def test_biRNN_bprop(backend_default, fargs, deltas_buffer): # basic sanity check with 0 weights random inputs seq_len, input_size, hidden_size, batch_size = fargs in_shape = (input_size, seq_len) NervanaObject.be.bsz = batch_size # setup the bi-directional rnn init_glorot = GlorotUniform() birnn = BiRNN(hidden_size, activation=Rectlinclip(slope=0), init=init_glorot) birnn.configure(in_shape) birnn.prev_layer = True birnn.allocate() birnn.allocate_deltas(deltas_buffer) deltas_buffer.allocate_buffers() birnn.set_deltas(deltas_buffer) # same weight for bi-rnn backward and rnn weights birnn.W_input_b[:] = birnn.W_input_f birnn.W_recur_b[:] = birnn.W_recur_f birnn.b_b[:] = birnn.b_f birnn.dW[:] = 0 # same weight for bi-directional rnn init_glorot = GlorotUniform() rnn = Recurrent(hidden_size, activation=Rectlinclip(slope=0), init=init_glorot) rnn.configure(in_shape) rnn.prev_layer = True rnn.allocate() rnn.allocate_deltas(deltas_buffer) deltas_buffer.allocate_buffers() rnn.set_deltas(deltas_buffer) # inputs and views lr = np.random.random((input_size, seq_len * batch_size)) lr_rev = list(reversed(get_steps(lr.copy(), in_shape))) rl = con(lr_rev, axis=1) # allocate gpu buffers inp_lr = birnn.be.array(lr) inp_rl = birnn.be.array(rl) # outputs out_lr_g = birnn.fprop(inp_lr) del_lr = birnn.bprop(out_lr_g).get().copy() birnn.h_buffer[:] = 0 out_rl_g = birnn.fprop(inp_rl) del_rl = birnn.bprop(out_rl_g).get().copy() del_lr_s = get_steps(del_lr, in_shape) del_rl_s = get_steps(del_rl, in_shape) for (x, y) in zip(del_lr_s, reversed(del_rl_s)): assert np.allclose(x, y, rtol=0.0, atol=1.0e-5)
def test_biLSTM_fprop(backend_default, fargs): # basic sanity check with 0 weights random inputs seq_len, input_size, hidden_size, batch_size = fargs in_shape = (input_size, seq_len) out_shape = (hidden_size, seq_len) NervanaObject.be.bsz = batch_size # setup the bi-directional rnn init_glorot = GlorotUniform() bilstm = BiLSTM(hidden_size, gate_activation=Logistic(), init=init_glorot, activation=Tanh(), reset_cells=True) bilstm.configure(in_shape) bilstm.prev_layer = True bilstm.allocate() # same weight nout = hidden_size bilstm.W_input_b[:] = bilstm.W_input_f bilstm.W_recur_b[:] = bilstm.W_recur_f bilstm.b_b[:] = bilstm.b_f bilstm.dW[:] = 0 # inputs - random and flipped left-to-right inputs lr = np.random.random((input_size, seq_len * batch_size)) lr_rev = list(reversed(get_steps(lr.copy(), in_shape))) rl = con(lr_rev, axis=1) inp_lr = bilstm.be.array(lr) inp_rl = bilstm.be.array(rl) # outputs out_lr = bilstm.fprop(inp_lr).get().copy() bilstm.h_buffer[:] = 0 out_rl = bilstm.fprop(inp_rl).get().copy() # views out_lr_f_s = get_steps(out_lr[:nout], out_shape) out_lr_b_s = get_steps(out_lr[nout:], out_shape) out_rl_f_s = get_steps(out_rl[:nout], out_shape) out_rl_b_s = get_steps(out_rl[nout:], out_shape) # asserts for x_f, x_b, y_f, y_b in zip(out_lr_f_s, out_lr_b_s, reversed(out_rl_f_s), reversed(out_rl_b_s)): assert allclose_with_out(x_f, y_b, rtol=0.0, atol=1.0e-5) assert allclose_with_out(x_b, y_f, rtol=0.0, atol=1.0e-5)
def test_biRNN_fprop(backend_default, fargs): # basic sanity check with 0 weights random inputs seq_len, input_size, hidden_size, batch_size = fargs in_shape = (input_size, seq_len) out_shape = (hidden_size, seq_len) NervanaObject.be.bsz = batch_size # setup the bi-directional rnn init_glorot = GlorotUniform() birnn = BiRNN(hidden_size, activation=Rectlinclip(slope=0), init=init_glorot) birnn.configure(in_shape) birnn.prev_layer = True birnn.allocate() birnn.set_deltas([birnn.be.iobuf(birnn.in_shape)]) # same weight nout = hidden_size birnn.W_input_b[:] = birnn.W_input_f birnn.W_recur_b[:] = birnn.W_recur_f birnn.b_b[:] = birnn.b_f birnn.dW[:] = 0 # inputs - random and flipped left-to-right inputs lr = np.random.random((input_size, seq_len * batch_size)) lr_rev = list(reversed(get_steps(lr.copy(), in_shape))) rl = con(lr_rev, axis=1) inp_lr = birnn.be.array(lr) inp_rl = birnn.be.array(rl) # outputs out_lr = birnn.fprop(inp_lr).get().copy() birnn.h_buffer[:] = 0 out_rl = birnn.fprop(inp_rl).get().copy() # views out_lr_f_s = get_steps(out_lr[:nout], out_shape) out_lr_b_s = get_steps(out_lr[nout:], out_shape) out_rl_f_s = get_steps(out_rl[:nout], out_shape) out_rl_b_s = get_steps(out_rl[nout:], out_shape) # asserts for x_f, x_b, y_f, y_b in zip(out_lr_f_s, out_lr_b_s, reversed(out_rl_f_s), reversed(out_rl_b_s)): assert np.allclose(x_f, y_b, rtol=0.0, atol=1.0e-5) assert np.allclose(x_b, y_f, rtol=0.0, atol=1.0e-5)
def test_biLSTM_fprop(backend_default, fargs): # basic sanity check with 0 weights random inputs seq_len, input_size, hidden_size, batch_size = fargs in_shape = (input_size, seq_len) out_shape = (hidden_size, seq_len) NervanaObject.be.bsz = batch_size # setup the bi-directional rnn init_glorot = GlorotUniform() bilstm = BiLSTM(hidden_size, gate_activation=Logistic(), init=init_glorot, activation=Tanh(), reset_cells=True) bilstm.configure(in_shape) bilstm.prev_layer = True bilstm.allocate() bilstm.set_deltas([bilstm.be.iobuf(bilstm.in_shape)]) # same weight nout = hidden_size bilstm.W_input_b[:] = bilstm.W_input_f bilstm.W_recur_b[:] = bilstm.W_recur_f bilstm.b_b[:] = bilstm.b_f bilstm.dW[:] = 0 # inputs - random and flipped left-to-right inputs lr = np.random.random((input_size, seq_len * batch_size)) lr_rev = list(reversed(get_steps(lr.copy(), in_shape))) rl = con(lr_rev, axis=1) inp_lr = bilstm.be.array(lr) inp_rl = bilstm.be.array(rl) # outputs out_lr = bilstm.fprop(inp_lr).get().copy() bilstm.h_buffer[:] = 0 out_rl = bilstm.fprop(inp_rl).get().copy() # views out_lr_f_s = get_steps(out_lr[:nout], out_shape) out_lr_b_s = get_steps(out_lr[nout:], out_shape) out_rl_f_s = get_steps(out_rl[:nout], out_shape) out_rl_b_s = get_steps(out_rl[nout:], out_shape) # asserts for x_f, x_b, y_f, y_b in zip(out_lr_f_s, out_lr_b_s, reversed(out_rl_f_s), reversed(out_rl_b_s)): assert np.allclose(x_f, y_b, rtol=0.0, atol=1.0e-5) assert np.allclose(x_b, y_f, rtol=0.0, atol=1.0e-5)
def test_biRNN_fprop(backend_default, fargs): # basic sanity check with 0 weights random inputs seq_len, input_size, hidden_size, batch_size = fargs in_shape = (input_size, seq_len) out_shape = (hidden_size, seq_len) NervanaObject.be.bsz = batch_size # setup the bi-directional rnn init_glorot = GlorotUniform() birnn = BiRNN(hidden_size, activation=Rectlinclip(slope=0), init=init_glorot) birnn.configure(in_shape) birnn.prev_layer = True birnn.allocate() # same weight nout = hidden_size birnn.W_input_b[:] = birnn.W_input_f birnn.W_recur_b[:] = birnn.W_recur_f birnn.b_b[:] = birnn.b_f birnn.dW[:] = 0 # inputs - random and flipped left-to-right inputs lr = np.random.random((input_size, seq_len * batch_size)) lr_rev = list(reversed(get_steps(lr.copy(), in_shape))) rl = con(lr_rev, axis=1) inp_lr = birnn.be.array(lr) inp_rl = birnn.be.array(rl) # outputs out_lr = birnn.fprop(inp_lr).get().copy() birnn.h_buffer[:] = 0 out_rl = birnn.fprop(inp_rl).get().copy() # views out_lr_f_s = get_steps(out_lr[:nout], out_shape) out_lr_b_s = get_steps(out_lr[nout:], out_shape) out_rl_f_s = get_steps(out_rl[:nout], out_shape) out_rl_b_s = get_steps(out_rl[nout:], out_shape) # asserts for x_f, x_b, y_f, y_b in zip(out_lr_f_s, out_lr_b_s, reversed(out_rl_f_s), reversed(out_rl_b_s)): assert allclose_with_out(x_f, y_b, rtol=0.0, atol=1.0e-5) assert allclose_with_out(x_b, y_f, rtol=0.0, atol=1.0e-5)
def test_biLSTM_bprop(backend_default, fargs): # basic sanity check with 0 weights random inputs seq_len, input_size, hidden_size, batch_size = fargs in_shape = (input_size, seq_len) out_shape = (hidden_size, seq_len) NervanaObject.be.bsz = batch_size # setup the bi-directional rnn init_glorot = GlorotUniform() bilstm = BiLSTM(hidden_size, gate_activation=Logistic(), activation=Tanh(), init=init_glorot, reset_cells=True) bilstm.configure(in_shape) bilstm.prev_layer = True bilstm.allocate() bilstm.set_deltas([bilstm.be.iobuf(bilstm.in_shape)]) # same weight for bi-rnn backward and rnn weights nout = hidden_size bilstm.W_input_b[:] = bilstm.W_input_f bilstm.W_recur_b[:] = bilstm.W_recur_f bilstm.b_b[:] = bilstm.b_f bilstm.dW[:] = 0 # inputs and views lr = np.random.random((input_size, seq_len * batch_size)) lr_rev = list(reversed(get_steps(lr.copy(), in_shape))) rl = con(lr_rev, axis=1) # allocate gpu buffers inp_lr = bilstm.be.array(lr) inp_rl = bilstm.be.array(rl) # outputs out_lr_g = bilstm.fprop(inp_lr) out_lr = out_lr_g.get().copy() del_lr = bilstm.bprop(out_lr_g).get().copy() bilstm.h_buffer[:] = 0 out_rl_g = bilstm.fprop(inp_rl) out_rl = out_rl_g.get().copy() del_rl = bilstm.bprop(out_rl_g).get().copy() # views out_lr_f_s = get_steps(out_lr[:nout], out_shape) out_lr_b_s = get_steps(out_lr[nout:], out_shape) out_rl_f_s = get_steps(out_rl[:nout], out_shape) out_rl_b_s = get_steps(out_rl[nout:], out_shape) # asserts for x_f, x_b, y_f, y_b in zip(out_lr_f_s, out_lr_b_s, reversed(out_rl_f_s), reversed(out_rl_b_s)): assert np.allclose(x_f, y_b, rtol=0.0, atol=1.0e-5) assert np.allclose(x_b, y_f, rtol=0.0, atol=1.0e-5) del_lr_s = get_steps(del_lr, in_shape) del_rl_s = get_steps(del_rl, in_shape) for (x, y) in zip(del_lr_s, reversed(del_rl_s)): assert np.allclose(x, y, rtol=0.0, atol=1.0e-5)
def init_buffers(self, inputs): """ Initialize buffers for recurrent internal units and outputs. Buffers are initialized as 2D tensors with second dimension being steps * batch_size A list of views are created on the buffer for easy manipulation of data related to a certain time step Arguments: inputs (Tensor): input data as 2D tensor. The dimension is (input_size, sequence_length * batch_size) """ if self.x is None or self.x is not inputs: self.x = inputs self.xs = get_steps(inputs, self.in_shape)
def set_deltas(self, delta_buffers): """ Use pre-allocated (by layer containers) list of buffers for backpropagated error. Only set deltas for layers that own their own deltas Only allocate space if layer owns its own deltas (e.g., bias and activation work in-place, so do not own their deltas). Arguments: delta_buffers (list): list of pre-allocated tensors (provided by layer container) """ super(TimeDistributedRecurrentOutput, self).set_deltas(delta_buffers) self.deltas_buffer = self.deltas if self.deltas: self.deltas = get_steps(self.deltas_buffer, self.in_shape) else: self.deltas = [] # for simplifying bprop notation
def bprop(self, error, alpha=1.0, beta=0.0): """ Backpropagation of errors, output delta for previous layer, and calculate the update on model params Arguments: error (list[Tensor]): error tensors for each time step of unrolling alpha (float, optional): scale to apply to input for activation gradient bprop. Defaults to 1.0 beta (float, optional): scale to apply to output activation gradient bprop. Defaults to 0.0 Returns: Tensor: Backpropagated errors for each time step of model unrolling """ self.dW[:] = 0 if self.in_deltas_f is None: self.in_deltas_f = get_steps(error[:self.o_shape[0]], self.o_shape) self.prev_in_deltas = self.in_deltas_f[-1:] + self.in_deltas_f[:-1] self.ifog_delta_last_steps = self.ifog_delta_buffer[:, self.be.bsz:] self.h_first_steps = self.h_buffer_f[:, :-self.be.bsz] # h_delta[5] * h[4] + h_delta[4] * h[3] + ... + h_delta[1] * h[0] if self.in_deltas_b is None: self.in_deltas_b = get_steps(error[self.o_shape[0]:], self.o_shape) self.next_in_deltas = self.in_deltas_b[1:] + self.in_deltas_b[:1] self.ifog_delta_first_steps = self.ifog_delta_buffer[:, :-self.be. bsz] self.h_last_steps = self.h_buffer_b[:, self.be.bsz:] # h_delta[0] * h[1] + h_delta[1] * h[2] + ... + h_delta[4] * h[5] params_f = (self.in_deltas_f, self.prev_in_deltas, self.i_f, self.f_f, self.o_f, self.g_f, self.ifog_delta, self.i_delta, self.f_delta, self.o_delta, self.g_delta, self.c_delta, self.c_delta_prev, self.c_prev_bprop, self.c_act_f) params_b = (self.in_deltas_b, self.next_in_deltas, self.i_b, self.f_b, self.o_b, self.g_b, self.ifog_delta, self.i_delta, self.f_delta, self.o_delta, self.g_delta, self.c_delta, self.c_delta_next, self.c_next_bprop, self.c_act_b) # bprop for forward direction connections . Error flow from right to left self.c_delta_buffer[:] = 0 self.ifog_delta_buffer[:] = 0 self.ifog_delta_f = None self.ifog_delta_b = None for idx, (in_deltas, prev_in_deltas, i, f, o, g, ifog_delta, i_delta, f_delta, o_delta, g_delta, c_delta, c_delta_prev, c_prev, c_act) \ in enumerate(reversed(list(zip(*params_f)))): # current cell delta c_delta[:] = c_delta + \ self.activation.bprop(c_act) * (o * in_deltas) i_delta[:] = self.gate_activation.bprop(i) * c_delta * g f_delta[:] = self.gate_activation.bprop(f) * c_delta * c_prev o_delta[:] = self.gate_activation.bprop(o) * in_deltas * c_act g_delta[:] = self.activation.bprop(g) * c_delta * i # bprop the errors to prev_in_delta and c_delta_prev self.be.compound_dot(self.W_recur_f.T, ifog_delta, prev_in_deltas, beta=1.0) if c_delta_prev is not None: c_delta_prev[:] = c_delta * f # Weight deltas and accumulate self.be.compound_dot(self.ifog_delta_last_steps, self.h_first_steps.T, self.dW_recur_f) self.be.compound_dot(self.ifog_delta_buffer, self.x_f.T, self.dW_input_f) self.db_f[:] = self.be.sum(self.ifog_delta_buffer, axis=1) # out deltas to input units if self.out_deltas_buffer: self.be.compound_dot(self.W_input_f.T, self.ifog_delta_buffer, self.out_deltas_buffer_f_v, alpha=alpha, beta=beta) # bprop for backward direction connections. Error flow from left to right self.c_delta_buffer[:] = 0 self.ifog_delta_buffer[:] = 0 for idx, (in_deltas, next_in_deltas, i, f, o, g, ifog_delta, i_delta, f_delta, o_delta, g_delta, c_delta, c_delta_next, c_next, c_act) \ in enumerate(zip(*params_b)): # current cell delta c_delta[:] = c_delta[:] + \ self.activation.bprop(c_act) * (o * in_deltas) i_delta[:] = self.gate_activation.bprop(i) * c_delta * g f_delta[:] = self.gate_activation.bprop(f) * c_delta * c_next o_delta[:] = self.gate_activation.bprop(o) * in_deltas * c_act g_delta[:] = self.activation.bprop(g) * c_delta * i # bprop the errors to next_in_delta and c_next_delta self.be.compound_dot(self.W_recur_b.T, ifog_delta, next_in_deltas, beta=1.0) if c_delta_next is not None: c_delta_next[:] = c_delta * f # Weight deltas and accumulate self.be.compound_dot(self.ifog_delta_first_steps, self.h_last_steps.T, self.dW_recur_b) self.be.compound_dot(self.ifog_delta_buffer, self.x_b.T, self.dW_input_b) self.db_b[:] = self.be.sum(self.ifog_delta_buffer, axis=1) # out deltas to input units. bprop to the same inputs if # split_inputs=False if self.out_deltas_buffer: self.be.compound_dot(self.W_input_b.T, self.ifog_delta_buffer, self.out_deltas_buffer_b_v, alpha=alpha, beta=beta if self.inputs else 1.0) return self.out_deltas_buffer
def test_biLSTM_bprop(backend_default, fargs, deltas_buffer): # basic sanity check with 0 weights random inputs seq_len, input_size, hidden_size, batch_size = fargs in_shape = (input_size, seq_len) out_shape = (hidden_size, seq_len) NervanaObject.be.bsz = batch_size # setup the bi-directional rnn init_glorot = GlorotUniform() bilstm = BiLSTM(hidden_size, gate_activation=Logistic(), activation=Tanh(), init=init_glorot, reset_cells=True) bilstm.configure(in_shape) bilstm.prev_layer = True bilstm.allocate() bilstm.allocate_deltas(deltas_buffer) deltas_buffer.allocate_buffers() bilstm.set_deltas(deltas_buffer) # same weight for bi-rnn backward and rnn weights nout = hidden_size bilstm.W_input_b[:] = bilstm.W_input_f bilstm.W_recur_b[:] = bilstm.W_recur_f bilstm.b_b[:] = bilstm.b_f bilstm.dW[:] = 0 # inputs and views lr = np.random.random((input_size, seq_len * batch_size)) lr_rev = list(reversed(get_steps(lr.copy(), in_shape))) rl = con(lr_rev, axis=1) # allocate gpu buffers inp_lr = bilstm.be.array(lr) inp_rl = bilstm.be.array(rl) # outputs out_lr_g = bilstm.fprop(inp_lr) out_lr = out_lr_g.get().copy() del_lr = bilstm.bprop(out_lr_g).get().copy() bilstm.h_buffer[:] = 0 out_rl_g = bilstm.fprop(inp_rl) out_rl = out_rl_g.get().copy() del_rl = bilstm.bprop(out_rl_g).get().copy() # views out_lr_f_s = get_steps(out_lr[:nout], out_shape) out_lr_b_s = get_steps(out_lr[nout:], out_shape) out_rl_f_s = get_steps(out_rl[:nout], out_shape) out_rl_b_s = get_steps(out_rl[nout:], out_shape) # asserts for x_f, x_b, y_f, y_b in zip(out_lr_f_s, out_lr_b_s, reversed(out_rl_f_s), reversed(out_rl_b_s)): assert allclose_with_out(x_f, y_b, rtol=0.0, atol=1.0e-5) assert allclose_with_out(x_b, y_f, rtol=0.0, atol=1.0e-5) del_lr_s = get_steps(del_lr, in_shape) del_rl_s = get_steps(del_rl, in_shape) for (x, y) in zip(del_lr_s, reversed(del_rl_s)): assert allclose_with_out(x, y, rtol=0.0, atol=1.0e-5)
def bprop(self, error, alpha=1.0, beta=0.0): """ Backpropagation of errors, output delta for previous layer, and calculate the update on model params Arguments: error (list[Tensor]): error tensors for each time step of unrolling alpha (float, optional): scale to apply to input for activation gradient bprop. Defaults to 1.0 beta (float, optional): scale to apply to output activation gradient bprop. Defaults to 0.0 Returns: Tensor: Backpropagated errors for each time step of model unrolling """ self.dW[:] = 0 if self.in_deltas_f is None: self.in_deltas_f = get_steps(error[:self.o_shape[0]], self.o_shape) self.prev_in_deltas = self.in_deltas_f[-1:] + self.in_deltas_f[:-1] self.ifog_delta_last_steps = self.ifog_delta_buffer[:, self.be.bsz:] self.h_first_steps = self.h_buffer_f[:, :-self.be.bsz] # h_delta[5] * h[4] + h_delta[4] * h[3] + ... + h_delta[1] * h[0] if self.in_deltas_b is None: self.in_deltas_b = get_steps(error[self.o_shape[0]:], self.o_shape) self.next_in_deltas = self.in_deltas_b[1:] + self.in_deltas_b[:1] self.ifog_delta_first_steps = self.ifog_delta_buffer[:, :-self.be.bsz] self.h_last_steps = self.h_buffer_b[:, self.be.bsz:] # h_delta[0] * h[1] + h_delta[1] * h[2] + ... + h_delta[4] * h[5] params_f = (self.in_deltas_f, self.prev_in_deltas, self.i_f, self.f_f, self.o_f, self.g_f, self.ifog_delta, self.i_delta, self.f_delta, self.o_delta, self.g_delta, self.c_delta, self.c_delta_prev, self.c_prev_bprop, self.c_act_f) params_b = (self.in_deltas_b, self.next_in_deltas, self.i_b, self.f_b, self.o_b, self.g_b, self.ifog_delta, self.i_delta, self.f_delta, self.o_delta, self.g_delta, self.c_delta, self.c_delta_next, self.c_next_bprop, self.c_act_b) # bprop for forward direction connections . Error flow from right to left self.c_delta_buffer[:] = 0 self.ifog_delta_buffer[:] = 0 self.ifog_delta_f = None self.ifog_delta_b = None for idx, (in_deltas, prev_in_deltas, i, f, o, g, ifog_delta, i_delta, f_delta, o_delta, g_delta, c_delta, c_delta_prev, c_prev, c_act) \ in enumerate(reversed(list(zip(*params_f)))): # current cell delta c_delta[:] = c_delta + \ self.activation.bprop(c_act) * (o * in_deltas) i_delta[:] = self.gate_activation.bprop(i) * c_delta * g f_delta[:] = self.gate_activation.bprop(f) * c_delta * c_prev o_delta[:] = self.gate_activation.bprop(o) * in_deltas * c_act g_delta[:] = self.activation.bprop(g) * c_delta * i # bprop the errors to prev_in_delta and c_delta_prev self.be.compound_dot( self.W_recur_f.T, ifog_delta, prev_in_deltas, beta=1.0) if c_delta_prev is not None: c_delta_prev[:] = c_delta * f # Weight deltas and accumulate self.be.compound_dot( self.ifog_delta_last_steps, self.h_first_steps.T, self.dW_recur_f) self.be.compound_dot( self.ifog_delta_buffer, self.x_f.T, self.dW_input_f) self.db_f[:] = self.be.sum(self.ifog_delta_buffer, axis=1) # out deltas to input units if self.out_deltas_buffer: self.be.compound_dot( self.W_input_f.T, self.ifog_delta_buffer, self.out_deltas_buffer_f_v, alpha=alpha, beta=beta) # bprop for backward direction connections. Error flow from left to right self.c_delta_buffer[:] = 0 self.ifog_delta_buffer[:] = 0 for idx, (in_deltas, next_in_deltas, i, f, o, g, ifog_delta, i_delta, f_delta, o_delta, g_delta, c_delta, c_delta_next, c_next, c_act) \ in enumerate(zip(*params_b)): # current cell delta c_delta[:] = c_delta[:] + \ self.activation.bprop(c_act) * (o * in_deltas) i_delta[:] = self.gate_activation.bprop(i) * c_delta * g f_delta[:] = self.gate_activation.bprop(f) * c_delta * c_next o_delta[:] = self.gate_activation.bprop(o) * in_deltas * c_act g_delta[:] = self.activation.bprop(g) * c_delta * i # bprop the errors to next_in_delta and c_next_delta self.be.compound_dot( self.W_recur_b.T, ifog_delta, next_in_deltas, beta=1.0) if c_delta_next is not None: c_delta_next[:] = c_delta * f # Weight deltas and accumulate self.be.compound_dot( self.ifog_delta_first_steps, self.h_last_steps.T, self.dW_recur_b) self.be.compound_dot( self.ifog_delta_buffer, self.x_b.T, self.dW_input_b) self.db_b[:] = self.be.sum(self.ifog_delta_buffer, axis=1) # out deltas to input units. bprop to the same inputs if # split_inputs=False if self.out_deltas_buffer: self.be.compound_dot(self.W_input_b.T, self.ifog_delta_buffer, self.out_deltas_buffer_b_v, alpha=alpha, beta=beta if self.inputs else 1.0) return self.out_deltas_buffer
def allocate_deltas(self, global_deltas=None): super(Encoder, self).allocate_deltas(global_deltas=global_deltas) self.error_buf = self.be.iobuf(self.out_shape) self.error_slices = get_steps(self.error_buf, self.out_shape)