def backward_pass(self, buffers): # prepare _h = self.handler inputs = buffers.inputs.default outputs = buffers.outputs.default in_deltas = buffers.input_deltas.default out_deltas = buffers.output_deltas.default # reshape flat_inputs = flatten_time(inputs) flat_in_deltas = flatten_time(in_deltas) flat_out_deltas = flatten_time(out_deltas) flat_outputs = flatten_time(outputs) if self.type == 'max': argmax = buffers.internals.argmax flat_argmax = flatten_time(argmax) _h.maxpool2d_backward_batch(flat_inputs, self.kernel_size, flat_outputs, self.padding, self.stride, flat_argmax, flat_in_deltas, flat_out_deltas) elif self.type == 'avg': _h.avgpool2d_backward_batch(flat_inputs, self.kernel_size, flat_outputs, self.padding, self.stride, flat_in_deltas, flat_out_deltas)
def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler W, R, bias, timing = buffers.parameters inputs = buffers.inputs.default outputs = buffers.outputs.default Ha = buffers.internals.Ha flat_inputs = flatten_time(inputs) flat_H = flatten_time(Ha[:-1]) _h.dot_mm(flat_inputs, W, flat_H, transb=True) _h.add_mv(flat_H, bias.reshape((1, self.size)), flat_H) tmp = _h.zeros(timing.shape) cond = _h.zeros(outputs[0].shape) for t in range(inputs.shape[0]): _h.dot_add_mm(outputs[t - 1], R, Ha[t], transb=True) _h.act_func[self.activation](Ha[t], outputs[t]) # Undo updates if t > 0: _h.fill(tmp, t) _h.modulo_tt(tmp, timing, tmp) _h.broadcast_t(tmp.reshape((1, tmp.shape[0])), 0, cond) _h.copy_to_if(outputs[t - 1], outputs[t], cond)
def backward_pass(self, buffers): # prepare _h = self.handler inputs = buffers.inputs.default outputs = buffers.outputs.default in_deltas = buffers.input_deltas.default out_deltas = buffers.output_deltas.default # reshape flat_inputs = flatten_time(inputs) flat_in_deltas = flatten_time(in_deltas) flat_out_deltas = flatten_time(out_deltas) flat_outputs = flatten_time(outputs) if self.type == 'max': argmax = buffers.internals.argmax flat_argmax = flatten_time(argmax) _h.maxpool2d_backward_batch(flat_inputs, self.kernel_size, flat_outputs, self.padding, self.stride, flat_argmax, flat_in_deltas, flat_out_deltas) elif self.type == 'avg': _h.avgpool2d_backward_batch(flat_inputs, self.kernel_size, flat_outputs, self.padding, self.stride, flat_in_deltas, flat_out_deltas)
def backward_pass(self, buffers): # prepare _h = self.handler W, R, bias = buffers.parameters dW, dR, dbias = buffers.gradients inputs = buffers.inputs.default outputs = buffers.outputs.default dinputs = buffers.input_deltas.default doutputs = buffers.output_deltas.default Ha, dHa, dHb = buffers.internals _h.copy_to(doutputs, dHb) T = inputs.shape[0] - 1 _h.act_func_deriv[self.activation](Ha[T], outputs[T], dHb[T], dHa[T]) for t in range(T - 1, -1, -1): _h.dot_add_mm(dHa[t + 1], R, dHb[t]) _h.act_func_deriv[self.activation](Ha[t], outputs[t], dHb[t], dHa[t]) flat_inputs = flatten_time_and_features(inputs) flat_dinputs = flatten_time_and_features(dinputs) flat_dHa = flatten_time(dHa[:-1]) # calculate in_deltas and gradients _h.dot_add_mm(flat_dHa, W, flat_dinputs) _h.dot_add_mm(flat_dHa, flat_inputs, dW, transa=True) dbias_tmp = _h.allocate(dbias.shape) _h.sum_t(flat_dHa, axis=0, out=dbias_tmp) _h.add_tt(dbias, dbias_tmp, dbias) flat_outputs = flatten_time(outputs[:-2]) flat_dHa = flatten_time(dHa[1:-1]) _h.dot_add_mm(flat_dHa, flat_outputs, dR, transa=True) _h.dot_add_mm(dHa[0], outputs[-1], dR, transa=True)
def backward_pass(self, buffers): # prepare _h = self.handler W, R, bias = buffers.parameters dW, dR, dbias = buffers.gradients inputs = buffers.inputs.default outputs = buffers.outputs.default dinputs = buffers.input_deltas.default doutputs = buffers.output_deltas.default Ha, dHa, dHb = buffers.internals _h.copy_to(doutputs, dHb) T = inputs.shape[0] - 1 _h.act_func_deriv[self.activation](Ha[T], outputs[T], dHb[T], dHa[T]) for t in range(T - 1, -1, -1): _h.dot_add_mm(dHa[t + 1], R, dHb[t]) _h.act_func_deriv[self.activation](Ha[t], outputs[t], dHb[t], dHa[t]) flat_inputs = flatten_time_and_features(inputs) flat_dinputs = flatten_time_and_features(dinputs) flat_dHa = flatten_time(dHa[:-1]) # calculate in_deltas and gradients _h.dot_add_mm(flat_dHa, W, flat_dinputs) _h.dot_add_mm(flat_dHa, flat_inputs, dW, transa=True) dbias_tmp = _h.allocate(dbias.shape) _h.sum_t(flat_dHa, axis=0, out=dbias_tmp) _h.add_tt(dbias, dbias_tmp, dbias) flat_outputs = flatten_time(outputs[:-2]) flat_dHa = flatten_time(dHa[1:-1]) _h.dot_add_mm(flat_dHa, flat_outputs, dR, transa=True) _h.dot_add_mm(dHa[0], outputs[-1], dR, transa=True)
def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler W_H, W_T, R_T, bias_T, R_H, bias_H = buffers.parameters inputs = buffers.inputs.default outputs = buffers.outputs.default H_list = [] T_list = [] Y_list = [] for i in range(self.recurrence_depth): H_list.append(buffers.internals['H_{}'.format(i)]) T_list.append(buffers.internals['T_{}'.format(i)]) Y_list.append(buffers.internals['Y_{}'.format(i)]) flat_inputs = flatten_time_and_features(inputs) flat_H = flatten_time(H_list[0][:-1]) flat_T = flatten_time(T_list[0][:-1]) _h.dot_mm(flat_inputs, W_H, flat_H, transb=True) _h.dot_mm(flat_inputs, W_T, flat_T, transb=True) for t in range(inputs.shape[0]): for i in range(self.recurrence_depth): if i == 0: x = outputs[t - 1] _h.dot_add_mm(x, R_T[i], T_list[i][t], transb=True) _h.add_mv(T_list[i][t], bias_T[i].reshape((1, self.size)), T_list[i][t]) _h.inplace_act_func['sigmoid'](T_list[i][t]) _h.dot_add_mm(x, R_H[i], H_list[i][t], transb=True) _h.add_mv(H_list[i][t], bias_H[i].reshape((1, self.size)), H_list[i][t]) _h.inplace_act_func[self.activation](H_list[i][t]) else: x = Y_list[i - 1][t] _h.dot_mm(x, R_T[i], T_list[i][t], transb=True) _h.add_mv(T_list[i][t], bias_T[i].reshape((1, self.size)), T_list[i][t]) _h.inplace_act_func['sigmoid'](T_list[i][t]) _h.dot_mm(x, R_H[i], H_list[i][t], transb=True) _h.add_mv(H_list[i][t], bias_H[i].reshape((1, self.size)), H_list[i][t]) _h.inplace_act_func[self.activation](H_list[i][t]) if i == 0: _h.mult_tt(T_list[i][t], H_list[i][t], out=Y_list[i][t]) tmp = _h.ones(H_list[i][t].shape) _h.subtract_tt(tmp, T_list[i][t], tmp) _h.mult_add_tt(tmp, outputs[t - 1], out=Y_list[i][t]) else: _h.mult_tt(T_list[i][t], H_list[i][t], out=Y_list[i][t]) tmp = _h.ones(H_list[i][t].shape) _h.subtract_tt(tmp, T_list[i][t], tmp) _h.mult_add_tt(tmp, Y_list[i - 1][t], out=Y_list[i][t]) _h.copy_to(Y_list[self.recurrence_depth - 1][t], outputs[t])
def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler (Wz, Wi, Wf, Wo, pi, pf, po, Rz, Ri, Rf, Ro, bz, bi, bf, bo) = buffers.parameters (Za, Zb, Ia, Ib, Fa, Fb, Oa, Ob, Ca, Cb, dZa, dZb, dIa, dIb, dFa, dFb, dOa, dOb, dCa, dCb) = buffers.internals x = buffers.inputs.default y = buffers.outputs.default time_size, batch_size, in_size = x.shape flat_x = flatten_time(x) flat_Za = flatten_time(Za[:-1]) flat_Ia = flatten_time(Ia[:-1]) flat_Fa = flatten_time(Fa[:-1]) flat_Oa = flatten_time(Oa[:-1]) _h.dot_mm(flat_x, Wz, flat_Za, transb=True) _h.dot_mm(flat_x, Wi, flat_Ia, transb=True) _h.dot_mm(flat_x, Wf, flat_Fa, transb=True) _h.dot_mm(flat_x, Wo, flat_Oa, transb=True) for t in range(time_size): # Block input _h.dot_add_mm(y[t - 1], Rz, Za[t], transb=True) _h.add_mv(Za[t], bz.reshape((1, self.size)), Za[t]) _h.act_func[self.activation](Za[t], Zb[t]) # Input Gate _h.dot_add_mm(y[t - 1], Ri, Ia[t], transb=True) _h.mult_add_mv(Ca[t - 1], pi, Ia[t]) _h.add_mv(Ia[t], bi.reshape((1, self.size)), Ia[t]) _h.sigmoid(Ia[t], Ib[t]) # Forget Gate _h.dot_add_mm(y[t - 1], Rf, Fa[t], transb=True) _h.mult_add_mv(Ca[t - 1], pf, Fa[t]) _h.add_mv(Fa[t], bf.reshape((1, self.size)), Fa[t]) _h.sigmoid(Fa[t], Fb[t]) # Cell _h.mult_tt(Ib[t], Zb[t], Ca[t]) _h.mult_add_tt(Fb[t], Ca[t - 1], Ca[t]) # Output Gate _h.dot_add_mm(y[t - 1], Ro, Oa[t], transb=True) _h.mult_add_mv(Ca[t], po, Oa[t]) _h.add_mv(Oa[t], bo.reshape((1, self.size)), Oa[t]) _h.sigmoid(Oa[t], Ob[t]) # Block output _h.act_func[self.activation](Ca[t], Cb[t]) _h.mult_tt(Ob[t], Cb[t], y[t])
def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler W, bias = buffers.parameters inputs = buffers.inputs.default outputs = buffers.outputs.default # reshape flat_inputs = flatten_time(inputs) flat_outputs = flatten_time(outputs) # calculate outputs _h.conv2d_forward_batch(flat_inputs, W, bias, flat_outputs, self.padding, self.stride) _h.inplace_act_func[self.activation](outputs)
def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler W, bias = buffers.parameters inputs = buffers.inputs.default outputs = buffers.outputs.default # reshape flat_inputs = flatten_time(inputs) flat_outputs = flatten_time(outputs) # calculate outputs _h.conv2d_forward_batch(flat_inputs, W, bias, flat_outputs, self.padding, self.stride) _h.inplace_act_func[self.activation](outputs)
def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler y = buffers.inputs.default t = buffers.inputs.targets cee = buffers.internals.cee cee_sum = buffers.outputs.default # the binomial cross entropy error is given by # - t * ln(y) - (1-t) * ln(1-y) tmp = _h.ones(cee.shape) _h.subtract_tt(tmp, y, cee) # cee = 1-y _h.subtract_tt(tmp, t, tmp) # tmp = 1-t _h.clip_t(cee, 1e-6, 1.0, cee) _h.log_t(cee, cee) # cee = ln(1-y) _h.mult_tt(tmp, cee, tmp) # tmp = (1-t) * ln(1-y) _h.clip_t(y, 1e-6, 1.0, cee) _h.log_t(cee, cee) # cee = ln(y) _h.mult_tt(t, cee, cee) # cee = t * ln(y) _h.add_tt(tmp, cee, cee) # cee = (1-t) * ln(1-y) + t * ln(y) # reshape for summation cee = flatten_time_and_features(cee) cee_sum = flatten_time(cee_sum) _h.sum_t(cee, axis=1, out=cee_sum) _h.mult_st(-1, cee_sum, cee_sum) # * -1
def test_flatten_time(): # Testing for NumpyHandler only _h = NumpyHandler(np.float64) shape = (2, 3, 2, 4) x = np.random.randn(*shape) y = flatten_time(x).copy() yp = x.reshape((6, 2, 4)) assert np.allclose(y, yp)
def test_flatten_time(): # Testing for NumpyHandler only _h = NumpyHandler(np.float64) shape = (2, 3, 2, 4) x = np.random.randn(*shape) y = flatten_time(x).copy() yp = x.reshape((6, 2, 4)) assert np.allclose(y, yp)
def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler W, R, bias = buffers.parameters inputs = buffers.inputs.default outputs = buffers.outputs.default Ha = buffers.internals.Ha flat_inputs = flatten_time(inputs) flat_H = flatten_time(Ha[:-1]) _h.dot_mm(flat_inputs, W, flat_H, transb=True) _h.add_mv(flat_H, bias.reshape((1, self.size)), flat_H) for t in range(inputs.shape[0]): _h.dot_add_mm(outputs[t - 1], R, Ha[t], transb=True) _h.act_func[self.activation](Ha[t], outputs[t])
def forward_pass(self, buffers, training_pass=True): _h = self.handler flat_inp = flatten_time_and_features(buffers.inputs.default) flat_mask = flatten_time(buffers.inputs.mask) flat_out = flatten_time_and_features(buffers.outputs.default) _h.mult_mv(flat_inp, flat_mask, out=flat_out)
def backward_pass(self, buffers): # prepare _h = self.handler W, bias = buffers.parameters dW, dbias = buffers.gradients inputs = buffers.inputs.default outputs = buffers.outputs.default in_deltas = buffers.input_deltas.default out_deltas = buffers.output_deltas.default # reshape flat_inputs = flatten_time(inputs) flat_in_deltas = flatten_time(in_deltas) flat_out_deltas = flatten_time(out_deltas) # calculate in_deltas and gradients _h.inplace_act_func_deriv[self.activation](outputs, out_deltas) _h.conv2d_backward_batch(flat_inputs, W, self.padding, self.stride, flat_in_deltas, flat_out_deltas, dW, dbias)
def backward_pass(self, buffers): # prepare _h = self.handler W, bias = buffers.parameters dW, dbias = buffers.gradients inputs = buffers.inputs.default outputs = buffers.outputs.default in_deltas = buffers.input_deltas.default out_deltas = buffers.output_deltas.default # reshape flat_inputs = flatten_time(inputs) flat_in_deltas = flatten_time(in_deltas) flat_out_deltas = flatten_time(out_deltas) # calculate in_deltas and gradients _h.inplace_act_func_deriv[self.activation](outputs, out_deltas) _h.conv2d_backward_batch(flat_inputs, W, self.padding, self.stride, flat_in_deltas, flat_out_deltas, dW, dbias)
def backward_pass(self, buffers): # prepare _h = self.handler W, R, bias, timing = buffers.parameters dW, dR, dbias, dtiming = buffers.gradients inputs = buffers.inputs.default outputs = buffers.outputs.default dinputs = buffers.input_deltas.default doutputs = buffers.output_deltas.default Ha, dHa, dHb = buffers.internals tmp = _h.zeros(timing.shape) cond = _h.zeros(outputs[0].shape) _h.copy_to(doutputs, dHb) T = inputs.shape[0] - 1 _h.act_func_deriv[self.activation](Ha[T], outputs[T], dHb[T], dHa[T]) for t in range(T - 1, -1, -1): _h.fill(tmp, t + 1) _h.modulo_tt(tmp, timing, tmp) _h.broadcast_t(tmp.reshape((1, tmp.shape[0])), 0, cond) _h.add_into_if(dHb[t + 1], dHb[t], cond) _h.fill_if(dHa[t+1], 0.0, cond) _h.dot_add_mm(dHa[t + 1], R, dHb[t]) _h.act_func_deriv[self.activation](Ha[t], outputs[t], dHb[t], dHa[t]) flat_inputs = flatten_time(inputs) flat_dinputs = flatten_time(dinputs) flat_dHa = flatten_time(dHa[:-1]) # Calculate in_deltas and gradients _h.dot_add_mm(flat_dHa, W, flat_dinputs) _h.dot_add_mm(flat_dHa, flat_inputs, dW, transa=True) dbias_tmp = _h.allocate(dbias.shape) _h.sum_t(flat_dHa, axis=0, out=dbias_tmp) _h.add_tt(dbias, dbias_tmp, dbias) flat_outputs = flatten_time(outputs[:-2]) flat_dHa = flatten_time(dHa[1:-1]) _h.dot_add_mm(flat_dHa, flat_outputs, dR, transa=True) _h.dot_add_mm(dHa[0], outputs[-1], dR, transa=True)
def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler inputs = buffers.inputs.default outputs = buffers.outputs.default # reshape flat_inputs = flatten_time(inputs) flat_outputs = flatten_time(outputs) # calculate outputs if self.type == 'max': argmax = buffers.internals.argmax flat_argmax = flatten_time(argmax) _h.maxpool2d_forward_batch(flat_inputs, self.kernel_size, flat_outputs, self.padding, self.stride, flat_argmax) elif self.type == 'avg': _h.avgpool2d_forward_batch(flat_inputs, self.kernel_size, flat_outputs, self.padding, self.stride)
def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler inputs = buffers.inputs.default outputs = buffers.outputs.default # reshape flat_inputs = flatten_time(inputs) flat_outputs = flatten_time(outputs) # calculate outputs if self.type == 'max': argmax = buffers.internals.argmax flat_argmax = flatten_time(argmax) _h.maxpool2d_forward_batch(flat_inputs, self.kernel_size, flat_outputs, self.padding, self.stride, flat_argmax) elif self.type == 'avg': _h.avgpool2d_forward_batch(flat_inputs, self.kernel_size, flat_outputs, self.padding, self.stride)
def backward_pass(self, buffers): _h = self.handler flat_out_deltas = flatten_time_and_features( buffers.output_deltas.default) tmp = self.handler.allocate(flat_out_deltas.shape) flat_mask = flatten_time(buffers.inputs.mask) flat_in_deltas = flatten_time_and_features( buffers.input_deltas.default) _h.mult_mv(flat_out_deltas, flat_mask, tmp) _h.add_tt(tmp, flat_in_deltas, flat_in_deltas)
def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler inputs_1 = flatten_time_and_features(buffers.inputs.inputs_1) inputs_2 = flatten_time_and_features(buffers.inputs.inputs_2) diff = flatten_time_and_features(buffers.internals.squared_diff) diff_sum = flatten_time(buffers.outputs.default) # calculate _h.subtract_tt(inputs_1, inputs_2, out=diff) _h.mult_tt(diff, diff, out=diff) _h.sum_t(diff, axis=1, out=diff_sum) _h.mult_st(0.5, diff_sum, out=diff_sum)
def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler inputs_1 = flatten_time_and_features(buffers.inputs.inputs_1) inputs_2 = flatten_time_and_features(buffers.inputs.inputs_2) diff = flatten_time_and_features(buffers.internals.squared_diff) diff_sum = flatten_time(buffers.outputs.default) # calculate _h.subtract_tt(inputs_1, inputs_2, out=diff) _h.mult_tt(diff, diff, out=diff) _h.sum_t(diff, axis=1, out=diff_sum) _h.mult_st(0.5, diff_sum, out=diff_sum)
def backward_pass(self, buffers): # prepare _h = self.handler assert isinstance(_h, Handler) dinputs = flatten_time_and_features(buffers.input_deltas.default) dloss = flatten_time(buffers.output_deltas.loss) dcee = flatten_time_and_features(buffers.internals.cee) targets = flatten_time_and_features(buffers.inputs.targets) prob = flatten_time_and_features(buffers.outputs.probabilities) _h.subtract_tt(prob, targets, dcee) # y - t _h.mult_mv(dcee, dloss, dcee) # out_delta * (y - t) _h.add_tt(dcee, dinputs, dinputs)
def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler assert isinstance(_h, Handler) inputs = buffers.inputs.default tmp = buffers.internals.tmp outputs = buffers.outputs.loss # reshape flat_inputs = flatten_time_and_features(inputs) flat_tmp = flatten_time_and_features(tmp) flat_outputs = flatten_time(outputs) # compute _h.abs_t(flat_inputs, flat_tmp) _h.sum_t(flat_tmp, 1, flat_outputs)
def backward_pass(self, buffers): _h = self.handler assert isinstance(_h, Handler) inputs = buffers.inputs.default tmp = buffers.internals.tmp output_deltas = buffers.output_deltas.loss input_deltas = buffers.input_deltas.default # reshape flat_inputs = flatten_time_and_features(inputs) flat_tmp = flatten_time_and_features(tmp) flat_output_deltas = flatten_time(output_deltas) flat_input_deltas = flatten_time_and_features(input_deltas) # compute _h.mult_mv(flat_inputs, flat_output_deltas, flat_tmp) _h.add_tt(flat_tmp, flat_input_deltas, flat_input_deltas)
def backward_pass(self, buffers): # prepare _h = self.handler inputs_1 = flatten_time_and_features(buffers.inputs.inputs_1) inputs_2 = flatten_time_and_features(buffers.inputs.inputs_2) out_deltas = buffers.output_deltas.default grad_diff = buffers.internals.grad_diff dinputs_1 = flatten_time_and_features(buffers.input_deltas.inputs_1) dinputs_2 = flatten_time_and_features(buffers.input_deltas.inputs_2) tmp = _h.allocate(inputs_2.shape) # out_deltas has only one feature dimension due to summation, # so we broadcast to all feature dimensions _h.broadcast_t(out_deltas, 2, grad_diff) grad_diff = flatten_time(grad_diff) # calculate _h.subtract_tt(inputs_1, inputs_2, out=tmp) _h.mult_add_tt(grad_diff, tmp, dinputs_1) _h.subtract_tt(inputs_2, inputs_1, out=tmp) _h.mult_add_tt(grad_diff, tmp, dinputs_2)
def backward_pass(self, buffers): # prepare _h = self.handler inputs_1 = flatten_time_and_features(buffers.inputs.inputs_1) inputs_2 = flatten_time_and_features(buffers.inputs.inputs_2) out_deltas = buffers.output_deltas.default grad_diff = buffers.internals.grad_diff dinputs_1 = flatten_time_and_features(buffers.input_deltas.inputs_1) dinputs_2 = flatten_time_and_features(buffers.input_deltas.inputs_2) tmp = _h.allocate(inputs_2.shape) # out_deltas has only one feature dimension due to summation, # so we broadcast to all feature dimensions _h.broadcast_t(out_deltas, 2, grad_diff) grad_diff = flatten_time(grad_diff) # calculate _h.subtract_tt(inputs_1, inputs_2, out=tmp) _h.mult_add_tt(grad_diff, tmp, dinputs_1) _h.subtract_tt(inputs_2, inputs_1, out=tmp) _h.mult_add_tt(grad_diff, tmp, dinputs_2)
def backward_pass(self, buffers): # prepare _h = self.handler W_H, W_T, R_T, bias_T, R_H, bias_H = buffers.parameters dW_H, dW_T, dR_T, dbias_T, dR_H, dbias_H = buffers.gradients inputs = buffers.inputs.default outputs = buffers.outputs.default dinputs = buffers.input_deltas.default doutputs = buffers.output_deltas.default H_list = [] T_list = [] Y_list = [] dH_list = [] dT_list = [] dY_list = [] for i in range(self.recurrence_depth): H_list.append(buffers.internals['H_{}'.format(i)]) T_list.append(buffers.internals['T_{}'.format(i)]) Y_list.append(buffers.internals['Y_{}'.format(i)]) dH_list.append(buffers.internals['dH_{}'.format(i)]) dT_list.append(buffers.internals['dT_{}'.format(i)]) dY_list.append(buffers.internals['dY_{}'.format(i)]) t = inputs.shape[0] - 1 _h.copy_to(doutputs[t], dY_list[self.recurrence_depth - 1][t]) for i in range(self.recurrence_depth - 1, -1, -1): if i == 0: _h.mult_tt(dY_list[i][t], T_list[i][t], dH_list[i][t]) tmp = _h.ones(dH_list[i][t].shape) _h.subtract_tt(H_list[i][t], outputs[t - 1], tmp) _h.mult_tt(dY_list[i][t], tmp, dT_list[i][t]) _h.inplace_act_func_deriv['sigmoid'](T_list[i][t], dT_list[i][t]) _h.inplace_act_func_deriv[self.activation](H_list[i][t], dH_list[i][t]) else: _h.mult_tt(dY_list[i][t], T_list[i][t], dH_list[i][t]) tmp = _h.ones(dH_list[i][t].shape) _h.subtract_tt(tmp, T_list[i][t], tmp) _h.mult_tt(dY_list[i][t], tmp, dY_list[i - 1][t]) _h.subtract_tt(H_list[i][t], Y_list[i - 1][t], tmp) _h.mult_tt(dY_list[i][t], tmp, dT_list[i][t]) _h.inplace_act_func_deriv['sigmoid'](T_list[i][t], dT_list[i][t]) _h.inplace_act_func_deriv[self.activation](H_list[i][t], dH_list[i][t]) _h.dot_add_mm(dT_list[i][t], R_T[i], dY_list[i - 1][t]) _h.dot_add_mm(dH_list[i][t], R_H[i], dY_list[i - 1][t]) for t in range(inputs.shape[0] - 2, -1, -1): _h.dot_add_mm(dT_list[0][t + 1], R_T[0], doutputs[t]) _h.dot_add_mm(dH_list[0][t + 1], R_H[0], doutputs[t]) tmp = _h.ones(dH_list[0][t + 1].shape) _h.subtract_tt(tmp, T_list[0][t + 1], tmp) _h.mult_add_tt(dY_list[0][t + 1], tmp, doutputs[t]) _h.copy_to(doutputs[t], dY_list[self.recurrence_depth - 1][t]) for i in range(self.recurrence_depth - 1, -1, -1): if i == 0: _h.mult_tt(dY_list[i][t], T_list[i][t], dH_list[i][t]) tmp = _h.ones(dH_list[i][t].shape) _h.subtract_tt(H_list[i][t], outputs[t - 1], tmp) _h.mult_tt(dY_list[i][t], tmp, dT_list[i][t]) _h.inplace_act_func_deriv['sigmoid'](T_list[i][t], dT_list[i][t]) _h.inplace_act_func_deriv[self.activation](H_list[i][t], dH_list[i][t]) else: _h.mult_tt(dY_list[i][t], T_list[i][t], dH_list[i][t]) tmp = _h.ones(dH_list[i][t].shape) _h.subtract_tt(tmp, T_list[i][t], tmp) _h.mult_tt(dY_list[i][t], tmp, dY_list[i - 1][t]) _h.subtract_tt(H_list[i][t], Y_list[i - 1][t], tmp) _h.mult_tt(dY_list[i][t], tmp, dT_list[i][t]) _h.inplace_act_func_deriv['sigmoid'](T_list[i][t], dT_list[i][t]) _h.inplace_act_func_deriv[self.activation](H_list[i][t], dH_list[i][t]) _h.dot_add_mm(dT_list[i][t], R_T[i], dY_list[i - 1][t]) _h.dot_add_mm(dH_list[i][t], R_H[i], dY_list[i - 1][t]) flat_inputs = flatten_time_and_features(inputs) flat_dinputs = flatten_time_and_features(dinputs) flat_dH = flatten_time(dH_list[0][:-1]) flat_dT = flatten_time(dT_list[0][:-1]) # calculate in_deltas and gradients _h.dot_add_mm(flat_dH, W_H, flat_dinputs) _h.dot_add_mm(flat_dH, flat_inputs, dW_H, transa=True) _h.dot_add_mm(flat_dT, W_T, flat_dinputs) _h.dot_add_mm(flat_dT, flat_inputs, dW_T, transa=True) for i in range(self.recurrence_depth): dbias_tmp = _h.allocate(dbias_H[i].shape) flat_dH = flatten_time(dH_list[i][:-1]) flat_dT = flatten_time(dT_list[i][:-1]) _h.sum_t(flat_dT, axis=0, out=dbias_tmp) _h.add_tt(dbias_T[i], dbias_tmp, dbias_T[i]) _h.sum_t(flat_dH, axis=0, out=dbias_tmp) _h.add_tt(dbias_H[i], dbias_tmp, dbias_H[i]) for i in range(self.recurrence_depth): if i == 0: flat_outputs = flatten_time(outputs[:-2]) flat_dH = flatten_time(dH_list[i][1:-1]) flat_dT = flatten_time(dT_list[i][1:-1]) _h.dot_add_mm(flat_dT, flat_outputs, dR_T[i], transa=True) _h.dot_add_mm(dT_list[i][0], outputs[-1], dR_T[i], transa=True) _h.dot_add_mm(flat_dH, flat_outputs, dR_H[i], transa=True) _h.dot_add_mm(dH_list[i][0], outputs[-1], dR_H[i], transa=True) else: flat_outputs = flatten_time(Y_list[i - 1][:-1]) flat_dH = flatten_time(dH_list[i][:-1]) flat_dT = flatten_time(dT_list[i][:-1]) _h.dot_add_mm(flat_dT, flat_outputs, dR_T[i], transa=True) _h.dot_add_mm(flat_dH, flat_outputs, dR_H[i], transa=True)
def backward_pass(self, buffers): # prepare _h = self.handler (Wz, Wi, Wf, Wo, pi, pf, po, Rz, Ri, Rf, Ro, bz, bi, bf, bo) = buffers.parameters (dWz, dWi, dWf, dWo, dpi, dpf, dpo, dRz, dRi, dRf, dRo, dbz, dbi, dbf, dbo) = buffers.gradients (Za, Zb, Ia, Ib, Fa, Fb, Oa, Ob, Ca, Cb, dZa, dZb, dIa, dIb, dFa, dFb, dOa, dOb, dCa, dCb) = buffers.internals x = buffers.inputs.default dx = buffers.input_deltas.default y = buffers.outputs.default deltas = buffers.output_deltas.default dy = _h.allocate(y.shape) _h.fill(dCa, 0.0) time_size, batch_size, in_size = x.shape for t in range(time_size - 1, -1, - 1): # Accumulate recurrent deltas _h.copy_to(deltas[t], dy[t]) _h.dot_add_mm(dIa[t + 1], Ri, dy[t]) _h.dot_add_mm(dFa[t + 1], Rf, dy[t]) _h.dot_add_mm(dOa[t + 1], Ro, dy[t]) _h.dot_add_mm(dZa[t + 1], Rz, dy[t]) # Peephole connection part: _h.mult_add_mv(dIa[t + 1], pi, dCa[t]) _h.mult_add_mv(dFa[t + 1], pf, dCa[t]) # Output Gate _h.mult_tt(dy[t], Cb[t], dOb[t]) _h.sigmoid_deriv(Oa[t], Ob[t], dOb[t], dOa[t]) # Peephole connection _h.mult_add_mv(dOa[t], po, dCa[t]) # Cell _h.mult_tt(dy[t], Ob[t], dCb[t]) _h.act_func_deriv[self.activation](Ca[t], Cb[t], dCb[t], dCb[t]) _h.add_tt(dCa[t], dCb[t], dCa[t]) _h.mult_add_tt(dCa[t + 1], Fb[t + 1], dCa[t]) # Forget Gate _h.mult_tt(dCa[t], Ca[t - 1], dFb[t]) _h.sigmoid_deriv(Fa[t], Fb[t], dFb[t], dFa[t]) # Input Gate _h.mult_tt(dCa[t], Zb[t], dIb[t]) _h.sigmoid_deriv(Ia[t], Ib[t], dIb[t], dIa[t]) # Block Input _h.mult_tt(dCa[t], Ib[t], dZb[t]) _h.act_func_deriv[self.activation](Za[t], Zb[t], dZb[t], dZa[t]) flat_inputs = flatten_time(x) flat_dinputs = flatten_time(dx) flat_dIa = flatten_time(dIa[:-1]) flat_dFa = flatten_time(dFa[:-1]) flat_dOa = flatten_time(dOa[:-1]) flat_dZa = flatten_time(dZa[:-1]) # Calculate in_deltas and gradients _h.dot_add_mm(flat_dIa, Wi, flat_dinputs) _h.dot_add_mm(flat_dFa, Wf, flat_dinputs) _h.dot_add_mm(flat_dOa, Wo, flat_dinputs) _h.dot_add_mm(flat_dZa, Wz, flat_dinputs) _h.dot_add_mm(flat_dIa, flat_inputs, dWi, transa=True) _h.dot_add_mm(flat_dFa, flat_inputs, dWf, transa=True) _h.dot_add_mm(flat_dOa, flat_inputs, dWo, transa=True) _h.dot_add_mm(flat_dZa, flat_inputs, dWz, transa=True) dbias_tmp = _h.allocate(dbz.shape) _h.sum_t(flat_dIa, axis=0, out=dbias_tmp) _h.add_tt(dbi, dbias_tmp, dbi) _h.sum_t(flat_dFa, axis=0, out=dbias_tmp) _h.add_tt(dbf, dbias_tmp, dbf) _h.sum_t(flat_dOa, axis=0, out=dbias_tmp) _h.add_tt(dbo, dbias_tmp, dbo) _h.sum_t(flat_dZa, axis=0, out=dbias_tmp) _h.add_tt(dbz, dbias_tmp, dbz) flat_outputs = flatten_time(y[:-2]) flat_cell = flatten_time(Ca[:-2]) flat_cell2 = flatten_time(Ca[:-1]) dWco_tmp = _h.allocate(flat_cell2.shape) dWc_tmp = _h.allocate(dpo.shape) # Output gate Peephole _h.mult_tt(flat_cell2, flat_dOa, dWco_tmp) _h.sum_t(dWco_tmp, axis=0, out=dWc_tmp) _h.add_tt(dpo, dWc_tmp, dpo) flat_dIa = flatten_time(dIa[1:-1]) flat_dFa = flatten_time(dFa[1:-1]) flat_dOa = flatten_time(dOa[1:-1]) flat_dZa = flatten_time(dZa[1:-1]) _h.dot_add_mm(flat_dIa, flat_outputs, dRi, transa=True) _h.dot_add_mm(flat_dFa, flat_outputs, dRf, transa=True) _h.dot_add_mm(flat_dOa, flat_outputs, dRo, transa=True) _h.dot_add_mm(flat_dZa, flat_outputs, dRz, transa=True) _h.dot_add_mm(dIa[0], dy[-1], dRi, transa=True) _h.dot_add_mm(dFa[0], dy[-1], dRf, transa=True) _h.dot_add_mm(dOa[0], dy[-1], dRo, transa=True) _h.dot_add_mm(dZa[0], dy[-1], dRz, transa=True) # Other Peephole connections dWcif_tmp = _h.allocate(flat_cell.shape) _h.mult_tt(flat_cell, flat_dIa, dWcif_tmp) _h.sum_t(dWcif_tmp, axis=0, out=dWc_tmp) _h.add_tt(dpi, dWc_tmp, dpi) _h.mult_tt(flat_cell, flat_dFa, dWcif_tmp) _h.sum_t(dWcif_tmp, axis=0, out=dWc_tmp) _h.add_tt(dpf, dWc_tmp, dpf) dWcif_tmp = _h.allocate(dIa[0].shape) _h.mult_tt(dCa[-1], dIa[0], dWcif_tmp) _h.sum_t(dWcif_tmp, axis=0, out=dWc_tmp) _h.add_tt(dpi, dWc_tmp, dpi) _h.mult_tt(dCa[-1], dIa[0], dWcif_tmp) _h.sum_t(dWcif_tmp, axis=0, out=dWc_tmp) _h.add_tt(dpf, dWc_tmp, dpf)
def backward_pass(self, buffers): # prepare _h = self.handler (dWz, dWi, dWf, dWo, dpi, dpf, dpo, dRz, dRi, dRf, dRo, dbz, dbi, dbf, dbo, dtiming) = buffers.gradients (Wz, Wi, Wf, Wo, pi, pf, po, Rz, Ri, Rf, Ro, bz, bi, bf, bo, timing) = buffers.parameters (Za, Zb, Ia, Ib, Fa, Fb, Oa, Ob, Ca, Cb, dZa, dZb, dIa, dIb, dFa, dFb, dOa, dOb, dCa, dCb) = buffers.internals x = buffers.inputs.default dx = buffers.input_deltas.default y = buffers.outputs.default deltas = buffers.output_deltas.default dy = _h.allocate(y.shape) time_size, batch_size = x.shape[0], x.shape[1] # Temporary variable to be filled with the current value of time t tmp = _h.zeros(timing.shape) _h.fill(dCa, 0.0) cond = _h.zeros(y[0].shape) for t in range(time_size - 1, -1, - 1): # Accumulate recurrent deltas _h.add_tt(dy[t], deltas[t], dy[t]) _h.fill(tmp, t) _h.modulo_tt(tmp, timing, tmp) _h.broadcast_t(tmp.reshape((1, tmp.shape[0])), 0, cond) _h.dot_add_mm(dIa[t + 1], Ri, dy[t]) _h.dot_add_mm(dFa[t + 1], Rf, dy[t]) _h.dot_add_mm(dOa[t + 1], Ro, dy[t]) _h.dot_add_mm(dZa[t + 1], Rz, dy[t]) _h.mult_add_mv(dIa[t + 1], pi, dCa[t]) _h.mult_add_mv(dFa[t + 1], pf, dCa[t]) # Output Gate _h.mult_tt(dy[t], Cb[t], dOb[t]) _h.fill_if(dOb[t], 0, cond) # Set inactive to 0 _h.sigmoid_deriv(Oa[t], Ob[t], dOb[t], dOa[t]) # Output influence on peephole: _h.mult_add_mv(dOa[t], po, dCa[t]) # Cell _h.mult_tt(dy[t], Ob[t], dCb[t]) _h.act_func_deriv[self.activation](Ca[t], Cb[t], dCb[t], dCb[t]) _h.fill_if(dCb[t], 0, cond) _h.add_tt(dCa[t], dCb[t], dCa[t]) _h.mult_add_tt(dCa[t + 1], Fb[t + 1], dCa[t]) # Forget Gate _h.mult_tt(dCa[t], Ca[t - 1], dFb[t]) _h.sigmoid_deriv(Fa[t], Fb[t], dFb[t], dFa[t]) # Input Gate _h.mult_tt(dCa[t], Zb[t], dIb[t]) _h.sigmoid_deriv(Ia[t], Ib[t], dIb[t], dIa[t]) # Block Input _h.mult_tt(dCa[t], Ib[t], dZb[t]) _h.act_func_deriv[self.activation](Za[t], Zb[t], dZb[t], dZa[t]) # Copy over the error from previous inactive nodes _h.add_into_if(dy[t], dy[t-1], cond) _h.add_into_if(dCa[t], dCa[t-1], cond) # Undo updates to inactive nodes: _h.fill_if(dIa[t], 0, cond) _h.fill_if(dFa[t], 0, cond) _h.fill_if(dZa[t], 0, cond) _h.fill_if(Fb[t], 0, cond) # Same as for standard RNN: flat_inputs = flatten_time_and_features(x) flat_dinputs = flatten_time_and_features(dx) flat_dIa = flatten_time(dIa[:-1]) flat_dFa = flatten_time(dFa[:-1]) flat_dOa = flatten_time(dOa[:-1]) flat_dZa = flatten_time(dZa[:-1]) # calculate in_deltas and gradients _h.dot_add_mm(flat_dIa, Wi, flat_dinputs) _h.dot_add_mm(flat_dFa, Wf, flat_dinputs) _h.dot_add_mm(flat_dOa, Wo, flat_dinputs) _h.dot_add_mm(flat_dZa, Wz, flat_dinputs) _h.dot_add_mm(flat_dIa, flat_inputs, dWi, transa=True) _h.dot_add_mm(flat_dFa, flat_inputs, dWf, transa=True) _h.dot_add_mm(flat_dOa, flat_inputs, dWo, transa=True) _h.dot_add_mm(flat_dZa, flat_inputs, dWz, transa=True) dbias_tmp = _h.allocate(dbz.shape) _h.sum_t(flat_dIa, axis=0, out=dbias_tmp) _h.add_tt(dbi, dbias_tmp, dbi) _h.sum_t(flat_dFa, axis=0, out=dbias_tmp) _h.add_tt(dbf, dbias_tmp, dbf) _h.sum_t(flat_dOa, axis=0, out=dbias_tmp) _h.add_tt(dbo, dbias_tmp, dbo) _h.sum_t(flat_dZa, axis=0, out=dbias_tmp) _h.add_tt(dbz, dbias_tmp, dbz) flat_outputs = flatten_time(y[:-2]) flat_cell = flatten_time(Ca[:-2]) flat_cell2 = flatten_time(Ca[:-1]) dWco_tmp = _h.allocate(flat_cell2.shape) dWc_tmp = _h.allocate(dpo.shape) # Peephole connection output weight: _h.mult_tt(flat_cell2, flat_dOa, dWco_tmp) _h.sum_t(dWco_tmp, axis=0, out=dWc_tmp) _h.add_tt(dpo, dWc_tmp, dpo) flat_dIa = flatten_time(dIa[1:-1]) flat_dFa = flatten_time(dFa[1:-1]) flat_dOa = flatten_time(dOa[1:-1]) flat_dZa = flatten_time(dZa[1:-1]) _h.dot_add_mm(flat_dIa, flat_outputs, dRi, transa=True) _h.dot_add_mm(flat_dFa, flat_outputs, dRf, transa=True) _h.dot_add_mm(flat_dOa, flat_outputs, dRo, transa=True) _h.dot_add_mm(flat_dZa, flat_outputs, dRz, transa=True) _h.dot_add_mm(dIa[0], dy[-1], dRi, transa=True) _h.dot_add_mm(dFa[0], dy[-1], dRf, transa=True) _h.dot_add_mm(dOa[0], dy[-1], dRo, transa=True) _h.dot_add_mm(dZa[0], dy[-1], dRz, transa=True) # Other Peephole connections dWcif_tmp = _h.allocate(flat_cell.shape) _h.mult_tt(flat_cell, flat_dIa, dWcif_tmp) _h.sum_t(dWcif_tmp, axis=0, out=dWc_tmp) _h.add_tt(dpi, dWc_tmp, dpi) _h.mult_tt(flat_cell, flat_dFa, dWcif_tmp) _h.sum_t(dWcif_tmp, axis=0, out=dWc_tmp) _h.add_tt(dpf, dWc_tmp, dpf) dWcif_tmp = _h.allocate(dIa[0].shape) _h.mult_tt(dCa[-1], dIa[0], dWcif_tmp) _h.sum_t(dWcif_tmp, axis=0, out=dWc_tmp) _h.add_tt(dpi, dWc_tmp, dpi) _h.mult_tt(dCa[-1], dIa[0], dWcif_tmp) _h.sum_t(dWcif_tmp, axis=0, out=dWc_tmp) _h.add_tt(dpf, dWc_tmp, dpf)
def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler (Wz, Wi, Wf, Wo, pi, pf, po, Rz, Ri, Rf, Ro, bz, bi, bf, bo, timing) = buffers.parameters (Za, Zb, Ia, Ib, Fa, Fb, Oa, Ob, Ca, Cb, dZa, dZb, dIa, dIb, dFa, dFb, dOa, dOb, dCa, dCb) = buffers.internals x = buffers.inputs.default y = buffers.outputs.default time_size, batch_size = x.shape[0], x.shape[1] # Temporary variable to be filled with the current value of time t tmp = _h.zeros(timing.shape) cond = _h.zeros(y[0].shape) flat_x = flatten_time_and_features(x) flat_Za = flatten_time(Za[:-1]) flat_Ia = flatten_time(Ia[:-1]) flat_Fa = flatten_time(Fa[:-1]) flat_Oa = flatten_time(Oa[:-1]) _h.dot_mm(flat_x, Wz, flat_Za, transb=True) _h.dot_mm(flat_x, Wi, flat_Ia, transb=True) _h.dot_mm(flat_x, Wf, flat_Fa, transb=True) _h.dot_mm(flat_x, Wo, flat_Oa, transb=True) for t in range(time_size): # Block input _h.dot_add_mm(y[t - 1], Rz, Za[t], transb=True) _h.add_mv(Za[t], bz.reshape((1, self.size)), Za[t]) _h.act_func[self.activation](Za[t], Zb[t]) # Input Gate _h.dot_add_mm(y[t - 1], Ri, Ia[t], transb=True) _h.mult_add_mv(Ca[t - 1], pi, Ia[t]) # ADDED PEEPHOLE CONNECTION _h.add_mv(Ia[t], bi.reshape((1, self.size)), Ia[t]) _h.sigmoid(Ia[t], Ib[t]) # Forget Gate _h.dot_add_mm(y[t - 1], Rf, Fa[t], transb=True) _h.mult_add_mv(Ca[t - 1], pf, Fa[t]) # ADDED PEEPHOLE CONNECTION _h.add_mv(Fa[t], bf.reshape((1, self.size)), Fa[t]) _h.sigmoid(Fa[t], Fb[t]) # Cell _h.mult_tt(Ib[t], Zb[t], Ca[t]) _h.mult_add_tt(Fb[t], Ca[t - 1], Ca[t]) # Output Gate _h.dot_add_mm(y[t - 1], Ro, Oa[t], transb=True) _h.mult_add_mv(Ca[t], po, Oa[t]) # ADDED PEEPHOLE CONNECTION _h.add_mv(Oa[t], bo.reshape((1, self.size)), Oa[t]) _h.sigmoid(Oa[t], Ob[t]) # Block output _h.act_func[self.activation](Ca[t], Cb[t]) _h.mult_tt(Ob[t], Cb[t], y[t]) if t > 0: _h.fill(tmp, t) _h.modulo_tt(tmp, timing, tmp) _h.broadcast_t(tmp.reshape((1, tmp.shape[0])), 0, cond) # Reset Cell _h.copy_to_if(Ca[t-1], Ca[t], cond) # Reset Block output _h.copy_to_if(y[t-1], y[t], cond)