def back_propagate_through_time(self, X, Y): X = self._assert_numpy(X) Y = self._assert_numpy(Y) self.reset() F_args, Fs, I_args, Is, C_args, Cs, Hf_args, Hfs, Hs, O_args, Os, Ss = self.feed_forward_and_cache( X) Ss = numpy.concatenate((Ss, numpy.zeros((1, self.hidden_size)))) Hs = numpy.concatenate((Hs, numpy.zeros((1, self.hidden_size)))) self.reset() delta = numpy.multiply(self.compute_error_vector(Os, Y), self.afunc_primes[2](O_args)) dLdfs, dLdis, dLdcs, dLdhfs, dLdOs = self.init_gradients(delta, Hs) dLdX = numpy.zeros(X.shape) def update(params, H, X, d): W_, U_, b_ = params b_ += d U_ += numpy.outer(d, H) W_ += numpy.outer(d, X) for t in range(X.shape[0]): delta_s_t = numpy.zeros(self.S.shape) delta_h_t = numpy.dot(delta[t], self.W_o) for bptt_step in range(max(0, t - self.bptt_truncate), t + 1)[::-1]: delta_s_t += numpy.multiply( numpy.multiply(delta_h_t, Hfs[bptt_step]), self.afunc_primes[1](Ss[bptt_step])) delta_h_t = numpy.multiply(delta_h_t, self.afuncs[1](Ss[bptt_step])) delta_h_t = numpy.multiply( delta_h_t, af.sigmoid_prime(Hf_args[bptt_step])) delta_c = numpy.multiply( numpy.multiply(delta_s_t, Is[bptt_step]), self.afunc_primes[0](C_args[bptt_step])) delta_i = numpy.multiply( numpy.multiply(delta_s_t, Cs[bptt_step]), af.sigmoid_prime(I_args[bptt_step])) delta_f = numpy.multiply( numpy.multiply(delta_s_t, Ss[bptt_step - 1]), af.sigmoid_prime(F_args[bptt_step])) update(dLdhfs, Hs[bptt_step - 1], X[bptt_step], delta_h_t) update(dLdcs, Hs[bptt_step - 1], X[bptt_step], delta_c) update(dLdis, Hs[bptt_step - 1], X[bptt_step], delta_i) update(dLdfs, Hs[bptt_step - 1], X[bptt_step], delta_f) delta_h_t += numpy.dot(delta_c, self.U_c) +\ numpy.dot(delta_i, self.U_i) + numpy.dot(delta_f, self.U_f) dLdX[bptt_step] += numpy.dot(delta_c, self.W_c) +\ numpy.dot(delta_i, self.W_i) + numpy.dot(delta_f, self.W_f) delta_s_t = numpy.multiply(delta_s_t, Fs[bptt_step]) return ( dLdfs, dLdis, dLdcs, dLdhfs, dLdOs, dLdX, )
def _back_prop(self, x, y): nabla_b = [np.zeros(bias.shape) for bias in self.biases] nabla_w = [np.zeros(weight.shape) for weight in self.weights] error = (self._activations[-1] - y) * sigmoid_prime(self._zs[-1]) nabla_b[-1] = error nabla_w[-1] = error.dot(self._activations[-2].transpose()) for l in range(self.num_layers - 2, 0, -1): error = np.multiply(self.weights[l + 1].transpose().dot(error), sigmoid_prime(self._zs[l])) nabla_b[l] = error nabla_w[l] = error.dot(self._activations[l - 1].transpose()) return nabla_b, nabla_w
def back_prop_input_gate(self, delta_s_t, delta_h_t, C, I_arg, H, X, dLdis, dLdX): dLdW_i, dLdU_i, dLdb_i = dLdis # The "input gate." We know s_t = F * s_t-1 + I * C # so delta_i = delta_s_t * C. Both parts have shape (hidden_state) delta_i = numpy.multiply(delta_s_t, C) # I = sigmoid(dot(W_i, x_t) + dot(U_i, h_t-1) + b_c) # caching this error: delta_i = delta_i * sigmoid_prime(I_args[bptt_step]) delta_i = numpy.multiply(delta_i, af.sigmoid_prime(I_arg)) dLdb_i += delta_i dLdU_i += numpy.outer(delta_i, H) dLdW_i += numpy.outer(delta_i, X) # need to update delta_h_t: each element += delta_i * U_i delta_h_t += numpy.dot(delta_i, self.U_i)
def back_prop_forget_gate(self, delta_s_t, delta_h_t, S, F_arg, H, X, dLdfs, dLdX): dLdW_f, dLdU_f, dLdb_f = dLdfs # last gate: The "forget" gate. We know s_t = F * s_t-1 + I * C # so delta_f = delta_s_t * s_t-1, both element have size (hidden_size) delta_f = numpy.multiply(delta_s_t, S) # we also know F = sigmoid(dot(W_f, x_t) + dot(U_f, h_t-1) + b_f) # we can cache some of this information. delta_f = numpy.multiply(delta_f, af.sigmoid_prime(F_arg)) dLdb_f += delta_f dLdU_f += numpy.outer(delta_f, H) dLdW_f += numpy.outer(delta_f, X) # need to update delta_h_t: each element += delta_f * U_f.T delta_h_t += numpy.dot(delta_f, self.U_f)
def back_prop_hidden_filter_gate(self, delta_s_t, delta_h_t, Hf_arg, H, X, dLdhfs, dLdX): dLdW_hf, dLdU_hf, dLdb_hf = dLdhfs # four delta_h_t operations here: # The "hidden filter" gate: sigmoid(dot(W_hf, x_t) + dot(U_hf, h_t-1) + b_hf) # differentiating wrsp the argument gets us: # delta_h_t * sigmoid_prime(Hf_args[t]), each having size (hidden_size) delta_hf = numpy.multiply(delta_h_t, af.sigmoid_prime(Hf_arg)) # time to update W_hf, U_hf, and b_hf dLdb_hf += delta_hf # each element of dLdU_hf should be += delta_hf * h_t-1 # but delta_hf has shape (hidden_size), and h_t-1 has shape (hidden_size) # dLdU_hf has shape (hidden_size, hidden_size) dLdU_hf += numpy.outer(delta_hf, H) # each element of dLdW_hf should be += delta_hf * x_t # but delta_hf has shape (hidden_size), and x_t is (input_size) # dLdW_hf has shape (hidden_size, input_size) dLdW_hf += numpy.outer(delta_hf, X) # we also need to compute the first part of delta_h_t. This will be # the derivative of the "hf" gate wrsp to h_t-1: # delta_hf * U_hf. # delta_hf has shape (hidden_size), and U_hf (hidden_size, hidden_size). # So, we only need to compute delta_h_t = dot(delta_hf, U_hf.T) # the transpose is important. This is because when we multiplied forward: # [U_hf_00, U_hf_01, .., U_hf_0hidden] * [ h_t-10 ] # [U_hf_10, U_hf_11, ... ] [ h_t-11 ] # [ ... ] [ ... ] # [U_hf_hidden0, U_hf_hidden1, ... ] [h_t-1hidden] # so, when we have error, we say delta_h_t has the errors for each element # in h_t due to h_t-1. So, we want to multiply the error for each component # by the one weight that contributed to predicting that element. # This can only be done by reorganizing U_f so that, when we perform the dot # operation, each row of delta_h_t will be multiplied by the row (now a column # in the transpose) that predicted that error unit in the forward direction. delta_h_t = numpy.dot(delta_hf, self.U_hf)