Exemple #1
0
    def back_propagate_through_time(self, X, Y):
        X = self._assert_numpy(X)
        Y = self._assert_numpy(Y)
        self.reset()
        F_args, Fs, I_args, Is, C_args, Cs, Hf_args, Hfs, Hs, O_args, Os, Ss = self.feed_forward_and_cache(
            X)
        Ss = numpy.concatenate((Ss, numpy.zeros((1, self.hidden_size))))
        Hs = numpy.concatenate((Hs, numpy.zeros((1, self.hidden_size))))
        self.reset()
        delta = numpy.multiply(self.compute_error_vector(Os, Y),
                               self.afunc_primes[2](O_args))
        dLdfs, dLdis, dLdcs, dLdhfs, dLdOs = self.init_gradients(delta, Hs)
        dLdX = numpy.zeros(X.shape)

        def update(params, H, X, d):
            W_, U_, b_ = params
            b_ += d
            U_ += numpy.outer(d, H)
            W_ += numpy.outer(d, X)

        for t in range(X.shape[0]):
            delta_s_t = numpy.zeros(self.S.shape)
            delta_h_t = numpy.dot(delta[t], self.W_o)
            for bptt_step in range(max(0, t - self.bptt_truncate),
                                   t + 1)[::-1]:
                delta_s_t += numpy.multiply(
                    numpy.multiply(delta_h_t, Hfs[bptt_step]),
                    self.afunc_primes[1](Ss[bptt_step]))
                delta_h_t = numpy.multiply(delta_h_t,
                                           self.afuncs[1](Ss[bptt_step]))
                delta_h_t = numpy.multiply(
                    delta_h_t, af.sigmoid_prime(Hf_args[bptt_step]))
                delta_c = numpy.multiply(
                    numpy.multiply(delta_s_t, Is[bptt_step]),
                    self.afunc_primes[0](C_args[bptt_step]))
                delta_i = numpy.multiply(
                    numpy.multiply(delta_s_t, Cs[bptt_step]),
                    af.sigmoid_prime(I_args[bptt_step]))
                delta_f = numpy.multiply(
                    numpy.multiply(delta_s_t, Ss[bptt_step - 1]),
                    af.sigmoid_prime(F_args[bptt_step]))
                update(dLdhfs, Hs[bptt_step - 1], X[bptt_step], delta_h_t)
                update(dLdcs, Hs[bptt_step - 1], X[bptt_step], delta_c)
                update(dLdis, Hs[bptt_step - 1], X[bptt_step], delta_i)
                update(dLdfs, Hs[bptt_step - 1], X[bptt_step], delta_f)
                delta_h_t += numpy.dot(delta_c, self.U_c) +\
                    numpy.dot(delta_i, self.U_i) + numpy.dot(delta_f, self.U_f)
                dLdX[bptt_step] += numpy.dot(delta_c, self.W_c) +\
                    numpy.dot(delta_i, self.W_i) + numpy.dot(delta_f, self.W_f)
                delta_s_t = numpy.multiply(delta_s_t, Fs[bptt_step])
        return (
            dLdfs,
            dLdis,
            dLdcs,
            dLdhfs,
            dLdOs,
            dLdX,
        )
Exemple #2
0
    def _back_prop(self, x, y):
        nabla_b = [np.zeros(bias.shape) for bias in self.biases]
        nabla_w = [np.zeros(weight.shape) for weight in self.weights]

        error = (self._activations[-1] - y) * sigmoid_prime(self._zs[-1])
        nabla_b[-1] = error
        nabla_w[-1] = error.dot(self._activations[-2].transpose())

        for l in range(self.num_layers - 2, 0, -1):
            error = np.multiply(self.weights[l + 1].transpose().dot(error),
                                sigmoid_prime(self._zs[l]))
            nabla_b[l] = error
            nabla_w[l] = error.dot(self._activations[l - 1].transpose())

        return nabla_b, nabla_w
Exemple #3
0
    def back_prop_input_gate(self, delta_s_t, delta_h_t, C, I_arg, H, X, dLdis,
                             dLdX):
        dLdW_i, dLdU_i, dLdb_i = dLdis

        # The "input gate." We know s_t = F * s_t-1 + I * C
        # so delta_i = delta_s_t * C. Both parts have shape (hidden_state)
        delta_i = numpy.multiply(delta_s_t, C)
        # I = sigmoid(dot(W_i, x_t) + dot(U_i, h_t-1) + b_c)
        # caching this error: delta_i = delta_i * sigmoid_prime(I_args[bptt_step])
        delta_i = numpy.multiply(delta_i, af.sigmoid_prime(I_arg))
        dLdb_i += delta_i
        dLdU_i += numpy.outer(delta_i, H)
        dLdW_i += numpy.outer(delta_i, X)
        # need to update delta_h_t: each element += delta_i * U_i
        delta_h_t += numpy.dot(delta_i, self.U_i)
Exemple #4
0
    def back_prop_forget_gate(self, delta_s_t, delta_h_t, S, F_arg, H, X,
                              dLdfs, dLdX):
        dLdW_f, dLdU_f, dLdb_f = dLdfs

        # last gate: The "forget" gate. We know s_t = F * s_t-1 + I * C
        # so delta_f = delta_s_t * s_t-1, both element have size (hidden_size)
        delta_f = numpy.multiply(delta_s_t, S)
        # we also know F = sigmoid(dot(W_f, x_t) + dot(U_f, h_t-1) + b_f)
        # we can cache some of this information.
        delta_f = numpy.multiply(delta_f, af.sigmoid_prime(F_arg))
        dLdb_f += delta_f
        dLdU_f += numpy.outer(delta_f, H)
        dLdW_f += numpy.outer(delta_f, X)
        # need to update delta_h_t: each element += delta_f * U_f.T
        delta_h_t += numpy.dot(delta_f, self.U_f)
Exemple #5
0
    def back_prop_hidden_filter_gate(self, delta_s_t, delta_h_t, Hf_arg, H, X,
                                     dLdhfs, dLdX):
        dLdW_hf, dLdU_hf, dLdb_hf = dLdhfs

        # four delta_h_t operations here:
        # The "hidden filter" gate: sigmoid(dot(W_hf, x_t) + dot(U_hf, h_t-1) + b_hf)
        # differentiating wrsp the argument gets us:
        # delta_h_t * sigmoid_prime(Hf_args[t]), each having size (hidden_size)
        delta_hf = numpy.multiply(delta_h_t, af.sigmoid_prime(Hf_arg))
        # time to update W_hf, U_hf, and b_hf
        dLdb_hf += delta_hf
        # each element of dLdU_hf should be += delta_hf * h_t-1
        # but delta_hf has shape (hidden_size), and h_t-1 has shape (hidden_size)
        # dLdU_hf has shape (hidden_size, hidden_size)
        dLdU_hf += numpy.outer(delta_hf, H)
        # each element of dLdW_hf should be += delta_hf * x_t
        # but delta_hf has shape (hidden_size), and x_t is (input_size)
        # dLdW_hf has shape (hidden_size, input_size)
        dLdW_hf += numpy.outer(delta_hf, X)
        # we also need to compute the first part of delta_h_t. This will be
        # the derivative of the "hf" gate wrsp to h_t-1:
        # delta_hf * U_hf.
        # delta_hf has shape (hidden_size), and U_hf (hidden_size, hidden_size).
        # So, we only need to compute delta_h_t = dot(delta_hf, U_hf.T)
        # the transpose is important. This is because when we multiplied forward:
        # [U_hf_00, U_hf_01, .., U_hf_0hidden] * [   h_t-10  ]
        # [U_hf_10, U_hf_11,      ...        ]   [   h_t-11  ]
        # [             ...                  ]   [    ...    ]
        # [U_hf_hidden0, U_hf_hidden1, ...   ]   [h_t-1hidden]

        # so, when we have error, we say delta_h_t has the errors for each element
        # in h_t due to h_t-1. So, we want to multiply the error for each component
        # by the one weight that contributed to predicting that element.
        # This can only be done by reorganizing U_f so that, when we perform the dot
        # operation, each row of delta_h_t will be multiplied by the row (now a column
        # in the transpose) that predicted that error unit in the forward direction.
        delta_h_t = numpy.dot(delta_hf, self.U_hf)