Beispiel #1
0
    def _bpts_step(self, i, gradient_reg, seqs, reps, inter_reps, left_subreps, right_subreps, rep_gradients):
        # BPTS
        seq = seqs[i]
        left, right, target = seq[0], seq[1], seq[2]

        left_is_token = T.lt(left, 0)
        right_is_token = T.lt(right, 0)

        bpts_gradient = gradient_reg[target]
        rep_gradient = rep_gradients[i] + bpts_gradient

        if self.deep:
            # Implementation note:
            # As the gradient of deep encoding func wrt W_ee includes the input representation.
            # If we let T.grad to find that input representation directly, it will stuck in an infinite loop.
            # So we must use SRG in this case.
            _fake_input_rep, = make_float_vectors("_fake_input_rep")
            deep_rep = self._deep_encode(_fake_input_rep)

            node_map = {deep_rep: reps[i], _fake_input_rep: inter_reps[i]}

            g_wee = SRG(T.grad(T.sum(deep_rep), self.W_ee), node_map) * rep_gradient
            g_bee = SRG(T.grad(T.sum(deep_rep), self.B_ee), node_map) * rep_gradient
            g_inter_rep = SRG(T.grad(T.sum(deep_rep), _fake_input_rep), node_map) * rep_gradient
            inter_rep = inter_reps[i]

        else:
            g_wee = T.constant(0)
            g_bee = T.constant(0)
            g_inter_rep = rep_gradient
            inter_rep = reps[i]

        # Accelerate computation by using saved internal values.
        # For the limitation of SRG, known_grads can not be used here.
        _fake_left_rep, _fake_right_rep = make_float_vectors("_fake_left_rep", "_fake_right_rep")
        rep_node = self._encode_computation(_fake_left_rep, _fake_right_rep)
        if self.deep:
            rep_node = self._deep_encode(rep_node)

        node_map = {_fake_left_rep: left_subreps[i], _fake_right_rep: right_subreps[i], rep_node: inter_rep}

        g_we1 = SRG(T.grad(T.sum(rep_node), self.W_e1), node_map) * g_inter_rep
        g_we2 = SRG(T.grad(T.sum(rep_node), self.W_e2), node_map) * g_inter_rep
        g_be = SRG(T.grad(T.sum(rep_node), self.B_e), node_map) * g_inter_rep

        g_left_p = SRG(T.grad(T.sum(rep_node), _fake_left_rep), node_map) * g_inter_rep
        g_right_p = SRG(T.grad(T.sum(rep_node), _fake_right_rep), node_map) * g_inter_rep

        gradient_reg = ifelse(left_is_token, gradient_reg, T.set_subtensor(gradient_reg[left], g_left_p))
        gradient_reg = ifelse(right_is_token, gradient_reg, T.set_subtensor(gradient_reg[right], g_right_p))

        return g_we1, g_we2, g_be, g_wee, g_bee, gradient_reg
Beispiel #2
0
    def _lstm_full_gradient_step(self, i,
                                 g_h, g_c,
                                 xs, ks, s_ts, h_ts, c_ts, o_ts, f_ts, i_ts,
                                 i_t_preacts, f_t_preacts, c_t_preacts, o_t_preacts, s_t_preacts,
                                 tanh_c_ts, tanh_c_t_rights, h_ps, c_ps):
        _s, = make_float_vectors("_s")
        _k = T.iscalar("_k")
        _softmax_s = self._softmax(_s)
        _g_cost = T.grad(self._cost_func(_softmax_s, _k), _s)
        g_s_t_preact = SRG(_g_cost, {_s: s_t_preacts[i], _softmax_s: s_ts[i], _k: ks[i]})

        g_wos = T.outer(o_ts[i], g_s_t_preact)
        g_o_t = T.dot(g_s_t_preact, T.transpose(self.W_os))# + 0.0002 * o_ts[i]
        g_o_t += tanh_c_ts[i] * g_h
        g_tanh_c_t = o_ts[i] * g_h
        g_c_t = self._tanh_grad(tanh_c_ts[i]) * g_tanh_c_t + g_c
        g_o_t_preact = self._sigmoid_grad(o_ts[i]) * g_o_t
        g_bo = g_o_t_preact
        g_h_p = T.dot(g_o_t_preact, T.transpose(self.W_ho))
        g_c_t += T.dot(g_o_t_preact, T.transpose(self.W_co))
        g_wxo = T.outer(xs[i], g_o_t_preact)
        g_who = T.outer(h_ps[i], g_o_t_preact)
        g_wco = T.outer(c_ts[i], g_o_t_preact)
        g_f_t = c_ps[i] * g_c_t
        g_c_p = f_ts[i] * g_c_t
        g_i_t = tanh_c_t_rights[i] * g_c_t
        g_tanh_c_t_right = i_ts[i] * g_c_t
        g_c_t_preact = self._tanh_grad(tanh_c_t_rights[i]) * g_tanh_c_t_right
        g_bc = g_c_t_preact
        g_wxc = T.outer(xs[i], g_c_t_preact)
        g_whc = T.outer(h_ps[i], g_c_t_preact)
        g_h_p += T.dot(g_c_t_preact, T.transpose(self.W_hc))
        g_f_t_preact = self._sigmoid_grad(f_ts[i]) * g_f_t
        g_wxf = T.outer(xs[i], g_f_t_preact)
        g_whf = T.outer(h_ps[i], g_f_t_preact)
        g_wcf = T.outer(c_ps[i], g_f_t_preact)
        g_bf = g_f_t_preact
        g_h_p += T.dot(g_f_t_preact, T.transpose(self.W_hf))
        g_c_p += T.dot(g_f_t_preact, T.transpose(self.W_cf))
        g_i_t_preact = self._sigmoid_grad(i_ts[i]) * g_i_t
        g_wxi = T.outer(xs[i], g_i_t_preact)
        g_whi = T.outer(h_ps[i], g_i_t_preact)
        g_wci = T.outer(c_ps[i], g_i_t_preact)
        g_bi = g_i_t_preact
        g_h_p += T.dot(g_i_t_preact, T.transpose(self.W_hi))
        g_c_p += T.dot(g_i_t_preact, T.transpose(self.W_ci))

        # Params:
        # self.W_xi,self.W_hi,self.W_ci,self.W_xf,self.W_hf,
        # self.W_cf,self.W_xc,self.W_hc,self.W_xo,self.W_ho,self.W_co,self.W_os
        return g_h_p, g_c_p, g_wxi, g_whi, g_wci, g_wxf, g_whf, g_wcf, g_wxc, g_whc, g_wxo, g_who, g_wco, g_wos# , \
Beispiel #3
0
    def _build_gradient_func(self):
        self._preact_t, = make_float_vectors("_preact")

        self._sigmoid_grad_act_t = self._sigmoid(self._preact_t)
        self._sigmoid_grad_t = T.grad(T.sum(self._sigmoid_grad_act_t), self._preact_t)
        self._sigmoid_grad = lambda act: SRG(self._sigmoid_grad_t, {self._sigmoid_grad_act_t: act})

        self._tanh_grad_act_t = self._tanh(self._preact_t)
        self._tanh_grad_t = T.grad(T.sum(self._tanh_grad_act_t), self._preact_t)
        self._tanh_grad = lambda act: SRG(self._tanh_grad_t, {self._tanh_grad_act_t: act})

        self._softmax_grad_act_t = self._softmax(self._preact_t)
        self._softmax_grad_t = T.grad(T.sum(self._softmax_grad_act_t), self._preact_t)
        self._softmax_grad = lambda preact, act: SRG(self._softmax_grad_t,
                                                     {self._softmax_grad_t: act, self._preact_t: preact})
Beispiel #4
0
    def _unfold_gradients_func(self, rep, dec, g_dec, target_tok, tok, w, b, unfold_idx=0):
        distance = T.sum((target_tok - dec)**2)
        g_cost_dec = T.grad(distance, dec)

        tok_is_token = T.lt(tok, 0)
        g_dec_switcher = ifelse(tok_is_token, g_cost_dec, g_dec)

        output_distance = ifelse(tok_is_token, distance, T.constant(0.0, dtype=FLOATX))

        _rep, = make_float_vectors("_rep")
        _dec = self._decode_computation(_rep)[unfold_idx]
        node_map = {_rep: rep, _dec: dec}

        g_dec_rep = SRG(T.grad(T.sum(_dec), _rep), node_map) * g_dec_switcher
        g_dec_w = SRG(T.grad(T.sum(_dec), w), node_map) * g_dec_switcher
        g_dec_b = SRG(T.grad(T.sum(_dec), b), node_map) * g_dec_switcher

        return g_dec_rep, g_dec_w, g_dec_b, output_distance
Beispiel #5
0
        def bptt_step(i, wi, wr, bptt_error, xs, h_list, h_errs):

            # Make virtual graph
            _wi, _wr, _ws = make_float_matrices("_wi", "_wr", "_ws")
            _h, _r, _x, _s, _xz = make_float_vectors("_h", "_r", "_x", "_s", "_xz")

            _z = T.dot(_x, _wi)+ T.dot(_r, _wr)
            _h = self._activation_func(_z)

            _xh = self._activation_func(_xz)

            node_map = {_h: h_list[i], _r: ifelse(T.eq(i, 0), self.h0, h_list[i-1]),
                         _x: xs[i], _wi: self.W_i, _wr: self.W_r}

            # Backpropagate

            g_z_wi = RG(T.grad(T.sum(_z), _wi), node_map)
            g_z_wr = RG(T.grad(T.sum(_z), _wr), node_map)

            error = h_errs[i] + bptt_error

            # Make sure g_h_z only relies on _xh!!!
            g_h_z = SRG(T.grad(T.sum(_xh), _xz), {_xh: h_list[i]})

            g_wi = (g_z_wi * (g_h_z * error))
            g_wr = (g_z_wr * (g_h_z * error))

            # TODO: Updating weights in each step is slow, move it out
            w_updates = OrderedDict(optimize_parameters([wi, wr], [g_wi, g_wr], shapes=[self.W_i, self.W_r],
                                                        method=self.optimization, lr=self.learning_rate, beta=self.beta))

            wi_out = w_updates[wi]
            wr_out = w_updates[wr]
            del w_updates[wr]
            del w_updates[wi]
            self._assistive_params.extend(w_updates.keys())

            bptt_error = T.dot(self.W_r, g_h_z * error)

            return [wi_out, wr_out, bptt_error], w_updates