Example #1
0
 def __build_theano__(self):
     x = ivector(name="x")
     y = ivector(name="y")
     U, V, W = self.U, self.V, self.W
     
     def forword_prop_step(x_t, s_t_prev, U, V, W):
         s_t = T.tanh(U[:,x_t] + V.dot(s_t_prev))
         o_t = T.nnet.softmax(W.dot(s_t))
         return [o_t[0], s_t]
     
     [o,s], updates = theano.scan(forword_prop_step, sequences=x, 
                                  outputs_info=[None, dict(initial=T.zeros(self.hidden_dim))], 
                                  non_sequences=[U,V,W], truncate_gradient=4, strict=True)
     
     prediction = T.argmax(o, axis=1)
     o_error = T.sum(T.nnet.categorical_crossentropy(o, y))
     
     dU = T.grad(o_error, U)
     dV = T.grad(o_error, V)
     dW = T.grad(o_error, W)
     
     self.forward = theano.function([x], o)
     self.predict = theano.function([x], prediction)
     self.c_error = theano.function([x, y], o_error)
     self.bptt = theano.function([x, y], [dU, dV, dW])
     
     learning_rate = scalar(name="learning_rate")
     self.sgd_step = theano.function([x, y, learning_rate], [], 
                                     updates=[(self.U, self.U-learning_rate*dU),
                                              (self.V, self.V-learning_rate*dV),
                                              (self.W, self.W-learning_rate*dW)])
Example #2
0
    def test_grad_log1msigm(self):
        # At some point, this returned nan, because (1 - sigm(x)) was
        # on both the numerator and the denominator of a fraction,
        # but the two nodes in question had not been merged.
        x = tensor.matrix('x')
        lr = tensor.scalar('lr')

        s = sigmoid(x)
        l = T.log(1 - s)
        c = l.mean()
        ux = x - lr * theano.grad(c, x)

        # Before the optimization, inf and NaN will be produced in the graph,
        # and DebugMode will complain. Everything is fine afterwards.
        mode = self.get_mode()
        if not isinstance(mode, theano.compile.DebugMode):
            f = theano.function([x, lr], ux, mode=mode)
            ux_v = f([[50]], 0.1)
            assert not np.isnan(ux_v)
Example #3
0
    def test_grad_log1msigm(self):
        # At some point, this returned nan, because (1 - sigm(x)) was
        # on both the numerator and the denominator of a fraction,
        # but the two nodes in question had not been merged.
        x = tensor.matrix('x')
        lr = tensor.scalar('lr')

        s = sigmoid(x)
        l = T.log(1 - s)
        c = l.mean()
        ux = x - lr * theano.grad(c, x)

        # Before the optimization, inf and NaN will be produced in the graph,
        # and DebugMode will complain. Everything is fine afterwards.
        mode = self.get_mode()
        if not isinstance(mode, theano.compile.DebugMode):
            f = theano.function([x, lr], ux, mode=mode)
            ux_v = f([[50]], 0.1)
            assert not numpy.isnan(ux_v)
Example #4
0
 def __build_theano__(self):
     x = ivector("x")
     y = ivector("y")
     hidden_dim = self.hidden_dim
     word_dim = self.word_dim
     
     Wxi, Whi, Wci, Wxf, Whf, Wcf, Wxc, Whc, Wxo, Who, Wco, Wo = self.Wxi, self.Whi, self.Wci, self.Wxf, self.Whf, self.Wcf, self.Wxc, self.Whc, self.Wxo, self.Who, self.Wco, self.Wo
     
     def forward_prop(x_t, c_prev_t, h_prev_t,
                      Wxi, Whi, Wci, Wxf, Whf, Wcf, Wxc, Whc, Wxo, Who, Wco, Wo):
         input_gate = T.tanh(Wxi.dot(x_t) + Whi.dot(h_prev_t) + Wci*c_prev_t)
         forget_gate = T.tanh(Wxf.dot(x_t) + Whf.dot(h_prev_t) + Wcf*c_prev_t)
         
         a_c_t = Wxc.dot(x_t) + Whc.dot(h_prev_t)
         c_t = input_gate * T.nnet.sigmoid(a_c_t) + forget_gate * c_prev_t
         
         output_gate = T.tanh(Wxo.dot(x_t) + Who.dot(h_prev_t) + Wco*c_t)
         h_t = output_gate * T.tanh(c_t)
         o_t = Wo.dot(h_t)
         
         return [o_t[0], c_t, h_t]
     
     [o, c, h], updates = theano.scan(forward_prop, sequences = x, 
                             outputs_info = [None, dict(initial=T.zeros(hidden_dim)), dict(initial=T.zeros(hidden_dim))],
                             non_sequences = [Wxi, Whi, Wci, Wxf, Whf, Wcf, Wxc, Whc, Wxo, Who, Wco, Wo], 
                             strict = True)
     
     prediction = T.argmax(o, axis=1)
     c_error = T.sum(T.nnet.categorical_crossentropy(o, y))
     
     dWxi = T.grad(c_error, Wxi)
     dWhi = T.grad(c_error, Whi)
     dWci = T.grad(c_error, Wci)
     dWxf = T.grad(c_error, Wxf)
     dWhf = T.grad(c_error, Whf)
     dWcf = T.grad(c_error, Wcf)
     dWxc = T.grad(c_error, Wxc)
     dWhc = T.grad(c_error, Whc)
     dWxo = T.grad(c_error, Wxo)
     dWho = T.grad(c_error, Who)
     dWco = T.grad(c_error, Wco)
     dWo = T.grad(c_error, Wo)
     
     forward = theano.function([x], o)
     predict = theano.function([x], prediction)
     
     learning_rate = scalar("learning_rate")
     
     sgd_step = theano.function([x,y], [],
                                updates = [(self.Wxi, self.Wxi-learning_rate*dWxi),
                                           (self.Whi, self.Whi-learning_rate*dWhi),
                                           (self.Wci, self.Wci-learning_rate*dWci),
                                           (self.Wxf, self.Wxf-learning_rate*dWxf),
                                           (self.Whf, self.Whf-learning_rate*dWhf),
                                           (self.Wcf, self.Wcf-learning_rate*dWcf),
                                           (self.Wxo, self.Wxo-learning_rate*dWxo),
                                           (self.Who, self.Who-learning_rate*dWho),
                                           (self.Wco, self.Wco-learning_rate*dWco),
                                           (self.Wxc, self.Wxc-learning_rate*dWxc),
                                           (self.Whc, self.Whc-learning_rate*dWhc),
                                           (self.Wo, self.Wo-learning_rate*dWo)])