def __build_theano__(self): x = ivector(name="x") y = ivector(name="y") U, V, W = self.U, self.V, self.W def forword_prop_step(x_t, s_t_prev, U, V, W): s_t = T.tanh(U[:,x_t] + V.dot(s_t_prev)) o_t = T.nnet.softmax(W.dot(s_t)) return [o_t[0], s_t] [o,s], updates = theano.scan(forword_prop_step, sequences=x, outputs_info=[None, dict(initial=T.zeros(self.hidden_dim))], non_sequences=[U,V,W], truncate_gradient=4, strict=True) prediction = T.argmax(o, axis=1) o_error = T.sum(T.nnet.categorical_crossentropy(o, y)) dU = T.grad(o_error, U) dV = T.grad(o_error, V) dW = T.grad(o_error, W) self.forward = theano.function([x], o) self.predict = theano.function([x], prediction) self.c_error = theano.function([x, y], o_error) self.bptt = theano.function([x, y], [dU, dV, dW]) learning_rate = scalar(name="learning_rate") self.sgd_step = theano.function([x, y, learning_rate], [], updates=[(self.U, self.U-learning_rate*dU), (self.V, self.V-learning_rate*dV), (self.W, self.W-learning_rate*dW)])
def test_grad_log1msigm(self): # At some point, this returned nan, because (1 - sigm(x)) was # on both the numerator and the denominator of a fraction, # but the two nodes in question had not been merged. x = tensor.matrix('x') lr = tensor.scalar('lr') s = sigmoid(x) l = T.log(1 - s) c = l.mean() ux = x - lr * theano.grad(c, x) # Before the optimization, inf and NaN will be produced in the graph, # and DebugMode will complain. Everything is fine afterwards. mode = self.get_mode() if not isinstance(mode, theano.compile.DebugMode): f = theano.function([x, lr], ux, mode=mode) ux_v = f([[50]], 0.1) assert not np.isnan(ux_v)
def test_grad_log1msigm(self): # At some point, this returned nan, because (1 - sigm(x)) was # on both the numerator and the denominator of a fraction, # but the two nodes in question had not been merged. x = tensor.matrix('x') lr = tensor.scalar('lr') s = sigmoid(x) l = T.log(1 - s) c = l.mean() ux = x - lr * theano.grad(c, x) # Before the optimization, inf and NaN will be produced in the graph, # and DebugMode will complain. Everything is fine afterwards. mode = self.get_mode() if not isinstance(mode, theano.compile.DebugMode): f = theano.function([x, lr], ux, mode=mode) ux_v = f([[50]], 0.1) assert not numpy.isnan(ux_v)
def __build_theano__(self): x = ivector("x") y = ivector("y") hidden_dim = self.hidden_dim word_dim = self.word_dim Wxi, Whi, Wci, Wxf, Whf, Wcf, Wxc, Whc, Wxo, Who, Wco, Wo = self.Wxi, self.Whi, self.Wci, self.Wxf, self.Whf, self.Wcf, self.Wxc, self.Whc, self.Wxo, self.Who, self.Wco, self.Wo def forward_prop(x_t, c_prev_t, h_prev_t, Wxi, Whi, Wci, Wxf, Whf, Wcf, Wxc, Whc, Wxo, Who, Wco, Wo): input_gate = T.tanh(Wxi.dot(x_t) + Whi.dot(h_prev_t) + Wci*c_prev_t) forget_gate = T.tanh(Wxf.dot(x_t) + Whf.dot(h_prev_t) + Wcf*c_prev_t) a_c_t = Wxc.dot(x_t) + Whc.dot(h_prev_t) c_t = input_gate * T.nnet.sigmoid(a_c_t) + forget_gate * c_prev_t output_gate = T.tanh(Wxo.dot(x_t) + Who.dot(h_prev_t) + Wco*c_t) h_t = output_gate * T.tanh(c_t) o_t = Wo.dot(h_t) return [o_t[0], c_t, h_t] [o, c, h], updates = theano.scan(forward_prop, sequences = x, outputs_info = [None, dict(initial=T.zeros(hidden_dim)), dict(initial=T.zeros(hidden_dim))], non_sequences = [Wxi, Whi, Wci, Wxf, Whf, Wcf, Wxc, Whc, Wxo, Who, Wco, Wo], strict = True) prediction = T.argmax(o, axis=1) c_error = T.sum(T.nnet.categorical_crossentropy(o, y)) dWxi = T.grad(c_error, Wxi) dWhi = T.grad(c_error, Whi) dWci = T.grad(c_error, Wci) dWxf = T.grad(c_error, Wxf) dWhf = T.grad(c_error, Whf) dWcf = T.grad(c_error, Wcf) dWxc = T.grad(c_error, Wxc) dWhc = T.grad(c_error, Whc) dWxo = T.grad(c_error, Wxo) dWho = T.grad(c_error, Who) dWco = T.grad(c_error, Wco) dWo = T.grad(c_error, Wo) forward = theano.function([x], o) predict = theano.function([x], prediction) learning_rate = scalar("learning_rate") sgd_step = theano.function([x,y], [], updates = [(self.Wxi, self.Wxi-learning_rate*dWxi), (self.Whi, self.Whi-learning_rate*dWhi), (self.Wci, self.Wci-learning_rate*dWci), (self.Wxf, self.Wxf-learning_rate*dWxf), (self.Whf, self.Whf-learning_rate*dWhf), (self.Wcf, self.Wcf-learning_rate*dWcf), (self.Wxo, self.Wxo-learning_rate*dWxo), (self.Who, self.Who-learning_rate*dWho), (self.Wco, self.Wco-learning_rate*dWco), (self.Wxc, self.Wxc-learning_rate*dWxc), (self.Whc, self.Whc-learning_rate*dWhc), (self.Wo, self.Wo-learning_rate*dWo)])