def target_step(dot_x_t_Wgx, dot_x_t_Whx, dot_x_t_Wax, h_tm1, h_src, dot_h_src_Ua): # search c_t #z = T.tanh(T.dot(h_tm1, self.Wa) + T.dot(h_src, self.Ua) + self.ba) #z = T.tanh(T.dot(h_tm1, self.Wa) + dot_h_src_Ua) z = T.tanh(T.dot(h_tm1, self.Wah) + dot_x_t_Wax + dot_h_src_Ua) #print 'z:', z.ndim e = T.dot(self.va, z.T) #print 'e:', e.ndim max_e = T.max(e) exp_e = T.exp(e - max_e) a = exp_e / exp_e.sum() #print 'a:', a.ndim c_t = T.dot(a, h_src) #print 'c_t:', c_t.ndim #print 'z_t and r_t are combined!' all_t = T.nnet.sigmoid(dot_x_t_Wgx + T.dot(h_tm1, self.Ugh) + T.dot(c_t, self.Wgc)) z_t = myutil.slice(all_t, 0, nh) r_t = myutil.slice(all_t, 1, nh) # candidate h_t ch_t = myutil.activation( activation, dot_x_t_Whx + T.dot(r_t * h_tm1, self.Uhh) + T.dot(c_t, self.Whc)) h_t = (1.0 - z_t) * h_tm1 + z_t * ch_t return [h_t, c_t]
def source_step(dot_x_t_Wgx_src, dot_x_t_Whx_src, h_tm1): #print 'z_t, r_t are combined!' #all_t = T.nnet.sigmoid(T.dot(x_t, self.Wgx_src) + T.dot(h_tm1, self.Ugh_src) + self.bg_src) all_t = T.nnet.sigmoid(dot_x_t_Wgx_src + T.dot(h_tm1, self.Ugh_src)) z_t = myutil.slice(all_t, 0, nh) r_t = myutil.slice(all_t, 1, nh) # candidate h_t #ch_t = myutil.activation(activation, T.dot(x_t, self.Whx_src) + T.dot(r_t * h_tm1, self.Uhh_src) + self.bh_src) ch_t = myutil.activation( activation, dot_x_t_Whx_src + T.dot(r_t * h_tm1, self.Uhh_src)) h_t = (1.0 - z_t) * h_tm1 + z_t * ch_t return h_t
def target_step(x_t, h_tm1, c): #print 'z_t and r_t are combined!' all_t = T.nnet.sigmoid( T.dot(x_t, self.Wgx) + T.dot(h_tm1, self.Ugh) + T.dot(c, self.Wgc) + self.bg) z_t = myutil.slice(all_t, 0, nh) r_t = myutil.slice(all_t, 1, nh) # candidate h_t ch_t = myutil.activation( activation, T.dot(x_t, self.Whx) + T.dot(r_t * h_tm1, self.Uhh) + T.dot(c, self.Whc) + self.bh) h_t = (1.0 - z_t) * h_tm1 + z_t * ch_t return h_t
def target_step(x_t, h_tm1, h2_tm1, c, c2): #print 'z_t and r_t are combined!' all_t = T.nnet.sigmoid(T.dot(x_t, self.Wgx) + T.dot(h_tm1, self.Ugh) + T.dot(c, self.Wgc) + self.bg) z_t = myutil.slice(all_t, 0, nh) r_t = myutil.slice(all_t, 1, nh) # candidate h_t ch_t = myutil.activation(activation, T.dot(x_t, self.Whx) + T.dot(r_t * h_tm1, self.Uhh) + T.dot(c, self.Whc) + self.bh) h_t = (1.0 - z_t) * h_tm1 + z_t * ch_t # second layer all2_t = T.nnet.sigmoid(T.dot(h_t, self.Wg2h) + T.dot(h2_tm1, self.Ug2h2) + T.dot(c2, self.Wg2c2) + self.bg2) z2_t = myutil.slice(all2_t, 0, nh2) r2_t = myutil.slice(all2_t, 1, nh2) ch2_t = myutil.activation(activation, T.dot(h_t, self.Wh2h) + T.dot(r2_t * h2_tm1, self.Uh2h2) + T.dot(c2, self.Wh2c2) + self.bh2) h2_t = (1.0 - z2_t) * h2_tm1 + z2_t * ch2_t return [h_t, h2_t]
def source_step(x_t, h_tm1, h2_tm1): #print 'z_t and r_t are combined!' all_t = T.nnet.sigmoid(T.dot(x_t, self.Wgx_src) + T.dot(h_tm1, self.Ugh_src) + self.bg_src) z_t = myutil.slice(all_t, 0, nh) r_t = myutil.slice(all_t, 1, nh) # candidate h_t ch_t = myutil.activation(activation, T.dot(x_t, self.Whx_src) + T.dot(r_t * h_tm1, self.Uhh_src) + self.bh_src) h_t = (1.0 - z_t) * h_tm1 + z_t * ch_t # second layer all2_t = T.nnet.sigmoid(T.dot(h_t, self.Wg2h_src) + T.dot(h2_tm1, self.Ug2h2_src) + self.bg2_src) z2_t = myutil.slice(all2_t, 0, nh2) r2_t = myutil.slice(all2_t, 1, nh2) ch2_t = myutil.activation(activation, T.dot(h_t, self.Wh2h_src) + T.dot(r2_t * h2_tm1, self.Uh2h2_src) + self.bh2_src) h2_t = (1.0 - z2_t) * h2_tm1 + z2_t * ch2_t return [h_t, h2_t]
def greedy_search_step(h_tm1, h2_tm1, y_tm1, h_src, h2_src): x_t = self.emb[y_tm1] c = h_src[-1] c2 = h2_src[-1] #print 'z_t and r_t are combined!' all_t = T.nnet.sigmoid(T.dot(x_t, self.Wgx) + T.dot(h_tm1, self.Ugh) + T.dot(c, self.Wgc) + self.bg) z_t = myutil.slice(all_t, 0, nh) r_t = myutil.slice(all_t, 1, nh) # candidate h_t ch_t = myutil.activation(activation, T.dot(x_t, self.Whx) + T.dot(r_t * h_tm1, self.Uhh) + T.dot(c, self.Whc) + self.bh) h_t = (1.0 - z_t) * h_tm1 + z_t * ch_t # second layer all2_t = T.nnet.sigmoid(T.dot(h_t, self.Wg2h) + T.dot(h2_tm1, self.Ug2h2) + T.dot(c2, self.Wg2c2) + self.bg2) z2_t = myutil.slice(all2_t, 0, nh2) r2_t = myutil.slice(all2_t, 1, nh2) ch2_t = myutil.activation(activation, T.dot(h_t, self.Wh2h) + T.dot(r2_t * h2_tm1, self.Uh2h2) + T.dot(c2, self.Wh2c2) + self.bh2) h2_t = (1.0 - z2_t) * h2_tm1 + z2_t * ch2_t # score s = T.dot(h2_t, self.Wyh2) + T.dot(x_t, self.Wyy) + T.dot(c2, self.Wyc2) + self.by max_s, y_t = T.max_and_argmax(s) exp_s = T.exp(s - max_s) p_y = exp_s / exp_s.sum() log_p_y = T.log(exp_s / exp_s.sum()) return [h_t, h2_t, y_t, log_p_y], theano.scan_module.until(T.eq(y_t,1)) # 1 --> '</s>'
def __init__(self, hyper_param, word2idx_dic): ''' nh :: dimension of the hidden layer ne :: number of word embeddings in the vocabulary de :: dimension of the word embeddings nf :: number of feature nfe:: number of feature embeddings in the vocabulary - by leeck dfe:: dimension of the feature embeddings - by leeck cs :: word window context size emb_file :: word embedding file weight_decay :: weight decay dropout_rate :: dropout rate activation :: activation function: simg, tanh, relu word2idx_dic :: word to index dictionary ''' self.hyper_param = hyper_param nh = hyper_param['nhidden'] ne = hyper_param['vocsize'] de = hyper_param['emb_dimension'] weight_decay = hyper_param['weight_decay'] dropout_rate = hyper_param['dropout_rate'] activation = hyper_param['activation'] learning_method = hyper_param['learning_method'] verbose = False # parameters of the model if hyper_param['load_model'] != '': self.load_param(hyper_param, hyper_param['load_model']) else: self.build_param(hyper_param, word2idx_dic) # parameters self.params = [self.emb, self.Wgx_src, self.Wgx, self.Wgc, \ self.Whx_src, self.Whx, self.Whc, self.Wh0c, self.Ugh_src, self.Ugh, \ self.Uhh_src, self.Uhh, self.Wyh, self.Wyc, self.Wyy, \ self.bg_src, self.bg, self.bh_src, self.bh, self.bh0, self.by, \ self.h0_src] if hyper_param['fixed_emb']: print 'fixed embeddig.' self.params.remove(self.emb) # as many columns as context window size # as many lines as words in the sentence x_sentence = T.ivector('x_sentence') # x_sentence : n_steps x_org = self.emb[x_sentence].reshape((x_sentence.shape[0], de)) x = x_org[:-1] # remove '</s>' x_reverse = x[::-1] # reverse for backward y_sentence = T.ivector('y_sentence') # labels y_input_sentence = T.concatenate([y_sentence[-1:], y_sentence[:-1]], axis=0) y = self.emb[y_input_sentence].reshape((y_input_sentence.shape[0], de)) # for scan def source_step(x_t, h_tm1): #print 'z_t and r_t are combined!' all_t = T.nnet.sigmoid( T.dot(x_t, self.Wgx_src) + T.dot(h_tm1, self.Ugh_src) + self.bg_src) z_t = myutil.slice(all_t, 0, nh) r_t = myutil.slice(all_t, 1, nh) # candidate h_t ch_t = myutil.activation( activation, T.dot(x_t, self.Whx_src) + T.dot(r_t * h_tm1, self.Uhh_src) + self.bh_src) h_t = (1.0 - z_t) * h_tm1 + z_t * ch_t return h_t def target_step(x_t, h_tm1, c): #print 'z_t and r_t are combined!' all_t = T.nnet.sigmoid( T.dot(x_t, self.Wgx) + T.dot(h_tm1, self.Ugh) + T.dot(c, self.Wgc) + self.bg) z_t = myutil.slice(all_t, 0, nh) r_t = myutil.slice(all_t, 1, nh) # candidate h_t ch_t = myutil.activation( activation, T.dot(x_t, self.Whx) + T.dot(r_t * h_tm1, self.Uhh) + T.dot(c, self.Whc) + self.bh) h_t = (1.0 - z_t) * h_tm1 + z_t * ch_t return h_t # make score, h_src, h0 (for beam search) def make_score(x, y, use_noise): # input layer dropout: ex. [0.2, 0.2, 0.5] if use_noise: print "X's projection layer dropout:", dropout_rate[0] dropout_x = myutil.dropout_from_layer(x, dropout_rate[0]) else: dropout_x = x * (1.0 - dropout_rate[0]) # recurrent for source language h_src, _ = theano.scan(fn=source_step, sequences=dropout_x, outputs_info=self.h0_src, n_steps=dropout_x.shape[0]) # context c = h_src[-1] h0 = myutil.activation(activation, T.dot(c, self.Wh0c) + self.bh0) # output layer dropout: ex. [0.2, 0.2, 0.5] if use_noise: print "Y's projection layer dropout:", dropout_rate[1] dropout_y = myutil.dropout_from_layer(y, dropout_rate[1]) else: dropout_y = y * (1.0 - dropout_rate[1]) # forward recurrent for target language h, _ = theano.scan(fn=target_step, sequences=dropout_y, outputs_info=h0, non_sequences=[c], n_steps=dropout_y.shape[0]) # hidden layer dropout if use_noise: print "Y's hidden layer dropout:", dropout_rate[2] dropout_h = myutil.dropout_from_layer(h, dropout_rate[2]) else: dropout_h = h * (1.0 - dropout_rate[2]) # score score = T.dot(dropout_h, self.Wyh) + T.dot( dropout_y, self.Wyy) + T.dot(c, self.Wyc) + self.by return score, h_src, h0 # dropout version (for training) if 'reverse_input' in hyper_param and hyper_param['reverse_input']: print 'reverse input.' dropout_score, _, _ = make_score(x_reverse, y, True) else: dropout_score, _, _ = make_score(x, y, True) dropout_p_y_given_x = myutil.mysoftmax(dropout_score) # scaled version (for prediction) if 'reverse_input' in hyper_param and hyper_param['reverse_input']: print 'reverse input.' score, h_src, h0 = make_score(x_reverse, y, False) else: score, h_src, h0 = make_score(x, y, False) p_y_given_x = myutil.mysoftmax(score) # prediction y_pred = T.argmax(p_y_given_x, axis=1) test_nll = -T.mean( T.log(p_y_given_x)[T.arange(y.shape[0]), y_sentence]) # beam search decoding: input=[c, h_tm1, y_tm1], output=[h_t, log_p_y_t] input_h_src = T.fmatrix('input_h_src') input_h_tm1 = T.fvector('input_h_tm1') input_y_tm1 = T.iscalar('input_y_tm1') # input_y_tm1 == x_t x_t = self.emb[input_y_tm1] c = input_h_src[-1] all_t = T.nnet.sigmoid( T.dot(x_t, self.Wgx) + T.dot(input_h_tm1, self.Ugh) + T.dot(c, self.Wgc) + self.bg) z_t = myutil.slice(all_t, 0, nh) r_t = myutil.slice(all_t, 1, nh) # candidate h_t ch_t = myutil.activation( activation, T.dot(x_t, self.Whx) + T.dot(r_t * input_h_tm1, self.Uhh) + T.dot(c, self.Whc) + self.bh) h_t = (1.0 - z_t) * input_h_tm1 + z_t * ch_t # prediction score_y_t = T.dot(h_t, self.Wyh) + T.dot(x_t, self.Wyy) + T.dot( c, self.Wyc) + self.by max_s = T.max(score_y_t) exp_s = T.exp(score_y_t - max_s) log_p_y_t = T.log(exp_s / exp_s.sum()) # cost and gradients and learning rate lr = T.scalar('lr') # for SGD # NLL + L2-norm nll = -T.mean( T.log(dropout_p_y_given_x)[T.arange(y.shape[0]), y_sentence]) cost = nll for param in self.params: if param.name == 'emb': continue cost += weight_decay * T.sum(param**2) # SGD #gradients = T.grad(cost, self.params) #sgd_updates = OrderedDict((p, p - lr*g) for p, g in zip(self.params, gradients)) sgd_updates = myutil.sgd_updates(self.params, cost, lr) # SGD + momentum (0.9) momentum_updates = myutil.sgd_updates_momentum(self.params, cost, lr, 0.9) # RMSProp (rho = 0.9) rmsprop_updates = myutil.sgd_updates_rmsprop(self.params, cost, lr, 0.9, 1) # AdaDelta (lr --> rho = 0.95) adadelta_updates = myutil.sgd_updates_adadelta(self.params, cost, lr, 1e-6, 9) # theano functions to compile self.classify = theano.function(inputs=[x_sentence, y_sentence], outputs=[y_pred, test_nll]) # for beam search self.encoding_src_lang = theano.function(inputs=[x_sentence], outputs=[h_src, h0]) self.search_next_word = theano.function( inputs=[input_h_src, input_h_tm1, input_y_tm1], outputs=[log_p_y_t, h_t]) # for reranking self.get_nll = theano.function( inputs=[x_sentence, input_h_src, input_h_tm1, y_sentence], outputs=test_nll, on_unused_input='ignore') # SGD self.train_sgd = theano.function(inputs=[x_sentence, y_sentence, lr], outputs=[cost, nll], updates=sgd_updates) # SGD with momentum self.train_momentum = theano.function( inputs=[x_sentence, y_sentence, lr], outputs=[cost, nll], updates=momentum_updates) # RMSProp self.train_rmsprop = theano.function( inputs=[x_sentence, y_sentence, lr], outputs=[cost, nll], updates=rmsprop_updates) # AdaDelta self.train_adadelta = theano.function( inputs=[x_sentence, y_sentence, lr], outputs=[cost, nll], updates=adadelta_updates)
def __init__(self, hyper_param, word2idx_dic): ''' nh :: dimension of the hidden layer ne :: number of word embeddings in the vocabulary de :: dimension of the word embeddings nf :: number of feature nfe:: number of feature embeddings in the vocabulary - by leeck dfe:: dimension of the feature embeddings - by leeck cs :: word window context size emb_file :: word embedding file weight_decay :: weight decay dropout_rate :: dropout rate activation :: activation function: simg, tanh, relu word2idx_dic :: word to index dictionary ''' self.hyper_param = hyper_param nh = hyper_param['nhidden'] ne = hyper_param['vocsize'] de = hyper_param['emb_dimension'] weight_decay = hyper_param['weight_decay'] dropout_rate = hyper_param['dropout_rate'] activation = hyper_param['activation'] learning_method = hyper_param['learning_method'] # parameters of the model if hyper_param['load_model'] != '': self.load_param(hyper_param, hyper_param['load_model']) else: self.build_param(hyper_param, word2idx_dic) # parameters self.params = [self.emb, self.Wgx_src, self.Wgxb_src, self.Wgx, self.Wgc, \ self.Whx_src, self.Whxb_src, self.Whx, self.Whc, self.Wh0c, \ self.Ugh_src, self.Ughb_src, self.Ugh, self.Uhh_src, self.Uhhb_src, self.Uhh, \ self.bg_src, self.bgb_src, self.bg, self.bh_src, self.bhb_src, self.bh, self.bh2, self.bh0, \ self.Wah, self.Wax, self.Ua, self.ba, self.va, \ self.Wh2h, self.Wyh2, self.Wyh, self.Wyc, self.Wyy, self.by, \ self.h0_src, self.h0b_src] if hyper_param['fixed_emb']: print 'fixed embeddig.' self.params.remove(self.emb) # as many lines as words in the sentence x_sentence = T.ivector('x_sentence') # x_sentence : n_steps x = self.emb[x_sentence].reshape( (x_sentence.shape[0], de)) # don't remove '</s>' x_reverse = x[::-1] # reverse for backward y_sentence = T.ivector('y_sentence') # labels y_input_sentence = T.concatenate( [y_sentence[-1:], y_sentence[:-1]], axis=0) # move '</s>' to first position y = self.emb[y_input_sentence].reshape((y_input_sentence.shape[0], de)) # for scan #def source_step(x_t, h_tm1): def source_step(dot_x_t_Wgx_src, dot_x_t_Whx_src, h_tm1): #print 'z_t, r_t are combined!' #all_t = T.nnet.sigmoid(T.dot(x_t, self.Wgx_src) + T.dot(h_tm1, self.Ugh_src) + self.bg_src) all_t = T.nnet.sigmoid(dot_x_t_Wgx_src + T.dot(h_tm1, self.Ugh_src)) z_t = myutil.slice(all_t, 0, nh) r_t = myutil.slice(all_t, 1, nh) # candidate h_t #ch_t = myutil.activation(activation, T.dot(x_t, self.Whx_src) + T.dot(r_t * h_tm1, self.Uhh_src) + self.bh_src) ch_t = myutil.activation( activation, dot_x_t_Whx_src + T.dot(r_t * h_tm1, self.Uhh_src)) h_t = (1.0 - z_t) * h_tm1 + z_t * ch_t return h_t #def source_backward_step(x_t, h_tm1): def source_backward_step(dot_x_t_Wgxb_src, dot_x_t_Whxb_src, h_tm1): #print 'z_t and r_t are combined!' #all_t = T.nnet.sigmoid(T.dot(x_t, self.Wgxb_src) + T.dot(h_tm1, self.Ughb_src) + self.bgb_src) all_t = T.nnet.sigmoid(dot_x_t_Wgxb_src + T.dot(h_tm1, self.Ughb_src)) z_t = myutil.slice(all_t, 0, nh) r_t = myutil.slice(all_t, 1, nh) # candidate h_t #ch_t = myutil.activation(activation, T.dot(x_t, self.Whxb_src) + T.dot(r_t * h_tm1, self.Uhhb_src) + self.bhb_src) ch_t = myutil.activation( activation, dot_x_t_Whxb_src + T.dot(r_t * h_tm1, self.Uhhb_src)) h_t = (1.0 - z_t) * h_tm1 + z_t * ch_t return h_t #def target_step(x_t, h_tm1, h_src, dot_h_src_Ua): def target_step(dot_x_t_Wgx, dot_x_t_Whx, dot_x_t_Wax, h_tm1, h_src, dot_h_src_Ua): # search c_t #z = T.tanh(T.dot(h_tm1, self.Wa) + T.dot(h_src, self.Ua) + self.ba) #z = T.tanh(T.dot(h_tm1, self.Wa) + dot_h_src_Ua) z = T.tanh(T.dot(h_tm1, self.Wah) + dot_x_t_Wax + dot_h_src_Ua) #print 'z:', z.ndim e = T.dot(self.va, z.T) #print 'e:', e.ndim max_e = T.max(e) exp_e = T.exp(e - max_e) a = exp_e / exp_e.sum() #print 'a:', a.ndim c_t = T.dot(a, h_src) #print 'c_t:', c_t.ndim #print 'z_t and r_t are combined!' all_t = T.nnet.sigmoid(dot_x_t_Wgx + T.dot(h_tm1, self.Ugh) + T.dot(c_t, self.Wgc)) z_t = myutil.slice(all_t, 0, nh) r_t = myutil.slice(all_t, 1, nh) # candidate h_t ch_t = myutil.activation( activation, dot_x_t_Whx + T.dot(r_t * h_tm1, self.Uhh) + T.dot(c_t, self.Whc)) h_t = (1.0 - z_t) * h_tm1 + z_t * ch_t return [h_t, c_t] # make score, h_src, h0 (for beam search) def make_score(x, y, use_noise): # input layer dropout: ex. [0.2, 0.2, 0.5] if use_noise: print "X's projection layer dropout:", dropout_rate[0] dropout_x = myutil.dropout_from_layer(x, dropout_rate[0]) else: dropout_x = x * (1.0 - dropout_rate[0]) dropout_x_reverse = dropout_x[::-1] # reverse for backward # RNN encoder dot_x_Wgx_src = T.dot(dropout_x, self.Wgx_src) + self.bg_src dot_x_Whx_src = T.dot(dropout_x, self.Whx_src) + self.bh_src dot_x_rev_Wgx_src = T.dot(dropout_x_reverse, self.Wgxb_src) + self.bgb_src dot_x_rev_Whx_src = T.dot(dropout_x_reverse, self.Whxb_src) + self.bhb_src # forward recurrent for source language hf_src, _ = theano.scan(fn=source_step, sequences=[dot_x_Wgx_src, dot_x_Whx_src], outputs_info=self.h0_src, n_steps=dropout_x.shape[0]) # backward recurrent for source language hb_src_reverse, _ = theano.scan( fn=source_backward_step, sequences=[dot_x_rev_Wgx_src, dot_x_rev_Whx_src], outputs_info=self.h0b_src, n_steps=dropout_x_reverse.shape[0]) hb_src = hb_src_reverse[::-1] h_src = T.concatenate([hf_src, hb_src], axis=1) # global context #c_global = h_src[0] c_global = T.concatenate([hf_src[-1], hb_src[0]], axis=0) # output layer dropout: ex. [0.2, 0.2, 0.5] # output layer (target language input layer) dropout: ex. [0.2, 0.2, 0.5] if use_noise: print "Y's projection layer dropout:", dropout_rate[1] dropout_y = myutil.dropout_from_layer(y, dropout_rate[1]) else: dropout_y = y * (1.0 - dropout_rate[1]) # RNN decoder dot_y_Wgx = T.dot(dropout_y, self.Wgx) + self.bg dot_y_Whx = T.dot(dropout_y, self.Whx) + self.bh dot_y_Wax = T.dot(dropout_y, self.Wax) dot_h_src_Ua = T.dot(h_src, self.Ua) + self.ba h0 = myutil.activation(activation, T.dot(c_global, self.Wh0c) + self.bh0) # forward recurrent for target language [h, c], _ = theano.scan(fn=target_step, sequences=[dot_y_Wgx, dot_y_Whx, dot_y_Wax], outputs_info=[h0, None], non_sequences=[h_src, dot_h_src_Ua], n_steps=dropout_y.shape[0]) # h2 - Deep Output RNN print 'Deep Output RNN: ReLU' # hidden layer dropout if use_noise: print "Y's hidden layer dropout:", dropout_rate[2] dropout_h = myutil.dropout_from_layer(h, dropout_rate[2]) else: dropout_h = h * (1.0 - dropout_rate[2]) # h2 - Deep Output RNN print 'Deep Output RNN: ReLU' h2 = myutil.activation('relu', T.dot(dropout_h, self.Wh2h) + self.bh2) # score score = T.dot(h2, self.Wyh2) + T.dot(dropout_h, self.Wyh) + T.dot(dropout_y, self.Wyy) + \ T.dot(c, self.Wyc) + self.by return score, h_src, h0 # dropout version (for training) dropout_score, _, _ = make_score(x, y, True) dropout_p_y_given_x = myutil.mysoftmax(dropout_score) # scaled version (for prediction) score, h_src, h0 = make_score(x, y, False) p_y_given_x = myutil.mysoftmax(score) # prediction y_pred = T.argmax(p_y_given_x, axis=1) test_nll = -T.mean( T.log(p_y_given_x)[T.arange(y.shape[0]), y_sentence]) # beam search decoding: input=[h_src, h_tm1, y_tm1], output=[h_t, log_p_y_t, alignment] input_h_src = T.fmatrix('input_h_src') input_h_tm1 = T.fvector('input_h_tm1') input_y_tm1 = T.iscalar('input_y_tm1') # input_y_tm1 == x_t x_t = self.emb[input_y_tm1] # search c_t #z = T.tanh(T.dot(input_h_tm1, self.Wa) + T.dot(input_h_src, self.Ua) + self.ba) z = T.tanh( T.dot(input_h_tm1, self.Wah) + T.dot(x_t, self.Wax) + T.dot(input_h_src, self.Ua) + self.ba) e = T.dot(self.va, z.T) max_e = T.max(e) exp_e = T.exp(e - max_e) alignment = exp_e / exp_e.sum() c_t = T.dot(alignment, input_h_src) all_t = T.nnet.sigmoid( T.dot(x_t, self.Wgx) + T.dot(input_h_tm1, self.Ugh) + T.dot(c_t, self.Wgc) + self.bg) z_t = myutil.slice(all_t, 0, nh) r_t = myutil.slice(all_t, 1, nh) # candidate h_t ch_t = myutil.activation( activation, T.dot(x_t, self.Whx) + T.dot(r_t * input_h_tm1, self.Uhh) + T.dot(c_t, self.Whc) + self.bh) h_t = (1.0 - z_t) * input_h_tm1 + z_t * ch_t # h2 - Deep Output RNN h2_t = myutil.activation('relu', T.dot(h_t, self.Wh2h) + self.bh2) # prediction score_y_t = T.dot(h2_t, self.Wyh2) + T.dot(h_t, self.Wyh) + T.dot(x_t, self.Wyy) + \ T.dot(c_t, self.Wyc) + self.by max_s = T.max(score_y_t) exp_s = T.exp(score_y_t - max_s) log_p_y_t = T.log(exp_s / exp_s.sum()) # cost and gradients and learning rate lr = T.scalar('lr') # for SGD # NLL + L2-norm nll = -T.mean( T.log(dropout_p_y_given_x)[T.arange(y.shape[0]), y_sentence]) cost = nll for param in self.params: if param.name == 'emb': continue cost += weight_decay * T.sum(param**2) # SGD sgd_updates = myutil.sgd_updates(self.params, cost, lr) # SGD + momentum momentum_updates = myutil.sgd_updates_momentum(self.params, cost, lr, 0.9) # RMSProp (rho = 0.9) rmsprop_updates = myutil.sgd_updates_rmsprop(self.params, cost, lr, 0.9, 1) # AdaDelta (lr --> rho = 0.95) adadelta_updates = myutil.sgd_updates_adadelta(self.params, cost, lr, 1e-6, 9) # theano functions to compile self.classify = theano.function(inputs=[x_sentence, y_sentence], outputs=[y_pred, test_nll]) # for beam search self.encoding_src_lang = theano.function(inputs=[x_sentence], outputs=[h_src, h0]) self.search_next_word = theano.function( inputs=[input_h_src, input_h_tm1, input_y_tm1], outputs=[log_p_y_t, h_t, alignment]) # for reranking self.get_nll = theano.function( inputs=[x_sentence, input_h_src, input_h_tm1, y_sentence], outputs=test_nll, on_unused_input='ignore') # SGD self.train_sgd = theano.function(inputs=[x_sentence, y_sentence, lr], outputs=[cost, nll], updates=sgd_updates) # SGD with momentum self.train_momentum = theano.function( inputs=[x_sentence, y_sentence, lr], outputs=[cost, nll], updates=momentum_updates) # RMSProp self.train_rmsprop = theano.function( inputs=[x_sentence, y_sentence, lr], outputs=[cost, nll], updates=rmsprop_updates) # AdaDelta self.train_adadelta = theano.function( inputs=[x_sentence, y_sentence, lr], outputs=[cost, nll], updates=adadelta_updates)