def target_step(dot_x_t_Wgx, dot_x_t_Whx, dot_x_t_Wax, h_tm1, h_src,
                 dot_h_src_Ua):
     # search c_t
     #z = T.tanh(T.dot(h_tm1, self.Wa) + T.dot(h_src, self.Ua) + self.ba)
     #z = T.tanh(T.dot(h_tm1, self.Wa) + dot_h_src_Ua)
     z = T.tanh(T.dot(h_tm1, self.Wah) + dot_x_t_Wax + dot_h_src_Ua)
     #print 'z:', z.ndim
     e = T.dot(self.va, z.T)
     #print 'e:', e.ndim
     max_e = T.max(e)
     exp_e = T.exp(e - max_e)
     a = exp_e / exp_e.sum()
     #print 'a:', a.ndim
     c_t = T.dot(a, h_src)
     #print 'c_t:', c_t.ndim
     #print 'z_t and r_t are combined!'
     all_t = T.nnet.sigmoid(dot_x_t_Wgx + T.dot(h_tm1, self.Ugh) +
                            T.dot(c_t, self.Wgc))
     z_t = myutil.slice(all_t, 0, nh)
     r_t = myutil.slice(all_t, 1, nh)
     # candidate h_t
     ch_t = myutil.activation(
         activation, dot_x_t_Whx + T.dot(r_t * h_tm1, self.Uhh) +
         T.dot(c_t, self.Whc))
     h_t = (1.0 - z_t) * h_tm1 + z_t * ch_t
     return [h_t, c_t]
 def source_step(dot_x_t_Wgx_src, dot_x_t_Whx_src, h_tm1):
     #print 'z_t, r_t are combined!'
     #all_t = T.nnet.sigmoid(T.dot(x_t, self.Wgx_src) + T.dot(h_tm1, self.Ugh_src) + self.bg_src)
     all_t = T.nnet.sigmoid(dot_x_t_Wgx_src +
                            T.dot(h_tm1, self.Ugh_src))
     z_t = myutil.slice(all_t, 0, nh)
     r_t = myutil.slice(all_t, 1, nh)
     # candidate h_t
     #ch_t = myutil.activation(activation, T.dot(x_t, self.Whx_src) + T.dot(r_t * h_tm1, self.Uhh_src) + self.bh_src)
     ch_t = myutil.activation(
         activation, dot_x_t_Whx_src + T.dot(r_t * h_tm1, self.Uhh_src))
     h_t = (1.0 - z_t) * h_tm1 + z_t * ch_t
     return h_t
 def target_step(x_t, h_tm1, c):
     #print 'z_t and r_t are combined!'
     all_t = T.nnet.sigmoid(
         T.dot(x_t, self.Wgx) + T.dot(h_tm1, self.Ugh) +
         T.dot(c, self.Wgc) + self.bg)
     z_t = myutil.slice(all_t, 0, nh)
     r_t = myutil.slice(all_t, 1, nh)
     # candidate h_t
     ch_t = myutil.activation(
         activation,
         T.dot(x_t, self.Whx) + T.dot(r_t * h_tm1, self.Uhh) +
         T.dot(c, self.Whc) + self.bh)
     h_t = (1.0 - z_t) * h_tm1 + z_t * ch_t
     return h_t
		def target_step(x_t, h_tm1, h2_tm1, c, c2):
			#print 'z_t and r_t are combined!'
			all_t = T.nnet.sigmoid(T.dot(x_t, self.Wgx) + T.dot(h_tm1, self.Ugh) + T.dot(c, self.Wgc) + self.bg)
			z_t = myutil.slice(all_t, 0, nh)
			r_t = myutil.slice(all_t, 1, nh)
			# candidate h_t
			ch_t = myutil.activation(activation, T.dot(x_t, self.Whx) + T.dot(r_t * h_tm1, self.Uhh) + T.dot(c, self.Whc) + self.bh)
			h_t = (1.0 - z_t) * h_tm1 + z_t * ch_t
			# second layer
			all2_t = T.nnet.sigmoid(T.dot(h_t, self.Wg2h) + T.dot(h2_tm1, self.Ug2h2) + T.dot(c2, self.Wg2c2) + self.bg2)
			z2_t = myutil.slice(all2_t, 0, nh2)
			r2_t = myutil.slice(all2_t, 1, nh2)
			ch2_t = myutil.activation(activation, T.dot(h_t, self.Wh2h) + T.dot(r2_t * h2_tm1, self.Uh2h2) + T.dot(c2, self.Wh2c2) + self.bh2)
			h2_t = (1.0 - z2_t) * h2_tm1 + z2_t * ch2_t
			return [h_t, h2_t]
		def source_step(x_t, h_tm1, h2_tm1):
			#print 'z_t and r_t are combined!'
			all_t = T.nnet.sigmoid(T.dot(x_t, self.Wgx_src) + T.dot(h_tm1, self.Ugh_src) + self.bg_src)
			z_t = myutil.slice(all_t, 0, nh)
			r_t = myutil.slice(all_t, 1, nh)
			# candidate h_t
			ch_t = myutil.activation(activation, T.dot(x_t, self.Whx_src) + T.dot(r_t * h_tm1, self.Uhh_src) + self.bh_src)
			h_t = (1.0 - z_t) * h_tm1 + z_t * ch_t
			# second layer
			all2_t = T.nnet.sigmoid(T.dot(h_t, self.Wg2h_src) + T.dot(h2_tm1, self.Ug2h2_src) + self.bg2_src)
			z2_t = myutil.slice(all2_t, 0, nh2)
			r2_t = myutil.slice(all2_t, 1, nh2)
			ch2_t = myutil.activation(activation, T.dot(h_t, self.Wh2h_src) + T.dot(r2_t * h2_tm1, self.Uh2h2_src) + self.bh2_src)
			h2_t = (1.0 - z2_t) * h2_tm1 + z2_t * ch2_t
			return [h_t, h2_t]
		def greedy_search_step(h_tm1, h2_tm1, y_tm1, h_src, h2_src):
			x_t = self.emb[y_tm1]
			c = h_src[-1]
			c2 = h2_src[-1]
			#print 'z_t and r_t are combined!'
			all_t = T.nnet.sigmoid(T.dot(x_t, self.Wgx) + T.dot(h_tm1, self.Ugh) + T.dot(c, self.Wgc) + self.bg)
			z_t = myutil.slice(all_t, 0, nh)
			r_t = myutil.slice(all_t, 1, nh)
			# candidate h_t
			ch_t = myutil.activation(activation, T.dot(x_t, self.Whx) + T.dot(r_t * h_tm1, self.Uhh) + T.dot(c, self.Whc) + self.bh)
			h_t = (1.0 - z_t) * h_tm1 + z_t * ch_t
			# second layer
			all2_t = T.nnet.sigmoid(T.dot(h_t, self.Wg2h) + T.dot(h2_tm1, self.Ug2h2) + T.dot(c2, self.Wg2c2) + self.bg2)
			z2_t = myutil.slice(all2_t, 0, nh2)
			r2_t = myutil.slice(all2_t, 1, nh2)
			ch2_t = myutil.activation(activation, T.dot(h_t, self.Wh2h) + T.dot(r2_t * h2_tm1, self.Uh2h2) + T.dot(c2, self.Wh2c2) + self.bh2)
			h2_t = (1.0 - z2_t) * h2_tm1 + z2_t * ch2_t
			# score
			s = T.dot(h2_t, self.Wyh2) + T.dot(x_t, self.Wyy) + T.dot(c2, self.Wyc2) + self.by
	   		max_s, y_t = T.max_and_argmax(s)
			exp_s = T.exp(s - max_s)
			p_y = exp_s / exp_s.sum()
			log_p_y = T.log(exp_s / exp_s.sum())
			return [h_t, h2_t, y_t, log_p_y], theano.scan_module.until(T.eq(y_t,1)) # 1 --> '</s>'
    def __init__(self, hyper_param, word2idx_dic):
        '''
		nh :: dimension of the hidden layer
		ne :: number of word embeddings in the vocabulary
		de :: dimension of the word embeddings
		nf :: number of feature
		nfe:: number of feature embeddings in the vocabulary - by leeck
		dfe:: dimension of the feature embeddings - by leeck
		cs :: word window context size
		emb_file :: word embedding file
		weight_decay :: weight decay
		dropout_rate :: dropout rate
		activation :: activation function: simg, tanh, relu
		word2idx_dic :: word to index dictionary
		'''
        self.hyper_param = hyper_param
        nh = hyper_param['nhidden']
        ne = hyper_param['vocsize']
        de = hyper_param['emb_dimension']
        weight_decay = hyper_param['weight_decay']
        dropout_rate = hyper_param['dropout_rate']
        activation = hyper_param['activation']
        learning_method = hyper_param['learning_method']
        verbose = False
        # parameters of the model
        if hyper_param['load_model'] != '':
            self.load_param(hyper_param, hyper_param['load_model'])
        else:
            self.build_param(hyper_param, word2idx_dic)

        # parameters
        self.params = [self.emb, self.Wgx_src, self.Wgx, self.Wgc, \
          self.Whx_src, self.Whx, self.Whc, self.Wh0c, self.Ugh_src, self.Ugh, \
          self.Uhh_src, self.Uhh, self.Wyh, self.Wyc, self.Wyy, \
          self.bg_src, self.bg, self.bh_src, self.bh, self.bh0, self.by, \
          self.h0_src]

        if hyper_param['fixed_emb']:
            print 'fixed embeddig.'
            self.params.remove(self.emb)

        # as many columns as context window size
        # as many lines as words in the sentence
        x_sentence = T.ivector('x_sentence')  # x_sentence : n_steps
        x_org = self.emb[x_sentence].reshape((x_sentence.shape[0], de))
        x = x_org[:-1]  # remove '</s>'
        x_reverse = x[::-1]  # reverse for backward

        y_sentence = T.ivector('y_sentence')  # labels
        y_input_sentence = T.concatenate([y_sentence[-1:], y_sentence[:-1]],
                                         axis=0)
        y = self.emb[y_input_sentence].reshape((y_input_sentence.shape[0], de))

        # for scan
        def source_step(x_t, h_tm1):
            #print 'z_t and r_t are combined!'
            all_t = T.nnet.sigmoid(
                T.dot(x_t, self.Wgx_src) + T.dot(h_tm1, self.Ugh_src) +
                self.bg_src)
            z_t = myutil.slice(all_t, 0, nh)
            r_t = myutil.slice(all_t, 1, nh)
            # candidate h_t
            ch_t = myutil.activation(
                activation,
                T.dot(x_t, self.Whx_src) + T.dot(r_t * h_tm1, self.Uhh_src) +
                self.bh_src)
            h_t = (1.0 - z_t) * h_tm1 + z_t * ch_t
            return h_t

        def target_step(x_t, h_tm1, c):
            #print 'z_t and r_t are combined!'
            all_t = T.nnet.sigmoid(
                T.dot(x_t, self.Wgx) + T.dot(h_tm1, self.Ugh) +
                T.dot(c, self.Wgc) + self.bg)
            z_t = myutil.slice(all_t, 0, nh)
            r_t = myutil.slice(all_t, 1, nh)
            # candidate h_t
            ch_t = myutil.activation(
                activation,
                T.dot(x_t, self.Whx) + T.dot(r_t * h_tm1, self.Uhh) +
                T.dot(c, self.Whc) + self.bh)
            h_t = (1.0 - z_t) * h_tm1 + z_t * ch_t
            return h_t

        # make score, h_src, h0 (for beam search)
        def make_score(x, y, use_noise):
            # input layer dropout: ex. [0.2, 0.2, 0.5]
            if use_noise:
                print "X's projection layer dropout:", dropout_rate[0]
                dropout_x = myutil.dropout_from_layer(x, dropout_rate[0])
            else:
                dropout_x = x * (1.0 - dropout_rate[0])
            # recurrent for source language
            h_src, _ = theano.scan(fn=source_step,
                                   sequences=dropout_x,
                                   outputs_info=self.h0_src,
                                   n_steps=dropout_x.shape[0])
            # context
            c = h_src[-1]
            h0 = myutil.activation(activation, T.dot(c, self.Wh0c) + self.bh0)
            # output layer dropout: ex. [0.2, 0.2, 0.5]
            if use_noise:
                print "Y's projection layer dropout:", dropout_rate[1]
                dropout_y = myutil.dropout_from_layer(y, dropout_rate[1])
            else:
                dropout_y = y * (1.0 - dropout_rate[1])
            # forward recurrent for target language
            h, _ = theano.scan(fn=target_step,
                               sequences=dropout_y,
                               outputs_info=h0,
                               non_sequences=[c],
                               n_steps=dropout_y.shape[0])
            # hidden layer dropout
            if use_noise:
                print "Y's hidden layer dropout:", dropout_rate[2]
                dropout_h = myutil.dropout_from_layer(h, dropout_rate[2])
            else:
                dropout_h = h * (1.0 - dropout_rate[2])
            # score
            score = T.dot(dropout_h, self.Wyh) + T.dot(
                dropout_y, self.Wyy) + T.dot(c, self.Wyc) + self.by
            return score, h_src, h0

        # dropout version (for training)
        if 'reverse_input' in hyper_param and hyper_param['reverse_input']:
            print 'reverse input.'
            dropout_score, _, _ = make_score(x_reverse, y, True)
        else:
            dropout_score, _, _ = make_score(x, y, True)
        dropout_p_y_given_x = myutil.mysoftmax(dropout_score)
        # scaled version (for prediction)
        if 'reverse_input' in hyper_param and hyper_param['reverse_input']:
            print 'reverse input.'
            score, h_src, h0 = make_score(x_reverse, y, False)
        else:
            score, h_src, h0 = make_score(x, y, False)
        p_y_given_x = myutil.mysoftmax(score)

        # prediction
        y_pred = T.argmax(p_y_given_x, axis=1)
        test_nll = -T.mean(
            T.log(p_y_given_x)[T.arange(y.shape[0]), y_sentence])

        # beam search decoding: input=[c, h_tm1, y_tm1], output=[h_t, log_p_y_t]
        input_h_src = T.fmatrix('input_h_src')
        input_h_tm1 = T.fvector('input_h_tm1')
        input_y_tm1 = T.iscalar('input_y_tm1')  # input_y_tm1 == x_t
        x_t = self.emb[input_y_tm1]
        c = input_h_src[-1]
        all_t = T.nnet.sigmoid(
            T.dot(x_t, self.Wgx) + T.dot(input_h_tm1, self.Ugh) +
            T.dot(c, self.Wgc) + self.bg)
        z_t = myutil.slice(all_t, 0, nh)
        r_t = myutil.slice(all_t, 1, nh)
        # candidate h_t
        ch_t = myutil.activation(
            activation,
            T.dot(x_t, self.Whx) + T.dot(r_t * input_h_tm1, self.Uhh) +
            T.dot(c, self.Whc) + self.bh)
        h_t = (1.0 - z_t) * input_h_tm1 + z_t * ch_t
        # prediction
        score_y_t = T.dot(h_t, self.Wyh) + T.dot(x_t, self.Wyy) + T.dot(
            c, self.Wyc) + self.by
        max_s = T.max(score_y_t)
        exp_s = T.exp(score_y_t - max_s)
        log_p_y_t = T.log(exp_s / exp_s.sum())

        # cost and gradients and learning rate
        lr = T.scalar('lr')  # for SGD

        # NLL + L2-norm
        nll = -T.mean(
            T.log(dropout_p_y_given_x)[T.arange(y.shape[0]), y_sentence])
        cost = nll
        for param in self.params:
            if param.name == 'emb':
                continue
            cost += weight_decay * T.sum(param**2)

        # SGD
        #gradients = T.grad(cost, self.params)
        #sgd_updates = OrderedDict((p, p - lr*g) for p, g in zip(self.params, gradients))
        sgd_updates = myutil.sgd_updates(self.params, cost, lr)
        # SGD + momentum (0.9)
        momentum_updates = myutil.sgd_updates_momentum(self.params, cost, lr,
                                                       0.9)
        # RMSProp (rho = 0.9)
        rmsprop_updates = myutil.sgd_updates_rmsprop(self.params, cost, lr,
                                                     0.9, 1)
        # AdaDelta (lr --> rho = 0.95)
        adadelta_updates = myutil.sgd_updates_adadelta(self.params, cost, lr,
                                                       1e-6, 9)

        # theano functions to compile
        self.classify = theano.function(inputs=[x_sentence, y_sentence],
                                        outputs=[y_pred, test_nll])
        # for beam search
        self.encoding_src_lang = theano.function(inputs=[x_sentence],
                                                 outputs=[h_src, h0])
        self.search_next_word = theano.function(
            inputs=[input_h_src, input_h_tm1, input_y_tm1],
            outputs=[log_p_y_t, h_t])
        # for reranking
        self.get_nll = theano.function(
            inputs=[x_sentence, input_h_src, input_h_tm1, y_sentence],
            outputs=test_nll,
            on_unused_input='ignore')
        # SGD
        self.train_sgd = theano.function(inputs=[x_sentence, y_sentence, lr],
                                         outputs=[cost, nll],
                                         updates=sgd_updates)
        # SGD with momentum
        self.train_momentum = theano.function(
            inputs=[x_sentence, y_sentence, lr],
            outputs=[cost, nll],
            updates=momentum_updates)
        # RMSProp
        self.train_rmsprop = theano.function(
            inputs=[x_sentence, y_sentence, lr],
            outputs=[cost, nll],
            updates=rmsprop_updates)
        # AdaDelta
        self.train_adadelta = theano.function(
            inputs=[x_sentence, y_sentence, lr],
            outputs=[cost, nll],
            updates=adadelta_updates)
    def __init__(self, hyper_param, word2idx_dic):
        '''
		nh :: dimension of the hidden layer
		ne :: number of word embeddings in the vocabulary
		de :: dimension of the word embeddings
		nf :: number of feature
		nfe:: number of feature embeddings in the vocabulary - by leeck
		dfe:: dimension of the feature embeddings - by leeck
		cs :: word window context size
		emb_file :: word embedding file
		weight_decay :: weight decay
		dropout_rate :: dropout rate
		activation :: activation function: simg, tanh, relu
		word2idx_dic :: word to index dictionary
		'''
        self.hyper_param = hyper_param
        nh = hyper_param['nhidden']
        ne = hyper_param['vocsize']
        de = hyper_param['emb_dimension']
        weight_decay = hyper_param['weight_decay']
        dropout_rate = hyper_param['dropout_rate']
        activation = hyper_param['activation']
        learning_method = hyper_param['learning_method']
        # parameters of the model
        if hyper_param['load_model'] != '':
            self.load_param(hyper_param, hyper_param['load_model'])
        else:
            self.build_param(hyper_param, word2idx_dic)

        # parameters
        self.params = [self.emb, self.Wgx_src, self.Wgxb_src, self.Wgx, self.Wgc, \
          self.Whx_src, self.Whxb_src, self.Whx, self.Whc, self.Wh0c, \
          self.Ugh_src, self.Ughb_src, self.Ugh, self.Uhh_src, self.Uhhb_src, self.Uhh, \
          self.bg_src, self.bgb_src, self.bg, self.bh_src, self.bhb_src, self.bh, self.bh2, self.bh0, \
          self.Wah, self.Wax, self.Ua, self.ba, self.va, \
          self.Wh2h, self.Wyh2, self.Wyh, self.Wyc, self.Wyy, self.by, \
          self.h0_src, self.h0b_src]

        if hyper_param['fixed_emb']:
            print 'fixed embeddig.'
            self.params.remove(self.emb)

        # as many lines as words in the sentence
        x_sentence = T.ivector('x_sentence')  # x_sentence : n_steps
        x = self.emb[x_sentence].reshape(
            (x_sentence.shape[0], de))  # don't remove '</s>'
        x_reverse = x[::-1]  # reverse for backward

        y_sentence = T.ivector('y_sentence')  # labels
        y_input_sentence = T.concatenate(
            [y_sentence[-1:], y_sentence[:-1]],
            axis=0)  # move '</s>' to first position
        y = self.emb[y_input_sentence].reshape((y_input_sentence.shape[0], de))

        # for scan
        #def source_step(x_t, h_tm1):
        def source_step(dot_x_t_Wgx_src, dot_x_t_Whx_src, h_tm1):
            #print 'z_t, r_t are combined!'
            #all_t = T.nnet.sigmoid(T.dot(x_t, self.Wgx_src) + T.dot(h_tm1, self.Ugh_src) + self.bg_src)
            all_t = T.nnet.sigmoid(dot_x_t_Wgx_src +
                                   T.dot(h_tm1, self.Ugh_src))
            z_t = myutil.slice(all_t, 0, nh)
            r_t = myutil.slice(all_t, 1, nh)
            # candidate h_t
            #ch_t = myutil.activation(activation, T.dot(x_t, self.Whx_src) + T.dot(r_t * h_tm1, self.Uhh_src) + self.bh_src)
            ch_t = myutil.activation(
                activation, dot_x_t_Whx_src + T.dot(r_t * h_tm1, self.Uhh_src))
            h_t = (1.0 - z_t) * h_tm1 + z_t * ch_t
            return h_t

        #def source_backward_step(x_t, h_tm1):
        def source_backward_step(dot_x_t_Wgxb_src, dot_x_t_Whxb_src, h_tm1):
            #print 'z_t and r_t are combined!'
            #all_t = T.nnet.sigmoid(T.dot(x_t, self.Wgxb_src) + T.dot(h_tm1, self.Ughb_src) + self.bgb_src)
            all_t = T.nnet.sigmoid(dot_x_t_Wgxb_src +
                                   T.dot(h_tm1, self.Ughb_src))
            z_t = myutil.slice(all_t, 0, nh)
            r_t = myutil.slice(all_t, 1, nh)
            # candidate h_t
            #ch_t = myutil.activation(activation, T.dot(x_t, self.Whxb_src) + T.dot(r_t * h_tm1, self.Uhhb_src) + self.bhb_src)
            ch_t = myutil.activation(
                activation,
                dot_x_t_Whxb_src + T.dot(r_t * h_tm1, self.Uhhb_src))
            h_t = (1.0 - z_t) * h_tm1 + z_t * ch_t
            return h_t

        #def target_step(x_t, h_tm1, h_src, dot_h_src_Ua):
        def target_step(dot_x_t_Wgx, dot_x_t_Whx, dot_x_t_Wax, h_tm1, h_src,
                        dot_h_src_Ua):
            # search c_t
            #z = T.tanh(T.dot(h_tm1, self.Wa) + T.dot(h_src, self.Ua) + self.ba)
            #z = T.tanh(T.dot(h_tm1, self.Wa) + dot_h_src_Ua)
            z = T.tanh(T.dot(h_tm1, self.Wah) + dot_x_t_Wax + dot_h_src_Ua)
            #print 'z:', z.ndim
            e = T.dot(self.va, z.T)
            #print 'e:', e.ndim
            max_e = T.max(e)
            exp_e = T.exp(e - max_e)
            a = exp_e / exp_e.sum()
            #print 'a:', a.ndim
            c_t = T.dot(a, h_src)
            #print 'c_t:', c_t.ndim
            #print 'z_t and r_t are combined!'
            all_t = T.nnet.sigmoid(dot_x_t_Wgx + T.dot(h_tm1, self.Ugh) +
                                   T.dot(c_t, self.Wgc))
            z_t = myutil.slice(all_t, 0, nh)
            r_t = myutil.slice(all_t, 1, nh)
            # candidate h_t
            ch_t = myutil.activation(
                activation, dot_x_t_Whx + T.dot(r_t * h_tm1, self.Uhh) +
                T.dot(c_t, self.Whc))
            h_t = (1.0 - z_t) * h_tm1 + z_t * ch_t
            return [h_t, c_t]

        # make score, h_src, h0 (for beam search)
        def make_score(x, y, use_noise):
            # input layer dropout: ex. [0.2, 0.2, 0.5]
            if use_noise:
                print "X's projection layer dropout:", dropout_rate[0]
                dropout_x = myutil.dropout_from_layer(x, dropout_rate[0])
            else:
                dropout_x = x * (1.0 - dropout_rate[0])
            dropout_x_reverse = dropout_x[::-1]  # reverse for backward
            # RNN encoder
            dot_x_Wgx_src = T.dot(dropout_x, self.Wgx_src) + self.bg_src
            dot_x_Whx_src = T.dot(dropout_x, self.Whx_src) + self.bh_src
            dot_x_rev_Wgx_src = T.dot(dropout_x_reverse,
                                      self.Wgxb_src) + self.bgb_src
            dot_x_rev_Whx_src = T.dot(dropout_x_reverse,
                                      self.Whxb_src) + self.bhb_src
            # forward recurrent for source language
            hf_src, _ = theano.scan(fn=source_step,
                                    sequences=[dot_x_Wgx_src, dot_x_Whx_src],
                                    outputs_info=self.h0_src,
                                    n_steps=dropout_x.shape[0])
            # backward recurrent for source language
            hb_src_reverse, _ = theano.scan(
                fn=source_backward_step,
                sequences=[dot_x_rev_Wgx_src, dot_x_rev_Whx_src],
                outputs_info=self.h0b_src,
                n_steps=dropout_x_reverse.shape[0])
            hb_src = hb_src_reverse[::-1]
            h_src = T.concatenate([hf_src, hb_src], axis=1)
            # global context
            #c_global = h_src[0]
            c_global = T.concatenate([hf_src[-1], hb_src[0]], axis=0)
            # output layer dropout: ex. [0.2, 0.2, 0.5]
            # output layer (target language input layer) dropout: ex. [0.2, 0.2, 0.5]
            if use_noise:
                print "Y's projection layer dropout:", dropout_rate[1]
                dropout_y = myutil.dropout_from_layer(y, dropout_rate[1])
            else:
                dropout_y = y * (1.0 - dropout_rate[1])
            # RNN decoder
            dot_y_Wgx = T.dot(dropout_y, self.Wgx) + self.bg
            dot_y_Whx = T.dot(dropout_y, self.Whx) + self.bh
            dot_y_Wax = T.dot(dropout_y, self.Wax)
            dot_h_src_Ua = T.dot(h_src, self.Ua) + self.ba
            h0 = myutil.activation(activation,
                                   T.dot(c_global, self.Wh0c) + self.bh0)
            # forward recurrent for target language
            [h,
             c], _ = theano.scan(fn=target_step,
                                 sequences=[dot_y_Wgx, dot_y_Whx, dot_y_Wax],
                                 outputs_info=[h0, None],
                                 non_sequences=[h_src, dot_h_src_Ua],
                                 n_steps=dropout_y.shape[0])
            # h2 - Deep Output RNN
            print 'Deep Output RNN: ReLU'
            # hidden layer dropout
            if use_noise:
                print "Y's hidden layer dropout:", dropout_rate[2]
                dropout_h = myutil.dropout_from_layer(h, dropout_rate[2])
            else:
                dropout_h = h * (1.0 - dropout_rate[2])
            # h2 - Deep Output RNN
            print 'Deep Output RNN: ReLU'
            h2 = myutil.activation('relu',
                                   T.dot(dropout_h, self.Wh2h) + self.bh2)
            # score
            score = T.dot(h2, self.Wyh2) + T.dot(dropout_h, self.Wyh) + T.dot(dropout_y, self.Wyy) + \
              T.dot(c, self.Wyc) + self.by
            return score, h_src, h0

        # dropout version (for training)
        dropout_score, _, _ = make_score(x, y, True)
        dropout_p_y_given_x = myutil.mysoftmax(dropout_score)

        # scaled version (for prediction)
        score, h_src, h0 = make_score(x, y, False)
        p_y_given_x = myutil.mysoftmax(score)

        # prediction
        y_pred = T.argmax(p_y_given_x, axis=1)
        test_nll = -T.mean(
            T.log(p_y_given_x)[T.arange(y.shape[0]), y_sentence])

        # beam search decoding: input=[h_src, h_tm1, y_tm1], output=[h_t, log_p_y_t, alignment]
        input_h_src = T.fmatrix('input_h_src')
        input_h_tm1 = T.fvector('input_h_tm1')
        input_y_tm1 = T.iscalar('input_y_tm1')  # input_y_tm1 == x_t
        x_t = self.emb[input_y_tm1]
        # search c_t
        #z = T.tanh(T.dot(input_h_tm1, self.Wa) + T.dot(input_h_src, self.Ua) + self.ba)
        z = T.tanh(
            T.dot(input_h_tm1, self.Wah) + T.dot(x_t, self.Wax) +
            T.dot(input_h_src, self.Ua) + self.ba)
        e = T.dot(self.va, z.T)
        max_e = T.max(e)
        exp_e = T.exp(e - max_e)
        alignment = exp_e / exp_e.sum()
        c_t = T.dot(alignment, input_h_src)
        all_t = T.nnet.sigmoid(
            T.dot(x_t, self.Wgx) + T.dot(input_h_tm1, self.Ugh) +
            T.dot(c_t, self.Wgc) + self.bg)
        z_t = myutil.slice(all_t, 0, nh)
        r_t = myutil.slice(all_t, 1, nh)
        # candidate h_t
        ch_t = myutil.activation(
            activation,
            T.dot(x_t, self.Whx) + T.dot(r_t * input_h_tm1, self.Uhh) +
            T.dot(c_t, self.Whc) + self.bh)
        h_t = (1.0 - z_t) * input_h_tm1 + z_t * ch_t
        # h2 - Deep Output RNN
        h2_t = myutil.activation('relu', T.dot(h_t, self.Wh2h) + self.bh2)
        # prediction
        score_y_t = T.dot(h2_t, self.Wyh2) + T.dot(h_t, self.Wyh) + T.dot(x_t, self.Wyy) + \
          T.dot(c_t, self.Wyc) + self.by
        max_s = T.max(score_y_t)
        exp_s = T.exp(score_y_t - max_s)
        log_p_y_t = T.log(exp_s / exp_s.sum())

        # cost and gradients and learning rate
        lr = T.scalar('lr')  # for SGD

        # NLL + L2-norm
        nll = -T.mean(
            T.log(dropout_p_y_given_x)[T.arange(y.shape[0]), y_sentence])
        cost = nll
        for param in self.params:
            if param.name == 'emb':
                continue
            cost += weight_decay * T.sum(param**2)

        # SGD
        sgd_updates = myutil.sgd_updates(self.params, cost, lr)
        # SGD + momentum
        momentum_updates = myutil.sgd_updates_momentum(self.params, cost, lr,
                                                       0.9)
        # RMSProp (rho = 0.9)
        rmsprop_updates = myutil.sgd_updates_rmsprop(self.params, cost, lr,
                                                     0.9, 1)
        # AdaDelta (lr --> rho = 0.95)
        adadelta_updates = myutil.sgd_updates_adadelta(self.params, cost, lr,
                                                       1e-6, 9)

        # theano functions to compile
        self.classify = theano.function(inputs=[x_sentence, y_sentence],
                                        outputs=[y_pred, test_nll])
        # for beam search
        self.encoding_src_lang = theano.function(inputs=[x_sentence],
                                                 outputs=[h_src, h0])
        self.search_next_word = theano.function(
            inputs=[input_h_src, input_h_tm1, input_y_tm1],
            outputs=[log_p_y_t, h_t, alignment])
        # for reranking
        self.get_nll = theano.function(
            inputs=[x_sentence, input_h_src, input_h_tm1, y_sentence],
            outputs=test_nll,
            on_unused_input='ignore')
        # SGD
        self.train_sgd = theano.function(inputs=[x_sentence, y_sentence, lr],
                                         outputs=[cost, nll],
                                         updates=sgd_updates)
        # SGD with momentum
        self.train_momentum = theano.function(
            inputs=[x_sentence, y_sentence, lr],
            outputs=[cost, nll],
            updates=momentum_updates)
        # RMSProp
        self.train_rmsprop = theano.function(
            inputs=[x_sentence, y_sentence, lr],
            outputs=[cost, nll],
            updates=rmsprop_updates)
        # AdaDelta
        self.train_adadelta = theano.function(
            inputs=[x_sentence, y_sentence, lr],
            outputs=[cost, nll],
            updates=adadelta_updates)