def __init__(self, config, name='', fls=None):
     self.config = config
     self.name = name
     self.creater = LayerFactory()
     self.fls = fls
     #print(self.fls)
     self.trng = RandomStreams(numpy.random.randint(int(10e6)))
Beispiel #2
0
def setup_LG(n, classsize=20, degree=5, p=1):
    """
    construction of our basic network
    """
    categories = ["Kids", "Normal", "Risk"]
    percentage = [0.15, 0.48, 0.37]  # stastica
    LG = LayerGraph(n, categories, percentage)
    LF = LayerFactory(LG)

    # create layers:
    household_layer = LF.layer_dividing_Graph("Households",
                                              2,
                                              None,
                                              categories,
                                              fully_connected=True)
    school_layer = LF.layer_dividing_Graph("Schools", classsize, degree,
                                           ["Kids"])
    working_layer = LF.layer_dividing_Graph("Workplaces", 6, 3, ["Normal"])
    risk_layer = LF.create_layer("R_Workplaces", 6, int(percentage[1] * n / 3),
                                 [0, 0.7, 0.3], 3)
    social_layer = LF.layer_dividing_Graph("Social", 10, 3, categories)
    party_layer = LF.create_layer("parties", 20, int(n * percentage[0] / 6),
                                  [0.6, 0.4, 0], 6)
    basic_connect = LF.layer_dividing_Graph("basic", n, 1, categories)
    # Add layers:
    LG.add_layer(household_layer, p)
    LG.add_layer(school_layer, p)
    LG.add_layer(working_layer, p)
    LG.add_layer(risk_layer, p)
    LG.add_layer(social_layer, p)
    LG.add_layer(party_layer, p)
    LG.add_layer(basic_connect, p)

    return LG
Beispiel #3
0
    def build(self):
        '''
			Building the computational graph.
		'''
        # building forward NMT
        logging.info("Building forward NMT")
        self.fwd_nmt = RNNsearch(self.config, '')
        self.fwd_nmt.build()

        # building backward NMT
        logging.info("Building backward NMT")
        config = copy.deepcopy(self.config)
        config['index_unk_src'], config['index_unk_trg'] = config[
            'index_unk_trg'], config['index_unk_src']
        config['index_eos_src'], config['index_eos_trg'] = config[
            'index_eos_trg'], config['index_eos_src']
        config['num_vocab_src'], config['num_vocab_trg'] = config[
            'num_vocab_trg'], config['num_vocab_src']
        self.bwd_nmt = RNNsearch(config, 'inv_')
        self.bwd_nmt.build()

        # merging parameters and objectives
        self.creater = LayerFactory()
        self.creater.params = self.fwd_nmt.creater.params + self.bwd_nmt.creater.params
        self.creater.layers = self.fwd_nmt.creater.layers + self.bwd_nmt.creater.layers
        cost0 = self.fwd_nmt.cost_per_sample
        cost1 = self.bwd_nmt.cost_per_sample
        valid = tensor.vector('valid', dtype='float32')
        self.inputs = self.fwd_nmt.inputs + self.bwd_nmt.inputs + [
            valid,
        ]
        self.get_addition_grads(cost0, cost1, valid)
class RNNsearch(model):
    '''
		The attention-based NMT model
	'''
    def __init__(self, config, name='', fls=None):
        self.config = config
        self.name = name
        self.creater = LayerFactory()
        self.fls = fls
        #print(self.fls)
        self.trng = RandomStreams(numpy.random.randint(int(10e6)))

    def sampling_step(self, state, prev, context):
        '''
			Build the computational graph which samples the next word.

			:type state: theano variables
			:param state: the previous hidden state

			:type prev: theano variables
			:param prev: the last generated word

			:type context: theano variables
			:param context: the context vectors.
		'''
        emb = self.emb_trg.forward(prev)
        energy, c = self.decoderGRU.decode_probs(context, state, emb)
        probs = tensor.nnet.softmax(energy)

        sample = self.trng.multinomial(pvals=probs,
                                       dtype='int64').argmax(axis=-1)

        newemb = self.emb_trg.forward(sample)
        newstate = self.decoderGRU.decode_next(c, state, newemb)

        return newstate, sample, probs

    def decode_sample(self, state_init, c, length, n_samples):
        '''
			Build the decoder graph for sampling.

			:type state_init: theano variables
			:param state_init: the initial state of decoder

			:type c: theano variables
			:param c: the context vectors

			:type length: int
			:param length: the limitation of sample length

			:type n_samples: int
			:param n_samples: the number of samples
		'''

        state = tensor.repeat(state_init, n_samples,
                              axis=0)  # copy state n times
        sample = tensor.zeros((n_samples, ), dtype='int64')
        c = tensor.repeat(c, n_samples, axis=1)

        result, updates = theano.scan(self.sampling_step,
                                      outputs_info=[state, sample, None],
                                      non_sequences=[c],
                                      n_steps=length)

        samples = result[1]
        probs = result[2]
        y_idx = tensor.arange(samples.flatten(
        ).shape[0]) * self.config['num_vocab_trg'] + samples.flatten()
        #probs = probs.flatten()[y_idx]
        #probs = probs.reshape(samples.shape)
        return samples, probs, updates

    def build(self, verbose=False):
        '''
			Build the computational graph.

			:type verbose: bool
			:param verbose: only set to True on visualization
		'''
        config = self.config

        # create layers
        logging.info('Initializing layers')
        self.emb_src = self.creater.createLookupTable(
            self.name + 'emb_src',
            config['num_vocab_src'],
            config['dim_emb_src'],
            offset=True)  #(input,output)-->[30000,620]
        self.emb_trg = self.creater.createLookupTable(
            self.name + 'emb_trg',
            config['num_vocab_trg'],
            config['dim_emb_trg'],
            offset=True)  #(input,output)-->[30000,620]
        self.encoderGRU = self.creater.createGRU(self.name + 'GRU_enc',
                                                 config['dim_emb_src'],
                                                 config['dim_rec_enc'],
                                                 verbose=verbose)
        self.encoderGRU_back = self.creater.createGRU(self.name +
                                                      'GRU_enc_back',
                                                      config['dim_emb_src'],
                                                      config['dim_rec_enc'],
                                                      verbose=verbose)
        self.decoderGRU = self.creater.createGRU_attention(
            self.name + 'GRU_dec',
            config['dim_emb_trg'],
            2 * config['dim_rec_enc'],
            config['dim_rec_dec'],
            config['num_vocab_trg'],
            verbose=verbose)
        self.initer = self.creater.createFeedForwardLayer(
            self.name + 'initer',
            config['dim_rec_enc'],
            config['dim_rec_dec'],
            offset=True)

        if self.fls:
            #print("loaded feature")
            fl_weight = []
            for fl in self.fls:
                fl_weight.append(fl.feature_weight)
                #logging.info("sen weight")
                #print(fl.feature_weight)
            fl_weight = numpy.concatenate(fl_weight)
            self.feature_weight = theano.shared(fl_weight.astype('float32'),
                                                name="feature_weight")
            self.creater.params += [self.feature_weight]
            self.feature_weight_dim = self.feature_weight.dimshuffle(
                'x', 0)  # equal to a.T  (m,n)-->(n,m)

        # create input variables
        self.x = tensor.matrix('x', dtype='int64')  # size: (length, batchsize)
        self.xmask = tensor.matrix(
            'x_mask', dtype='float32')  # size: (length, batchsize)
        self.y = tensor.matrix('y', dtype='int64')  # size: (length, batchsize)
        self.ymask = tensor.matrix(
            'y_mask', dtype='float32')  # size: (length, batchsize)

        if 'MRT' in config and config['MRT'] is True:
            self.MRTLoss = tensor.vector('MRTLoss')
            self.inputs = [
                self.x, self.xmask, self.y, self.ymask, self.MRTLoss
            ]
        else:
            self.MRTLoss = None
            self.inputs = [self.x, self.xmask, self.y, self.ymask]

        if config['PR']:
            self.ans = tensor.scalar('ans', dtype='int64')
            self.features = tensor.matrix('features', dtype='float32')
            self.inputs += [self.features, self.ans]

        # create computational graph for training
        logging.info('Building computational graph')
        # ----encoder-----
        emb = self.emb_src.forward(
            self.x.flatten())  # size: (length, batch_size, dim_emb)
        back_emb = self.emb_src.forward(self.x[::-1].flatten())

        self.encode_forward = self.encoderGRU.forward(
            emb, self.x.shape[0], batch_size=self.x.shape[1],
            mask=self.xmask)  # size: (length, batch_size, dim)
        self.encode_backward = self.encoderGRU_back.forward(
            back_emb,
            self.x.shape[0],
            batch_size=self.x.shape[1],
            mask=self.xmask[::-1])  # size: (length, batch_size, dim)
        context_forward = self.encode_forward[0]  # only hiddens
        context_backward = self.encode_backward[0][::-1]
        self.context = tensor.concatenate(
            (context_forward, context_backward),
            axis=2)  # size: (length, batch_size, 2*dim)

        # ----decoder----
        self.init_c = context_backward[0]
        self.state_init = self.initer.forward(context_backward[0])
        emb = self.emb_trg.forward(
            self.y.flatten())  # size: (length, batch_size, dim_emb)
        self.decode = self.decoderGRU.forward(
            emb,
            self.y.shape[0],
            self.context,
            state_init=self.state_init,
            batch_size=self.y.shape[1],
            mask=self.ymask,
            cmask=self.xmask)  # size: (length, batch_size, dim)

        energy = self.decode[1]
        self.attention = self.decode[2]
        self.softmax = tensor.nnet.softmax(energy)
        # compute costs and grads
        y_idx = tensor.arange(self.y.flatten(
        ).shape[0]) * self.config['num_vocab_trg'] + self.y.flatten()
        cost = self.softmax.flatten()[y_idx]
        cost = -tensor.log(cost)
        self.cost = cost.reshape(
            (self.y.shape[0], self.y.shape[1])) * self.ymask
        self.cost_per_sample = self.cost.sum(axis=0)
        if 'MRT' in config and config['MRT'] is True:
            self.cost_per_sample = self.cost.sum(axis=0)
            tmp = self.cost_per_sample
            tmp *= config['MRT_alpha']
            tmp -= tmp.min()
            tmp = tensor.exp(-tmp)
            tmp /= tmp.sum()
            tmp *= self.MRTLoss
            tmp = -tmp.sum()
            self.cost = tmp
        elif config['PR'] and self.fls:
            # calculate p
            self.cost_per_sample = self.cost.sum(axis=0)
            self.cost_per_sample *= config['alpha_PR']
            cost_min = self.cost_per_sample - self.cost_per_sample.min()
            probs = tensor.exp(-cost_min)
            log_probs = -cost_min - tensor.log(probs.sum())
            probs /= probs.sum()
            self.probs = log_probs
            # calculate q
            energy_q = self.features * self.feature_weight_dim
            energy_q = energy_q.sum(axis=1)
            self.energy_q = energy_q
            energy_q_min = energy_q - energy_q.max()
            probs_q = tensor.exp(energy_q_min)
            log_probs_q = energy_q_min - tensor.log(probs_q.sum())

            probs_q /= probs_q.sum()
            self.probs_q = log_probs_q
            # calculate KL divergence
            cost_KL = tensor.exp(log_probs_q) * (log_probs_q - log_probs)
            self.cost_KLs = cost_KL
            self.cost_KL = cost_KL.sum()
            self.cost_NMT = self.cost_per_sample[self.ans]
            self.cost = config['lambda_PR'] * self.cost_KL + config[
                'lambda_MLE'] * self.cost_NMT
        else:
            self.cost = self.cost.sum()

        # build sampling graph
        self.x_sample = tensor.matrix('x_sample', dtype='int64')
        self.n_samples = tensor.scalar('n_samples', dtype='int64')
        self.length_sample = tensor.scalar('length', dtype='int64')
        emb_sample = self.emb_src.forward(
            self.x_sample.flatten())  # (length, batch_size, dim_emb)
        back_emb_sample = self.emb_src.forward(self.x_sample[::-1].flatten())
        encode_forward_sample = self.encoderGRU.forward(
            emb_sample,
            self.x_sample.shape[0],
            batch_size=self.x_sample.shape[1])  # (length, batch_size, dim)
        encode_backward_sample = self.encoderGRU_back.forward(
            back_emb_sample,
            self.x_sample.shape[0],
            batch_size=self.x_sample.shape[1])  # (length, batch_size, dim)
        context_sample = tensor.concatenate(
            (encode_forward_sample[0], encode_backward_sample[0][::-1]),
            axis=2)  # (length, batch_size, 2*dim)
        state_init_sample = self.initer.forward(
            encode_backward_sample[0][::-1][0])
        self.state_init_sample = state_init_sample
        self.context_sample = context_sample
        self.samples, self.probs_sample, self.updates_sample = self.decode_sample(
            state_init_sample, context_sample, self.length_sample,
            self.n_samples)

        # parameter for decoding
        self.y_decode = tensor.vector('y_decode', dtype='int64')
        self.context_decode = tensor.tensor3('context_decode', dtype='float32')
        self.c_decode = tensor.matrix('c_decode', dtype='float32')
        self.state_decode = tensor.matrix('state_decode', dtype='float32')
        self.emb_decode = tensor.matrix('emb_decode', dtype='float32')

    def encode(self, x):
        '''
			Encode source sentence to context vector.
		'''
        if not hasattr(self, "encoder"):
            self.encoder = theano.function(inputs=[self.x, self.xmask],
                                           outputs=[self.context])
        x = numpy.reshape(x, (x.shape[0], 1))
        xmask = numpy.ones(x.shape, dtype='float32')
        return self.encoder(x, xmask)

    def get_trg_embedding(self, y):
        '''
			Get the embedding of target sentence.
		'''
        if not hasattr(self, "get_trg_embeddinger"):
            self.get_trg_embeddinger = theano.function(
                inputs=[self.y_decode],
                outputs=[self.emb_trg.forward(self.y_decode)])
        return self.get_trg_embeddinger(y)

    def get_init(self, c):
        '''
			Get the initial decoder hidden state with context vector.
		'''
        if not hasattr(self, "get_initer"):
            self.get_initer = theano.function(
                inputs=[self.context],
                outputs=[self.initer.forward(context_backward[0])])
        return self.get_initer(c)

    def get_context_and_init(self, x):
        '''
			Encode source sentence to context vectors and get the initial decoder hidden state.
		'''
        if not hasattr(self, "get_context_and_initer"):
            self.get_context_and_initer = theano.function(
                inputs=[self.x, self.xmask],
                outputs=[self.context, self.state_init])
        x = numpy.reshape(x, (x.shape[0], 1))
        xmask = numpy.ones(x.shape, dtype='float32')
        return self.get_context_and_initer(x, xmask)

    def get_probs(self, c, state, emb):
        '''
			Get the probability of the next target word.
		'''
        if not hasattr(self, "get_probser"):
            self.get_probser = theano.function(inputs = [self.context_decode, \
                                                   self.state_decode, \
                        self.emb_decode], \
                       outputs = self.decoderGRU.decode_probs(self.context_decode, \
                                                           self.state_decode, \
                                self.emb_decode))
        return self.get_probser(c, state, emb)

    def get_next(self, c, state, emb):
        '''
			Get the next hidden state.
		'''
        if not hasattr(self, "get_nexter"):
            self.get_nexter = theano.function(inputs = [self.c_decode, \
                                                  self.state_decode, \
                       self.emb_decode],
                      outputs = self.decoderGRU.decode_next(self.c_decode, \
                                                         self.state_decode, \
                              self.emb_decode))
        return self.get_nexter(c, state, emb)

    def get_cost(self, x, xmask, y, ymask):
        '''
			Get the negative log-likelihood of parallel sentences.
		'''
        if not hasattr(self, "get_coster"):
            self.get_coster = theano.function(
                inputs=[self.x, self.xmask, self.y, self.ymask],
                outputs=[self.cost])
        return self.get_coster(x, xmask, y, ymask)

    def get_sample(self, x, length, n_samples):
        '''
			Get sampling results.
		'''
        if not hasattr(self, "get_sampler"):
            self.get_sampler = theano.function(
                inputs=[self.x_sample, self.length_sample, self.n_samples],
                outputs=[self.samples, self.probs_sample],
                updates=self.updates_sample)
        return self.get_sampler(x, length, n_samples)

    def get_attention(self, x, xmask, y, ymask):
        '''
			Get the attention weight of parallel sentences.
		'''
        if not hasattr(self, "get_attentioner"):
            self.get_attentioner = theano.function(
                inputs=[self.x, self.xmask, self.y, self.ymask],
                outputs=[self.attention])
        return self.get_attentioner(x, xmask, y, ymask)

    def get_layer(self, x, xmask, y, ymask):
        '''
			Get the hidden states essential for visualization
		'''
        if not hasattr(self, "get_layerer"):
            self.get_layerer = theano.function(inputs = [self.x, self.xmask, self.y, self.ymask],
                          outputs = self.encode_forward + \
                                 self.encode_backward + \
                        tuple(self.decode[0]) + tuple(self.decode[1:]))

        layers = self.get_layerer(x, xmask, y, ymask)
        enc_names = [
            'h', 'gate', 'reset', 'state', 'reseted', 'state_in', 'gate_in',
            'reset_in'
        ]
        dec_names = [
            'h', 'c', 'att', 'gate_cin', 'gate_preactive', 'gate', 'reset_cin',
            'reset_preactive', 'reset', 'state_cin', 'reseted',
            'state_preactive', 'state'
        ]
        dec_names += [
            'outenergy', 'state_in', 'gate_in', 'reset_in', 'state_in_prev',
            'readout', 'maxout', 'outenergy_1', 'outenergy_2'
        ]
        value_name = ['enc_for_' + name for name in enc_names]
        value_name += ['enc_back_' + name for name in enc_names]
        value_name += ['dec_' + name for name in dec_names]
        result = {}
        for i in range(len(layers)):
            if value_name[i] != '':
                result[value_name[i]] = layers[i]
        return result
Beispiel #5
0
class RNNtsg(model):
    '''
		The attention-based NMT model for TSG
	'''
    def __init__(self, config, name=''):
        self.config = config
        self.name = name
        self.creater = LayerFactory()
        self.trng = RandomStreams(numpy.random.randint(int(10e6)))

    def translate(self, x, T, beam_size=10, return_array=False):
        '''
			Decode with beam search.

			:type x: numpy array
			:param x: the indexed source sentence

			:type beam_size: int
			:param beam_size: beam size

			:returns: a numpy array, the indexed translation result
		'''
        # initialize variables
        result = [[]]
        loss = [0.]
        result_eos = []
        loss_eos = []
        beam = beam_size
        nonterms = [
            ['S']
        ]  # same length as result, nonterms for each hypothesis # (n_hyps, nonterm for each hyp)
        par_state_time = [[0]]  # (n_hyps, len(nonterm) for each hyp)
        # get encoder states
        c, state = self.get_context_and_init(x)
        emb_y = numpy.zeros((1, self.config['dim_emb_trg']), dtype='float32')
        state_hist = [[
            numpy.zeros((1, self.config['dim_rec_enc']), dtype='float32')
        ]]  # (n_hyps, l)

        for l in range(x.shape[0] * 3):
            cur_nonterm_idx = [
            ]  # length lists, each list is the rule indices for expanding LHS
            #print result
            for i in range(len(nonterms)):
                if len(nonterms[i]) > 0:
                    potent_rules = T.rule_idx_with_root(
                        nonterms[i][-1]
                    )  # list of potential rules with the given lhs as root
                    #print potent_rules + i * self.config['dim_emb_trg']
                    cur_nonterm_idx += [
                        r + i * self.config['num_vocab_trg']
                        for r in potent_rules
                    ]

                    nonterms[i].pop()
            # only take the first k results if we have k < beam_size potential nonterms
            if len(cur_nonterm_idx) < beam_size:
                beam = len(cur_nonterm_idx)
            else:
                beam = beam_size
            # get word probability
            energy, ctx = self.get_probs(numpy.repeat(c, len(result), axis=1),
                                         state, emb_y)
            # multiply energy by cur_nonterm_idx mask
            energy_mask = numpy.zeros((energy.shape[0] * energy.shape[1]),
                                      dtype='float32')
            energy_mask[cur_nonterm_idx] = 1.
            energy_mask = energy_mask.reshape(
                (energy.shape[0], energy.shape[1]))
            energy = energy * energy_mask

            probs = tools.softmax(energy)
            losses = -numpy.log(probs)

            # prevent translation to be too short.
            if l < x.shape[0] / 2:
                losses[:, self.config['index_eos_trg']] = numpy.inf
            # prevent rules that do not have required lhs
            #losses[:, not_cur_nonterm_idx] = numpy.inf
            for i in range(len(loss)):
                losses[i] += loss[i]

            # get the n-best partial translations
            best_index_flatten = numpy.argpartition(losses.flatten(),
                                                    beam)[:beam]
            best_index = [(index / self.config['num_vocab_trg'],
                           index % self.config['num_vocab_trg'])
                          for index in best_index_flatten]

            # save the partial translations in the beam
            new_ctx = numpy.zeros((beam, 2 * self.config['dim_rec_enc']),
                                  dtype='float32')
            new_y = []
            new_state = numpy.zeros((beam, self.config['dim_rec_dec']),
                                    dtype='float32')
            new_result = []
            new_loss = []
            new_nonterms = []
            new_par_state_time = []
            new_state_hist = []
            new_par_state = numpy.zeros((beam, self.config['dim_rec_dec']),
                                        dtype='float32')
            #print best_index
            #print len(result), len(state_hist), len(par_state_time)
            for i in range(beam):
                index = best_index[i]
                new_result.append(result[index[0]] + [index[1]])
                new_loss.append(losses[index[0], index[1]])
                new_ctx[i] = ctx[index[0]]
                new_y.append(index[1])
                new_state[i] = state[index[0]]
                par_state_t = par_state_time[index[0]][-1]

                new_par_state[i] = state_hist[index[0]][par_state_t]

                r = T.get_rule_from_idx(index[1])
                if r:
                    add_nonterms = r.get_expand_tags()[::-1]
                else:
                    add_nonterms = []
                new_nonterms.append(nonterms[index[0]] + add_nonterms)
                # set the parent of expanded tags to be current
                # do not include last par_state_time[] for current hyp
                new_par_state_time.append(par_state_time[index[0]][:-1] +
                                          [l + 1] * len(add_nonterms))
                new_state_hist.append(state_hist[index[0]] + [state[index[0]]])
            # get the next decoder hidden state
            new_emby = self.get_trg_embedding(
                numpy.asarray(new_y, dtype='int64'))[0]
            new_state = self.get_next(new_ctx, new_state, new_par_state,
                                      new_emby)

            # remove finished translation from the beam
            state = []
            emb_y = []
            result = []
            loss = []
            nonterms = []
            state_hist = []
            par_state_time = []
            for i in range(beam):
                if len(new_nonterms[i]) == 0:
                    # par_state_time and nonterms should have same length for each hyp
                    # par_state_time records parent state timestep for each nonterms that needs to be expanded
                    assert len(new_par_state_time[i]) == 0
                    result_eos.append(new_result[i])
                    #print new_result[i]
                    loss_eos.append(new_loss[i])
                    beam -= 1
                else:
                    result.append(new_result[i])
                    loss.append(new_loss[i])
                    state.append(new_state[i])
                    emb_y.append(new_emby[i])
                    nonterms.append(new_nonterms[i])
                    state_hist.append(new_state_hist[i])
                    par_state_time.append(new_par_state_time[i])
            #print len(result), len(state_hist), len(par_state_time)
            if beam <= 0:
                break

            state = numpy.asarray(state, dtype='float32')
            emb_y = numpy.asarray(emb_y, dtype='float32')

        # only used in semi-supervised training
        if return_array:
            if len(result_eos) > 0:
                return result_eos
            else:
                return [result[-1][:1]]

        if len(result_eos) > 0:
            # return the best translation
            return result_eos[numpy.argmin(loss_eos)]
        elif beam_size > 100:
            # double the beam size on failure
            logging.warning('cannot find translation in beam size %d' %
                            beam_size)
            return []
        else:
            logging.info('cannot find translation in beam size %d, try %d' %
                         (beam_size, beam_size * 2))
            return self.translate(x, beam_size=beam_size * 2)

    def sampling_step(self, state, prev, context, par_state):
        '''
			Build the computational graph which samples the next word.

			:type state: theano variables
			:param state: the previous hidden state

			:type prev: theano variables
			:param prev: the last generated word

			:type context: theano variables
			:param context: the context vectors.
		'''
        emb = self.emb_trg.forward(prev)
        energy, c = self.decoderGRU.decode_probs(context, state, emb)
        probs = tensor.nnet.softmax(energy)

        sample = self.trng.multinomial(pvals=probs,
                                       dtype='int64').argmax(axis=-1)

        newemb = self.emb_trg.forward(sample)
        newstate = self.decoderGRU.decode_next(c, state, newemb, par_state)

        return newstate, sample, probs

    def decode_sample(self, state_init, c, length, n_samples):
        '''
			Build the decoder graph for sampling.

			:type state_init: theano variables
			:param state_init: the initial state of decoder

			:type c: theano variables
			:param c: the context vectors

			:type length: int
			:param length: the limitation of sample length

			:type n_samples: int
			:param n_samples: the number of samples
		'''

        state = tensor.repeat(state_init, n_samples, axis=0)
        sample = tensor.zeros((n_samples, ), dtype='int64')
        c = tensor.repeat(c, n_samples, axis=1)

        result, updates = theano.scan(self.sampling_step,
                                      outputs_info=[state, sample, None],
                                      non_sequences=[c],
                                      n_steps=length)

        samples = result[1]
        probs = result[2]
        y_idx = tensor.arange(samples.flatten(
        ).shape[0]) * self.config['num_vocab_trg'] + samples.flatten()
        probs = probs.flatten()[y_idx]
        probs.reshape(samples.shape)
        return samples, probs, updates

    def build(self, verbose=False):
        '''
			Build the computational graph.

			:type verbose: bool
			:param verbose: only set to True on visualization
		'''
        config = self.config

        #create layers
        logging.info('initializing layers...')
        self.emb_src = self.creater.createLookupTable(self.name + 'emb_src',
                                                      config['num_vocab_src'],
                                                      config['dim_emb_src'],
                                                      offset=True)
        self.emb_trg = self.creater.createLookupTable(self.name + 'emb_trg',
                                                      config['num_vocab_trg'],
                                                      config['dim_emb_trg'],
                                                      offset=True)
        self.encoderGRU = self.creater.createGRU(self.name + 'GRU_enc',
                                                 config['dim_emb_src'],
                                                 config['dim_rec_enc'],
                                                 verbose=verbose)
        self.encoderGRU_back = self.creater.createGRU(self.name +
                                                      'GRU_enc_back',
                                                      config['dim_emb_src'],
                                                      config['dim_rec_enc'],
                                                      verbose=verbose)

        self.decoderGRU = self.creater.createGRU_tsg(self.name + 'GRU_dec',
                                                     config['dim_emb_trg'],
                                                     2 * config['dim_rec_enc'],
                                                     config['dim_rec_dec'],
                                                     config['num_vocab_trg'],
                                                     verbose=verbose)

        self.initer = self.creater.createFeedForwardLayer(
            self.name + 'initer',
            config['dim_rec_enc'],
            config['dim_rec_dec'],
            offset=True)

        # create input variables
        self.x = tensor.matrix('x', dtype='int64')  # size: (length, batchsize)
        self.xmask = tensor.matrix(
            'x_mask', dtype='float32')  # size: (length, batchsize)
        self.y_idx = tensor.matrix('y_idx',
                                   dtype='int64')  # size: (length, batchsize)
        self.ymask = tensor.matrix(
            'y_mask', dtype='float32')  # size: (length, batchsize)
        #self.y_parent_idx = tensor.matrix('y_parent_idx', dtype='int64') # size: (length, batchsize)
        self.y_parent_t = tensor.matrix(
            'y_parent_t', dtype='int64')  # size: (length, batchsize)

        if 'MRT' in config and config['MRT'] is True:
            self.MRTLoss = tensor.vector('MRTLoss')
            self.inputs = [
                self.x, self.xmask, self.y_idx, self.y_parent_t, self.ymask,
                self.MRTLoss
            ]
        else:
            self.MRTLoss = None
            self.inputs = [
                self.x, self.xmask, self.y_idx, self.y_parent_t, self.ymask
            ]

        # create computational graph for training
        logging.info('building computational graph...')
        # ----encoder-----
        emb = self.emb_src.forward(
            self.x.flatten())  # size: (length, batch_size, dim_emb)
        back_emb = self.emb_src.forward(self.x[::-1].flatten())

        self.encode_forward = self.encoderGRU.forward(
            emb, self.x.shape[0], batch_size=self.x.shape[1],
            mask=self.xmask)  # size: (length, batch_size, dim)
        self.encode_backward = self.encoderGRU_back.forward(
            back_emb,
            self.x.shape[0],
            batch_size=self.x.shape[1],
            mask=self.xmask[::-1])  # size: (length, batch_size, dim)
        context_forward = self.encode_forward[0]
        context_backward = self.encode_backward[0][::-1]
        self.context = tensor.concatenate(
            (context_forward, context_backward),
            axis=2)  # size: (length, batch_size, 2*dim)

        # ----decoder----
        self.init_c = context_backward[0]
        self.state_init = self.initer.forward(context_backward[0])
        emb = self.emb_trg.forward(
            self.y_idx.flatten())  # size: (length, batch_size, dim_emb)
        self.decode = self.decoderGRU.forward(
            emb,
            self.y_idx.shape[0],
            self.context,
            self.state_init,
            self.y_parent_t,
            batch_size=self.y_idx.shape[1],
            mask=self.ymask,
            cmask=self.xmask)  # size: (length, batch_size, dim)

        energy = self.decode[1]
        self.attention = self.decode[2]
        self.softmax = tensor.nnet.softmax(energy)
        # compute costs and grads
        y_idx = tensor.arange(self.y_idx.flatten(
        ).shape[0]) * self.config['num_vocab_trg'] + self.y_idx.flatten()
        cost = self.softmax.flatten()[y_idx]
        cost = -tensor.log(cost)
        self.cost = cost.reshape(
            (self.y_idx.shape[0], self.y_idx.shape[1])) * self.ymask
        self.cost_per_sample = self.cost.sum(axis=0)
        if 'MRT' in config and config['MRT'] is True:
            self.cost_per_sample = self.cost.sum(axis=0)
            tmp = self.cost_per_sample
            tmp *= config['MRT_alpha']
            tmp -= tmp.min()
            tmp = tensor.exp(-tmp)
            tmp /= tmp.sum()
            tmp *= self.MRTLoss
            tmp = -tmp.sum()
            self.cost = tmp
        else:
            self.cost = self.cost.sum()

        # build sampling graph
        self.x_sample = tensor.matrix('x_sample', dtype='int64')
        self.n_samples = tensor.scalar('n_samples', dtype='int64')
        self.length_sample = tensor.scalar('length', dtype='int64')
        emb_sample = self.emb_src.forward(
            self.x_sample.flatten())  # (length, batch_size, dim_emb)
        back_emb_sample = self.emb_src.forward(self.x_sample[::-1].flatten())
        encode_forward_sample = self.encoderGRU.forward(
            emb_sample,
            self.x_sample.shape[0],
            batch_size=self.x_sample.shape[1])  # (length, batch_size, dim)
        encode_backward_sample = self.encoderGRU_back.forward(
            back_emb_sample,
            self.x_sample.shape[0],
            batch_size=self.x_sample.shape[1])  # (length, batch_size, dim)
        context_sample = tensor.concatenate(
            (encode_forward_sample[0], encode_backward_sample[0][::-1]),
            axis=2)  # (length, batch_size, 2*dim)
        state_init_sample = self.initer.forward(
            encode_backward_sample[0][::-1][0])
        self.state_init_sample = state_init_sample
        self.context_sample = context_sample
        #self.samples, self.probs_sample, self.updates_sample = self.decode_sample(state_init_sample, context_sample,
        #											self.length_sample, self.n_samples)

        # parameter for decoding
        self.y_decode = tensor.vector('y_decode', dtype='int64')
        self.context_decode = tensor.tensor3('context_decode', dtype='float32')
        self.c_decode = tensor.matrix('c_decode', dtype='float32')
        self.state_decode = tensor.matrix('state_decode', dtype='float32')
        self.par_state_decode = tensor.matrix('par_state_decode',
                                              dtype='float32')
        self.emb_decode = tensor.matrix('emb_decode', dtype='float32')

    def encode(self, x):
        '''
			Encode source sentence to context vector.
		'''
        if not hasattr(self, "encoder"):
            self.encoder = theano.function(inputs=[self.x, self.xmask],
                                           outputs=[self.context])
        x = numpy.reshape(x, (x.shape[0], 1))
        xmask = numpy.ones(x.shape, dtype='float32')
        return self.encoder(x, xmask)

    def get_trg_embedding(self, y):
        '''
			Get the embedding of target sentence.
		'''
        if not hasattr(self, "get_trg_embeddinger"):
            self.get_trg_embeddinger = theano.function(
                inputs=[self.y_decode],
                outputs=[self.emb_trg.forward(self.y_decode)])
        return self.get_trg_embeddinger(y)

    def get_init(self, c):
        '''
			Get the initial decoder hidden state with context vector.
		'''
        if not hasattr(self, "get_initer"):
            self.get_initer = theano.function(
                inputs=[self.context],
                outputs=[self.initer.forward(context_backward[0])])
        return self.get_initer(c)

    def get_context_and_init(self, x):
        '''
			Encode source sentence to context vectors and get the initial decoder hidden state.
		'''
        if not hasattr(self, "get_context_and_initer"):
            self.get_context_and_initer = theano.function(
                inputs=[self.x, self.xmask],
                outputs=[self.context, self.state_init])
        x = numpy.reshape(x, (x.shape[0], 1))
        xmask = numpy.ones(x.shape, dtype='float32')
        return self.get_context_and_initer(x, xmask)

    def get_probs(self, c, state, emb):
        '''
			Get the probability of the next target word.
		'''
        if not hasattr(self, "get_probser"):
            self.get_probser = theano.function(
                inputs=[
                    self.context_decode, self.state_decode, self.emb_decode
                ],
                outputs=self.decoderGRU.decode_probs(self.context_decode,
                                                     self.state_decode,
                                                     self.emb_decode))
        return self.get_probser(c, state, emb)

    def get_next(self, c, state, par_state, emb):
        '''
			Get the next hidden state.
		'''
        if not hasattr(self, "get_nexter"):
            self.get_nexter = theano.function(
                inputs=[
                    self.c_decode, self.state_decode, self.par_state_decode,
                    self.emb_decode
                ],
                outputs=self.decoderGRU.decode_next(self.c_decode,
                                                    self.state_decode,
                                                    self.par_state_decode,
                                                    self.emb_decode))
        return self.get_nexter(c, state, par_state, emb)

    def get_cost(self, x, xmask, y, ymask):
        '''
			Get the negative log-likelihood of parallel sentences.
		'''
        if not hasattr(self, "get_coster"):
            self.get_coster = theano.function(
                inputs=[self.x, self.xmask, self.y, self.ymask],
                outputs=[self.cost])
        return self.get_coster(x, xmask, y, ymask)

    def get_sample(self, x, length, n_samples):
        '''
			Get sampling results.
		'''
        if not hasattr(self, "get_sampler"):
            self.get_sampler = theano.function(
                inputs=[self.x_sample, self.length_sample, self.n_samples],
                outputs=[self.samples, self.probs_sample],
                updates=self.updates_sample)
        return self.get_sampler(x, length, n_samples)

    def get_attention(self, x, xmask, y, ymask):
        '''
			Get the attention weight of parallel sentences.
		'''
        if not hasattr(self, "get_attentioner"):
            self.get_attentioner = theano.function(
                inputs=[self.x, self.xmask, self.y, self.ymask],
                outputs=[self.attention])
        return self.get_attentioner(x, xmask, y, ymask)

    def get_layer(self, x, xmask, y, ymask):
        '''
			Get the hidden states essential for visualization
		'''
        if not hasattr(self, "get_layerer"):
            self.get_layerer = theano.function(
                inputs=[self.x, self.xmask, self.y, self.ymask],
                outputs=self.encode_forward + self.encode_backward +
                tuple(self.decode[0]) + tuple(self.decode[1:]))

        layers = self.get_layerer(x, xmask, y, ymask)
        enc_names = [
            'h', 'gate', 'reset', 'state', 'reseted', 'state_in', 'gate_in',
            'reset_in'
        ]
        dec_names = [
            'h', 'c', 'att', 'gate_cin', 'gate_preactive', 'gate', 'reset_cin',
            'reset_preactive', 'reset', 'state_cin', 'reseted',
            'state_preactive', 'state'
        ]
        dec_names += [
            'outenergy', 'state_in', 'gate_in', 'reset_in', 'state_in_prev',
            'readout', 'maxout', 'outenergy_1', 'outenergy_2'
        ]
        value_name = ['enc_for_' + name for name in enc_names]
        value_name += ['enc_back_' + name for name in enc_names]
        value_name += ['dec_' + name for name in dec_names]
        result = {}
        for i in range(len(layers)):
            print layers[i].shape
            if value_name[i] != '':
                result[value_name[i]] = layers[i]
        return result