Beispiel #1
0
    def getMLELoss(self, target, decoder_outputs, max_decoder_len):
        '''MLE损失
        '''
        masks = tf.sequence_mask(lengths=self.dec_len,
                                 maxlen=max_decoder_len,
                                 dtype=tf.float32,
                                 name='masks')
        weights = tf.slice(self.rewards,
                           begin=[0, 0],
                           size=[-1, max_decoder_len])
        weights = tf.exp(weights)
        weights = tf.clip_by_value(weights,
                                   clip_value_min=1,
                                   clip_value_max=tf.reduce_min(weights) * 3)
        #        weights=weights/tf.reduce_max(weights)
        weights = weights * masks

        #计算每个单词的加权 loss
        weighted_loss = seq2seq.sequence_loss(logits=decoder_outputs,
                                              targets=target,
                                              weights=weights,
                                              average_across_timesteps=True,
                                              average_across_batch=True)
        #mle loss
        loss = seq2seq.sequence_loss(logits=decoder_outputs,
                                     targets=target,
                                     weights=masks,
                                     average_across_timesteps=True,
                                     average_across_batch=True)
        return weighted_loss, loss
Beispiel #2
0
    def training(self):
        softmax_loss_per_output = []
        for i in range(len(self.logits)):
            if self.seq_length is not None:
                softmax_loss = s2s.sequence_loss(self.logits[i],
                                                 self.target[:, :, i],
                                                 self._numerical_mask)
            else:
                softmax_loss = s2s.sequence_loss(
                    self.logits[i], self.target[:, :, i],
                    tf.ones_like(self.target[:, :, i], self.logits[i].dtype))
            softmax_loss_per_output.append(softmax_loss)

        if len(softmax_loss_per_output) == 1:
            self._softmax_loss = softmax_loss_per_output[0]
        else:
            self._softmax_loss = tf.add_n(softmax_loss_per_output)

        if isinstance(self.cell, ACTWrapper):
            self._ponder_loss = self.time_penalty * self.cell.get_ponder_cost(
                self.seq_length)
            self._ponder_steps = self.cell.get_ponder_steps(self.seq_length)
            total_loss = self._softmax_loss + self._ponder_loss
        else:
            total_loss = self._softmax_loss

        return self.optimizer.minimize(total_loss)
Beispiel #3
0
    def compute_loss(self, logits):
        target_output = self.iterator.target_output

        if self.time_major:
            target_output = tf.transpose(target_output)

        max_time = self.get_max_time(target_output)
        target_weights = tf.sequence_mask(lengths=self.iterator.target_length,
                                          maxlen=max_time)

        if self.time_major:
            target_weights = tf.transpose(target_weights)

        target_weights = tf.cast(target_weights, tf.float32)

        # crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target_output, logits=logits)

        # _loss = tf.reduce_sum(crossent * target_weights) / tf.to_float(self.hps.batch_size)
        _loss = seq2seq.sequence_loss(logits=logits,
                                      targets=target_output,
                                      weights=target_weights)

        tf.summary.scalar(name='seq2seq-loss', tensor=_loss)

        return _loss
Beispiel #4
0
  def add_loss_op(self, output):
    """Adds loss ops to the computational graph.

    Hint: Use tensorflow.python.ops.seq2seq.sequence_loss to implement sequence loss.

    Args:
      output: A tensor of shape (None, self.vocab)
        Actually according to the docs this should be of the shape below
        ([batch_size x sequence_length x logits] tensor)
        https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/sequence_loss
        But based on our result of the  projection operation we should also try
        sequence_length x batch_size x logits
    Returns:
      loss: A 0-d tensor (scalar)
    """
    ### YOUR CODE HERE
    print("calculating loss")
    with tf.variable_scope("loss_op") as scope:
        labels = tf.reshape(self.labels_placeholder, [self.config.num_steps, self.config.batch_size])
        weights = tf.ones(shape=tf.shape(labels), dtype=tf.float32, name="weights")
        loss = sequence_loss(logits=output, targets=labels, weights=weights, name="sequence_loss")
        tf.summary.scalar("loss", loss)
        scope.reuse_variables()
    ### END YOUR CODE
    return loss
Beispiel #5
0
    def __init__(self, args, training=True):

        self.args = args
        if not training:
            args.batch_size = 1

        use_dropout = training and (args.output_keep_prob < 1.0 or args.input_keep_prob < 1.0)

        self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length])
        self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length])

        embedding = self.create_var('input', 'embedding', [args.vocab_size, args.rnn_size])
        inputs = tf.nn.embedding_lookup(embedding, self.input_data)
        if use_dropout:
            inputs = tf.nn.dropout(inputs, args.output_keep_prob)

        cell_fn = self.select_cell_fn(args.model)
        cells_fw = self.create_cell_stack('hidden_fw', cell_fn, args, use_dropout=use_dropout)
        cells_bw = self.create_cell_stack('hidden_bw', cell_fn, args, use_dropout=use_dropout)

        self.cell_fw = rnn.MultiRNNCell(cells_fw, state_is_tuple=True)
        self.cell_bw = rnn.MultiRNNCell(cells_bw, state_is_tuple=True)

        self.initial_state = (self.cell_fw.zero_state(args.batch_size, tf.float32),
                              self.cell_bw.zero_state(args.batch_size, tf.float32))

        sequence_length = [args.seq_length]*args.batch_size
        outputs, self.final_state = tf.nn.bidirectional_dynamic_rnn(self.cell_fw, self.cell_bw, inputs, sequence_length,
            initial_state_fw=self.initial_state[0], initial_state_bw=self.initial_state[1])

        # bidi dynamic rnn does not concatenate fw and bw cells by default
        output = tf.concat(outputs, 2, name="concat_outputs")

        softmax_w = self.create_var('rnlm', 'softmax_w', [2*args.rnn_size, args.vocab_size])
        softmax_b = self.create_var('rnlm', 'softmax_b', [args.vocab_size])

        # Reshape/matmul/reshape sequence
        self.logits = tf.einsum("ijk,kl->ijl", output, softmax_w) + softmax_b
        self.probs = tf.nn.softmax(self.logits)

        loss = seq2seq.sequence_loss(self.logits, self.targets, tf.ones([args.batch_size, args.seq_length]), average_across_batch=False)

        with tf.name_scope('cost'):
            self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length

        self.lr = tf.Variable(0.0, trainable=False)

        # apply clipping
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
                args.grad_clip)

        with tf.name_scope('optimizer'):
            optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars))

        # instrument tensorboard
        tf.summary.histogram('logits', self.logits)
        tf.summary.histogram('loss', loss)
        tf.summary.scalar('train_loss', self.cost)
Beispiel #6
0
    def train(self):
        self.global_step = tf.train.get_or_create_global_step()
        training_logit, inference_logit = self.seq2seq_model \
            (self.text_length, self.targets, self.summary_length, self.max_summary_length, self.vocab_size)

        # create tensor for train_logit and inference_logit
        self.training_logits = tf.identity(training_logit.rnn_output, 'logits')
        self.inference_logits = tf.identity(inference_logit.sample_id,
                                            'predictions')

        # create weights for sequence_loss
        masks = tf.sequence_mask(self.summary_length,
                                 self.max_summary_length,
                                 dtype=tf.float32,
                                 name='masks')

        with tf.variable_scope('optimization'):
            self.cost = seq2seq.sequence_loss(self.training_logits,
                                              self.targets, masks)

            optimizer = tf.train.AdamOptimizer(self.learning_rate)
            gradients = optimizer.compute_gradients(self.cost)
            cliped_gradients = [(tf.clip_by_value(grad, -5., 5.), var)
                                for grad, var in gradients if grad is not None]
            self.train_op = optimizer.apply_gradients(
                cliped_gradients, global_step=self.global_step)

        tf.summary.scalar('loss', self.cost)
        self.summary = tf.summary.merge_all()
Beispiel #7
0
    def build_model(self):

    	get_sample = tf.make_template('gen', self.Generate)

    	get_disc = tf.make_template('disc', self.Discriminate)

        self.weight = tf.get_variable('weight_gen',initializer = tf.truncated_normal([args.rnn_size, args.vocab_size], stddev=0.1),dtype =tf.float32)
        self.bias = tf.get_variable('bias_gen',initializer = tf.constant(0.1, shape=[args.vocab_size]),dtype = tf.float32,trainable = False)
        self.gen_lr = self.args.gen_learning_rate


        #self.target = tf.placeholder(tf.float32,[args.batch_size,1])
        self.W1 = tf.get_variable(initializer = tf.random_normal([args.dis_seq_length,args.fc_hidden],stddev = 0.35,dtype = tf.float64),name = 'disc_W1')
        self.W2 = tf.Variable(initializer = tf.random_normal([args.fc_hidden,1],stddev = 0.2,dtype = tf.float64),name = 'disc_W2')
        self.b1 = tf.Variable(initializer = tf.random_normal([args.batch_size,1],stddev = 0.2,dtype = tf.float64),name = 'disc_b1',trainable = False)
        self.b2 = tf.Variable(initializer = tf.random_normal([args.batch_size,1],stddev = 0.2,dtype = tf.float64),name = 'disc_b2',trainable = False)
        self.dis_lr = self.args.disc_learning_rate

    	if self.flag == 'GAN':
    		self.fake_data = get_sample()
    		#self.loop = tf.make_template('gen', self.loop)
    		self.D_fake, self.D_logit_fake = get_disc('fake')
    		self.D_real, self.D_logit_real= get_disc('real')

    		self.D_loss = -tf.reduce_mean(tf.log(self.D_real) + tf.log(1 - self.D_fake))
    		self.G_loss = -tf.reduce_mean(tf.log(1 - self.D_fake))
    		self.Dtvars = [v for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='gen')]
    		self.Gtvars = [v for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='disc')]

    	else:
    		logits = get_sample(self.flag)
    		weights = tf.concat([tf.ones([self.args.batch_size, 16]), tf.zeros([self.args.batch_size,34])])
    		self.G_Loss = seq2seq.sequence_loss(logits, self.response,weights, average_across_timesteps = True, average_across_batch = True, name = 'Sequence_loss')
    		pred,_ = get_disc(flag)
    		self.D_Loss = -tf.reduce_mean(self.target*tf.log(pred) + (1.-self.target)*tf.log(1. - pred) )
Beispiel #8
0
  def add_loss_op(self, output):
    """Adds loss ops to the computational graph.

    Hint: Use tensorflow.python.ops.seq2seq.sequence_loss to implement sequence loss. 

    Args:
      output: A tensor of shape (None, self.vocab)
    Returns:
      loss: A 0-d tensor (scalar)
    """
    ### YOUR CODE HERE
    b_size  = self.config.batch_size
    n_steps = self.config.num_steps
    targets = [tf.reshape(self.labels_placeholder, [-1])]
    weights = [tf.ones([b_size*n_steps])]
    print "\n\nLoss Op: "
    print "logits ", len(output), " - ", output[0].shape
    t = tf.reshape(self.labels_placeholder, [b_size, n_steps])
    print "labels ", t
    #print "weights  ",  
    w = tf.ones([b_size, n_steps])
    print "weights ", w
    f = tf.reshape(output, [b_size, n_steps, len(self.vocab)])
    print "reshaped ", f
    s2s_loss = sequence_loss(logits=f, targets=t, weights=w)

    self.sMax = tf.nn.softmax(f)
    print "smax ", self.sMax
    tf.add_to_collection('total_loss', s2s_loss)
    loss = s2s_loss
    print loss
    #raise NotImplementedError
    ### END YOUR CODE
    return loss
Beispiel #9
0
    def _init_optimizer(self):
        # 整理输出并计算loss
        logits = tf.transpose(self.decoder_logits_train, [1, 0, 2])
        targets = tf.transpose(self.decoder_train_targets, [1, 0])
        self.logits = tf.transpose(self.decoder_logits_train, [1, 0, 2])
        self.targets = tf.transpose(self.decoder_train_targets, [1, 0])
    
        self.loss = seq2seq.sequence_loss(logits=logits, targets=targets,
                                          weights=self.loss_weights)
        
        opt = tf.train.AdamOptimizer()
        self.train_op = opt.minimize(self.loss)

        # add
        params = tf.trainable_variables()
        self.gradient_norms = []
        self.updates = []

        gradients = tf.gradients(self.loss, params)
        clipped_gradients, norm = tf.clip_by_global_norm(gradients,
                                                         self.max_gradient_norm)
        self.gradient_norms.append(norm)
        self.updates.append(opt.apply_gradients(
            zip(clipped_gradients, params), global_step=self.global_step))

        self.saver = tf.train.Saver(tf.global_variables())
 def add_loss_op(self, output):
     cross_entropy = sequence_loss(output,
                                   self.inputs_placeholder_dict['tags'],
                                   self.weight)
     tf.add_to_collection('total loss', cross_entropy)
     loss = tf.add_n(tf.get_collection('total loss'))
     return loss
 def build_model(self):
     questions=tf.nn.embedding_lookup(self.embedding,self.question_in)
     encoder_outputs,encoder_state=self.encoder(questions)
     print("encoder_outputs",encoder_outputs.get_shape())
     decoder_inputs=tf.zeros(shape=[tf.shape(encoder_outputs)[0],self.subject_len,self.hidden_dim])
     decoder_outputs,decoder_state,decoder_context=self.decoder(encoder_state=encoder_state,inputs=decoder_inputs)
     self.decoder_outptus=decoder_inputs
     print("decoder_outputs:",decoder_outputs.get_shape())
     self.logits,_,_=self.decoder(encoder_state,inputs=encoder_outputs,is_train=False)
     #beam search解码
     self.output=tf.nn.ctc_greedy_decoder(tf.transpose(self.logits,perm=[1,0,2]),sequence_length=self.seq_len,merge_repeated=False)
     weights=tf.cast(tf.greater(self.subject_in,0),tf.float32)
     
     def softmax_loss_function(inputs,labels):
         '''定义softmax_loss_function损失函数,由seq2seq.sequence_loss调用
         '''
         print("inputs:",inputs.get_shape())
         print("labels:",labels.get_shape())
         labels=tf.expand_dims(labels,1) 
         loss=tf.nn.sampled_softmax_loss(self.dec_embedding,self.dec_bias,labels=labels,inputs=inputs,num_sampled=100,num_classes=self.vocab_size)
         return loss 
     '''
     损失函数:
     logits: 3-D, shape=[batch_size,sequence_length,num_decoder_symbols], num_decoder_symbols为decoder的维度,可以是类别个数,也可以是词向量的维度(需要使用负采样的softmax_loss_function)
     targets: 2-D, shape=[batch_size,sequence_length], dtype为int,目标序列的index
     weights: 2-D, shape=[batch_size,sequence_length],dtype为float
     softmax_loss_function: Function(inputs-batch,labels-batch)
     '''
     self.loss=seq2seq.sequence_loss(logits=decoder_outputs,targets=self.subject_in,weights=weights,softmax_loss_function=softmax_loss_function)
     self.opt=tf.train.AdamOptimizer(learning_rate=self.lr).minimize(self.loss)
Beispiel #12
0
 def _build_loss(self, logits, target, target_length):
     with tf.variable_scope('loss'):
         weight_masks = tf.sequence_mask(lengths=target_length,
                                         maxlen=self.max_len,
                                         dtype=tf.float32)
         loss = seq2seq.sequence_loss(logits, target, weight_masks)
         return loss
 def recon_loss(self):
     return seq2seq.sequence_loss(
         logits=self.logits,
         targets=self.train_targets,  # * max sequence length?
         weights=self.loss_weights,
         name="reconstruction_loss",
     )
Beispiel #14
0
 def _init_optimizer(self):
     logits = tf.transpose(self.decoder_logits_train, [1, 0, 2])
     targets = tf.transpose(self.decoder_train_targets, [1, 0])
     self.loss = seq2seq.sequence_loss(logits=logits,
                                       targets=targets,
                                       weights=self.loss_weights)
     self.train_op = tf.train.AdamOptimizer().minimize(self.loss)
Beispiel #15
0
 def build_training_model(self):
     self.def_placeholder_and_components()
     emb_out = []
     enc_h_out = []
     past_for_decoder = []
     for i in range(0, self.input_num):
         past_length = 0
         h = tf.gather(self.wte, self.inputs[i]) + tf.gather(
             self.wpe, positions_for(self.inputs[i], past_length))
         emb_out.append(h)
         presents, h_enc = self.encoder.encode(h, self.input_lens[i])
         enc_h_out.append(h_enc)
         past_for_decoder.append(presents)
     all_logits = self.decoder.decode_all(tokens=self.target_in,
                                          past_list=past_for_decoder,
                                          enc_h_list=enc_h_out)['logits']
     with tf.name_scope('loss'):
         batch_max_seq_len = tf.shape(self.target_in)[1]
         target_mask = tf.sequence_mask(self.target_len,
                                        maxlen=batch_max_seq_len,
                                        dtype=tf.float32)
     cost = sequence_loss(logits=all_logits,
                          targets=self.target_out,
                          weights=target_mask)
     return cost
Beispiel #16
0
    def _init_optimizer(self):

        logits = tf.transpose(self.decoder_logits_train, [1, 0, 2])
        targets = tf.transpose(self.decoder_train_targets, [1, 0])

        #손실함수
        self.loss = seq2seq.sequence_loss(logits=logits,
                                          targets=targets,
                                          weights=self.loss_weights)

        #기울기 클리핑
        self.lr = tf.Variable(0.0, trainable=False, name='lr')

        # 훈련이 가능하다고 설정한 모든 변수들
        tvars = tf.trainable_variables()

        # 여러 값들에 대한 기울기 클리핑
        # contrib.keras.backend.gradients
        # gradients gradients of variables

        grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars),
                                          config.max_grad_norm)

        #optimizer = tf.train.AdamOptimizer(self.lr)
        #self.train_op = optimizer.apply_gradients(zip(grads, tvars))
        self.train_op = tf.train.AdamOptimizer().minimize(self.loss)
 def _build_loss(self, config):
     # cost/evaluate/train
     if config.model_name.endswith("flat"):
         self.prob = tf.nn.softmax(self.logits)
         self.losses = tf.nn.softmax_cross_entropy_with_logits(
             logits=self.logits,
             labels=self.y_flat)  # TODO self.y at multi-labels input
         self.loss = tf.reduce_mean(self.losses)
     else:
         self.weights = tf.sequence_mask(self.y_seq_length,
                                         dtype=tf.float32)
         # epsilon = tf.constant(value=0.00001)
         self.check = self.logits
         # self.logits = tf.clip_by_value(self.logits, -1.0, 1.0)
         #softmax = tf.nn.softmax(logits)
         #cross_entropy = -tf.reduce_sum(labels * tf.log(softmax), reduction_indices=[1])
         self.losses = sequence_loss(logits=self.logits,
                                     targets=self.y_seq,
                                     weights=self.weights,
                                     average_across_timesteps=False,
                                     average_across_batch=False)
         self.loss = tf.reduce_mean(self.losses)
     tf.summary.scalar(self.loss.op.name, self.loss)
     # TODO process compute_gradients() and apply_gradients() separetely
     self.train_op = tf.train.AdamOptimizer(
         learning_rate=config.learning_rate).minimize(
             self.loss, global_step=self.global_step)
Beispiel #18
0
 def _build_loss(self):
     self.train_logits_seq = tf.transpose(self.train_logits, [1, 0, 2])
     self.train_targets_seq = tf.transpose(self.train_targets, [1, 0])
     self.unreg_loss = self.loss = seq2seq.sequence_loss(
         logits=self.train_logits_seq,
         targets=self.train_targets_seq,
         weights=self.loss_weights)
Beispiel #19
0
    def build_model(self, int_to_vocab, rnn_size, rnn_layer_count,
                    summary_output_dir):
        self.train_graph = tf.Graph()
        with self.train_graph.as_default():
            vocab_size = len(int_to_vocab)
            self.input_text, self.targets, self.lr = self.get_inputs()
            input_data_shape = tf.shape(self.input_text)
            cell, self.initial_state = self.get_init_cell(
                input_data_shape[0], rnn_size, layer_count=rnn_layer_count)
            logits, self.final_state = self.build_nn(cell, rnn_size,
                                                     self.input_text,
                                                     vocab_size)

            # Probabilities for generating words
            probs = tf.nn.softmax(logits, name='probs')

            # Loss function
            self.cost = seq2seq.sequence_loss(
                logits, self.targets,
                tf.ones([input_data_shape[0], input_data_shape[1]]))
            tf.summary.scalar('train_loss', self.cost)

            # Optimizer
            optimizer = tf.train.AdamOptimizer(self.lr)

            # Gradient Clipping
            gradients = optimizer.compute_gradients(self.cost)
            capped_gradients = [(tf.clip_by_value(grad, -1.0, 1.0), var)
                                for grad, var in gradients]
            self.train_op = optimizer.apply_gradients(capped_gradients)

            self.merged_summaries = tf.summary.merge_all()
            self.train_writer = tf.summary.FileWriter(summary_output_dir,
                                                      graph=self.train_graph)
Beispiel #20
0
    def build_decoder(self):
        print("building decoder and attention..")
        with tf.variable_scope('decoder'):
            self.decoder_cell, self.decoder_initial_state = self.build_decoder_cell()

            output_layer = Dense(self.num_symbols, name='output_projection')
            start_tokens = tf.ones([self.batch_size,], tf.int32) * data_utils.start_token
            end_token = data_utils.end_token
            
            helper = GumbelSoftmaxEmbeddingHelper(embedding=self.embeddings, start_tokens=start_tokens,end_token= end_token, tau=self.tau)
            
            max_decoder_length = tf.reduce_max(self.encoder_inputs_length)
            decoder = tf.contrib.seq2seq.BasicDecoder(cell=self.decoder_cell, helper=helper, initial_state=self.decoder_initial_state)#, output_layer=output_layer)
            (self.decoder_outputs_train, self.decoder_last_state_train, self.decoder_outputs_length_train) = (seq2seq.dynamic_decode(decoder=decoder, maximum_iterations=max_decoder_length,impute_finished=True))
            self.decoder_logits_train = tf.identity(self.decoder_outputs_train.rnn_output)#IMPORTANT
            self.decoder_pred_decode = tf.argmax(self.decoder_outputs_train.sample_id, axis=-1, output_type=tf.int32)#IMPORTANT
            
            #newintput = data_utils.insertSequence(self.decoder_pred_decode.eval(), self.encoder_inputs.eval(),1, self.total_num)
            '''
            _loss = 0
            for i in range(self.detector.batch_size):
                source, source_len = data_utils.prepare_batch(newintput[i:i*self.detector.batch_size], self.detector.stride, self.detector.maxlen, self.detector.batch_size)
                _, logits = self.detector.predict(self.sess, source, source_len)
                _loss += logits[0] - logits[1]
            '''
            self.accuracy = tf.reduce_mean(tf.cast(tf.equal(self.encoder_inputs, self.decoder_pred_decode), self.dtype))
            masks = tf.sequence_mask(lengths=self.encoder_inputs_length, maxlen=max_decoder_length, dtype=self.dtype, name='masks')
            self.loss = seq2seq.sequence_loss(logits=self.decoder_logits_train, targets=self.encoder_inputs, weights=masks)
            #self.loss = _loss + np.sum(self.decoder_pred_decode**masks**2)/np.sum(masks)/2
            tf.summary.scalar('loss', self.loss)
            self.init_optimizer()
Beispiel #21
0
    def _define_loss(self, sampled_softmax):

        self.loss_reconstruct = seq2seq.sequence_loss(
            logits=self.decoder_outputs_train,
            targets=self.decoder_targets,
            weights=self.decoder_weights,
            softmax_loss_function=sampled_softmax,
            average_across_timesteps=True,
            average_across_batch=True)

        self.KL = tf.reduce_mean(-0.5 * tf.reduce_sum(1 + self.encoder_state_logsigma
                                                      - tf.pow(self.encoder_state_mu, 2)
                                                      - tf.exp(self.encoder_state_logsigma), axis=1))

        self.loss = tf.add(self.annealing_term * self.KL, self.loss_reconstruct)

        # Keep track of the cost
        tf.summary.scalar('loss_reconstruct', self.loss_reconstruct)
        tf.summary.scalar('KL', self.KL)
        tf.summary.scalar('loss', self.loss)

        opt = tf.train.AdamOptimizer(
            learning_rate=self.args.learning_rate,
            beta1=0.9,
            beta2=0.999,
            epsilon=1e-08
        )

        self.opt_op = opt.minimize(self.loss)
Beispiel #22
0
    def build_training_graph(self, input, input_len, target, target_mask=None):
        batch_max_seq_len = tf.shape(input)[1]

        def step(hparams, tokens, past=None):
            lm_output = model.model(hparams=hparams,
                                    X=tokens,
                                    past=past,
                                    reuse=tf.AUTO_REUSE)
            logits = lm_output['logits']
            presents = lm_output['present']
            presents.set_shape(
                model.past_shape(hparams=hparams, batch_size=None))
            return {
                'logits': logits,
                'presents': presents,
            }

        with tf.name_scope('sample_sequence'):
            all_logits = step(hparams=self.hparams, tokens=input)['logits']
        with tf.name_scope('loss'):
            if target_mask is None:
                target_mask = tf.sequence_mask(input_len,
                                               maxlen=batch_max_seq_len,
                                               dtype=tf.float32)
        cost = sequence_loss(logits=all_logits,
                             targets=target,
                             weights=target_mask)
        return cost
Beispiel #23
0
    def add_loss_op(self, output):
        """Adds loss ops to the computational graph.

        Hint: Use tensorflow.python.ops.seq2seq.sequence_loss to implement sequence loss.

        Args:
          output: A tensor of shape (None, self.vocab)
        Returns:
          loss: A 0-d tensor (scalar)
        """
        ### YOUR CODE HERE
        logits = output
        e = tf.expand_dims(input=output, axis=0)
        print(e)
        targets = self.labels_placeholder
        f = tf.expand_dims(input=tf.reshape(targets, [-1]), axis=0)
        print(targets)
        weights = tf.ones((self.config.batch_size * self.config.num_steps))
        print(weights)
        g = tf.expand_dims(input=weights, axis=0)
        loss = sequence_loss(e, f, g)
        # all_ones = [tf.ones([self.config.batch_size * self.config.num_steps])]
        # a = tf.reshape(self.labels_placeholder, [-1])
        # b = tf.ones((self.config.batch_size * self.config.num_steps))
        # d = [output]
        # c = sequence_loss(logits=output, targets=[a], weights=b)
        # cross_entropy = sequence_loss(
        #     [output], [tf.reshape(self.labels_placeholder, [-1])], all_ones, len(self.vocab))
        # tf.add_to_collection('total_loss', cross_entropy)
        # loss = tf.add_n(tf.get_collection('total_loss'))
        ### END YOUR CODE
        return loss
Beispiel #24
0
def seq_net(name, inputs, targets, sl, n_items, n_cates, cate_list, u_emb, rank, is_training, reuse):
  with tf.variable_scope(name+'-rnn'):
    output_layer = Dense(n_items+2, n_cates+1, cate_list, activation=None, name='output_projection')
    training_helper = seq2seq.TrainingHelper(
        inputs=inputs,
        sequence_length=sl,
        time_major=False)

    cell, initial_state = build_decoder_cell(rank, u_emb, tf.shape(inputs)[0])
    training_decoder = seq2seq.BasicDecoder(
        cell=cell,
        helper=training_helper,
        initial_state=initial_state,
        output_layer=output_layer)

    max_decoder_length = tf.reduce_max(sl)
    output, _, _ = seq2seq.dynamic_decode(
        decoder=training_decoder,
        output_time_major=False,
        impute_finished=True,
        maximum_iterations=max_decoder_length)

    output = tf.identity(output.rnn_output)
    mask = tf.sequence_mask(
        lengths=sl,
        maxlen=max_decoder_length,
        dtype=tf.float32)
    loss = seq2seq.sequence_loss(
        logits=output,
        targets=targets,
        weights=mask,
        average_across_timesteps=True,
        average_across_batch=False)
  return loss, tf.shape(output), tf.shape(targets)
Beispiel #25
0
    def _build_loss(self):
        config = self.config

        def sampled_loss(labels, inputs):
            labels = tf.reshape(labels, [-1, 1])
            # We need to compute the sampled_softmax_loss using 32bit floats to
            # avoid numerical instabilities.
            local_w_t = tf.cast(tf.transpose(self.output_projection[0]),
                                tf.float32)
            local_b = tf.cast(self.output_projection[1], tf.float32)
            local_inputs = tf.cast(inputs, tf.float32)
            return tf.cast(
                tf.nn.sampled_softmax_loss(weights=local_w_t,
                                           biases=local_b,
                                           labels=labels,
                                           inputs=local_inputs,
                                           num_sampled=self.vocab_size // 10,
                                           num_classes=self.vocab_size),
                tf.float32)

        self.loss = seq2seq.sequence_loss(logits=self.decoder_logits,
                                          targets=self.decoder_train_targets,
                                          weights=tf.sequence_mask(
                                              self.x_length,
                                              tf.shape(self.x)[1],
                                              dtype=tf.float32,
                                              name='masks'),
                                          softmax_loss_function=sampled_loss,
                                          name='loss')
        tf.summary.scalar(self.loss.op.name, self.loss)
        tf.add_to_collection('ema/scalar', self.loss)
Beispiel #26
0
def _compute_loss(logits, target_output, target_weights, batch_size):
    """Compute optimization loss."""
    """logits shape=[batch_size, num_steps, num_classes]"""
    """思路 tf.logical_and(target_weights, new_weight)"""
    loss = sequence_loss(logits=logits,
                         targets=target_output[:, 1:],
                         weights=target_weights)
    return loss
Beispiel #27
0
    def build_train_decoder(self):
        print('Building train decoder...')

        ending = tf.strided_slice(self.decoder_targets, [0, 0], [self.batch_size, -1], [1, 1])
        decoder_input = tf.concat([tf.fill([self.batch_size, 1], self.word_to_id['<GO>']), ending], 1)
        decoder_inputs_embedded = tf.nn.embedding_lookup(self.embedding, decoder_input)

        if self.teacher_forcing:
            training_helper = ScheduledEmbeddingTrainingHelper(
                inputs=decoder_inputs_embedded,
                sequence_length=self.decoder_targets_length,
                embedding=self.embedding,
                sampling_probability=self.teacher_forcing_probability,
                time_major=False,
                name='teacher_forcing_training_helper'
            )
        else:
            training_helper = TrainingHelper(
                inputs=decoder_inputs_embedded,
                sequence_length=self.decoder_targets_length,
                time_major=False,
                name='training_helper'
            )

        training_decoder = BasicDecoder(
            cell=self.decoder_cell,
            helper=training_helper,
            initial_state=self.decoder_initial_state,
            output_layer=self.output_layer
        )

        decoder_outputs, _, _ = dynamic_decode(
            decoder=training_decoder,
            impute_finished=True,
            maximum_iterations=self.max_target_sequence_length
        )

        self.decoder_logits_train = tf.identity(decoder_outputs.rnn_output)

        # loss
                #This is the weighted cross-entropy loss for a sequence of logits.
                #Param:
                    #logits: [batch_size, sequence_length, num_decoder_symbols].
                    #        The logits is the prediction across all classes at each timestep.
                    #targets: [batch_size, sequence_length], representing true class at each time step
                    #weights: [batch_size, sequence_length], This is the weighting of each prediction in the sequence. 
      
        self.loss = sequence_loss(
            logits=self.decoder_logits_train,
            targets=self.decoder_targets,
            weights=self.mask
        )

         # summary
        tf.summary.scalar('loss', self.loss) #Outputs a Summary protocol buffer containing a single scalar value.
        self.summary_op = tf.summary.merge_all() #Merges all summaries collected in the default graph.

        self.build_optimizer()
Beispiel #28
0
 def losses(self):
     with tf.variable_scope("loss"):
         weights = tf.sequence_mask(
             tf.to_int32(self.output_seq_len),
             tf.to_int32(tf.shape(self.decoder_input_data)[1]))
         loss = seq2seq.sequence_loss(self.logits, self.decoder_input_label,
                                      tf.to_float(weights))
         tf.summary.scalar("loss", loss)
     return loss
Beispiel #29
0
def createModel(int_to_vocab):
    train_graph = tf.Graph()
    with train_graph.as_default():
        # 文字总量
        vocab_size = len(int_to_vocab)
        # 获取模型的输入,目标以及学习率节点,这些都是tf的placeholder
        input_text, targets, lr = get_inputs()
        # 输入数据的shape
        input_data_shape = tf.shape(input_text)
        # 创建rnn的cell和初始状态节点,rnn的cell已经包含了lstm,dropout
        # 这里的rnn_size表示每个lstm cell中包含了多少的神经元
        cell, initial_state = get_init_cell(input_data_shape[0], rnn_size)
        # 创建计算loss和finalstate的节点
        logits, final_state = build_nn(cell, rnn_size, input_text, vocab_size,
                                       embed_dim)
        # 使用softmax计算最后的预测概率
        probs = tf.nn.softmax(logits, name='probs')
        # 计算loss
        cost = seq2seq.sequence_loss(
            logits, targets,
            tf.ones([input_data_shape[0], input_data_shape[1]]))
        # 使用Adam提督下降
        optimizer = tf.train.AdamOptimizer(lr)
        # 裁剪一下Gradient输出,最后的gradient都在[-1, 1]的范围内
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var)
                            for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)
        #return train_op, train_graph, initial_state, input_text, targets
        # 获得训练用的所有batch
        batches = get_batches(int_text, batch_size, seq_length)
        # 打开session开始训练,将上面创建的graph对象传递给session
        with tf.Session(graph=train_graph) as sess:
            sess.run(tf.global_variables_initializer())
            for epoch_i in range(num_epochs):
                state = sess.run(initial_state, {input_text: batches[0][0]})
                for batch_i, (x, y) in enumerate(batches):
                    feed = {
                        input_text: x,
                        targets: y,
                        initial_state: state,
                        lr: learning_rate
                    }
                    train_loss, state, _ = sess.run(
                        [cost, final_state, train_op], feed)
                    # 打印训练信息
                    if (epoch_i * len(batches) +
                            batch_i) % show_every_n_batches == 0:
                        print(
                            'Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}'.
                            format(epoch_i, batch_i, len(batches), train_loss))
            # 保存模型
            saver = tf.train.Saver()
            saver.save(sess, save_dir)
            print('Model Trained and Saved')
            helper.save_params((seq_length, save_dir))
Beispiel #30
0
    def _init_optimiser(self):

        self.current_lr = self._hparams.learning_rate

        self._loss_weights = tf.sequence_mask(lengths=self._labels_length,
                                              dtype=self._hparams.dtype)

        self.batch_loss = seq2seq.sequence_loss(
            logits=self.decoder_train_outputs,
            targets=self._labels,
            weights=self._loss_weights,
            softmax_loss_function=None,
            average_across_batch=True,
            average_across_timesteps=True)

        self.reg_loss = 0

        if self._hparams.recurrent_l2_regularisation is not None:
            regularisable_vars = _get_trainable_vars(self._hparams.cell_type)
            reg = tf.contrib.layers.l2_regularizer(
                scale=self._hparams.recurrent_l2_regularisation)
            self.reg_loss = tf.contrib.layers.apply_regularization(
                reg, regularisable_vars)

        self.batch_loss = self.batch_loss + self.reg_loss

        if self._hparams.optimiser == 'Adam':
            optimiser = tf.train.AdamOptimizer(
                learning_rate=self.current_lr,
                epsilon=1e-8 if self._hparams.dtype == tf.float32 else 1e-4,
            )
        elif self._hparams.optimiser == 'AdamW':
            from tensorflow.contrib.opt import AdamWOptimizer
            optimiser = AdamWOptimizer(
                learning_rate=self.current_lr,
                weight_decay=self._hparams.weight_decay,
                epsilon=1e-8 if self._hparams.dtype == tf.float32 else 1e-4,
            )
        elif self._hparams.optimiser == 'Momentum':
            optimiser = tf.train.MomentumOptimizer(
                learning_rate=self.current_lr,
                momentum=0.9,
                use_nesterov=False)
        else:
            raise Exception('Unsupported optimiser, try Adam')

        variables = tf.trainable_variables()
        gradients = tf.gradients(self.batch_loss, variables)

        if self._hparams.clip_gradients is True:
            gradients, _ = tf.clip_by_global_norm(
                gradients, self._hparams.max_gradient_norm)

        self.train_op = optimiser.apply_gradients(
            grads_and_vars=zip(gradients, variables),
            global_step=tf.train.get_global_step())
  def _init_optimizer(self):
    self.targets = tf.transpose(self.decoder_train_targets,[1,0])
    self.logits  = tf.nn.softmax(tf.transpose(self.logits, [1,0,2]))
    print 'targets:',self.targets
      
    print 'logits:',self.logits
    
    
    
    self.loss = seq2seq.sequence_loss(logits=self.logits, targets=self.targets,
                                          weights=self.loss_weights)
    print 'self.loss:',self.loss
    # define train op
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), 10)

    optimizer = tf.train.AdamOptimizer(1e-3)
    self.train_op = optimizer.apply_gradients(zip(grads, tvars))
Beispiel #32
0
    def __init__(self, int_to_vocab, config):
        """
        Construct Recurrent Neural Network.

        Args:
            config: Dictionary of configuration parameters.
        """

        lstm_layers = config['lstm_layers']
        rnn_size = config['rnn_size']
        embed_dim = config['embed_dim']
        dropout_keep_prob = config['dropout_keep_prob']

        self.train_graph = tf.Graph()

        with self.train_graph.as_default():

            tf.set_random_seed(1234)

            vocab_size = len(int_to_vocab)

            self.input_text, self.targets, self.lr = get_inputs()

            input_data_shape = tf.shape(self.input_text)

            # Unpack the shape of what the RNN outputs (array) to the shape that the RNN expects in its next training
            #  step (tuples)
            self.init_state = tf.placeholder(tf.float32,
                                             [lstm_layers, 2, None, rnn_size], name='initial_state')

            state_per_layer_list = tf.unstack(self.init_state, axis=0)

            rnn_tuple_state = tuple(
                [tf.nn.rnn_cell.LSTMStateTuple(state_per_layer_list[idx][0], state_per_layer_list[idx][1])
                 for idx in range(lstm_layers)]
            )

            cell = get_init_cell(rnn_size, dropout_keep_prob, lstm_layers)

            logits, self.final_state = build_nn(cell,
                                                rnn_tuple_state,
                                                self.input_text,
                                                vocab_size,
                                                embed_dim)

            # Probabilities for generating words
            # Not used locally but referred to by tensor name during text
            # generation.
            probs = tf.nn.softmax(logits, name='probs')

            # Loss function
            self.cost = seq2seq.sequence_loss(
                logits,
                self.targets,
                tf.ones([input_data_shape[0], input_data_shape[1]]))

            # Optimizer
            optimizer = tf.train.AdamOptimizer(self.lr)

            # Gradient Clipping
            gradients = optimizer.compute_gradients(self.cost)
            capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var)
                                for grad, var in gradients if grad is not None]

            self.train_op = optimizer.apply_gradients(capped_gradients)
Beispiel #33
0
    def __init__(self, vocab_size, hidden_size, dropout,
                 num_layers, max_gradient_norm, batch_size, learning_rate,
                 lr_decay_factor, max_target_length,
                 max_source_length, decoder_mode=False):
        '''
        vocab_size: number of vocab tokens
        buckets: buckets of max sequence lengths
        hidden_size: dimension of hidden layers
        num_layers: number of hidden layers
        max_gradient_norm: maximum gradient magnitude
        batch_size: number of training examples fed to network at once
        learning_rate: starting learning rate of network
        lr_decay_factor: amount by which to decay learning rate
        num_samples: number of samples for sampled softmax
        decoder_mode: Whether to build backpass nodes or not
        '''
        GO_ID = config.GO_ID
        EOS_ID = config.EOS_ID
        self.max_source_length = max_source_length
        self.max_target_length = max_target_length
        self.vocab_size = vocab_size
        self.batch_size = batch_size
        self.global_step = tf.Variable(0, trainable=False)
        self.learning_rate = learning_rate
        self.encoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='encoder_inputs')
        self.source_lengths = tf.placeholder(shape=(None,), dtype=tf.int32, name='source_lengths')

        self.decoder_targets = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_targets')
        self.target_lengths = tf.placeholder(shape=(None,), dtype=tf.int32, name="target_lengths")

        with tf.variable_scope('embeddings') as scope:
            embeddings = tf.Variable(tf.random_uniform([vocab_size, hidden_size], -1.0, 1.0), dtype=tf.float32)
            encoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, self.encoder_inputs)
            targets_embedding = tf.nn.embedding_lookup(embeddings, self.decoder_targets)


        with tf.variable_scope('encoder') as scope:
            encoder_cell = rnn.LSTMCell(hidden_size)
            encoder_cell = rnn.DropoutWrapper(encoder_cell,
                                              input_keep_prob=dropout)

            encoder_cell = rnn.MultiRNNCell([encoder_cell] * num_layers)

            _, encoder_state = tf.nn.bidirectional_dynamic_rnn(cell_fw=encoder_cell,
                                                               cell_bw=encoder_cell,
                                                               sequence_length=self.source_lengths,
                                                               inputs=encoder_inputs_embedded,
                                                               dtype=tf.float32,
                                                               time_major=False)

        with tf.variable_scope('decoder') as scope:
            decoder_cell = rnn.LSTMCell(hidden_size)
            decoder_cell = rnn.DropoutWrapper(decoder_cell,
                                              input_keep_prob=dropout)

            decoder_cell = rnn.MultiRNNCell([decoder_cell] * num_layers)

            #TODO add attention
            #seq2seq.BahdanauAttention(num_units=,memory=encoder_output)

            #decoder_cell = seq2seq.AttentionWrapper(cell=decoder_cell,
            #                                        attention_mechanism=)

        if decoder_mode:
            decoder = seq2seq.BeamSearchDecoder(embedding=embeddings,
                                                start_tokens=tf.tile([GOD_ID], [batch_size]),
                                                end_token=EOS_ID,
                                                initial_state=encoder_state[0],
                                                beam_width=2)
        else:
            helper = seq2seq.TrainingHelper(inputs=targets_embedding,
                                            sequence_length=self.target_lengths)

            decoder = seq2seq.BasicDecoder(cell=decoder_cell,
                                           helper=helper,
                                           initial_state=encoder_state[-1],
                                           output_layer=Dense(vocab_size))

        final_outputs, final_state, final_sequence_lengths =\
                            seq2seq.dynamic_decode(decoder=decoder)

        self.logits = final_outputs.rnn_output

        if not decoder_mode:
            with tf.variable_scope("loss") as scope:
                #have to pad logits, dynamic decode produces results not consistent
                #in shape with targets
                pad_size = self.max_target_length - tf.reduce_max(final_sequence_lengths)
                self.logits = tf.pad(self.logits, [[0, 0], [0,pad_size], [0, 0]])

                weights = tf.sequence_mask(lengths=final_sequence_lengths,
                                           maxlen=self.max_target_length,
                                           dtype=tf.float32,
                                           name='weights')

                x_entropy_loss = seq2seq.sequence_loss(logits=self.logits,
                                                       targets=self.decoder_targets,
                                                       weights=weights)

                self.loss = tf.reduce_mean(x_entropy_loss)

            optimizer = tf.train.AdamOptimizer()
            gradients = optimizer.compute_gradients(x_entropy_loss)
            capped_grads = [(tf.clip_by_value(grad, -max_gradient_norm, max_gradient_norm), var) for grad, var in gradients]
            self.train_op = optimizer.apply_gradients(capped_grads,
                                                      global_step=self.global_step)
            self.saver = tf.train.Saver(tf.global_variables())
Beispiel #34
0
def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
                   target_dict_dim, is_generating, beam_size,
                   max_generation_length):
    src_word_idx = tf.placeholder(tf.int32, shape=[None, None])
    src_sequence_length = tf.placeholder(tf.int32, shape=[None, ])

    src_embedding_weights = tf.get_variable("source_word_embeddings",
                                            [source_dict_dim, embedding_dim])
    src_embedding = tf.nn.embedding_lookup(src_embedding_weights, src_word_idx)

    src_forward_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size)
    src_reversed_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size)
    # no peephole
    encoder_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
        cell_fw=src_forward_cell,
        cell_bw=src_reversed_cell,
        inputs=src_embedding,
        sequence_length=src_sequence_length,
        dtype=tf.float32)

    # concat the forward outputs and backward outputs
    encoded_vec = tf.concat(encoder_outputs, axis=2)

    # project the encoder outputs to size of decoder lstm
    encoded_proj = tf.contrib.layers.fully_connected(
        inputs=tf.reshape(
            encoded_vec, shape=[-1, embedding_dim * 2]),
        num_outputs=decoder_size,
        activation_fn=None,
        biases_initializer=None)
    encoded_proj_reshape = tf.reshape(
        encoded_proj, shape=[-1, tf.shape(encoded_vec)[1], decoder_size])

    # get init state for decoder lstm's H
    backword_first = tf.slice(encoder_outputs[1], [0, 0, 0], [-1, 1, -1])
    decoder_boot = tf.contrib.layers.fully_connected(
        inputs=tf.reshape(
            backword_first, shape=[-1, embedding_dim]),
        num_outputs=decoder_size,
        activation_fn=tf.nn.tanh,
        biases_initializer=None)

    # prepare the initial state for decoder lstm
    cell_init = tf.zeros(tf.shape(decoder_boot), tf.float32)
    initial_state = LSTMStateTuple(cell_init, decoder_boot)

    # create decoder lstm cell
    decoder_cell = LSTMCellWithSimpleAttention(
        decoder_size,
        encoded_vec
        if not is_generating else seq2seq.tile_batch(encoded_vec, beam_size),
        encoded_proj_reshape if not is_generating else
        seq2seq.tile_batch(encoded_proj_reshape, beam_size),
        src_sequence_length if not is_generating else
        seq2seq.tile_batch(src_sequence_length, beam_size),
        forget_bias=0.0)

    output_layer = Dense(target_dict_dim, name='output_projection')

    if not is_generating:
        trg_word_idx = tf.placeholder(tf.int32, shape=[None, None])
        trg_sequence_length = tf.placeholder(tf.int32, shape=[None, ])
        trg_embedding_weights = tf.get_variable(
            "target_word_embeddings", [target_dict_dim, embedding_dim])
        trg_embedding = tf.nn.embedding_lookup(trg_embedding_weights,
                                               trg_word_idx)

        training_helper = seq2seq.TrainingHelper(
            inputs=trg_embedding,
            sequence_length=trg_sequence_length,
            time_major=False,
            name='training_helper')

        training_decoder = seq2seq.BasicDecoder(
            cell=decoder_cell,
            helper=training_helper,
            initial_state=initial_state,
            output_layer=output_layer)

        # get the max length of target sequence
        max_decoder_length = tf.reduce_max(trg_sequence_length)

        decoder_outputs_train, _, _ = seq2seq.dynamic_decode(
            decoder=training_decoder,
            output_time_major=False,
            impute_finished=True,
            maximum_iterations=max_decoder_length)

        decoder_logits_train = tf.identity(decoder_outputs_train.rnn_output)
        decoder_pred_train = tf.argmax(
            decoder_logits_train, axis=-1, name='decoder_pred_train')
        masks = tf.sequence_mask(
            lengths=trg_sequence_length,
            maxlen=max_decoder_length,
            dtype=tf.float32,
            name='masks')

        # place holder of label sequence
        lbl_word_idx = tf.placeholder(tf.int32, shape=[None, None])

        # compute the loss
        loss = seq2seq.sequence_loss(
            logits=decoder_logits_train,
            targets=lbl_word_idx,
            weights=masks,
            average_across_timesteps=True,
            average_across_batch=True)

        # return feeding list and loss operator
        return {
            'src_word_idx': src_word_idx,
            'src_sequence_length': src_sequence_length,
            'trg_word_idx': trg_word_idx,
            'trg_sequence_length': trg_sequence_length,
            'lbl_word_idx': lbl_word_idx
        }, loss
    else:
        start_tokens = tf.ones([tf.shape(src_word_idx)[0], ],
                               tf.int32) * START_TOKEN_IDX
        # share the same embedding weights with target word
        trg_embedding_weights = tf.get_variable(
            "target_word_embeddings", [target_dict_dim, embedding_dim])

        inference_decoder = beam_search_decoder.BeamSearchDecoder(
            cell=decoder_cell,
            embedding=lambda tokens: tf.nn.embedding_lookup(trg_embedding_weights, tokens),
            start_tokens=start_tokens,
            end_token=END_TOKEN_IDX,
            initial_state=tf.nn.rnn_cell.LSTMStateTuple(
                tf.contrib.seq2seq.tile_batch(initial_state[0], beam_size),
                tf.contrib.seq2seq.tile_batch(initial_state[1], beam_size)),
            beam_width=beam_size,
            output_layer=output_layer)

        decoder_outputs_decode, _, _ = seq2seq.dynamic_decode(
            decoder=inference_decoder,
            output_time_major=False,
            #impute_finished=True,# error occurs
            maximum_iterations=max_generation_length)

        predicted_ids = decoder_outputs_decode.predicted_ids

        return {
            'src_word_idx': src_word_idx,
            'src_sequence_length': src_sequence_length
        }, predicted_ids