def decoding_layer_infer(encoder_state, dec_cell, dec_embeddings, start_of_sequence_id, end_of_sequence_id, max_target_sequence_length, vocab_size, output_layer, batch_size, keep_prob): """ Create a decoding layer for inference :param encoder_state: Encoder state :param dec_cell: Decoder RNN Cell :param dec_embeddings: Decoder embeddings :param start_of_sequence_id: GO ID :param end_of_sequence_id: EOS Id :param max_target_sequence_length: Maximum length of target sequences :param vocab_size: Size of decoder/target vocabulary :param decoding_scope: TenorFlow Variable Scope for decoding :param output_layer: Function to apply the output layer :param batch_size: Batch size :param keep_prob: Dropout keep probability :return: BasicDecoderOutput containing inference logits and sample_id """ # TODO: Implement Function start_tokens = tf.tile(tf.constant([start_of_sequence_id], dtype=tf.int32),multiples=[batch_size],) greedy_embeddinghelper = seq2seq.GreedyEmbeddingHelper(dec_embeddings, start_tokens=start_tokens, end_token=end_of_sequence_id) basic_decoder = seq2seq.BasicDecoder(dec_cell, helper=greedy_embeddinghelper, initial_state=encoder_state, output_layer=output_layer) dynamic_decode_output = seq2seq.dynamic_decode(basic_decoder, impute_finished=True, maximum_iterations=max_target_sequence_length)[0] return dynamic_decode_output
def get_DecoderHelper(embedding_lookup, seq_lengths, token_dim, gt_tokens=None, unroll_type='teacher_forcing'): if unroll_type == 'teacher_forcing': if gt_tokens is None: raise ValueError('teacher_forcing requires gt_tokens') embedding = embedding_lookup(gt_tokens) helper = seq2seq.TrainingHelper(embedding, seq_lengths) elif unroll_type == 'scheduled_sampling': if gt_tokens is None: raise ValueError('scheduled_sampling requires gt_tokens') embedding = embedding_lookup(gt_tokens) # sample_prob 1.0: always sample from ground truth # sample_prob 0.0: always sample from prediction helper = seq2seq.ScheduledEmbeddingTrainingHelper( embedding, seq_lengths, embedding_lookup, 1.0 - self.sample_prob, seed=None, scheduling_seed=None) elif unroll_type == 'greedy': # during evaluation, we perform greedy unrolling. start_token = tf.zeros([self.batch_size], dtype=tf.int32) + token_dim end_token = token_dim - 1 helper = seq2seq.GreedyEmbeddingHelper(embedding_lookup, start_token, end_token) else: raise ValueError('Unknown unroll type') return helper
def get_helper(encoder_output, input_emb, input_len, embedding, mode, params): batch_size = tf.shape(encoder_output.output)[0] if mode == tf.estimator.ModeKeys.TRAIN: if params['conditional']: # conditional train helper with encoder output state as direct input # Reshape encoder state as auxiliary input: batch_size * hidden -> batch_size * max_len * hidden decoder_length = tf.shape(input_emb)[1] state_shape = tf.shape(encoder_output.state) encoder_state = tf.tile( tf.reshape(encoder_output.state, [state_shape[1], state_shape[0], state_shape[2]]), [1, decoder_length, 1]) input_emb = tf.concat([encoder_state, input_emb], axis=-1) helper = seq2seq.TrainingHelper( inputs=input_emb, # batch_size * max_len-1 * emb_size sequence_length=input_len - 1, # exclude last token time_major=False, name='training_helper') else: helper = seq2seq.GreedyEmbeddingHelper( embedding=embedding_func(embedding), start_tokens=tf.fill([batch_size], params['start_token']), end_token=params['end_token']) return helper
def language_decoder(inputs, embed_seq, seq_len, embedding_lookup, dim, start_tokens, end_token, max_seq_len, unroll_type='teacher_forcing', output_layer=None, is_train=True, scope='language_decoder', reuse=tf.AUTO_REUSE): """ Args: seq: sequence of token (usually ground truth sequence) embed_seq: pre-embedded sequence of token for teacher forcing embedding_lookup: embedding lookup function for greedy unrolling start_token: tensor for start token [<s>] * bs end_token: integer for end token <e> """ with tf.variable_scope(scope, reuse=reuse) as scope: init_c = fc_layer(inputs, dim, use_bias=True, use_bn=False, activation_fn=None, is_training=is_train, scope='Linear_c', reuse=reuse) init_h = fc_layer(inputs, dim, use_bias=True, use_bn=False, activation_fn=None, is_training=is_train, scope='Linear_h', reuse=reuse) init_state = rnn.LSTMStateTuple(init_c, init_h) log.warning(scope.name) if unroll_type == 'teacher_forcing': helper = seq2seq.TrainingHelper(embed_seq, seq_len) elif unroll_type == 'greedy': helper = seq2seq.GreedyEmbeddingHelper(embedding_lookup, start_tokens, end_token) else: raise ValueError('Unknown unroll_type') cell = rnn.BasicLSTMCell(num_units=dim, state_is_tuple=True) decoder = seq2seq.BasicDecoder(cell, helper, init_state, output_layer=output_layer) outputs, _, pred_length = seq2seq.dynamic_decode( decoder, maximum_iterations=max_seq_len, scope='dynamic_decoder') output = outputs.rnn_output pred = outputs.sample_id return output, pred, pred_length
def _build_decoder_test_greedy(self): r""" Builds the greedy test decoder, which feeds the most likely decoded symbol as input for the next timestep """ self._helper_greedy = seq2seq.GreedyEmbeddingHelper( embedding=self._embedding_matrix, start_tokens=tf.tile([self._GO_ID], [self._batch_size]), end_token=self._EOS_ID) if self._hparams.enable_attention is True: cells, initial_state = self._add_attention(decoder_cells=self._decoder_cells, beam_search=False) else: cells = self._decoder_cells initial_state = self._decoder_initial_state self._decoder_inference = seq2seq.BasicDecoder( cell=cells, helper=self._helper_greedy, initial_state=initial_state, output_layer=self._dense_layer) outputs, states, lengths = seq2seq.dynamic_decode( self._decoder_inference, impute_finished=True, swap_memory=False, maximum_iterations=self._hparams.max_label_length) self.inference_outputs = outputs.rnn_output self.inference_predicted_ids = outputs.sample_id if self._hparams.write_attention_alignment is True: self.attention_summary = self._create_attention_alignments_summary(states, )
def _response_generator(self): with tf.name_scope('response_generator'): batch_size, _ = tf.unstack(tf.shape(self.inputs)) logits_projection = Dense(self._word_embeddings_shape[0], name='logits_projection') decoder_cell, decoder_initial_state = self._decoder_cell() if self._decoder_helper_initializer is not None: helper = self._decoder_helper_initializer( self._word_embeddings) else: helper = seq2seq.GreedyEmbeddingHelper( embedding=self._word_embeddings, start_tokens=tf.tile([0], [batch_size]), end_token=1) decoder = seq2seq.BasicDecoder(decoder_cell, helper=helper, initial_state=decoder_initial_state, output_layer=logits_projection) decoder_outputs, _, _ = seq2seq.dynamic_decode( decoder=decoder, impute_finished=True) self._decoder_logits = decoder_outputs.rnn_output self.decoder_token_ids = tf.argmax(self._decoder_logits, -1, output_type=tf.int32)
def get_DecoderHelper(embedding_lookup, seq_lengths, token_dim, gt_tokens=None, sequence_type='program', unroll_type='teacher_forcing'): if unroll_type == 'teacher_forcing': if gt_tokens is None: raise ValueError('teacher_forcing requires gt_tokens') embedding = embedding_lookup(gt_tokens) helper = seq2seq.TrainingHelper(embedding, seq_lengths) elif unroll_type == 'scheduled_sampling': if gt_tokens is None: raise ValueError('scheduled_sampling requires gt_tokens') embedding = embedding_lookup(gt_tokens) # sample_prob 1.0: always sample from ground truth # sample_prob 0.0: always sample from prediction helper = seq2seq.ScheduledEmbeddingTrainingHelper( embedding, seq_lengths, embedding_lookup, 1.0 - self.sample_prob, seed=None, scheduling_seed=None) elif unroll_type == 'greedy': # during evaluation, we perform greedy unrolling. start_token = tf.zeros([self.batch_size], dtype=tf.int32) + \ token_dim if sequence_type == 'program': end_token = self.vocab.token2int['m)'] elif sequence_type == 'action': end_token = token_dim - 1 else: # Hack to have no end token, greater than number of perceptions end_token = 11 helper = seq2seq.GreedyEmbeddingHelper( embedding_lookup, start_token, end_token) else: raise ValueError('Unknown unroll type') return helper
def _init_decoder(self): lstm_decoder = tf.contrib.rnn.DropoutWrapper( tf.contrib.rnn.LSTMCell(self.rnn_size), output_keep_prob=self.keep_prob) attention_mechanism = tf.contrib.seq2seq.LuongAttention( self.rnn_size, self.encoder_outputs, name='LuongAttention') self.decoder_cell = tf.contrib.seq2seq.AttentionWrapper( lstm_decoder, attention_mechanism, attention_layer_size=self.rnn_size, name="AttentionWrapper") batch_size = tf.shape(self.encoder_inputs)[0] attn_zero = self.decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) init_state = attn_zero.clone(cell_state=self.encoder_final_state) with tf.variable_scope( "decoder" ) as scope: # Need to understand why we aren't using the dynamic_rnn method here output_layer = layers_core.Dense(units=self.effective_vocab_size, activation=None) # Train decoding train_helper = seq2seq.TrainingHelper( inputs=self.decoder_train_inputs_embedded, sequence_length=self.decoder_train_length, time_major=False) train_decoder = seq2seq.BasicDecoder(cell=self.decoder_cell, helper=train_helper, initial_state=init_state) self.decoder_outputs_train, _, _ = seq2seq.dynamic_decode( decoder=train_decoder, output_time_major=False, impute_finished=True, maximum_iterations=self.max_decoder_seq_length, scope=scope) self.decoder_logits_train = output_layer.apply( self.decoder_outputs_train.rnn_output) self.decoder_prediction_train = tf.argmax( self.decoder_logits_train, 2) # Greedy decoding scope.reuse_variables() greedy_helper = seq2seq.GreedyEmbeddingHelper( embedding=self.embedding_matrix, start_tokens=self.decoder_start_tokens, end_token=self.eos) greedy_decoder = seq2seq.BasicDecoder(cell=self.decoder_cell, helper=greedy_helper, initial_state=init_state, output_layer=output_layer) self.decoder_outputs_inference, _, _ = seq2seq.dynamic_decode( decoder=greedy_decoder, output_time_major=False, impute_finished=True, maximum_iterations=self. max_decoder_seq_length, # Figure out a better way of setting this scope=scope) self.decoder_prediction_inference = tf.argmax( self.decoder_outputs_inference.rnn_output, 2)
def greedy_eval_decoder(agenda, embeddings, extended_base_words, oov, start_token_id, stop_token_id, base_sent_hiddens, insert_word_embeds, delete_word_embeds, base_length, iw_length, dw_length, vocab_size, attn_dim, hidden_dim, num_layer, max_sentence_length, swap_memory, enable_dropout=False, dropout_keep=1., no_insert_delete_attn=False): with tf.variable_scope(OPS_NAME, 'decoder', reuse=True): batch_size = tf.shape(base_sent_hiddens)[0] start_token_id = tf.cast(start_token_id, tf.int32) stop_token_id = tf.cast(stop_token_id, tf.int32) helper = seq2seq.GreedyEmbeddingHelper( create_embedding_fn(vocab_size), tf.fill([batch_size], start_token_id), stop_token_id) cell, zero_states = create_decoder_cell( agenda, extended_base_words, oov, base_sent_hiddens, insert_word_embeds, delete_word_embeds, base_length, iw_length, dw_length, vocab_size, attn_dim, hidden_dim, num_layer, enable_dropout=enable_dropout, dropout_keep=dropout_keep, no_insert_delete_attn=no_insert_delete_attn) decoder = seq2seq.BasicDecoder(cell, helper, zero_states) outputs, state, lengths = seq2seq.dynamic_decode( decoder, maximum_iterations=max_sentence_length, swap_memory=swap_memory) return outputs, state, lengths
def build_predict_decoder(self): # start_tokens: [batch_size,] start_tokens = tf.ones([ self.batch_size, ], tf.int32) * self.start_token end_token = self.end_token if not self.use_beamsearch_decode: # Helper to feed inputs for greedy decoding: use the argmax of the output if self.predict_mode == 'sample': print('Building sample decoder...') decoding_helper = seq2seq.SampleEmbeddingHelper( start_tokens=start_tokens, end_token=end_token, embedding=lambda inputs: tf.nn.embedding_lookup( self.embedding, inputs)) elif self.predict_mode == 'greedy': print('Building greedy decoder...') # embedding:params argument for embedding_lookup,也就是 定义的embedding 变量传入即可。 # start_tokens: batch中每个序列起始输入的token_id # end_token:序列终止的token_id decoding_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_tokens, end_token=end_token, embedding=lambda inputs: tf.nn.embedding_lookup( self.embedding, inputs)) else: raise NotImplementedError( 'Predict mode: {} is not yet implemented'.format( self.predict_mode)) inference_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=self.output_layer) else: raise NotImplementedError( 'Beamsearch decode is not yet implemented.') # nn_output: [batch_size, decoder_targets_length, vocab_size],保存decode每个时刻每个单词的概率,可以用来计算loss # sample_id: [batch_size], tf.int32,保存最终的编码结果 self.decoder_outputs_decode, self.decoder_last_state_decode, self.decoder_outputs_length_decode = seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=False, maximum_iterations=self.max_decode_step) if not self.use_beamsearch_decode: self.decoder_pred_decode = tf.expand_dims( self.decoder_outputs_decode.sample_id, -1) #在最后一维后面加上最后一维(转换为列向量? else: raise NotImplementedError('{} mode is not recognized.'.format( self.mode))
def _helper(self, train_test_predict, embeded_inputs, sequences_lengths, start_tokens, end_token): if train_test_predict == 'train' or train_test_predict == 'test': helper = seq2seq.TrainingHelper(embeded_inputs, sequences_lengths) elif train_test_predict == 'predict': helper = seq2seq.GreedyEmbeddingHelper(self.embedding_vector, start_tokens, end_token) else: raise TypeError( 'train_test_predict should equals train, test, or predict') return helper
def __init__(self, config, w2i_target): self.build_inputs() with tf.variable_scope('encoder'): encoder_embedding = tf.Variable(tf.random_uniform([config.source_vocab_size, config.embedding_dim]), dtype=tf.float32, name='encoder_embedding') encoder_inputs_embedded = tf.nn.embedding_lookup(encoder_embedding, self.seq_inputs) with tf.variable_scope("gru_cell"): encoder_cell = tf.nn.rnn_cell.GRUCell(config.hidden_dim) ((encoder_fw_outputs, encoder_bw_outputs), (encoder_fw_final_state, encoder_bw_final_state)) = tf.nn.bidirectional_dynamic_rnn(cell_fw=encoder_cell, cell_bw=encoder_cell, inputs=encoder_inputs_embedded, sequence_length=self.seq_inputs_length, dtype=tf.float32, time_major=False) encoder_state = tf.add(encoder_fw_final_state, encoder_bw_final_state) encoder_outputs = tf.add(encoder_fw_outputs, encoder_bw_outputs) with tf.variable_scope('decoder'): decoder_embedding = tf.Variable(tf.random_normal([config.target_vocab_size, config.embedding_dim]), dtype=tf.float32, name='decoder_embedding') token_go = tf.ones([self.batch_size], dtype=tf.int32, name='token_go') * w2i_target['_GO'] # helper对象 helper = seq2seq_contrib.GreedyEmbeddingHelper(decoder_embedding, token_go, w2i_target["_EOS"]) with tf.variable_scope('gru_cell'): decoder_cell = tf.nn.rnn_cell.GRUCell(config.hidden_dim) decoder_initial_state = encoder_state # 构建decoder decoder = seq2seq_contrib.BasicDecoder(decoder_cell, helper, decoder_initial_state, output_layer=tf.layers.Dense(config.target_vocab_size)) decoder_outputs, decoder_state, final_sequence_lengths = seq2seq_contrib.dynamic_decode(decoder, maximum_iterations=tf.reduce_max( self.seq_targets_length)) self.decoder_logits = decoder_outputs.rnn_output self.out = tf.argmax(self.decoder_logits, 2) # mask掉填充的0,使后边计算的时候0不参与计算。 sequence_mask = tf.sequence_mask(self.seq_targets_length, dtype=tf.float32) self.loss = seq2seq_contrib.sequence_loss(logits=self.decoder_logits, targets=self.seq_targets, weights=sequence_mask) # 防止梯度消失和梯度爆炸 opt = tf.train.AdamOptimizer(config.learning_rate) gradients = opt.compute_gradients(self.loss) capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None] self.train_op = opt.apply_gradients(capped_gradients)
def build_predict_decoder(self): # start_tokens: [batch_size,] start_tokens = tf.ones([ self.batch_size, ], tf.int32) * self.start_token end_token = self.end_token if not self.use_beamsearch_decode: # Helper to feed inputs for greedy decoding: use the argmax of the output if self.predict_mode == 'sample': print('Building sample decoder...') decoding_helper = seq2seq.SampleEmbeddingHelper( start_tokens=start_tokens, end_token=end_token, embedding=lambda inputs: tf.nn.embedding_lookup( self.embedding, inputs), seed=random.randint(0, 1000)) elif self.predict_mode == 'greedy': print('Building greedy decoder...') decoding_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_tokens, end_token=end_token, embedding=lambda inputs: tf.nn.embedding_lookup( self.embedding, inputs)) else: raise NotImplementedError( 'Predict mode: {} is not yet implemented'.format( self.predict_mode)) inference_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=self.output_layer) else: raise NotImplementedError( 'Beamsearch decode is not yet implemented.') self.decoder_outputs_decode, self.decoder_last_state_decode, self.decoder_outputs_length_decode = seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=False, maximum_iterations=self.max_decode_step) if not self.use_beamsearch_decode: self.decoder_pred_decode = tf.expand_dims( self.decoder_outputs_decode.sample_id, -1) else: raise NotImplementedError('{} mode is not recognized.'.format( self.mode))
def inference_decoding_layer(self,embeddings,start_token,end_token,dec_cell, initial_state,output_layer,max_summary_len): start_tokens = tf.tile(tf.constant([start_token],dtype=tf.int32),[self.batch_size],name='start_token') inference_helper = seq2seq.GreedyEmbeddingHelper(embeddings,start_tokens,end_token) inference_decoder = seq2seq.BasicDecoder(dec_cell, inference_helper, initial_state, output_layer) inference_logits, _, _ = seq2seq.dynamic_decode(inference_decoder,impute_finished=True, maximum_iterations=max_summary_len) return inference_logits
def _init_decoder(self): self.decoder_cell = tf.contrib.rnn.BasicLSTMCell(self.rnn_size) with tf.variable_scope( "decoder" ) as scope: # Need to understand why we aren't using the dynamic_rnn method here output_layer = layers_core.Dense(units=self.effective_vocab_size, activation=None) # Train decoding train_helper = seq2seq.TrainingHelper( inputs=self.decoder_train_inputs_embedded, sequence_length=self.decoder_train_length, time_major=False) train_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=train_helper, initial_state=self.encoder_final_state) self.decoder_outputs_train, _, _ = seq2seq.dynamic_decode( decoder=train_decoder, output_time_major=False, impute_finished=True, maximum_iterations=self.max_decoder_seq_length, scope=scope) self.decoder_logits_train = output_layer.apply( self.decoder_outputs_train.rnn_output) self.decoder_prediction_train = tf.argmax( self.decoder_logits_train, 2) # Greedy decoding scope.reuse_variables() greedy_helper = seq2seq.GreedyEmbeddingHelper( embedding=self.embedding_matrix, start_tokens=self.decoder_start_tokens, end_token=self.eos) greedy_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=greedy_helper, initial_state=self.encoder_final_state, output_layer=output_layer) self.decoder_outputs_inference, _, _ = seq2seq.dynamic_decode( decoder=greedy_decoder, output_time_major=False, impute_finished=True, maximum_iterations=self. max_decoder_seq_length, # Figure out a better way of setting this scope=scope) self.decoder_prediction_inference = tf.argmax( self.decoder_outputs_inference.rnn_output, 2)
def _build_decoder_greedy(self): batch_size, _ = tf.unstack(tf.shape(self._labels)) self._helper_greedy = seq2seq.GreedyEmbeddingHelper( embedding=self._embedding_matrix, start_tokens=tf.tile([self._GO_ID], [batch_size]), end_token=self._EOS_ID) if self._hparams.enable_attention is True: attention_mechanisms, layer_sizes = self._create_attention_mechanisms() attention_cells = seq2seq.AttentionWrapper( cell=self._decoder_cells, attention_mechanism=attention_mechanisms, attention_layer_size=layer_sizes, initial_cell_state=self._decoder_initial_state, alignment_history=self._hparams.write_attention_alignment, output_attention=self._output_attention ) attn_zero = attention_cells.zero_state( dtype=self._hparams.dtype, batch_size=batch_size ) initial_state = attn_zero.clone( cell_state=self._decoder_initial_state ) cells = attention_cells else: cells = self._decoder_cells initial_state = self._decoder_initial_state self._decoder_inference = seq2seq.BasicDecoder( cell=cells, helper=self._helper_greedy, initial_state=initial_state, output_layer=self._dense_layer) outputs, states, lengths = seq2seq.dynamic_decode( self._decoder_inference, impute_finished=True, swap_memory=False, maximum_iterations=self._hparams.max_label_length) # self._result = outputs, states, lengths self.inference_outputs = outputs.rnn_output self.inference_predicted_ids = outputs.sample_id if self._hparams.write_attention_alignment is True: self.attention_summary = self._create_attention_alignments_summary(states, )
def inference_decoding_layer(self, embeddings, decoder_cell, initial_state, output_layer): start_token = self.vocab_dictionary['<GO>'] end_token = self.vocab_dictionary['<EOS>'] start_tokens = tf.tile(tf.constant([start_token], dtype=tf.int32), [self.batch_size], name='start_tokens') helper = seq2seq.GreedyEmbeddingHelper(embeddings, start_tokens, end_token) decoder = seq2seq.BasicDecoder(decoder_cell, helper, initial_state, output_layer) logits, _ = seq2seq.dynamic_decode( decoder, False, True, maximum_iterations=self.out_max_length) return logits
def _model(self, embed): graph = tf.Graph() with graph.as_default(): embedding = tf.Variable(embed, trainable=False, name='embedding') # 词向量 lr = tf.placeholder(tf.float32, [], name='learning_rate') # 输入数据 x_input = tf.placeholder(tf.int32, [None, None], name='x_input') # 输入数据X x_sequence_length = tf.placeholder(tf.int32, [None], name='x_length') # 输入数据每一条的长度 x_embedding = tf.nn.embedding_lookup(embedding, x_input) # 将输入的one-hot编码转换成向量 y_input = tf.placeholder(tf.int32, [None, None], name='y_input') # 输入数据Y y_sequence_length = tf.placeholder(tf.int32, [None], name='y_length') # 每一个Y的长度 y_embedding = tf.nn.embedding_lookup(embedding, y_input) # 对Y向量化 batch_size = tf.placeholder(tf.int32, [], name='batch_size') keep_prob = tf.placeholder(tf.float32, [], name='keep_prob') encoder_output, encoder_state = self._encoder(keep_prob, x_embedding, x_sequence_length, batch_size) training_helper = seq2seq.TrainingHelper(inputs=y_embedding, sequence_length=y_sequence_length) predict_helper = seq2seq.GreedyEmbeddingHelper(embedding, tf.fill([batch_size], self.word2index['GO']), self.word2index['EOS']) train_output = self._decoder(keep_prob, encoder_output, encoder_state, batch_size, 'decode', training_helper) predict_output = self._decoder(keep_prob, encoder_output, encoder_state, batch_size, 'decode', predict_helper, True) # loss function training_logits = tf.identity(train_output.rnn_output, name='training_logits') predicting_logits = tf.identity(predict_output.rnn_output, name='predicting') # target = tf.slice(y_input, [0, 1], [-1, -1]) # target = tf.concat([tf.fill([batch_size, 1], self.word2index['GO']), y_input], 1) target = y_input masks = tf.sequence_mask(y_sequence_length, dtype=tf.float32, name='mask') loss = seq2seq.sequence_loss(training_logits, target, masks) optimizer = tf.train.AdamOptimizer(lr) gradients = optimizer.compute_gradients(loss) capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None] train_op = optimizer.apply_gradients(capped_gradients) # predicting_logits = tf.nn.softmax(predicting_logits, axis=1) tf.summary.scalar('loss', loss) tf.summary.scalar('learning rate', lr) # tf.summary.tensor_summary('learning rate', lr) return graph, loss, train_op, predicting_logits
def decode_L(inputs, dim, embed_map, start_token, unroll_type='teacher_forcing', seq=None, seq_len=None, end_token=None, max_seq_len=None, output_layer=None, is_train=True, scope='decode_L', reuse=tf.AUTO_REUSE): with tf.variable_scope(scope, reuse=reuse) as scope: init_c = fc_layer(inputs, dim, use_bias=True, use_bn=False, activation_fn=tf.nn.tanh, is_training=is_train, scope='Linear_c', reuse=reuse) init_h = fc_layer(inputs, dim, use_bias=True, use_bn=False, activation_fn=tf.nn.tanh, is_training=is_train, scope='Linear_h', reuse=reuse) init_state = rnn.LSTMStateTuple(init_c, init_h) log.warning(scope.name) start_tokens = tf.zeros( [tf.shape(inputs)[0]], dtype=tf.int32) + start_token if unroll_type == 'teacher_forcing': if seq is None: raise ValueError('seq is None') if seq_len is None: raise ValueError('seq_len is None') seq_with_start = tf.concat([tf.expand_dims(start_tokens, axis=1), seq[:, :-1]], axis=1) helper = seq2seq.TrainingHelper( tf.nn.embedding_lookup(embed_map, seq_with_start), seq_len) elif unroll_type == 'greedy': if end_token is None: raise ValueError('end_token is None') helper = seq2seq.GreedyEmbeddingHelper( lambda e: tf.nn.embedding_lookup(embed_map, e), start_tokens, end_token) else: raise ValueError('Unknown unroll_type') cell = rnn.BasicLSTMCell(num_units=dim, state_is_tuple=True) decoder = seq2seq.BasicDecoder(cell, helper, init_state, output_layer=output_layer) outputs, _, pred_length = seq2seq.dynamic_decode( decoder, maximum_iterations=max_seq_len, scope='dynamic_decoder') output = outputs.rnn_output pred = outputs.sample_id return output, pred, pred_length
def decoding_layer_inference(self, num_units, max_time, batch_size, char2numY, output_embedding, encoder_output, last_state, bidirectional): if not bidirectional: decoder_cell = rnn.LSTMCell(num_units) else: decoder_cell = rnn.LSTMCell(2 * num_units) infer_helper = seq2seq.GreedyEmbeddingHelper( output_embedding, # Notice that different between data_output_embed tf.fill([batch_size], char2numY['<GO>']), char2numY['<EOS>']) attention_mechanism = seq2seq.BahdanauAttention( num_units=num_units, memory=encoder_output, memory_sequence_length=[max_time] * batch_size) attention_cell = seq2seq.AttentionWrapper( cell=decoder_cell, attention_mechanism=attention_mechanism, attention_layer_size=num_units, ) state = attention_cell.zero_state(batch_size=batch_size, dtype=tf.float32) state = state.clone(cell_state=last_state) output_layer = tf.layers.Dense( len(char2numY) - 2, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) decoder = seq2seq.BasicDecoder(cell=attention_cell, helper=infer_helper, initial_state=state, output_layer=output_layer) infer_outputs, _, _ = seq2seq.dynamic_decode( decoder=decoder, impute_finished=True, maximum_iterations=max_time) return infer_outputs
saver.save(session, "model", epoch) batch = next(test_data()) batch = collapse_documents(batch) answers = session.run( answer_tags, { document_tokens: batch["document_tokens"], document_lengths: batch["document_lengths"], }) answers = np.argmax(answers, 2) batch = expand_answers(batch, answers) helper = seq2seq.GreedyEmbeddingHelper(embedding, tf.fill([batch["size"]], START_TOKEN), END_TOKEN) decoder = seq2seq.BasicDecoder(decoder_cell, helper, encoder_state, output_layer=projection) decoder_outputs, _, _ = seq2seq.dynamic_decode(decoder, maximum_iterations=16) decoder_outputs = decoder_outputs.rnn_output questions = session.run( decoder_outputs, { document_tokens: batch["document_tokens"], document_lengths: batch["document_lengths"], answer_labels: batch["answer_labels"], encoder_input_mask: batch["answer_masks"], encoder_lengths: batch["answer_lengths"],
def build_decoder(self): print("building decoder and attention..") with tf.variable_scope('decoder'): # Building decoder_cell and decoder_initial_state self.decoder_cell, self.decoder_initial_state = self.build_decoder_cell( ) # Initialize decoder embeddings to have variance=1. sqrt3 = math.sqrt(3) # Uniform(-sqrt(3), sqrt(3)) has variance=1. initializer = tf.random_uniform_initializer(-sqrt3, sqrt3, dtype=self.dtype) self.decoder_embeddings = tf.get_variable( name='embedding', shape=[self.num_decoder_symbols, self.embedding_size], initializer=initializer, dtype=self.dtype) # Input projection layer to feed embedded inputs to the cell # ** Essential when use_residual=True to match input/output dims input_layer = Dense(self.hidden_units, dtype=self.dtype, name='input_projection') # Output projection layer to convert cell_outputs to logits output_layer = Dense(self.num_decoder_symbols, name='output_projection') if self.mode == 'train': # decoder_inputs_embedded: [batch_size, max_time_step + 1, embedding_size] self.decoder_inputs_embedded = tf.nn.embedding_lookup( params=self.decoder_embeddings, ids=self.decoder_inputs_train) # Embedded inputs having gone through input projection layer self.decoder_inputs_embedded = input_layer( self.decoder_inputs_embedded) # Helper to feed inputs for training: read inputs from dense ground truth vectors training_helper = seq2seq.TrainingHelper( inputs=self.decoder_inputs_embedded, sequence_length=self.decoder_inputs_length_train, time_major=False, name='training_helper') training_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state, output_layer=output_layer) #output_layer=None) # Maximum decoder time_steps in current batch max_decoder_length = tf.reduce_max( self.decoder_inputs_length_train) # decoder_outputs_train: BasicDecoderOutput # namedtuple(rnn_outputs, sample_id) # decoder_outputs_train.rnn_output: [batch_size, max_time_step + 1, num_decoder_symbols] if output_time_major=False # [max_time_step + 1, batch_size, num_decoder_symbols] if output_time_major=True # decoder_outputs_train.sample_id: [batch_size], tf.int32 (self.decoder_outputs_train, self.decoder_last_state_train, self.decoder_outputs_length_train) = (seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=False, impute_finished=True, maximum_iterations=max_decoder_length)) # More efficient to do the projection on the batch-time-concatenated tensor # logits_train: [batch_size, max_time_step + 1, num_decoder_symbols] # self.decoder_logits_train = output_layer(self.decoder_outputs_train.rnn_output) self.decoder_logits_train = tf.identity( self.decoder_outputs_train.rnn_output) # Use argmax to extract decoder symbols to emit self.decoder_pred_train = tf.argmax(self.decoder_logits_train, axis=-1, name='decoder_pred_train') # masks: masking for valid and padded time steps, [batch_size, max_time_step + 1] masks = tf.sequence_mask( lengths=self.decoder_inputs_length_train, maxlen=max_decoder_length, dtype=self.dtype, name='masks') # Computes per word average cross-entropy over a batch # Internally calls 'nn_ops.sparse_softmax_cross_entropy_with_logits' by default self.loss = seq2seq.sequence_loss( logits=self.decoder_logits_train, targets=self.decoder_targets_train, weights=masks, average_across_timesteps=True, average_across_batch=True, ) # Training summary for the current batch_loss tf.summary.scalar('loss', self.loss) # Contruct graphs for minimizing loss self.init_optimizer() elif self.mode == 'decode': # Start_tokens: [batch_size,] `int32` vector start_tokens = tf.ones([ self.batch_size, ], tf.int32) * data_utils.start_token end_token = data_utils.end_token def embed_and_input_proj(inputs): return input_layer( tf.nn.embedding_lookup(self.decoder_embeddings, inputs)) if not self.use_beamsearch_decode: # Helper to feed inputs for greedy decoding: uses the argmax of the output decoding_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_tokens, end_token=end_token, embedding=embed_and_input_proj) # Basic decoder performs greedy decoding at each time step print("building greedy decoder..") inference_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=output_layer) else: # Beamsearch is used to approximately find the most likely translation print("building beamsearch decoder..") inference_decoder = beam_search_decoder.BeamSearchDecoder( cell=self.decoder_cell, embedding=embed_and_input_proj, start_tokens=start_tokens, end_token=end_token, initial_state=self.decoder_initial_state, beam_width=self.beam_width, output_layer=output_layer, ) # For GreedyDecoder, return # decoder_outputs_decode: BasicDecoderOutput instance # namedtuple(rnn_outputs, sample_id) # decoder_outputs_decode.rnn_output: [batch_size, max_time_step, num_decoder_symbols] if output_time_major=False # [max_time_step, batch_size, num_decoder_symbols] if output_time_major=True # decoder_outputs_decode.sample_id: [batch_size, max_time_step], tf.int32 if output_time_major=False # [max_time_step, batch_size], tf.int32 if output_time_major=True # For BeamSearchDecoder, return # decoder_outputs_decode: FinalBeamSearchDecoderOutput instance # namedtuple(predicted_ids, beam_search_decoder_output) # decoder_outputs_decode.predicted_ids: [batch_size, max_time_step, beam_width] if output_time_major=False # [max_time_step, batch_size, beam_width] if output_time_major=True # decoder_outputs_decode.beam_search_decoder_output: BeamSearchDecoderOutput instance # namedtuple(scores, predicted_ids, parent_ids) (self.decoder_outputs_decode, self.decoder_last_state_decode, self.decoder_outputs_length_decode) = ( seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=False, #impute_finished=True, # error occurs maximum_iterations=self.max_decode_step)) if not self.use_beamsearch_decode: # decoder_outputs_decode.sample_id: [batch_size, max_time_step] # Or use argmax to find decoder symbols to emit: # self.decoder_pred_decode = tf.argmax(self.decoder_outputs_decode.rnn_output, # axis=-1, name='decoder_pred_decode') # Here, we use expand_dims to be compatible with the result of the beamsearch decoder # decoder_pred_decode: [batch_size, max_time_step, 1] (output_major=False) self.decoder_pred_decode = tf.expand_dims( self.decoder_outputs_decode.sample_id, -1) else: # Use beam search to approximately find the most likely translation # decoder_pred_decode: [batch_size, max_time_step, beam_width] (output_major=False) self.decoder_pred_decode = self.decoder_outputs_decode.predicted_ids
def build_decoder(self, encoder_outputs, encoder_state): """构建解码器 """ with tf.variable_scope('decoder') as decoder_scope: #创建解码器单元 (self.decoder_cell,self.decoder_initial_state)\ = self.build_decoder_cell(encoder_outputs, encoder_state) # 解码器embedding 根据词表大小选择CPU还是GPU上训练 with tf.device(_get_embed_device(self.target_vocab_size)): #如果是共享的embedding 则赋值,否则加载预训练 或者初始化进行后续的训练 if self.share_embedding: self.decoder_embeddings = self.encoder_embeddings #如果是预训练的embedding elif self.pretrained_embedding: self.decoder_embeddings = tf.Variable( tf.constant( 0.0, shape=(self.target_vocab_size,self.embedding_size)), trainable=True,#是否可以被训练 name='embeddings') self.decoder_embeddings_placeholder = tf.placeholder( tf.float32, (self.target_vocab_size, self.embedding_size)) self.decoder_embeddings_init = self.decoder_embeddings.assign( self.decoder_embeddings_placeholder) else: self.decoder_embeddings = tf.get_variable( name='embeddings', shape=(self.target_vocab_size, self.embedding_size), initializer=self.initializer, dtype=tf.float32 ) #解码器的输出 self.decoder_output_projection = layers.Dense( self.target_vocab_size, #一共有词表大小个输出 dtype=tf.float32, use_bias=False, name='decoder_output_projection' ) if self.mode == 'train': self.decoder_inputs_embedded = tf.nn.embedding_lookup( params=self.decoder_embeddings, ids=self.decoder_inputs_train #placeholder初始化时设定 ) inputs = self.decoder_inputs_embedded if self.time_major: inputs = tf.transpose(inputs, (1, 0, 2)) #帮助feed参数 一般用于训练阶段Decoder解码,辅助Decoder解码过程 training_helper = seq2seq.TrainingHelper( inputs=inputs, sequence_length=self.decoder_inputs_length, time_major=self.time_major, name='training_helper' ) # 训练的时候不在这里应用 output_layer # 因为这里会每个 time_step 的进行 output_layer 的投影计算,比较慢 # 注意这个trick要成功必须设置 dynamic_decode 的 scope 参数 training_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state, #用之前定义的初始化单元的状态进行初始化 ) # decoder在当前batch下最大的time_steps max_decoder_length = tf.reduce_max(self.decoder_inputs_length) #定义动态解码的输出 (outputs,self.final_state,_)\ = seq2seq.dynamic_decode( #动态decoder decoder=training_decoder, output_time_major=self.time_major, #True是以time(seq_length)为第一维,False是以batch_size为第一维 impute_finished=True, #追踪finished,如果一个序列已经finished,那么后面的每一步output为0 maximum_iterations=max_decoder_length,#最大迭代次数(可以理解为decoder最多可以生成几个词) parallel_iterations=self.parallel_iterations,##while_loop的并行次数 swap_memory=True, ##True时,当遇到OOM(out of memory),是否把张量从显存转到内存 scope=decoder_scope) #在训练时将所有的结果在全连接层一次性做投影运算 可以提高效率官方提倡 self.decoder_logits_train = self.decoder_output_projection( outputs.rnn_output #上面定义的解码器的输出 ) # masks: masking for valid and padded time steps, #tf.sequence_mask的作用是构建序列长度的mask标志 """ tf.sequence_mask([1,2], 4) --> [[ True False False False] [ True True False False]] """ # [batch_size, max_time_step + 1] self.masks = tf.sequence_mask( lengths=self.decoder_inputs_length, maxlen=max_decoder_length, dtype=tf.float32, name='masks' ) decoder_logits_train = self.decoder_logits_train if self.time_major: decoder_logits_train = tf.transpose(decoder_logits_train, (1,0,2)) #解码器训练时的预测输出 decoder_logits_train一共有词表大小个输出,现仅取值最大的那个下标即为预测的对应下标 self.decoder_pred_train = tf.argmax( decoder_logits_train, axis=-1, name='decoder_pred_train') # 下面的一些变量用于特殊的学习训练 # 自定义rewards,其实我这里是修改了masks 损失之类 # train_entropy = cross entropy self.train_entropy = \ tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.decoder_inputs,#标签 logits=decoder_logits_train)#预测 self.masks_rewards = self.masks * self.rewards #seq2sqe中的损失函数 就是将各个时间步输出相加求平均 权重为mask 当句子长度短于最大长度,为0部分的权重为0 self.loss_rewards = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_inputs, weights=self.masks_rewards, #这里权重跟下面的不同 average_across_timesteps=True, #损失将除以总的权重 average_across_batch=True, #损失将是总的损失处于批次大小 ) self.loss = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_inputs, weights=self.masks, average_across_timesteps=True, average_across_batch=True, ) self.loss_add = self.loss + self.add_loss elif self.mode == 'decode': # 预测模式,非训练 #对原数据进行扩展 参考 #https://blog.csdn.net/tsyccnh/article/details/82459859 start_tokens = tf.tile( [WordSequence.START], [self.batch_size] ) end_token = WordSequence.END def embed_and_input_proj(inputs): """输入层的投影层wrapper 将输入转换成对应词表对应下的embedding """ return tf.nn.embedding_lookup( self.decoder_embeddings, inputs ) #如果不使用集束搜索解码 这里定义helper和decoder的结构 if not self.use_beamsearch_decode: # Helper to feed inputs for greedy decoding: # uses the argmax of the output #贪婪搜索解码 decoding_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_tokens,#起始token end_token=end_token, #结束token embedding=embed_and_input_proj #已经将输入转换成对应的embedding ) # Basic decoder performs greedy decoding at each time step # print("building greedy decoder..") inference_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=self.decoder_output_projection ) else: #使用beamsearch解码 # Beamsearch is used to approximately # find the most likely translation # print("building beamsearch decoder..") inference_decoder = BeamSearchDecoder( cell=self.decoder_cell, embedding=embed_and_input_proj, start_tokens=start_tokens, end_token=end_token, initial_state=self.decoder_initial_state, beam_width=self.beam_width, output_layer=self.decoder_output_projection, ) #一般使用最大值 if self.max_decode_step is not None: max_decode_step = self.max_decode_step else: # 默认 4 倍输入长度的输出解码 max_decode_step = tf.round(tf.reduce_max( self.encoder_inputs_length) * 4) ( self.decoder_outputs_decode,#输出 self.final_state, #最后的状态 _ # self.decoder_outputs_length_decode ) = seq2seq.dynamic_decode( decoder=inference_decoder, #这里包含了使用哪种解码方式 output_time_major=self.time_major, # impute_finished=True, # error occurs maximum_iterations=max_decode_step, parallel_iterations=self.parallel_iterations, swap_memory=True, scope=decoder_scope ) #如果不使用beamsearch解码,使用贪婪解码 #调用dynamic_decode进行解码,decoder_outputs_decode是一个namedtuple,里面包含两项(rnn_outputs, sample_id) # rnn_output: [batch_size, decoder_targets_length, vocab_size],保存decode每个时刻每个单词的概率,可以用来计算loss # sample_id: [batch_size], tf.int32,保存最终的编码结果。可以表示最后的答案 if not self.use_beamsearch_decode: dod = self.decoder_outputs_decode self.decoder_pred_decode = dod.sample_id #就是最终的答案 if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0)) #如果使用beamsearch #参考 https://blog.csdn.net/liuchonge/article/details/79021938 # 对于使用beam_search的时候,decoder_outputs_decode它里面包含两项(predicted_ids, beam_search_decoder_output) # predicted_ids: [batch_size, decoder_targets_length, beam_size],保存输出结果 # beam_search_decoder_output: BeamSearchDecoderOutput instance namedtuple(scores, predicted_ids, parent_ids) # 所以对应只需要返回predicted_ids或者sample_id即可翻译成最终的结果 else: self.decoder_pred_decode = \ self.decoder_outputs_decode.predicted_ids if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0, 2)) self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, perm=[0, 2, 1]) dod = self.decoder_outputs_decode self.beam_prob = dod.beam_search_decoder_output.scores
def __init__(self, mode, vocab_size, target_vocab_size, emb_dim, encoder_num_units, encoder_num_layers, decoder_num_units, decoder_num_layers, dropout_emb, dropout_hidden, tgt_sos_id, tgt_eos_id, learning_rate, clip_norm, attention_option, beam_size, optimizer, maximum_iterations): assert mode in ["train", "infer"], "invalid mode!" assert encoder_num_units == decoder_num_units, "encoder num_units **must** match decoder num_units" self.target_vocab_size = target_vocab_size # inputs self.encoder_inputs = tf.placeholder(tf.int32, shape=[None, None], name='encoder_inputs') self.decoder_inputs = tf.placeholder(tf.int32, shape=[None, None], name='decoder_inputs') self.decoder_outputs = tf.placeholder(tf.int32, shape=[None, None], name='decoder_outputs') self.encoder_lengths = tf.placeholder(tf.int32, shape=[None], name='encoder_lengths') self.decoder_lengths = tf.placeholder(tf.int32, shape=[None], name='decoder_lengths') # cell def cell(num_units): cell = rnn.BasicLSTMCell(num_units=num_units) if mode == 'train': cell = rnn.DropoutWrapper(cell=cell, output_keep_prob=1 - dropout_hidden) return cell # embeddings self.embeddings = tf.get_variable('embeddings', shape=[vocab_size, emb_dim], dtype=tf.float32) # Encoder with tf.variable_scope('encoder'): # embeddings encoder_inputs_emb = tf.nn.embedding_lookup( self.embeddings, self.encoder_inputs) if mode == 'train': encoder_inputs_emb = tf.nn.dropout(encoder_inputs_emb, 1 - dropout_emb) # encoder_rnn_cell fw_encoder_cell = cell(encoder_num_units) bw_encoder_cell = cell(encoder_num_units) # bi_lstm encoder (encoder_outputs_fw, encoder_outputs_bw), ( encoder_state_fw, encoder_state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw=fw_encoder_cell, cell_bw=bw_encoder_cell, inputs=encoder_inputs_emb, sequence_length=self.encoder_lengths, dtype=tf.float32) encoder_outputs = tf.concat( [encoder_outputs_fw, encoder_outputs_bw], 2) # A linear layer to reduce the encoder's final FW and BW state into a single initial state for the decoder. # This is needed because the encoder is bidirectional but the decoder is not. encoder_states_c = tf.layers.dense(inputs=tf.concat( [encoder_state_fw.c, encoder_state_bw.c], axis=-1), units=encoder_num_units, activation=None, use_bias=False) encoder_states_h = tf.layers.dense(inputs=tf.concat( [encoder_state_fw.h, encoder_state_bw.h], axis=-1), units=encoder_num_units, activation=None, use_bias=False) encoder_states = rnn.LSTMStateTuple(encoder_states_c, encoder_states_h) encoder_lengths = self.encoder_lengths # Decoder with tf.variable_scope('decoder'): decoder_inputs_emb = tf.nn.embedding_lookup( self.embeddings, self.decoder_inputs) if mode == 'train': decoder_inputs_emb = tf.nn.dropout(decoder_inputs_emb, 1 - dropout_emb) # decoder_rnn_cell decoder_cell = cell(decoder_num_units) batch_size = tf.shape(self.encoder_inputs)[0] decoder_initial_state = decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32) decoder_initial_state = encoder_states projection_layer = layers_core.Dense(units=target_vocab_size, use_bias=False) # train/infer if mode == 'train': # helper helper = seq2seq.TrainingHelper( inputs=decoder_inputs_emb, sequence_length=self.decoder_lengths) # decoder decoder = seq2seq.BasicDecoder( cell=decoder_cell, helper=helper, initial_state=decoder_initial_state, output_layer=projection_layer) # dynamic decoding self.final_outputs, self.final_state, self.final_sequence_lengths = seq2seq.dynamic_decode( decoder=decoder, swap_memory=True) else: start_tokens = tf.fill([batch_size], tgt_sos_id) end_token = tgt_eos_id # helper helper = seq2seq.GreedyEmbeddingHelper( embedding=self.embeddings, start_tokens=start_tokens, end_token=end_token) # decoder decoder = seq2seq.BasicDecoder( cell=decoder_cell, helper=helper, initial_state=decoder_initial_state, output_layer=projection_layer) # dynamic decoding self.final_outputs, self.final_state, self.final_sequence_lengths = seq2seq.dynamic_decode( decoder=decoder, maximum_iterations=maximum_iterations, swap_memory=True) self.logits = self.final_outputs.rnn_output self.sample_id = self.final_outputs.sample_id if mode == 'train': # loss with tf.variable_scope('loss'): cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.decoder_outputs, logits=self.logits) masks = tf.sequence_mask(lengths=self.decoder_lengths, dtype=tf.float32) self.loss = tf.reduce_sum( cross_entropy * masks) / tf.to_float(batch_size) tf.summary.scalar('loss', self.loss) # summaries self.merged = tf.summary.merge_all() # train_op self.learning_rate = tf.Variable(learning_rate, trainable=False) self.global_step = tf.Variable(0, dtype=tf.int32) tvars = tf.trainable_variables() clipped_gradients, _ = tf.clip_by_global_norm(tf.gradients( self.loss, tvars), clip_norm=clip_norm) optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate) self.train_op = optimizer.apply_gradients( zip(clipped_gradients, tvars), global_step=self.global_step)
def build_decoder(self): print("building decoder and attention..") with tf.variable_scope('decoder'): # Building decoder_cell and decoder_initial_state self.decoder_cell, self.decoder_initial_state = self.build_decoder_cell( ) # Initialize decoder embeddings to have variance=1. sqrt3 = math.sqrt(3) # Uniform(-sqrt(3), sqrt(3)) has variance=1. initializer = tf.random_uniform_initializer(-sqrt3, sqrt3, dtype=self.dtype) self.decoder_embeddings = tf.get_variable( name='embedding', shape=[self.num_decoder_symbols, self.embedding_size], initializer=initializer, dtype=self.dtype) # Input projection layer to feed embedded inputs to the cell # ** Essential when use_residual=True to match input/output dims input_layer = Dense(self.hidden_units, dtype=self.dtype, name='input_projection') # Output projection layer to convert cell_outputs to logits output_layer = Dense(self.num_decoder_symbols, name='output_projection') if self.mode == 'train': # decoder_inputs_embedded: [batch_size, max_time_step + 1, embedding_size] self.decoder_inputs_embedded = tf.nn.embedding_lookup( params=self.decoder_embeddings, ids=self.decoder_inputs_train) # Embedded inputs having gone through input projection layer self.decoder_inputs_embedded = input_layer( self.decoder_inputs_embedded) # Helper to feed inputs for training: read inputs from dense ground truth vectors training_helper = seq2seq.TrainingHelper( inputs=self.decoder_inputs_embedded, sequence_length=self.decoder_inputs_length_train, time_major=False, name='training_helper') training_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state, output_layer=output_layer) #output_layer=None) # Maximum decoder time_steps in current batch max_decoder_length = tf.reduce_max( self.decoder_inputs_length_train) # decoder_outputs_train: BasicDecoderOutput # namedtuple(rnn_outputs, sample_id) # decoder_outputs_train.rnn_output: [batch_size, max_time_step + 1, num_decoder_symbols] if output_time_major=False # [max_time_step + 1, batch_size, num_decoder_symbols] if output_time_major=True # decoder_outputs_train.sample_id: [batch_size], tf.int32 (self.decoder_outputs_train, self.decoder_last_state_train, self.decoder_outputs_length_train) = (seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=False, impute_finished=True, maximum_iterations=max_decoder_length)) # More efficient to do the projection on the batch-time-concatenated tensor # logits_train: [batch_size, max_time_step + 1, num_decoder_symbols] # self.decoder_logits_train = output_layer(self.decoder_outputs_train.rnn_output) self.decoder_logits_train = tf.identity( self.decoder_outputs_train.rnn_output) # Use argmax to extract decoder symbols to emit self.decoder_pred_train = tf.argmax(self.decoder_logits_train, axis=-1, name='decoder_pred_train') # masks: masking for valid and padded time steps, [batch_size, max_time_step + 1] masks = tf.sequence_mask( lengths=self.decoder_inputs_length_train, maxlen=max_decoder_length, dtype=self.dtype, name='masks') def class_weighted_loss(labels, logits): class_weights = tf.constant([ 0.00017234778799135608, 0.00017234778799135608, 0.00017234778799135608, 1.6821366229319637e-05, 4.898869308918329e-05, 7.106575604186823e-05, 7.126891354944498e-05, 7.514392550863835e-05, 7.719102618435312e-05, 8.89973910758995e-05, 0.00010430076292140834, 0.00010567508046918493, 0.00011254233356378444, 0.00013745981039146453, 0.00015365550520395147, 0.00016343173716428013, 0.00016623641703291143, 0.00018462654135821253, 0.0001873476479039208, 0.00018800477750021655, 0.00020981274294876723, 0.00021602805964389768, 0.00024354484846033354, 0.00024936107032012903, 0.0002495739348066665, 0.000319111899575184, 0.00033594586064125193, 0.0003818581956683335, 0.0003838636576651593, 0.0005417806138677063, 0.0006711205600832021, 0.0006750650134170244, 0.0006953534538202605, 0.0007032603813511271, 0.0007207552048226591, 0.0007264535179396215, 0.0007633538390502503, 0.000891602363160162, 0.0009813883808113227, 0.0010641991144668115, 0.0011028839931134101, 0.0012656472742694626, 0.0013067898106130453, 0.0013988733031399323, 0.0016671901108961662, 0.0017748398034871436, 0.0022286969673726295, 0.0022647955802244397, 0.0022727983914619817, 0.002481488984505173, 0.002566647824356508, 0.0026578592759658715, 0.002682243306020604, 0.002818588715090889, 0.002964064261676225, 0.0029888566207422903, 0.0030339714376591553, 0.0032127969269917125, 0.0032616731479905726, 0.0033361096721148385, 0.00424275689171333, 0.004594299605598149, 0.004750383639466329, 0.005306946739139776, 0.005497452519519153, 0.005911782580732912, 0.007162605175765489, 0.007194652626216341, 0.007496526162980663, 0.007960420108709664, 0.007960420108709664, 0.008691918172753256, 0.009110509132914177, 0.011323977901122198, 0.011652209144632988, 0.012711500885054168, 0.013180367720978298, 0.015169857188295775, 0.016242473353124773, 0.022971498027990745, 0.024000072566557496, 0.024549692548997745, 0.029504676366226647, 0.035733441376874495, 0.03828583004665124, 0.03874710510745427, 0.058472904071249165, 0.0630590141944844, 0.08040024309796762, 0.3573344137687449 ]) weights = tf.gather(class_weights, labels) unweighted_losses = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels, logits=logits) return unweighted_losses * weights # Computes per word average cross-entropy over a batch # Internally calls 'nn_ops.sparse_softmax_cross_entropy_with_logits' by default if self.loss_type == 'weighted': print 'using weighted loss!' self.loss = seq2seq.sequence_loss( logits=self.decoder_logits_train, targets=self.decoder_targets_train, weights=masks, average_across_timesteps=True, average_across_batch=True, softmax_loss_function=class_weighted_loss, ) else: self.loss = seq2seq.sequence_loss( logits=self.decoder_logits_train, targets=self.decoder_targets_train, weights=masks, average_across_timesteps=True, average_across_batch=True, ) # Training summary for the current batch_loss tf.summary.scalar('loss', self.loss) # Contruct graphs for minimizing loss self.init_optimizer() elif self.mode == 'decode': # Start_tokens: [batch_size,] `int32` vector start_tokens = tf.ones([ self.batch_size, ], tf.int32) * data_utils.start_token end_token = data_utils.end_token def embed_and_input_proj(inputs): return input_layer( tf.nn.embedding_lookup(self.decoder_embeddings, inputs)) if not self.use_beamsearch_decode: # Helper to feed inputs for greedy decoding: uses the argmax of the output decoding_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_tokens, end_token=end_token, embedding=embed_and_input_proj) # Basic decoder performs greedy decoding at each time step print("building greedy decoder..") inference_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=output_layer) else: # Beamsearch is used to approximately find the most likely translation print("building beamsearch decoder..") inference_decoder = beam_search_decoder.BeamSearchDecoder( cell=self.decoder_cell, embedding=embed_and_input_proj, start_tokens=start_tokens, end_token=end_token, initial_state=self.decoder_initial_state, beam_width=self.beam_width, output_layer=output_layer, ) # For GreedyDecoder, return # decoder_outputs_decode: BasicDecoderOutput instance # namedtuple(rnn_outputs, sample_id) # decoder_outputs_decode.rnn_output: [batch_size, max_time_step, num_decoder_symbols] if output_time_major=False # [max_time_step, batch_size, num_decoder_symbols] if output_time_major=True # decoder_outputs_decode.sample_id: [batch_size, max_time_step], tf.int32 if output_time_major=False # [max_time_step, batch_size], tf.int32 if output_time_major=True # For BeamSearchDecoder, return # decoder_outputs_decode: FinalBeamSearchDecoderOutput instance # namedtuple(predicted_ids, beam_search_decoder_output) # decoder_outputs_decode.predicted_ids: [batch_size, max_time_step, beam_width] if output_time_major=False # [max_time_step, batch_size, beam_width] if output_time_major=True # decoder_outputs_decode.beam_search_decoder_output: BeamSearchDecoderOutput instance # namedtuple(scores, predicted_ids, parent_ids) (self.decoder_outputs_decode, self.decoder_last_state_decode, self.decoder_outputs_length_decode) = ( seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=False, #impute_finished=True, # error occurs maximum_iterations=self.max_decode_step)) if not self.use_beamsearch_decode: # decoder_outputs_decode.sample_id: [batch_size, max_time_step] # Or use argmax to find decoder symbols to emit: # self.decoder_pred_decode = tf.argmax(self.decoder_outputs_decode.rnn_output, # axis=-1, name='decoder_pred_decode') # Here, we use expand_dims to be compatible with the result of the beamsearch decoder # decoder_pred_decode: [batch_size, max_time_step, 1] (output_major=False) self.decoder_pred_decode = tf.expand_dims( self.decoder_outputs_decode.sample_id, -1) else: # Use beam search to approximately find the most likely translation # decoder_pred_decode: [batch_size, max_time_step, beam_width] (output_major=False) self.decoder_pred_decode = self.decoder_outputs_decode.predicted_ids
def call(self, inputs, training=None, mask=None): dec_emb_fn = lambda ids: self.embed(ids) if self.is_infer: enc_outputs, enc_state, enc_seq_len = inputs batch_size = tf.shape(enc_outputs)[0] helper = seq2seq.GreedyEmbeddingHelper(embedding=dec_emb_fn, start_tokens=tf.fill([batch_size], self.dec_start_id), end_token=self.dec_end_id) else: dec_inputs, dec_seq_len, enc_outputs, enc_state, \ enc_seq_len = inputs batch_size = tf.shape(enc_outputs)[0] dec_inputs = self.embed(dec_inputs) helper = seq2seq.TrainingHelper(inputs=dec_inputs, sequence_length=dec_seq_len) if self.is_infer and self.beam_size > 1: tiled_enc_outputs = seq2seq.tile_batch(enc_outputs, multiplier=self.beam_size) tiled_seq_len = seq2seq.tile_batch(enc_seq_len, multiplier=self.beam_size) attn_mech = self._build_attention(enc_outputs=tiled_enc_outputs, enc_seq_len=tiled_seq_len) dec_cell = seq2seq.AttentionWrapper(self.cell, attn_mech) tiled_enc_last_state = seq2seq.tile_batch(enc_state, multiplier=self.beam_size) tiled_dec_init_state = dec_cell.zero_state(batch_size=batch_size * self.beam_size, dtype=tf.float32) if self.initial_decode_state: tiled_dec_init_state = tiled_dec_init_state.clone(cell_state=tiled_enc_last_state) dec = seq2seq.BeamSearchDecoder(cell=dec_cell, embedding=dec_emb_fn, start_tokens=tf.tile([self.dec_start_id], [batch_size]), end_token=self.dec_end_id, initial_state=tiled_dec_init_state, beam_width=self.beam_size, output_layer=tf.layers.Dense(self.vocab_size), length_penalty_weight=self.length_penalty) else: attn_mech = self._build_attention(enc_outputs=enc_outputs, enc_seq_len=enc_seq_len) dec_cell = seq2seq.AttentionWrapper(cell=self.cell, attention_mechanism=attn_mech) dec_init_state = dec_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if self.initial_decode_state: dec_init_state = dec_init_state.clone(cell_state=enc_state) dec = seq2seq.BasicDecoder(cell=dec_cell, helper=helper, initial_state=dec_init_state, output_layer=tf.layers.Dense(self.vocab_size)) if self.is_infer: dec_outputs, _, _ = \ seq2seq.dynamic_decode(decoder=dec, maximum_iterations=self.max_dec_len, swap_memory=self.swap_memory, output_time_major=self.time_major) return dec_outputs.predicted_ids[:, :, 0] else: dec_outputs, _, _ = \ seq2seq.dynamic_decode(decoder=dec, maximum_iterations=tf.reduce_max(dec_seq_len), swap_memory=self.swap_memory, output_time_major=self.time_major) return dec_outputs.rnn_output
def build_decoder(self, encoder_outputs, encoder_state): with tf.variable_scope('decoder') as decoder_scope: ( self.decoder_cell, self.decoder_initial_state ) = self.build_decoder_cell(encoder_outputs, encoder_state) with tf.device(_get_embed_device(self.target_vocab_size)): if self.share_embedding: self.decoder_embeddings = self.encoder_embeddings elif self.pretrained_embedding: self.decoder_embeddings = tf.Variable( tf.constant(0.0, shape=(self.target_vocab_size, self.embedding_size)), trainable=True, name='embeddings' ) self.decoder_embeddings_placeholder =\ tf.placeholder(tf.float32, (self.target_vocab_size, self.embedding_size)) self.decoder_embeddings_init = self.decoder_embeddings.assign(self.decoder_embeddings_placeholder) else: self.decoder_embeddings = tf.get_variable( name='embedding', shape=(self.target_vocab_size, self.embedding_size), initializer=self.initializer, dtype=tf.float32 ) self.decoder_output_projection = layers.Dense(self.target_vocab_size, dtype=tf.float32, use_bias=False, name='decoder_output_projection') if self.mode == 'train': self.decoder_inputs_embdedded = tf.nn.embedding_lookup( params=self.decoder_embeddings, ids=self.decoder_inputs_train ) inputs = self.decoder_inputs_embdedded if self.time_major: inputs = tf.transpose(inputs, (1, 0, 2)) training_helper = seq2seq.TrainingHelper( inputs=inputs, sequence_length=self.decoder_inputs_length, time_major=self.time_major, name='training_helper' ) training_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state ) max_decoder_length = tf.reduce_max( self.decoder_inputs_length ) ( outputs, self.final_state, _ ) = seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=self.time_major, impute_finished=True, maximum_iterations=max_decoder_length, parallel_iterations=self.parallel_iterations, swap_memory=True, scope=decoder_scope ) self.decoder_logits_train = self.decoder_output_projection( outputs.rnn_output ) self.masks = tf.sequence_mask( lengths=self.decoder_inputs_length, maxlen=max_decoder_length, dtype=tf.float32, name='masks' ) decoder_logits_train = self.decoder_logits_train if self.time_major: decoder_logits_train = tf.transpose(decoder_logits_train, (1, 0, 2)) self.decoder_pred_train = tf.argmax( decoder_logits_train, axis=-1, name='decoder_pred_train' ) self.train_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.decoder_inputs, logits=decoder_logits_train) self.masks_rewards = self.masks * self.rewards self.loss_rewards = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_inputs, weights=self.masks_rewards, average_across_timesteps=True, average_across_batch=True ) self.loss = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_inputs, weights=self.masks, average_across_timesteps=True, average_across_batch=True ) self.add_loss = self.loss + self.add_loss elif self.mode == 'decode': start_token = tf.tile( [WordSequence.START], [self.batch_size] ) end_token = WordSequence.END def embed_and_input_proj(inputs): return tf.nn.embedding_lookup( self.decoder_embeddings, inputs ) if not self.use_beamsearch_decode: decoder_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_token, end_token=end_token, embedding=embed_and_input_proj ) inference_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=decoder_helper, initial_state=self.decoder_initial_state, output_layer=self.decoder_output_projection ) else: inference_decoder = BeamSearchDecoder( cell=self.decoder_cell, embedding=embed_and_input_proj, start_tokens=start_token, end_token=end_token, initial_state=self.decoder_initial_state, beam_width=self.beam_width, output_layer=self.decoder_output_projection ) if self.max_decode_step is not None: max_decoder_step = self.max_decode_step else: max_decoder_step = tf.round(tf.reduce_max( self.encoder_inputs_length ) * 4) ( self.decoder_outputs_decode, self.final_state ) = (seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=self.time_major, maximum_iterations=max_decoder_step, parallel_iterations=self.parallel_iterations, swap_memory=True, scope=decoder_scope )) if not self.use_beamsearch_decode: dod = self.decoder_outputs_decode self.decoder_pred_train = tf.transpose( self.decoder_pred_decode, (1, 0) ) else: self.decoder_pred_decode = self.decoder_outputs_decode.predicted_ids if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0, 2) ) self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, perm=[0, 2, 1] ) dod = self.decoder_pred_decode self.beam_prob = dod.beam_search_decoder_output.scores
def build_decoder(self, encoder_outputs, encoder_state): """构建解码器 """ with tf.variable_scope('decoder') as decoder_scope: # Building decoder_cell and decoder_initial_state (self.decoder_cell, self.decoder_initial_state) = self.build_decoder_cell( encoder_outputs, encoder_state) # 解码器embedding with tf.device(_get_embed_device(self.target_vocab_size)): if self.share_embedding: self.decoder_embeddings = self.encoder_embeddings elif self.pretrained_embedding: self.decoder_embeddings = tf.Variable(tf.constant( 0.0, shape=(self.target_vocab_size, self.embedding_size)), trainable=True, name='embeddings') self.decoder_embeddings_placeholder = tf.placeholder( tf.float32, (self.target_vocab_size, self.embedding_size)) self.decoder_embeddings_init = \ self.decoder_embeddings.assign( self.decoder_embeddings_placeholder) else: self.decoder_embeddings = tf.get_variable( name='embeddings', shape=(self.target_vocab_size, self.embedding_size), initializer=self.initializer, dtype=tf.float32) self.decoder_output_projection = layers.Dense( self.target_vocab_size, dtype=tf.float32, use_bias=False, name='decoder_output_projection') if self.mode == 'train': # decoder_inputs_embedded: # [batch_size, max_time_step + 1, embedding_size] self.decoder_inputs_embedded = tf.nn.embedding_lookup( params=self.decoder_embeddings, ids=self.decoder_inputs_train) # Helper to feed inputs for training: # read inputs from dense ground truth vectors inputs = self.decoder_inputs_embedded if self.time_major: inputs = tf.transpose(inputs, (1, 0, 2)) training_helper = seq2seq.TrainingHelper( inputs=inputs, sequence_length=self.decoder_inputs_length, time_major=self.time_major, name='training_helper') # 训练的时候不在这里应用 output_layer # 因为这里会每个 time_step 的进行 output_layer 的投影计算,比较慢 # 注意这个trick要成功必须设置 dynamic_decode 的 scope 参数 training_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state, # output_layer=self.decoder_output_projection ) # Maximum decoder time_steps in current batch max_decoder_length = tf.reduce_max(self.decoder_inputs_length) # decoder_outputs_train: BasicDecoderOutput # namedtuple(rnn_outputs, sample_id) # decoder_outputs_train.rnn_output: # if output_time_major=False: # [batch_size, max_time_step + 1, num_decoder_symbols] # if output_time_major=True: # [max_time_step + 1, batch_size, num_decoder_symbols] # decoder_outputs_train.sample_id: [batch_size], tf.int32 ( outputs, self.final_state, # contain attention _ # self.final_sequence_lengths ) = seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=self.time_major, impute_finished=True, maximum_iterations=max_decoder_length, parallel_iterations=self.parallel_iterations, swap_memory=True, scope=decoder_scope) # More efficient to do the projection # on the batch-time-concatenated tensor # logits_train: # [batch_size, max_time_step + 1, num_decoder_symbols] # 训练的时候一次性对所有的结果进行 output_layer 的投影运算 # 官方NMT库说这样能提高10~20%的速度 # 实际上我提高的速度会更大 self.decoder_logits_train = self.decoder_output_projection( outputs.rnn_output) # masks: masking for valid and padded time steps, # [batch_size, max_time_step + 1] self.masks = tf.sequence_mask( lengths=self.decoder_inputs_length, maxlen=max_decoder_length, dtype=tf.float32, name='masks') # Computes per word average cross-entropy over a batch # Internally calls # 'nn_ops.sparse_softmax_cross_entropy_with_logits' by default decoder_logits_train = self.decoder_logits_train if self.time_major: decoder_logits_train = tf.transpose( decoder_logits_train, (1, 0, 2)) self.decoder_pred_train = tf.argmax(decoder_logits_train, axis=-1, name='decoder_pred_train') # 下面的一些变量用于特殊的学习训练 # 自定义rewards,其实我这里是修改了masks # train_entropy = cross entropy self.train_entropy = \ tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.decoder_inputs, logits=decoder_logits_train) self.masks_rewards = self.masks * self.rewards self.loss_rewards = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_inputs, weights=self.masks_rewards, average_across_timesteps=True, average_across_batch=True, ) self.loss = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_inputs, weights=self.masks, average_across_timesteps=True, average_across_batch=True, ) self.loss_add = self.loss + self.add_loss elif self.mode == 'decode': # 预测模式,非训练 start_tokens = tf.tile([WordSequence.START], [self.batch_size]) end_token = WordSequence.END def embed_and_input_proj(inputs): """输入层的投影层wrapper """ return tf.nn.embedding_lookup(self.decoder_embeddings, inputs) if not self.use_beamsearch_decode: # Helper to feed inputs for greedy decoding: # uses the argmax of the output decoding_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_tokens, end_token=end_token, embedding=embed_and_input_proj) # Basic decoder performs greedy decoding at each time step # print("building greedy decoder..") inference_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=self.decoder_output_projection) else: # Beamsearch is used to approximately # find the most likely translation # print("building beamsearch decoder..") inference_decoder = BeamSearchDecoder( cell=self.decoder_cell, embedding=embed_and_input_proj, start_tokens=start_tokens, end_token=end_token, initial_state=self.decoder_initial_state, beam_width=self.beam_width, output_layer=self.decoder_output_projection, ) # For GreedyDecoder, return # decoder_outputs_decode: BasicDecoderOutput instance # namedtuple(rnn_outputs, sample_id) # decoder_outputs_decode.rnn_output: # if output_time_major=False: # [batch_size, max_time_step, num_decoder_symbols] # if output_time_major=True # [max_time_step, batch_size, num_decoder_symbols] # decoder_outputs_decode.sample_id: # if output_time_major=False # [batch_size, max_time_step], tf.int32 # if output_time_major=True # [max_time_step, batch_size], tf.int32 # For BeamSearchDecoder, return # decoder_outputs_decode: FinalBeamSearchDecoderOutput instance # namedtuple(predicted_ids, beam_search_decoder_output) # decoder_outputs_decode.predicted_ids: # if output_time_major=False: # [batch_size, max_time_step, beam_width] # if output_time_major=True # [max_time_step, batch_size, beam_width] # decoder_outputs_decode.beam_search_decoder_output: # BeamSearchDecoderOutput instance # namedtuple(scores, predicted_ids, parent_ids) # 官方文档提到的一个潜在的最大长度选择 # 我这里改为 * 4 # maximum_iterations = tf.round(tf.reduce_max(source_sequence_length) * 2) # https://www.tensorflow.org/tutorials/seq2seq if self.max_decode_step is not None: max_decode_step = self.max_decode_step else: # 默认 4 倍输入长度的输出解码 max_decode_step = tf.round( tf.reduce_max(self.encoder_inputs_length) * 4) ( self.decoder_outputs_decode, self.final_state, _ # self.decoder_outputs_length_decode ) = ( seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=self.time_major, # impute_finished=True, # error occurs maximum_iterations=max_decode_step, parallel_iterations=self.parallel_iterations, swap_memory=True, scope=decoder_scope)) if not self.use_beamsearch_decode: # decoder_outputs_decode.sample_id: # [batch_size, max_time_step] # Or use argmax to find decoder symbols to emit: # self.decoder_pred_decode = tf.argmax( # self.decoder_outputs_decode.rnn_output, # axis=-1, name='decoder_pred_decode') # Here, we use expand_dims to be compatible with # the result of the beamsearch decoder # decoder_pred_decode: # [batch_size, max_time_step, 1] (output_major=False) # self.decoder_pred_decode = tf.expand_dims( # self.decoder_outputs_decode.sample_id, # -1 # ) dod = self.decoder_outputs_decode self.decoder_pred_decode = dod.sample_id if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0)) else: # Use beam search to approximately # find the most likely translation # decoder_pred_decode: # [batch_size, max_time_step, beam_width] (output_major=False) self.decoder_pred_decode = \ self.decoder_outputs_decode.predicted_ids if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0, 2)) self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, perm=[0, 2, 1]) dod = self.decoder_outputs_decode self.beam_prob = dod.beam_search_decoder_output.scores
def _init_decoder(self): data_y = process_decoding_input(self.data_y, self.vocab_to_int_y, self.batch_size) self.dec_embeddings = tf.Variable(tf.random_uniform( [self.vocab_size_y, self.embedding_size], -1.0, 1.0), dtype=tf.float32) dec_embedded = tf.nn.embedding_lookup(self.dec_embeddings, data_y) with tf.variable_scope("decoder"): dec_cell = rnn_cell(self.cell_size, self.dec_num_layers, self.dec_keep_prob) out_layer = Dense(self.vocab_size_y, kernel_initializer=tf.truncated_normal_initializer( mean=0.0, stddev=0.1)) att_mechanism = seq2seq.BahdanauAttention(self.cell_size, self.enc_outputs, self.x_length, normalize=False) dec_cell = seq2seq.DynamicAttentionWrapper( dec_cell, att_mechanism, attention_size=self.cell_size) init_state = seq2seq.DynamicAttentionWrapperState( cell_state=self.enc_states[0], attention=_zero_state_tensors(self.cell_size, self.batch_size, tf.float32)) with tf.variable_scope("decoding"): train_helper = seq2seq.TrainingHelper( dec_embedded, sequence_length=self.y_length, time_major=False) train_decoder = seq2seq.BasicDecoder(dec_cell, train_helper, init_state, out_layer) train_out, _ = seq2seq.dynamic_decode( train_decoder, output_time_major=False, impute_finished=True, maximum_iterations=self.max_length, swap_memory=True) self.decoder_train = train_out.rnn_output with tf.variable_scope("decoding", reuse=True): start_tokens = tf.tile( tf.constant([self.vocab_to_int_y[START]], dtype=tf.int32), [self.batch_size]) infer_helper = seq2seq.GreedyEmbeddingHelper( embedding=self.dec_embeddings, start_tokens=start_tokens, end_token=self.vocab_to_int_y[STOP]) infer_decoder = seq2seq.BasicDecoder(dec_cell, infer_helper, init_state, out_layer) infer_out, _ = seq2seq.dynamic_decode( infer_decoder, output_time_major=False, impute_finished=True, maximum_iterations=self.max_length) self.decoder_inference = infer_out.sample_id tf.identity(self.decoder_train, 'decoder_train') tf.identity(self.decoder_inference, 'decoder_inference')
def __init__(self, lstm_size, lstm_layers, source_vocab_size, enc_embedding_size, tgt_word_to_int, dec_embedding_size, tgt_max_length): #----------------------------------------------------------------------- # Placeholders #----------------------------------------------------------------------- self.inputs = tf.placeholder(tf.int32, [None, None], name='inputs') self.targets = tf.placeholder(tf.int32, [None, None], name='targets') self.batch_size = tf.placeholder(tf.int32, [], name='batch_size') self.tgt_seq_length = tf.placeholder(tf.int32, [None], name='tgt_seq_length') self.src_seq_length = tf.placeholder(tf.int32, [None], name='src_seq_length') #----------------------------------------------------------------------- # Encoder #----------------------------------------------------------------------- with tf.variable_scope('encoder'): with tf.variable_scope('embedding'): enc_embed = tf.contrib.layers.embed_sequence( self.inputs, source_vocab_size, enc_embedding_size) with tf.variable_scope('rnn'): enc_cell = tf.contrib.rnn.MultiRNNCell( [tf.contrib.rnn.BasicLSTMCell(lstm_size) \ for _ in range(lstm_layers)]) self.initial_state = enc_cell.zero_state(self.batch_size, tf.float32) _, self.enc_state = tf.nn.dynamic_rnn( enc_cell, enc_embed, sequence_length=self.src_seq_length, initial_state=self.initial_state) #----------------------------------------------------------------------- # Decoder #----------------------------------------------------------------------- target_vocab_size = len(tgt_word_to_int) with tf.variable_scope('decoder'): #------------------------------------------------------------------- # Embedding #------------------------------------------------------------------- with tf.variable_scope('embedding'): self.dec_embed = tf.Variable( tf.random_uniform([target_vocab_size, dec_embedding_size])) #------------------------------------------------------------------- # Final classifier #------------------------------------------------------------------- with tf.variable_scope('classifier') as classifier_scope: self.output_layer = Dense(target_vocab_size, kernel_initializer = \ tf.truncated_normal_initializer( mean = 0.0, stddev=0.1)) #------------------------------------------------------------------- # RNN #------------------------------------------------------------------- with tf.variable_scope('rnn'): self.dec_cell = tf.contrib.rnn.MultiRNNCell( [tf.contrib.rnn.BasicLSTMCell(lstm_size) \ for _ in range(lstm_layers)]) #------------------------------------------------------------------- # Inference decoder #------------------------------------------------------------------- with tf.variable_scope('decoder'): start_tokens = tf.tile([tgt_word_to_int['<s>']], [self.batch_size]) helper = seq2seq.GreedyEmbeddingHelper(self.dec_embed, start_tokens, tgt_word_to_int['</s>']) decoder = seq2seq.BasicDecoder(self.dec_cell, helper, self.enc_state, self.output_layer) outputs, _, _ = seq2seq.dynamic_decode(decoder, impute_finished=\ True, maximum_iterations=\ tgt_max_length) self.outputs = tf.identity(outputs.sample_id, 'predictions')