def build_decoder(self, encoder_outputs, encoder_final_state): """ 构建完整解码器 :return: """ with tf.variable_scope("decode"): decoder_cell, decoder_initial_state = self.build_decoder_cell( encoder_outputs, encoder_final_state, self.hidden_size, self.cell_type, self.layer_size) # 输出层投影 decoder_output_projection = layers.Dense( self.decoder_vocab_size, dtype=tf.float32, use_bias=False, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1), name='decoder_output_projection') if self.mode == 'train': # 训练模式 decoder_inputs_embdedded = tf.nn.embedding_lookup( self.decoder_embeddings, self.decoder_inputs_train) training_helper = TrainingHelper( inputs=decoder_inputs_embdedded, sequence_length=self.decoder_inputs_length, name='training_helper') training_decoder = BasicDecoder(decoder_cell, training_helper, decoder_initial_state, decoder_output_projection) max_decoder_length = tf.reduce_max(self.decoder_inputs_length) training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode( training_decoder, maximum_iterations=max_decoder_length) self.masks = tf.sequence_mask(self.decoder_inputs_length, maxlen=max_decoder_length, dtype=tf.float32, name='masks') self.loss = tf.contrib.seq2seq.sequence_loss( logits=training_decoder_output.rnn_output, targets=self.decoder_inputs, weights=self.masks, average_across_timesteps=True, average_across_batch=True) else: # 预测模式 start_token = [DataUnit.START_INDEX] * self.batch_size end_token = DataUnit.END_INDEX inference_decoder = BeamSearchDecoder( cell=decoder_cell, embedding=lambda x: tf.nn.embedding_lookup( self.decoder_embeddings, x), start_tokens=start_token, end_token=end_token, initial_state=decoder_initial_state, beam_width=self.beam_width, output_layer=decoder_output_projection) inference_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode( inference_decoder, maximum_iterations=self.max_decode_step) self.decoder_pred_decode = inference_decoder_output.predicted_ids self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, perm=[0, 2, 1])
def decoder(x, decoder_inputs, keep_prob, sequence_length, memory, memory_length, first_attention): with tf.variable_scope("Decoder") as scope: label_embeddings = tf.get_variable(name="embeddings", shape=[n_classes, embedding_size], dtype=tf.float32) train_inputs_embedded = tf.nn.embedding_lookup(label_embeddings, decoder_inputs) lstm = rnn.LayerNormBasicLSTMCell(n_hidden, dropout_keep_prob=keep_prob) output_l = layers_core.Dense(n_classes, use_bias=True) encoder_state = rnn.LSTMStateTuple(x, x) attention_mechanism = BahdanauAttention( embedding_size, memory=memory, memory_sequence_length=memory_length) cell = AttentionWrapper(lstm, attention_mechanism, output_attention=False) cell_state = cell.zero_state(dtype=tf.float32, batch_size=train_batch_size) cell_state = cell_state.clone(cell_state=encoder_state, attention=first_attention) train_helper = TrainingHelper(train_inputs_embedded, sequence_length) train_decoder = BasicDecoder(cell, train_helper, cell_state, output_layer=output_l) decoder_outputs_train, decoder_state_train, decoder_seq_train = dynamic_decode( train_decoder, impute_finished=True) tiled_inputs = tile_batch(memory, multiplier=beam_width) tiled_sequence_length = tile_batch(memory_length, multiplier=beam_width) tiled_first_attention = tile_batch(first_attention, multiplier=beam_width) attention_mechanism = BahdanauAttention( embedding_size, memory=tiled_inputs, memory_sequence_length=tiled_sequence_length) x2 = tile_batch(x, beam_width) encoder_state2 = rnn.LSTMStateTuple(x2, x2) cell = AttentionWrapper(lstm, attention_mechanism, output_attention=False) cell_state = cell.zero_state(dtype=tf.float32, batch_size=test_batch_size * beam_width) cell_state = cell_state.clone(cell_state=encoder_state2, attention=tiled_first_attention) infer_decoder = BeamSearchDecoder(cell, embedding=label_embeddings, start_tokens=[GO] * test_len, end_token=EOS, initial_state=cell_state, beam_width=beam_width, output_layer=output_l) decoder_outputs_infer, decoder_state_infer, decoder_seq_infer = dynamic_decode( infer_decoder, maximum_iterations=4) return decoder_outputs_train, decoder_outputs_infer, decoder_state_infer
def build_validation_graph(self, validation_data, beam_width=1): """ Build same graph as training graph with validation data so test metrics with unseen data :param validation_data: Validation data input :param beam_width: beam width parameter :return: """ # Unpack subject, content and answers and corresponding length subject, len_subject, content, len_content, target_input, target_output, len_target = validation_data # Choose best answer per question target_output = target_output[:, 0, :] len_target = tf.reshape(tf.to_int32(len_target[:, 0]), [-1]) # Concat subject and content to feed it into encoder sub_cont_concat_op = tf.map_fn( self.concat_seqs, [subject, len_subject, content, len_content])[0] len_both = len_subject + len_content # Build initial graph with concatted subject and content and sequence length. Get back decoder cell and # attention zero state decoder_cell, attn_zero_state = self.build_initial_graph( sub_cont_concat_op, len_both, reuse=True, beam_width=beam_width) # Build beam search decoder decoder = BeamSearchDecoder(decoder_cell, self.embeddings_english, self.start_token, self.end_token, attn_zero_state, beam_width, output_layer=self.projection_layer) # Define variable scope train decoder to initialize the decoder with dynamic decode. Reuse variables from scope # because it has already been defined for train graph with tf.variable_scope("train_decoder", reuse=True): outputs, _, val_seq_len = dynamic_decode( decoder, output_time_major=False, maximum_iterations=self.max_seq_len) # Take only first output of beam search validation_outputs = tf.transpose(outputs.predicted_ids, [2, 0, 1]) validation_outputs = tf.reshape(validation_outputs[0, :, :], [self.batch_size, -1]) val_seq_len = tf.transpose(val_seq_len) val_seq_len = tf.reshape(val_seq_len[0, :], [-1]) # Calculate metric scores avg_score, greedy_score, extreme_score = self.metrics_module( validation_outputs, val_seq_len, target_output, len_target) return avg_score, greedy_score, extreme_score, val_seq_len, validation_outputs
def build_infer_graph(self, beam_width=1, reuse=False): """ Build graph for infering unseen data from the graph :param beam_width: Define beam width :return: Placeholders for inputing data and outputs of beam search """ # Placeholders for Encoder subject_ph = tf.placeholder(shape=(self.batch_size, None), dtype=tf.int32, name='subject') content_ph = tf.placeholder(shape=(self.batch_size, None), dtype=tf.int32, name='content') len_subject_ph = tf.placeholder(shape=(None, ), dtype=tf.int32, name='sub_len') len_content_ph = tf.placeholder(shape=(None, ), dtype=tf.int32, name='cont_len') # Concat subject and content to feed it into encoder sub_cont_concat_op = tf.map_fn( self.concat_seqs, [subject_ph, len_subject_ph, content_ph, len_content_ph])[0] # Also concat length by adding them len_both = len_subject_ph + len_content_ph # Build initial graph with concatted subject and content and sequence length. Get back decoder cell and # attention zero state decoder_cell, attn_zero_state = self.build_initial_graph( sub_cont_concat_op, len_both, beam_width=beam_width, reuse=reuse) # Build beam search decoder decoder = BeamSearchDecoder(decoder_cell, self.embeddings_english, self.start_token, self.end_token, attn_zero_state, beam_width, output_layer=self.projection_layer) # Define variable scope train decoder to initialize the decoder with dynamic decode with tf.variable_scope("train_decoder", reuse=reuse): outputs, _, _ = dynamic_decode(decoder, output_time_major=False, maximum_iterations=self.max_seq_len) # Transform beam outputs for readable output beam_outputs = tf.transpose(outputs.predicted_ids, [2, 0, 1]) return [subject_ph, content_ph, len_subject_ph, len_content_ph], beam_outputs
def setup_decoder(self): self.dec_init_state = self.cell.zero_state(self.batch_size * self.beam_width, dtype=tf.float32) self.decoder = BeamSearchDecoder( cell=self.cell, embedding=self.embedding, start_tokens=tf.tile([0], [self.batch_size]), end_token=-1, initial_state=self.dec_init_state, beam_width=self.beam_width, output_layer=tf.layers.Dense(self.vocab_size))
def decoder(self, encoder_outputs, encoder_states): decoder_cell, decoder_init_state = self.add_decoder_cell( encoder_outputs, encoder_states, self.hidden_size, self.cell_type, self.num_layers) output_proj = tf.layers.Dense( self.tgt_vcb_size, dtype=tf.float32, use_bias=False, kernel_initializer=tf.truncated_normal_initializer(stddev=0.1), name='output_proj') if self.mode == 'train': target_embedding = tf.nn.embedding_lookup(self.decoder_embeddings, self.decoder_input_train) training_helper = TrainingHelper(target_embedding, self.target_len, name='training_helper') training_decoder = BasicDecoder(decoder_cell, training_helper, decoder_init_state, output_proj) max_dec_len = tf.reduce_max(self.target_len) output, _, _ = tf.contrib.seq2seq.dynamic_decode( training_decoder, maximum_iterations=max_dec_len) self.d_masks = tf.sequence_mask(self.target_len, max_dec_len, dtype=tf.float32, name='d_masks') self.prob = output.rnn_output self.loss = tf.contrib.seq2seq.sequence_loss( logits=self.prob, targets=self.target, weights=self.d_masks, average_across_timesteps=True, average_across_batch=True) else: start_token = [DataUnit.START_INDEX] * self.batch_size end_token = DataUnit.END_INDEX inference_decoder = BeamSearchDecoder( cell=decoder_cell, embedding=lambda x: tf.nn.embedding_lookup( self.decoder_embeddings, x), start_tokens=start_token, end_token=end_token, initial_state=decoder_init_state, beam_width=self.beam_size, output_layer=output_proj) output, _, _ = tf.contrib.seq2seq.dynamic_decode( inference_decoder, maximum_iterations=self.max_decode_step) output_pred_ = output.predicted_ids self.decoder_output = tf.transpose(output_pred_, perm=[0, 2, 1])
def inference_decode(enc_outputs, seq_len, embeddings, out_dim): tiled_enc_outputs = tf.contrib.seq2seq.tile_batch(enc_outputs, hp.beam_width) tiled_seq_len = tf.contrib.seq2seq.tile_batch(seq_len, hp.beam_width) beam_batch_size = tf.shape(tiled_enc_outputs)[0] # start tokens, end token start_tokens = tf.tile([hp.START_TOKEN], [beam_batch_size // hp.beam_width]) end_token = hp.END_TOKEN dec_prenet_outputs = DecoderPrenetWrapper(GRUCell(hp.embed_size), is_training=False, prenet_sizes=hp.embed_size, dropout_prob=hp.dropout) attention_mechanism = BahdanauAttention( hp.embed_size, tiled_enc_outputs, normalize=True, memory_sequence_length=tiled_seq_len, probability_fn=tf.nn.softmax) attn_cell = AttentionWrapper(dec_prenet_outputs, attention_mechanism, alignment_history=True, output_attention=False) concat_cell = ConcatOutputAndAttentionWrapper(attn_cell) decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, hp.embed_size), ResidualWrapper(GRUCell(hp.embed_size)), ResidualWrapper(GRUCell(hp.embed_size)) ], state_is_tuple=True) output_cell = OutputProjectionWrapper(decoder_cell, out_dim) initial_state = output_cell.zero_state(batch_size=beam_batch_size, dtype=tf.float32) decoder = BeamSearchDecoder(cell=output_cell, embedding=embeddings, start_tokens=start_tokens, end_token=end_token, initial_state=initial_state, beam_width=hp.beam_width) outputs, t1, t2 = tf.contrib.seq2seq.dynamic_decode( decoder, maximum_iterations=hp.max_len) return outputs
def _build_infer(self, config): # infer_decoder/beam_search # skip for flat_baseline tiled_inputs = tile_batch(self.xx_context, multiplier=config.beam_width) tiled_sequence_length = tile_batch(self.x_seq_length, multiplier=config.beam_width) tiled_first_attention = tile_batch(self.first_attention, multiplier=config.beam_width) attention_mechanism = BahdanauAttention(config.decode_size, memory=tiled_inputs, memory_sequence_length=tiled_sequence_length) tiled_xx_final = tile_batch(self.xx_final, config.beam_width) encoder_state2 = rnn.LSTMStateTuple(tiled_xx_final, tiled_xx_final) cell = AttentionWrapper(self.lstm, attention_mechanism, output_attention=False) cell_state = cell.zero_state(dtype=tf.float32, batch_size = config.test_batch_size * config.beam_width) cell_state = cell_state.clone(cell_state=encoder_state2, attention=tiled_first_attention) infer_decoder = BeamSearchDecoder(cell, embedding=self.label_embeddings, start_tokens=[config.GO]*config.test_batch_size, end_token=config.EOS, initial_state=cell_state, beam_width=config.beam_width, output_layer=self.output_l) decoder_outputs_infer, decoder_state_infer, decoder_seq_infer = dynamic_decode(infer_decoder, maximum_iterations=config.max_seq_length) self.preds = decoder_outputs_infer.predicted_ids self.scores = decoder_state_infer.log_probs
def decoder_decode(self, decoder_cell, decoder_initial_state, output_layer): # 每句的开始用<GO>标记 start_tokens = tf.ones([ self.batch_size, ], tf.int32) * self.word_to_idx['<GO>'] # 每句的结束用<EOS>标记 end_token = self.word_to_idx['<EOS>'] # 如果使用BeamSearch,使用BeamSearchDecoder进行解码. if self.beam_search: inference_decoder = BeamSearchDecoder( cell=decoder_cell, embedding=self.embedding, start_tokens=start_tokens, end_token=end_token, initial_state=decoder_initial_state, beam_width=self.beam_size, output_layer=output_layer) else: # 不使用BeamSearch,使用GreedyEmbeddingHelper帮助类. decoding_helper = GreedyEmbeddingHelper(embedding=self.embedding, start_tokens=start_tokens, end_token=end_token) # 用BasicDecoder进行解码. inference_decoder = BasicDecoder( cell=decoder_cell, helper=decoding_helper, initial_state=decoder_initial_state, output_layer=output_layer) # dynamic_decode # 参数: # decoder: BasicDecoder、BeamSearchDecoder或者自己定义的decoder类对象 # output_time_major: 见RNN,为真时step*batch_size*...,为假时batch_size*step*... # impute_finished: Boolean,为真时会拷贝最后一个时刻的状态并将输出置零,程序运行更稳定,使最终状态和输出具有正确的值,在反向传播时忽略最后一个完成步。但是会降低程序运行速度。 # maximum_iterations: 最大解码步数,一般训练设置为decoder_inputs_length,预测时设置一个想要的最大序列长度即可。程序会在产生<eos>或者到达最大步数处停止。 decoder_outputs, _, _ = dynamic_decode(decoder=inference_decoder, maximum_iterations=50) if self.beam_search: # 如果使用BeamSearch,输出为预测的predicted_ids decoder_predict_decode = decoder_outputs.predicted_ids else: # 扩充一个维度,即在最后添加一列 TODO:干什么? decoder_predict_decode = tf.expand_dims(decoder_outputs.sample_id, -1) return decoder_predict_decode
def build_predict_decoder(self): start_tokens = tf.ones([ self.batch_size, ], tf.int32) * self.word_to_id['<GO>'] end_token = self.word_to_id['<EOS>'] decoder_cell, deocder_initial_state = self.build_decoder_cell() output_layer = tf.layers.Dense( self.vocab_size, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) if self.beam_search: inference_decoder = BeamSearchDecoder( cell=decoder_cell, embedding=self.embedding, start_tokens=start_tokens, end_token=end_token, initial_state=deocder_initial_state, beam_width=self.beam_size, output_layer=output_layer) else: decoding_helper = GreedyEmbeddingHelper(embedding=self.embedding, start_tokens=start_tokens, end_token=end_token) inference_decoder = BasicDecoder( cell=decoder_cell, helper=decoding_helper, initial_state=deocder_initial_state, output_layer=output_layer) decoder_outputs, _, _ = dynamic_decode(decoder=inference_decoder, maximum_iterations=50) if self.beam_search: decoder_predict_decode = decoder_outputs.predicted_ids else: decoder_predict_decode = tf.expand_dims(decoder_outputs.sample_id, -1) return decoder_predict_decode
def _decode(): beam_docoder = BeamSearchDecoder( cell=cell, embedding=embedding, start_tokens=start_tokens, end_token=end_token, initial_state=initial_state, beam_width=beam_width, output_layer=output_layer, length_penalty_weight=length_penalty_weight) if 'maximum_iterations' in kwargs: raise ValueError('Use `max_decoding_length` to set the maximum ' 'allowed number of decoding steps.') outputs, final_state, _ = dynamic_decode( decoder=beam_docoder, output_time_major=output_time_major, maximum_iterations=max_decoding_length, **kwargs) return outputs, final_state, final_state.lengths
def build_predict_decoder(self): print('Building predict decoder...') start_tokens = tf.ones([self.batch_size, ], tf.int32) * self.word_to_id['<GO>'] end_token = self.word_to_id['<EOS>'] if self.beam_search: inference_decoder = BeamSearchDecoder( cell=self.decoder_cell, embedding=self.embedding, start_tokens=start_tokens, end_token=end_token, initial_state=self.decoder_initial_state, beam_width=self.beam_size, output_layer=self.output_layer ) else: decoding_helper = GreedyEmbeddingHelper( embedding=self.embedding, start_tokens=start_tokens, end_token=end_token ) ##Uses the argmax of the output (treated as logits) and passes the result through an embedding layer to get the next input. ##embedding: A callable that takes a vector tensor of ids (argmax ids), or the params argument for embedding_lookup. The returned tensor will be passed to the decoder input. ##start_tokens: int32 vector shaped [batch_size], the start tokens. ##end_token: int32 scalar, the token that marks end of decoding. inference_decoder = BasicDecoder( cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=self.output_layer ) decoder_outputs, _, _ = dynamic_decode(decoder=inference_decoder, maximum_iterations=50) ##predicted_ids: Final outputs returned by the beam search after all decoding is finished. A tensor of shape [batch_size, num_steps, beam_width] (or [num_steps, batch_size, beam_width] if output_time_major is True). Beams are ordered from best to worst. if self.beam_search: self.decoder_predict_decode = decoder_outputs.predicted_ids else: self.decoder_predict_decode = tf.expand_dims(decoder_outputs.sample_id, -1)
def interface_beamsearch(self, enc_outputs, enc_state, e_size): beam_width = self.beam_width enc_outputs = tf.contrib.seq2seq.tile_batch(enc_outputs, multiplier=self.beam_width) enc_state = nest.map_structure( lambda s: tf.contrib.seq2seq.tile_batch(s, self.beam_width), enc_state) e_size = tf.contrib.seq2seq.tile_batch(e_size, multiplier=self.beam_width) batch_starts = self.dec_batch_inputs softmax_call = self.dense_before_softmax attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( self.embedding_dim, enc_outputs, memory_sequence_length=e_size) attention_cell = tf.contrib.seq2seq.AttentionWrapper( self.dec_cell, attention_mechanism, attention_layer_size=self.embedding_dim, cell_input_fn=self.cell_input_fn) init_state = attention_cell.zero_state(self.dec_batch_size*self.beam_width, tf.float32)\ .clone(cell_state=enc_state) beamsearch = BeamSearchDecoder(attention_cell, self.decoder_embedding, batch_starts, self.end, init_state, beam_width, softmax_call) return beamsearch
def build_predict_decoder(self): print('Building predict decoder...') start_tokens = tf.ones([ self.batch_size, ], tf.int32) * self.word_to_id['<GO>'] end_token = self.word_to_id['<EOS>'] if self.beam_search: inference_decoder = BeamSearchDecoder( cell=self.decoder_cell, embedding=self.embedding_t, start_tokens=start_tokens, end_token=end_token, initial_state=self.decoder_initial_state, beam_width=self.beam_size, output_layer=self.output_layer) else: decoding_helper = GreedyEmbeddingHelper(embedding=self.embedding_t, start_tokens=start_tokens, end_token=end_token) inference_decoder = BasicDecoder( cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=self.output_layer) decoder_outputs, _, _ = dynamic_decode(decoder=inference_decoder, maximum_iterations=50) if self.beam_search: self.decoder_predict_decode = decoder_outputs.predicted_ids else: self.decoder_predict_decode = tf.expand_dims( decoder_outputs.sample_id, -1)
def __init__(self, vocab_size, learning_rate, encoder_size, max_length, embedding_size, sos_token, eos_token, unk_token, beam_size=5): self.vocab_size = vocab_size self.lr = learning_rate self.encoder_size = encoder_size self.max_length = max_length self.embedding_size = embedding_size self.SOS_token = sos_token self.EOS_token = eos_token self.UNK_token = unk_token self.beam_search_size = beam_size with tf.variable_scope('placeholder_and_embedding'): self.query = tf.placeholder(shape=(None, None), dtype=tf.int32) self.query_length = tf.placeholder(shape=(None, ), dtype=tf.int32) self.reply = tf.placeholder(shape=(None, None), dtype=tf.int32) self.reply_length = tf.placeholder(shape=(None, ), dtype=tf.int32) self.decoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32) self.decoder_target = tf.placeholder(shape=(None, None), dtype=tf.int32) self.decoder_length = tf.placeholder(shape=(None, ), dtype=tf.int32) self.batch_size = tf.placeholder(shape=(), dtype=tf.int32) self.embedding_pl = tf.placeholder(dtype=tf.float32, shape=(self.vocab_size, embedding_size), name='embedding_source_pl') word_embedding = tf.get_variable(name='word_embedding', shape=(self.vocab_size, embedding_size), dtype=tf.float32, trainable=True) self.init_embedding = word_embedding.assign(self.embedding_pl) self.max_target_sequence_length = tf.reduce_max( self.decoder_length, name='max_target_len') self.mask = tf.sequence_mask(self.decoder_length, self.max_target_sequence_length, dtype=tf.float32, name='masks') with tf.variable_scope("query_encoder"): self.query_encoder = deep_components.gru_encoder( word_embedding, self.encoder_size) query_out, query_state = self.query_encoder( seq_index=self.query, seq_len=self.query_length) with tf.variable_scope("reply_encoder"): self.reply_encoder = deep_components.gru_encoder( word_embedding, self.encoder_size) reply_out, reply_state = self.reply_encoder( seq_index=self.reply, seq_len=self.reply_length) with tf.variable_scope("decoder"): combined_encoder_state = tf.concat([query_state, reply_state], axis=1) tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch( combined_encoder_state, multiplier=self.beam_search_size) tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch( query_out, multiplier=self.beam_search_size) tiled_sequence_length = tf.contrib.seq2seq.tile_batch( self.query_length, multiplier=self.beam_search_size) decoder_cell = deep_components.AttentionGRUCell( memory=tiled_encoder_outputs, memory_size=self.encoder_size, attention_size=self.encoder_size, embedding_dims=self.embedding_size, rnn_units=self.encoder_size * 2) '''decoder_gru = GRUCell(self.encoder_size * 2) attention_mechanism = BahdanauAttention( num_units=self.encoder_size, memory=tiled_encoder_outputs, memory_sequence_length=tiled_sequence_length) attention_cell = AttentionWrapper(decoder_gru, attention_mechanism, attention_layer_size=self.encoder_size) decoder_initial_state_beam = attention_cell.zero_state( dtype=tf.float32, batch_size=tf.cast(self.batch_size * self.beam_search_size,dtype=tf.int32)).clone( cell_state=tiled_encoder_final_state)''' ############################# #attention_cell=decoder_gru #decoder_initial_state_beam = tiled_encoder_final_state ############################## decode_out_layer = tf.layers.Dense(self.vocab_size, name='output_layer', _reuse=tf.AUTO_REUSE) with tf.variable_scope("seq2seq-train"): # train self.tiled_d_in = tile_batch(self.decoder_inputs, multiplier=self.beam_search_size) self.tiled_d_tgt = tile_batch(self.decoder_target, multiplier=self.beam_search_size) train_helper = TrainingHelper( tf.contrib.seq2seq.tile_batch( tf.nn.embedding_lookup(word_embedding, self.decoder_inputs), multiplier=self.beam_search_size), sequence_length=tile_batch(self.decoder_length, multiplier=self.beam_search_size), name="train_helper") train_decoder = BasicDecoder( decoder_cell, train_helper, initial_state=tiled_encoder_final_state, output_layer=decode_out_layer) self.dec_output, _, self.gen_len = dynamic_decode( train_decoder, impute_finished=True, maximum_iterations=self.max_target_sequence_length) #self.gen_max_len=tf.reduce_max(self.gen_len) #self.padding=tf.zeros(shape=(self.batch_size,self.max_length-self.gen_max_len,self.vocab_size),dtype=tf.float32) #self.padding=tile_batch(self.padding,multiplier=self.beam_search_size) self.dec_logits = tf.identity(self.dec_output.rnn_output) #self.dec_logits = tf.concat((self.dec_logits,self.padding),axis=1) self.decoder_target_mask = tile_batch( self.mask, multiplier=self.beam_search_size) self.cost = sequence_loss( self.dec_logits, tile_batch(self.decoder_target, multiplier=self.beam_search_size), self.decoder_target_mask) self.optimizer = tf.train.AdamOptimizer( learning_rate=self.lr).minimize(self.cost) with tf.variable_scope("seq2seq_beam_search_generate"): start_tokens = tf.ones([ self.batch_size, ], tf.int32) * self.SOS_token beam_infer_decoder = BeamSearchDecoder( decoder_cell, embedding=word_embedding, end_token=self.EOS_token, start_tokens=start_tokens, initial_state=tiled_encoder_final_state, beam_width=self.beam_search_size, output_layer=decode_out_layer) self.bs_outputs, _, _ = dynamic_decode( beam_infer_decoder, maximum_iterations=self.max_length) with tf.variable_scope("greedy_generate"): decoding_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( embedding=word_embedding, start_tokens=start_tokens, end_token=self.EOS_token) inference_decoder = tf.contrib.seq2seq.BasicDecoder( cell=decoder_cell, helper=decoding_helper, initial_state=tiled_encoder_final_state, output_layer=decode_out_layer) self.greedy_outputs, _, _ = dynamic_decode( inference_decoder, maximum_iterations=self.max_length)
def build_decoder(self, encoder_outputs, encoder_state): """构建解码器 """ with tf.variable_scope('decoder') as decoder_scope: (self.decoder_cell, self.decoder_initial_state) = self.build_decoder_cell( encoder_outputs, encoder_state) # 解码器embedding with tf.device(_get_embed_device(self.target_vocab_size)): if self.share_embedding: self.decoder_embeddings = self.encoder_embeddings elif self.pretrained_embedding: self.decoder_embeddings = tf.Variable(tf.constant( 0.0, shape=(self.target_vocab_size, self.embedding_size)), trainable=True, name='embeddings') self.decoder_embeddings_placeholder = tf.placeholder( tf.float32, (self.target_vocab_size, self.embedding_size)) self.decoder_embeddings_init = \ self.decoder_embeddings.assign( self.decoder_embeddings_placeholder) else: self.decoder_embeddings = tf.get_variable( name='embeddings', shape=(self.target_vocab_size, self.embedding_size), initializer=self.initializer, dtype=tf.float32) self.decoder_output_projection = layers.Dense( self.target_vocab_size, dtype=tf.float32, use_bias=False, name='decoder_output_projection') if self.mode == 'train': self.decoder_inputs_embedded = tf.nn.embedding_lookup( params=self.decoder_embeddings, ids=self.decoder_inputs_train) inputs = self.decoder_inputs_embedded if self.time_major: inputs = tf.transpose(inputs, (1, 0, 2)) training_helper = seq2seq.TrainingHelper( inputs=inputs, sequence_length=self.decoder_inputs_length, time_major=self.time_major, name='training_helper') # 训练的时候不在这里应用 output_layer # 因为这里会每个 time_step 的进行 output_layer 的投影计算,比较慢 # 注意这个trick要成功必须设置 dynamic_decode 的 scope 参数 training_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state, ) # Maximum decoder time_steps in current batch max_decoder_length = tf.reduce_max(self.decoder_inputs_length) ( outputs, self.final_state, # contain attention _ # self.final_sequence_lengths ) = seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=self.time_major, impute_finished=True, maximum_iterations=max_decoder_length, parallel_iterations=self.parallel_iterations, swap_memory=True, scope=decoder_scope) self.decoder_logits_train = self.decoder_output_projection( outputs.rnn_output) # masks: masking for valid and padded time steps, # [batch_size, max_time_step + 1] self.masks = tf.sequence_mask( lengths=self.decoder_inputs_length, maxlen=max_decoder_length, dtype=tf.float32, name='masks') decoder_logits_train = self.decoder_logits_train if self.time_major: decoder_logits_train = tf.transpose( decoder_logits_train, (1, 0, 2)) self.decoder_pred_train = tf.argmax(decoder_logits_train, axis=-1, name='decoder_pred_train') # 下面的一些变量用于特殊的学习训练 # 自定义rewards,其实我这里是修改了masks # train_entropy = cross entropy self.train_entropy = \ tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.decoder_inputs, logits=decoder_logits_train) self.masks_rewards = self.masks * self.rewards self.loss_rewards = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_inputs, weights=self.masks_rewards, average_across_timesteps=True, average_across_batch=True, ) self.loss = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_inputs, weights=self.masks, average_across_timesteps=True, average_across_batch=True, ) self.loss_add = self.loss + self.add_loss elif self.mode == 'decode': # 预测模式,非训练 start_tokens = tf.tile([WordSequence.START], [self.batch_size]) end_token = WordSequence.END def embed_and_input_proj(inputs): """输入层的投影层wrapper """ return tf.nn.embedding_lookup(self.decoder_embeddings, inputs) if not self.use_beamsearch_decode: # Helper to feed inputs for greedy decoding: # uses the argmax of the output decoding_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_tokens, end_token=end_token, embedding=embed_and_input_proj) # Basic decoder performs greedy decoding at each time step # print("building greedy decoder..") inference_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=self.decoder_output_projection) else: # Beamsearch is used to approximately # find the most likely translation # print("building beamsearch decoder..") inference_decoder = BeamSearchDecoder( cell=self.decoder_cell, embedding=embed_and_input_proj, start_tokens=start_tokens, end_token=end_token, initial_state=self.decoder_initial_state, beam_width=self.beam_width, output_layer=self.decoder_output_projection, ) if self.max_decode_step is not None: max_decode_step = self.max_decode_step else: # 默认 4 倍输入长度的输出解码 max_decode_step = tf.round( tf.reduce_max(self.encoder_inputs_length) * 4) ( self.decoder_outputs_decode, self.final_state, _ # self.decoder_outputs_length_decode ) = ( seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=self.time_major, # impute_finished=True, # error occurs maximum_iterations=max_decode_step, parallel_iterations=self.parallel_iterations, swap_memory=True, scope=decoder_scope)) if not self.use_beamsearch_decode: dod = self.decoder_outputs_decode self.decoder_pred_decode = dod.sample_id if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0)) else: self.decoder_pred_decode = \ self.decoder_outputs_decode.predicted_ids if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0, 2)) self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, perm=[0, 2, 1]) dod = self.decoder_outputs_decode self.beam_prob = dod.beam_search_decoder_output.scores
def build_decoder(self,encoder_output,encoder_state): ''' 构建解码器 :param encoder_output: :param encoder_state: :return: ''' with tf.variable_scope('decoder') as decoder_scope:#这里是为了调试方便,将参数折叠成一个层。 ( self.decoder_cell, self.decoder_initial_state ) = self.build_decoder_cell(encoder_output,encoder_state) #解码器的embedding matrix with tf.device(get_embed_device(self.target_vocab_size)): if self.share_embedding: self.decoder_embeddings = self.encoder_embeddings elif self.pretrain_embedding: self.decoder_embeddings = tf.Variable( tf.constant( 0.0, shape=(self.target_vocab_size, self.embedding_size) ), trainable=True, name = 'embeddings' ) self.decoder_embeddings_placeholder = tf.placeholder( dtype=tf.float32, shape=(self.target_vocab_size,self.embedding_size), ) self.decoder_embeddings_init = self.decoder_embeddings.assign(self.decoder_embeddings_placeholder) else: self.decoder_embeddings = tf.get_variable( name='embeddings', shape = (self.target_vocab_size,self.embedding_size), initializer=self.initializer, dtype=tf.float32 ) #上面也是对用于解码器的embedding的初始化 #定义输出的projection,实际上就是全连接层 self.decoder_output_projection = layers.Dense( self.target_vocab_size, dtype=tf.float32, use_bias=False, name='decoder_output_projection' ) if self.mode == 'train': self.decoder_inputs_embedded = tf.nn.embedding_lookup( params=self.decoder_embeddings, ids=self.decoder_inputs_train ) inputs = self.decoder_inputs_embedded if self.time_major: inputs = tf.transpose(inputs,(1,0,2)) #seq2seq的一个类,用来帮助feeding参数。 training_helper = seq2seq.TrainingHelper( inputs=inputs, sequence_length=self.decoder_inputs_length, time_major=self.time_major, name='training_helper' ) training_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state ) #decoder在当前的batch下的最大time_steps max_decoder_length =tf.reduce_max( self.decoder_inputs_length ) ( outputs, self.final_state, final_sequence_lengths ) = seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=self.time_major, impute_finished=True, maximum_iterations=max_decoder_length, parallel_iterations=self.parallel_iterations, swap_memory=True, scope = decoder_scope ) self.decoder_logits_train = self.decoder_output_projection( outputs.rnn_output ) ''' self.masks感觉有用,通过这个mask来区分数据位和填充位,这个是计算sequence_loss需要传入的参数。 ''' self.masks = tf.sequence_mask( lengths=self.decoder_inputs_length, maxlen=max_decoder_length, dtype=tf.float32, name='masks' ) decoder_logits_train = self.decoder_logits_train if self.time_major: decoder_logits_train = tf.transpose(decoder_logits_train(1,0,2)) self.decoder_pre_train =tf.argmax( decoder_logits_train, axis=-1, name='deocder_pred_train' ) self.tran_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.decoder_inputs, logits=decoder_logits_train ) self.masks_rewards = self.masks * self.rewards self.loss_rewards = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_inputs, weights=self.masks_rewards, average_across_timesteps=True, average_across_batch=True ) self.loss = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_inputs, weights=self.masks, average_across_timesteps=True, average_across_batch=True ) print('in build_decoder') print(self.add_loss.name) self.add_loss = self.add_loss + self.loss print(self.add_loss.name) elif self.mode == 'decode': start_tokens = tf.tile( [WordSequence.START], [self.batch_size] ) end_token = WordSequence.END def embed_and_input_proj(inputs): return tf.nn.embedding_lookup( self.decoder_embeddings, inputs ) if not self.use_beamsearch_decode: decoding_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_tokens, end_token=end_token, embedding=embed_and_input_proj ) inference_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=self.decoder_output_projection ) else: #这里的BeamSearchDecoder 传入的initial_state是经过变换,成了原来的beam_width 这么多倍。 inference_decoder = BeamSearchDecoder( cell=self.decoder_cell, embedding=embed_and_input_proj, start_tokens=start_tokens, end_token = end_token, initial_state=self.decoder_initial_state, beam_width=self.beam_width, output_layer=self.decoder_output_projection ) if self.max_decode_step is not None: max_decoder_step = self.max_decode_step else: max_decoder_step = tf.round( tf.reduce_max(self.encoder_inputs_length)* 4 ) ( self.decoder_outputs_decode, self.final_state, final_sequence_lengths ) = (seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=self.time_major, maximum_iterations=self.parallel_iterations, swap_memory=True, scope = decoder_scope )) if not self.use_beamsearch_decode: dod = self.decoder_outputs_decode self.decoder_pred_decode = dod.sample_id else: self.decoder_pred_decode = self.decoder_outputs_decode.predicted_ids if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode,(1,0,2) ) self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, perm=[0,2,1] ) dod = self.decoder_outputs_decode self.beam_prob = dod.beam_search_decoder_output.scores
def build_decoder(self, encoder_outputs, encoder_final_state): """ 构建完整解码器 :return: """ with tf.variable_scope("decode"): decoder_cell, decoder_initial_state = self.build_decoder_cell( encoder_outputs, encoder_final_state, self.hidden_size, self.cell_type, self.layer_size) # 输出层投影 decoder_output_projection = layers.Dense( self.decoder_vocab_size, dtype=tf.float32, use_bias=False, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1), name='decoder_output_projection') if self.mode == 'train': # 训练模式 decoder_inputs_embdedded = tf.nn.embedding_lookup( self.decoder_embeddings, self.decoder_inputs_train) ''' TrainingHelper用于train阶段,next_inputs方法一样也接收outputs与sample_ids,但是只是从初始化时的inputs返回下一时刻的输入。 TrainingHelper __init__( inputs, sequence_length, time_major=False, name=None ) - inputs: A (structure of) input tensors. - sequence_length: An int32 vector tensor. - time_major: Python bool. Whether the tensors in inputs are time major. If False (default), they are assumed to be batch major. - name: Name scope for any created operations. inputs:对应Decoder框架图中的embedded_input,time_major=False的时候,inputs的shape就是[batch_size, sequence_length, embedding_size] ,time_major=True时,inputs的shape为[sequence_length, batch_size, embedding_size] sequence_length:这个文档写的太简略了,不过在源码中可以看出指的是当前batch中每个序列的长度(self._batch_size = array_ops.size(sequence_length))。 time_major:决定inputs Tensor前两个dim表示的含义 name:如文档所述 ''' training_helper = TrainingHelper( inputs=decoder_inputs_embdedded, sequence_length=self.decoder_inputs_length, name='training_helper') ''' BasicDecoder的作用就是定义一个封装了decoder应该有的功能的实例,根据Helper实例的不同,这个decoder可以实现不同的功能,比如在train的阶段,不把输出重新作为输入,而在inference阶段,将输出接到输入。 BasicDecoder __init__( cell, helper, initial_state, output_layer=None ) - cell: An RNNCell instance. - helper: A Helper instance. - initial_state: A (possibly nested tuple of…) tensors and TensorArrays. The initial state of the RNNCell. - output_layer: (Optional) An instance of tf.layers.Layer, i.e., tf.layers.Dense. Optional layer to apply to the RNN output prior to storing the result or sampling. cell:在这里就是一个多层LSTM的实例,与定义encoder时无异 helper:这里只是简单说明是一个Helper实例,第一次看文档的时候肯定还不知道这个Helper是什么,不用着急,看到具体的Helper实例就明白了 initial_state:encoder的final state,类型要一致,也就是说如果encoder的final state是tuple类型(如LSTM的包含了cell state与hidden state),那么这里的输入也必须是tuple。直接将encoder的final_state作为这个参数输入即可 output_layer:对应的就是框架图中的Dense_Layer,只不过文档里写tf.layers.Dense,但是tf.layers下只有dense方法,Dense的实例还需要from tensorflow.python.layers.core import Dense。 ''' training_decoder = BasicDecoder(decoder_cell, training_helper, decoder_initial_state, decoder_output_projection) max_decoder_length = tf.reduce_max(self.decoder_inputs_length) ''' 首先tf.contrib.seq2seq.dynamic_decode主要作用是接收一个Decoder类,然后依据Encoder进行解码,实现序列的生成(映射)。 其中,这个函数主要的一个思想是一步一步地调用Decoder的step函数(该函数接收当前的输入和隐层状态会生成下一个词),实现最后的一句话的生成。该函数类似tf.nn.dynamic_rnn。 ''' training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode( training_decoder, maximum_iterations=max_decoder_length) ''' tf.sequence_mask函数返回的一个mask张量。经过tf.Session()打印可以得到一个array数据。 decoder_inputs_length范围内的数据用1填充,[decoder_inputs_length,max_decoder_length]区间用0填充 ''' self.masks = tf.sequence_mask(self.decoder_inputs_length, maxlen=max_decoder_length, dtype=tf.float32, name='masks') ''' tf.contrib.seq2seq.sequence_loss可以直接计算序列的损失函数,重要参数: logits:尺寸[batch_size, sequence_length, num_decoder_symbols] targets:尺寸[batch_size, sequence_length],不用做one_hot。 weights:[batch_size, sequence_length],即mask,滤去padding的loss计算,使loss计算更准确。 ''' self.loss = tf.contrib.seq2seq.sequence_loss( logits=training_decoder_output.rnn_output, targets=self.decoder_inputs, weights=self.masks, # mask,滤去padding的loss计算,使loss计算更准确。 average_across_timesteps=True, average_across_batch=True) else: # 预测模式 start_token = [DataUnit.START_INDEX] * self.batch_size end_token = DataUnit.END_INDEX ''' BeamSearchDecoder cell: An RNNCell instance. embedding: A callable that takes a vector tensor of ids (argmax ids), or the params argument for embedding_lookup. start_tokens: int32 vector shaped [batch_size], the start tokens. end_token: int32 scalar, the token that marks end of decoding. initial_state: A (possibly nested tuple of...) tensors and TensorArrays. beam_width: Python integer, the number of beams. output_layer: (Optional) An instance of tf.keras.layers.Layer, i.e., tf.keras.layers.Dense. Optional layer to apply to the RNN output prior to storing the result or sampling. length_penalty_weight: Float weight to penalize length. Disabled with 0.0. coverage_penalty_weight: Float weight to penalize the coverage of source sentence. Disabled with 0.0. reorder_tensor_arrays: If True, TensorArrays' elements within the cell state will be reordered according to the beam search path. If the TensorArray can be reordered, the stacked form will be returned. Otherwise, the TensorArray will be returned as is. Set this flag to False if the cell state contains TensorArrays that are not amenable to reordering. ''' inference_decoder = BeamSearchDecoder( cell=decoder_cell, embedding=lambda x: tf.nn.embedding_lookup( self.decoder_embeddings, x), start_tokens=start_token, end_token=end_token, initial_state=decoder_initial_state, beam_width=self.beam_width, output_layer=decoder_output_projection) ''' 首先tf.contrib.seq2seq.dynamic_decode主要作用是接收一个Decoder类,然后依据Encoder进行解码,实现序列的生成(映射)。 其中,这个函数主要的一个思想是一步一步地调用Decoder的step函数(该函数接收当前的输入和隐层状态会生成下一个词),实现最后的一句话的生成。该函数类似tf.nn.dynamic_rnn。 ''' inference_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode( inference_decoder, maximum_iterations=self.max_decode_step) self.decoder_pred_decode = inference_decoder_output.predicted_ids self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, perm=[0, 2, 1])
def predict(self, encoder_output, encoder_state): ''' 开始预测 :param encoder_output: :param encoder_state: :return: ''' with tf.variable_scope('decoder') as decoder_scope: # 这里是为了调试方便,将参数折叠成一个层。 #####解码器的单元 解码器的初始化状态######## print(decoder_scope, 'decoder_scope') ( self.decoder_cell, self.decoder_initial_state ) = self.build_decoder_cell(encoder_output, encoder_state) #####解码器的单元 解码器的初始化状态######## with tf.device(get_embed_device(self.target_vocab_size)): ##############################################加载解码器的embedding################################################## ###编码器和解码器是否共享embedding matrix### if self.share_embedding: self.decoder_embeddings = self.encoder_embeddings ###编码器和解码器是否共享embedding matrix### ###是否加载预先训练好的embedding matrix### elif self.pretrain_embedding: self.decoder_embeddings = tf.Variable( tf.constant( 0.0, shape=(self.target_vocab_size, self.embedding_size) ), trainable=True, name='embeddings' ) self.decoder_embeddings_placeholder = tf.placeholder( dtype=tf.float32, shape=(self.target_vocab_size, self.embedding_size), ) self.decoder_embeddings_init = self.decoder_embeddings.assign(self.decoder_embeddings_placeholder) # 运行时通过placeholder传入embedding matrix,通过assign的形式进行赋值。 ###是否加载预先训练好的embedding matrix### else: self.decoder_embeddings = tf.get_variable( name='embeddings', shape=(self.target_vocab_size, self.embedding_size), initializer=self.initializer, dtype=tf.float32 ) # 不加载预先训练好的embedding matrix的情况,声明一个decoder_embeddings matrix,使用-0.5到0.5的均匀分布进行初始化 ##############################################加载解码器的embedding################################################## # 定义输出的projection,实际上就是全连接层 ##################################解码器的映射############################################ self.decoder_output_projection = layers.Dense( self.target_vocab_size, dtype=tf.float32, use_bias=False, name='decoder_output_projection' ) ##################################解码器的映射############################################ start_tokens = tf.tile( [WordSequence.START], [self.batch_size] ) end_token = WordSequence.END def embed_and_input_proj(inputs): return tf.nn.embedding_lookup( self.decoder_embeddings, inputs ) ################################### 没有使用beamsearch的情况 ################################ if not self.use_beamsearch_decode: decoding_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_tokens, end_token=end_token, embedding=embed_and_input_proj # 这里embedding参数作用是获得embedding vector的id ) # 这个时候使用的decoding_helper 就是贪婪模式下的Helper ''' 对output使用argmax(treated as logits)并且送入到embedding matrix中查询embedding vector, 得到下一个输入值 ''' inference_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=self.decoder_output_projection ) ################################### 没有使用beamsearch的情况 ################################ else: ##################################beamsearch 的inference_decoder############################################## # 这里的BeamSearchDecoder 传入的initial_state是经过变换,成了原来的beam_width 这么多倍。 inference_decoder = BeamSearchDecoder( cell=self.decoder_cell, embedding=embed_and_input_proj, start_tokens=start_tokens, end_token=end_token, initial_state=self.decoder_initial_state, beam_width=self.beam_width, output_layer=self.decoder_output_projection ) ##################################beamsearch 的inference_decoder############################################## if self.max_decode_step is not None: max_decoder_step = self.max_decode_step else: max_decoder_step = tf.round( tf.reduce_max(self.encoder_inputs_length) * 4 ) ###############################解码开始#################################### ( self.decoder_outputs_decode, self.final_state, final_sequence_lengths ) = (seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=self.time_major, impute_finished=False, maximum_iterations=100, swap_memory=True, scope=decoder_scope )) ###############################解码开始#################################### ##############################没有使用beamsearch 解码的情况############################ if not self.use_beamsearch_decode: dod = self.decoder_outputs_decode self.decoder_pred_decode = dod.sample_id # self.decoder_pred_decode = tf.transpose( # self.decoder_pred_decode, (1, 0) # ) return self.decoder_pred_decode, final_sequence_lengths ##############################没有使用beamsearch 解码的情况############################ else: ##############################使用beamsearch 解码的情况############################ self.decoder_pred_decode = self.decoder_outputs_decode.predicted_ids if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0, 2) ) self.decoder_pred_decode = tf.transpose(self.decoder_pred_decode, (0, 2, 1)) dod = self.decoder_outputs_decode self.beam_prob = dod.beam_search_decoder_output.scores ##############################使用beamsearch 解码的情况############################ return self.decoder_pred_decode, self.beam_prob, final_sequence_lengths
def build_graph(self): print('Building the TensorFlow graph...') opts = self.options self.graph = tf.Graph() with self.graph.as_default(): self.enc_input = tf.placeholder( tf.int32, shape=[opts.max_hist_len, opts.batch_size, opts.max_uttr_len]) self.enc_input_e = tf.placeholder( tf.float32, shape=[opts.batch_size, opts.max_hist_len, opts.n_emot]) self.dec_input = tf.placeholder( tf.int32, shape=[opts.batch_size, opts.max_uttr_len + 1]) self.target = tf.placeholder( tf.int32, shape=[opts.batch_size, opts.max_uttr_len + 1]) self.enc_input_len = tf.placeholder( tf.int32, shape=[opts.max_hist_len, opts.batch_size]) self.dec_input_len = tf.placeholder(tf.int32, shape=[opts.batch_size]) self.hist_len = tf.placeholder(tf.int32, shape=[opts.batch_size]) with tf.variable_scope('embedding', reuse=tf.AUTO_REUSE): # word_embeddings = tf.Variable(tf.random_uniform([opts.vocab_size, opts.word_embed_size], -1.0, 1.0), # name = 'word_embeddings') word_embeddings = tf.Variable(opts.word_embeddings, name='word_embeddings') enc_input_embed = tf.nn.embedding_lookup( word_embeddings, self.enc_input) dec_input_embed = tf.nn.embedding_lookup( word_embeddings, self.dec_input) with tf.variable_scope('word_level_encoding', reuse=tf.AUTO_REUSE): outputs_enc = [] cell_fw = tf.nn.rnn_cell.GRUCell(opts.n_hidden_units_enc_s) cell_bw = tf.nn.rnn_cell.GRUCell(opts.n_hidden_units_enc_s) for i in range(opts.max_hist_len): outputs, _ = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, inputs=enc_input_embed[i, :, :, :], sequence_length=self.enc_input_len[i, :], dtype=tf.float32) outputs_enc.append(tf.concat(outputs, 2)) outputs_enc = tf.stack(outputs_enc) with tf.variable_scope('emotion_encoding', reuse=tf.AUTO_REUSE): emot_input_layer = tf.layers.Dense( opts.emot_input_layer_size, activation=tf.sigmoid, kernel_initializer=tf.truncated_normal_initializer( stddev=0.1), name='emot_input_layer') enc_input_e = emot_input_layer(self.enc_input_e) cell_emot = tf.nn.rnn_cell.GRUCell(opts.n_hidden_units_enc_e) _, final_state = tf.nn.dynamic_rnn( cell_emot, inputs=enc_input_e, sequence_length=self.hist_len, dtype=tf.float32) emot_vector = final_state * opts.beta if opts.mode == 'PREDICT': outputs_enc = tf.transpose(outputs_enc, perm=[1, 0, 2, 3]) outputs_enc = tile_batch(outputs_enc, multiplier=opts.beam_width) outputs_enc = tf.transpose(outputs_enc, perm=[1, 0, 2, 3]) tiled_enc_input_len = tile_batch(tf.transpose( self.enc_input_len), multiplier=opts.beam_width) tiled_enc_input_len = tf.transpose(tiled_enc_input_len) tiled_hist_len = tile_batch(self.hist_len, multiplier=opts.beam_width) tiled_emot_vector = tile_batch(emot_vector, multiplier=opts.beam_width) else: tiled_enc_input_len = self.enc_input_len tiled_hist_len = self.hist_len tiled_emot_vector = emot_vector with tf.variable_scope('decoding', reuse=tf.AUTO_REUSE) as vs: attn_mechanism = UttrLevelAttentionMechanism( word_level_num_units=opts.word_level_attn_depth, uttr_level_num_units=opts.uttr_level_attn_depth, n_hidden_units=opts.n_hidden_units_enc_s, memory=outputs_enc, memory_sequence_length=tiled_enc_input_len, hist_length=tiled_hist_len) cell_dec = tf.nn.rnn_cell.GRUCell(opts.n_hidden_units_dec) cell_dec = MyAttentionWrapper(cell_dec, attn_mechanism, tiled_emot_vector) output_layer = tf.layers.Dense( units=opts.vocab_size - 1, kernel_initializer=tf.truncated_normal_initializer( stddev=0.1), name='output_layer') # Train if opts.mode == 'TRAIN': outputs_dec, _ = tf.nn.dynamic_rnn( cell=cell_dec, inputs=dec_input_embed, sequence_length=self.dec_input_len, initial_state=cell_dec.zero_state( opts.batch_size, tf.float32), dtype=tf.float32, scope=vs) logits = output_layer.apply(outputs_dec) weights = tf.sequence_mask(self.dec_input_len, maxlen=opts.max_uttr_len + 1, dtype=tf.float32) self.loss = sequence_loss(logits, self.target, weights) self.loss_batch = sequence_loss(logits, self.target, weights, average_across_batch=False) self.optimizer = tf.train.AdamOptimizer( opts.learning_rate).minimize(self.loss) self.init = tf.global_variables_initializer() # Predict if opts.mode == 'PREDICT': start_tokens = tf.constant(opts.go_index, dtype=tf.int32, shape=[opts.batch_size]) bs_decoder = BeamSearchDecoder( cell=cell_dec, embedding=word_embeddings, start_tokens=start_tokens, end_token=opts.eos_index, initial_state=cell_dec.zero_state( opts.batch_size * opts.beam_width, tf.float32), beam_width=opts.beam_width, output_layer=output_layer) final_outputs, final_state, _ = dynamic_decode( bs_decoder, impute_finished=False, maximum_iterations=opts.max_uttr_len + 1, scope=vs) self.predicted_ids = final_outputs.predicted_ids self.scores = final_outputs.beam_search_decoder_output.scores self.uttr_level_alignments = final_state[ 0].alignment_history_ul.stack() self.word_level_alignments = final_state[ 0].alignment_history_wl.stack() self.final_sequence_lengths = final_state[3] self.tvars = tf.trainable_variables() self.saver = tf.train.Saver(max_to_keep=100)
def build_model(self): print('building model... ...') with tf.variable_scope('seq2seq_placeholder'): self.encoder_inputs = tf.placeholder(tf.int32, [None, None], name="encoder_inputs") self.decoder_inputs = tf.placeholder(tf.int32, [None, None], name="decoder_inputs") self.decoder_targets = tf.placeholder(tf.int32, [None, None], name="decoder_targets") self.decoder_targets_masks = tf.placeholder(tf.bool, [None, None], name="mask") self.encoder_length = tf.placeholder(tf.int32, [None], name="encoder_length") self.decoder_length = tf.placeholder(tf.int32, [None], name="decoder_length") # PCGN placeholder self.user_feat = tf.placeholder(tf.float32, [None, self.feat_dim], name="user_feat") self.user_desc = tf.placeholder(tf.int32, [None, None], name="user_desc") self.desc_length = tf.placeholder(tf.int32, [None], name="user_desc_length") self.max_target_sequence_length = tf.constant( value=self.target_max_length, name='max_target_len' ) # 20# tf.reduce_max(self.decoder_length, name='max_target_len') with tf.variable_scope('seq2seq_embedding'): self.embedding = self.init_embedding(self.vocab_size, self.embedding_size) with tf.variable_scope('seq2seq_encoder'): encoder_outputs, encoder_states = build_encoder( self.embedding, self.encoder_inputs, self.encoder_length, self.encode_num_layers, self.encode_num_units, self.encode_cell_type, bidir=self.encode_bidir) if self.use_user_desc or self.use_user_feat: with tf.variable_scope('user_profile_encoder'): # create emotion category embeddings desc_initializer = tf.contrib.layers.xavier_initializer() self.user_feat_mem_embedding = tf.layers.Dense( self.user_feat_mem_unit, use_bias=False, activation=tf.nn.relu, kernel_initializer=desc_initializer, name="user_feat_mem_layer") self.user_feats, self.user_embs, self.user_desc_encode = self.build_user_embedding( self.user_feat, self.user_desc, self.desc_length, self.user_feat_unit, self.desc_rnn_unit, self.embedding, self.use_user_desc, self.use_user_feat) if self.use_external_desc_express: #self.embed_desc = self.user_desc_encode dim2 = self.desc_rnn_unit dim1 = self.decode_num_units if self.use_blog_user_coattn: dim1 = dim1 * 2 self.blog_desc_inetract = tf.Variable( desc_initializer(shape=(dim1, dim2)), name="blog_desc_inetraction_layer", dtype=tf.float32) if self.use_external_feat_express: dim2 = dim2 + self.user_feat_unit self.user_map_layer = tf.Variable( desc_initializer(shape=(dim2, self.user_map_unit)), name="user_map_layer", dtype=tf.float32) with tf.variable_scope('seq2seq_decoder'): encoder_length = self.encoder_length if self.use_user_desc or self.use_user_feat: user_feats = self.user_feats user_embs = self.user_embs if self.use_user_desc: desc_length = self.desc_length user_desc_encode = self.user_desc_encode if self.beam_search: # 如果使用beam_search,则需要将encoder的输出进行tile_batch,其实就是复制beam_size份。 print("use beamsearch decoding..") encoder_outputs = tile_batch(encoder_outputs, multiplier=self.beam_size) encoder_states = tile_batch(encoder_states, multiplier=self.beam_size) encoder_length = tile_batch(encoder_length, multiplier=self.beam_size) if self.use_user_desc or self.use_user_feat: user_feats = tile_batch(user_feats, multiplier=self.beam_size) user_embs = tile_batch(user_embs, multiplier=self.beam_size) if self.use_user_desc: desc_length = tile_batch(desc_length, multiplier=self.beam_size) user_desc_encode = tile_batch( user_desc_encode, multiplier=self.beam_size) attention_mechanism = BahdanauAttention( num_units=self.attn_num_units, memory=encoder_outputs, memory_sequence_length=encoder_length) if self.use_blog_user_coattn: attention_mechanism_desc = BahdanauAttention( num_units=self.desc_attn_num_units, memory=user_desc_encode, memory_sequence_length=desc_length) decoder_cell = create_rnn_cell(self.decode_num_layers, self.decode_num_units, self.decode_cell_type) if self.use_blog_user_coattn: _attention_mechanism = (attention_mechanism, attention_mechanism_desc) _attention_layer_size = [ self.decode_num_units, self.decode_num_units ] else: _attention_mechanism = attention_mechanism _attention_layer_size = self.decode_num_units if self.use_user_feat: if self.use_gate_memory: _read_g = tf.layers.Dense(self.user_feat_mem_unit, use_bias=False, name="internal_read_gate") _write_g = tf.layers.Dense(self.user_feat_mem_unit, use_bias=False, name="internal_write_gate") if self.use_blog_user_coattn: _read_atten_gate = tf.layers.Dense( 2 * self.desc_attn_num_units, use_bias=False, name="internal_read_attn_gate") else: _read_atten_gate = None else: _read_g = None _write_g = None _read_atten_gate = None decoder_cell = PCGNWrapper( cell=decoder_cell, attention_mechanism=_attention_mechanism, user_feats=user_feats, user_embs=user_embs, user_feat_mem_units=self.user_feat_mem_unit, # memory size user_feat_mem_embedding=self.user_feat_mem_embedding, read_gate=_read_g, write_gate=_write_g, use_gate_memory=self.use_gate_memory, attention_layer_size=_attention_layer_size, read_atten_gate=_read_atten_gate, name='PCGNWrapper') else: decoder_cell = AttentionWrapper( cell=decoder_cell, attention_mechanism=_attention_mechanism, attention_layer_size=_attention_layer_size, name='Attention_Wrapper') batch_size = self.batch_size if not self.beam_search else self.batch_size * self.beam_size decoder_initial_state = decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32).clone(cell_state=encoder_states) output_layer = tf.layers.Dense(self.vocab_size, use_bias=False, name='output_projection') # if self.mode == 'train': decoder_inputs_embedded = tf.nn.embedding_lookup( self.embedding, self.decoder_inputs) # training helper的作用就是决定下一个时序的decoder的输入为给定的decoder inputs, 而不是上一个时刻的输出 training_helper = TrainingHelper( inputs=decoder_inputs_embedded, sequence_length=self.decoder_length, name='training_helper') training_decoder = BasicDecoder( cell=decoder_cell, helper=training_helper, initial_state=decoder_initial_state) self.decoder_outputs, self.final_state, self.final_sequence_length = dynamic_decode( decoder=training_decoder, impute_finished=True, maximum_iterations=self.max_target_sequence_length) self.decoder_logits_train = tf.identity( self.decoder_outputs.rnn_output) if self.use_external_desc_express: if self.use_external_feat_express: _user_feats = user_embs else: _user_feats = None self.decoder_logits_train = self.external_personality_express( self.decoder_logits_train, user_desc_encode, self.blog_desc_inetract, user_feats=_user_feats, use_external_feat_express=self. use_external_feat_express, user_map=self.user_map_layer) with tf.variable_scope('decoder'): self.generic_logits = output_layer( self.decoder_logits_train) # 得到普通词的概率分布logits if self.use_gate_memory: self.feat_mem = self.final_state.user_feat_mem # user_feat_mem的最终状态 with tf.variable_scope('loss'): g_probs = tf.nn.softmax(self.generic_logits) train_log_probs = tf.log(g_probs) self.g_losses = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.generic_logits, labels=self.decoder_targets ) # - tf.log(1 - self.alphas) losses = tf.boolean_mask(self.g_losses, self.decoder_targets_masks) self.loss = tf.reduce_mean(losses) if self.use_gate_memory: self.int_mem_reg = tf.reduce_mean( tf.norm(self.feat_mem + 1e-7, axis=1)) self.loss += self.int_mem_reg # + self.alpha_reg # prepare for perlexity computations # self.decoder_targets_masks=tf.cast(self.decoder_targets_masks,tf.bool) CE = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_log_probs, labels=self.decoder_targets) CE = tf.boolean_mask( CE, tf.cast(self.decoder_targets_masks, tf.bool)) # CE = tf.boolean_mask(CE, self.decoder_targets_masks) self.CE = tf.reduce_mean(CE) # optimizer = tf.train.GradientDescentOptimizer(self.learning_rate)#tf.train.AdamOptimizer(self.learning_rate) optimizer = tf.train.AdamOptimizer( self.learning_rate) # beta1=0.5,beta2=0.9 trainable_params = tf.trainable_variables() gradients = tf.gradients(self.loss, trainable_params) clip_gradients, _ = tf.clip_by_global_norm( gradients, self.max_gradient_norm) self.train_op = optimizer.apply_gradients( zip(clip_gradients, trainable_params)) elif self.mode == 'infer': start_tokens = tf.ones([ self.batch_size, ], tf.int32) * SOS_ID end_token = EOS_ID if self.use_user_feat or self.use_user_desc: if self.use_external_desc_express: _embed_desc = user_desc_encode _blog_desc_inetract = self.blog_desc_inetract _user_map = self.user_map_layer if self.use_external_feat_express: _feat_embed = user_embs else: _feat_embed = None else: _embed_desc = None _blog_desc_inetract = None _user_map = None _feat_embed = None inference_decoder = PCGNBeamSearchDecoder( cell=decoder_cell, embedding=self.embedding, start_tokens=start_tokens, end_token=end_token, initial_state=decoder_initial_state, beam_width=self.beam_size, output_layer=output_layer, use_external_desc_express=self. use_external_desc_express, embed_desc=_embed_desc, blog_desc_inetract=_blog_desc_inetract, feat_embed=_feat_embed, use_external_feat_express=self. use_external_feat_express, user_map=_user_map) else: inference_decoder = BeamSearchDecoder( cell=decoder_cell, embedding=self.embedding, start_tokens=start_tokens, end_token=end_token, initial_state=decoder_initial_state, beam_width=self.beam_size, output_layer=output_layer) decoder_outputs, _, _ = dynamic_decode( decoder=inference_decoder, maximum_iterations=self.infer_max_iter) infer_outputs = decoder_outputs.predicted_ids # [batch_size, decoder_targets_length, beam_size] self.infer_outputs = tf.transpose( infer_outputs, [0, 2, 1], name='infer_outputs' ) # [batch_size, beam_size, decoder_targets_length] self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=self.max_to_keep)
def build_graph(self): # build_graph-train vs validate-train print('Building the TensorFlow graph...') opts = self.options self.graph = tf.Graph() with self.graph.as_default(): self.enc_input = tf.placeholder( tf.int32, shape=[opts.batch_size, opts.max_uttr_len_enc]) self.dec_input = tf.placeholder( tf.int32, shape=[opts.batch_size, opts.max_uttr_len_dec]) self.target = tf.placeholder( tf.int32, shape=[opts.batch_size, opts.max_uttr_len_dec]) self.enc_input_len = tf.placeholder(tf.int32, shape=[opts.batch_size]) self.dec_input_len = tf.placeholder(tf.int32, shape=[opts.batch_size]) self.VAD = tf.placeholder(tf.float32, shape=[opts.corpus_size, 3]) self.termfreq = tf.placeholder(tf.float32, shape=[opts.corpus_size, 1]) self.VAD_loss = tf.placeholder(tf.float32, shape=[opts.corpus_size, 1]) with tf.variable_scope('embedding', reuse=tf.AUTO_REUSE): # how to get input_embed for encoder and decoder word_embeddings = tf.Variable(tf.random_uniform( [opts.corpus_size, opts.word_embed_size], -1.0, 1.0), name='embedding') # word_embeddings = tf.constant(opts.word_embeddings, name = 'word_embeddings') enc_input_embed = tf.nn.embedding_lookup( word_embeddings, self.enc_input) dec_input_embed = tf.nn.embedding_lookup( word_embeddings, self.dec_input) enc_input_VAD = tf.nn.embedding_lookup(self.VAD, self.enc_input) target_VAD = tf.nn.embedding_lookup(self.VAD, self.target) enc_input_tf = tf.nn.embedding_lookup(self.termfreq, self.enc_input) target_tf = tf.nn.embedding_lookup(self.termfreq, self.target) target_VAD_loss = tf.nn.embedding_lookup( self.VAD_loss, self.target) target_VAD_loss = tf.squeeze(target_VAD_loss) with tf.variable_scope('encoding', reuse=tf.AUTO_REUSE): cell_enc = tf.nn.rnn_cell.GRUCell(opts.n_hidden_units_enc) # bi-directional? enc_outputs, _ = tf.nn.dynamic_rnn( cell_enc, enc_input_embed, sequence_length=self.enc_input_len, dtype=tf.float32) if opts.mode == 'PREDICT': enc_outputs = tile_batch(enc_outputs, multiplier=opts.beam_width) enc_input_embed = tile_batch(enc_input_embed, multiplier=opts.beam_width) enc_input_VAD = tile_batch(enc_input_VAD, multiplier=opts.beam_width) enc_input_tf = tile_batch(enc_input_tf, multiplier=opts.beam_width) tiled_enc_input_len = tile_batch(self.enc_input_len, multiplier=opts.beam_width) else: tiled_enc_input_len = self.enc_input_len # with tf.variable_scope('attention', reuse = tf.AUTO_REUSE) as attention_layer: # attention_Wb = tf.layers.Dense(units=3, # use_bias=False, # kernel_initializer = tf.truncated_normal_initializer(stddev = 0.1), # name='attention_Wb') with tf.variable_scope('decoding', reuse=tf.AUTO_REUSE) as vs: # attn_mechanism: alpha_<t,t'> attn_mechanism = MyBahdanauAttention( num_units=opts.attn_depth, memory=enc_outputs, memory_sequence_length=tiled_enc_input_len, enc_input_embed=enc_input_embed, enc_input_VAD=enc_input_VAD, enc_input_tf=enc_input_tf, VAD_mode=opts.VAD_mode) cell_dec = tf.nn.rnn_cell.GRUCell(opts.n_hidden_units_dec) # AttentionWrapper: c? cell_dec = AttentionWrapper(cell_dec, attn_mechanism, output_attention=False) output_layer = tf.layers.Dense( units=opts.corpus_size, kernel_initializer=tf.truncated_normal_initializer( stddev=0.1)) # Train if opts.mode == 'TRAIN': dec_initial_state = cell_dec.zero_state( opts.batch_size, tf.float32) attention = compute_attention( attn_mechanism, dec_initial_state.cell_state) #(1,256) dec_initial_state = dec_initial_state.clone( attention=attention) outputs_dec, _ = tf.nn.dynamic_rnn( cell=cell_dec, inputs=dec_input_embed, sequence_length=self.dec_input_len, initial_state=dec_initial_state, dtype=tf.float32, scope=vs) # logits: `[batch_size, sequence_length, num_decoder_symbols]` # The logits correspond to the prediction across all classes at each timestep. logits = output_layer.apply(outputs_dec) # batch size * max sentence length; binary; 0 for non-word in orignal sentence; mask sequence_mask = tf.sequence_mask( self.dec_input_len, maxlen=opts.max_uttr_len_dec, dtype=tf.float32) if opts.VAD_mode: weights = sequence_mask * target_VAD_loss # affective objective function else: weights = sequence_mask # sequence_mask: [batch_size, max_len] # target: [batch_size, max_len] VAD_loss: [batch_size,max_len] # softmax_loss_function(labels=targets, logits=logits_flat) 默认为sparse_softmax_cross_entropy_with_logits self.loss = sequence_loss(logits, self.target, weights) self.loss_batch = sequence_loss(logits, self.target, weights, average_across_batch=False) self.optimizer = tf.train.AdamOptimizer( opts.learning_rate).minimize(self.loss) self.init = tf.global_variables_initializer() # Predict if opts.mode == 'PREDICT': dec_initial_state = cell_dec.zero_state( opts.batch_size * opts.beam_width, tf.float32) attention = compute_attention(attn_mechanism, dec_initial_state.cell_state) dec_initial_state = dec_initial_state.clone( attention=attention) start_tokens = tf.constant(opts.go_index, dtype=tf.int32, shape=[opts.batch_size]) bs_decoder = BeamSearchDecoder( cell=cell_dec, embedding=word_embeddings, start_tokens=start_tokens, end_token=opts.eos_index, initial_state=dec_initial_state, beam_width=opts.beam_width, output_layer=output_layer) final_outputs, final_state, _ = dynamic_decode( bs_decoder, impute_finished=False, maximum_iterations=opts.max_uttr_len_dec, scope=vs) self.predicted_ids = final_outputs.predicted_ids # self.scores = final_outputs.scores # 'FinalBeamSearchDecoderOutput' object has no attribute 'scores' self.prob = final_state.log_probs # log_probs: The log probabilities with shape `[batch_size, beam_width, vocab_size]`. # logits: Logits at the current time step. A tensor of shape `[batch_size, beam_width, vocab_size]` # step_log_probs = nn_ops.log_softmax(logits) # logsoftmax = logits - log(reduce_sum(exp(logits), axis)) # step_log_probs = _mask_probs(step_log_probs, end_token, previously_finished) # total_probs = array_ops.expand_dims(beam_state.log_probs, 2) + step_log_probs # final_outputs.scores #[batch_size, length, beam_width] if opts.mode == 'POST_PREDICT': dec_initial_state = cell_dec.zero_state( opts.batch_size, tf.float32) attention = compute_attention( attn_mechanism, dec_initial_state.cell_state) #(1,256) dec_initial_state = dec_initial_state.clone( attention=attention) outputs_dec, _ = tf.nn.dynamic_rnn( cell=cell_dec, inputs=dec_input_embed, sequence_length=self.dec_input_len, initial_state=dec_initial_state, dtype=tf.float32, scope=vs) logits = output_layer.apply(outputs_dec) sequence_mask = tf.sequence_mask( self.dec_input_len, maxlen=opts.max_uttr_len_dec, dtype=tf.float32) score = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.target, logits=logits) self.prob = -1 * tf.reduce_sum(score * sequence_mask) self.tvars = tf.trainable_variables() self.saver = tf.train.Saver(max_to_keep=100)
def build_train_graph(self, train_examples): """ Building train graph with train examples :param train_examples: Examples from train data :return: Predicted outputs, parameters of generator, decoder cell, attention zero state """ # Unpack subject, content and answers and corresponding length subject, len_subject, content, len_content, target_input, target_output, len_target = train_examples # Choose best answer per question target_input = target_input[:, 0, :] target_output = target_output[:, 0, :] len_target = tf.to_int32(len_target[:, 0]) # Look up word vectors for decoder input decoder_inputs_embedded = tf.nn.embedding_lookup( self.embeddings_english, target_input) # Concat subject and content to feed it into encoder sub_cont_concat_op = tf.map_fn( self.concat_seqs, [subject, len_subject, content, len_content])[0] len_both = len_subject + len_content # Load inital graph twice, one for train and another for output with beam decoder decoder_cell, attn_zero_state = self.build_initial_graph( sub_cont_concat_op, len_both) decoder_cell_beam, attn_zero_state_beam = self.build_initial_graph( sub_cont_concat_op, len_both, reuse=True, beam_width=self.beam_width) # Make train decoder helper = TrainingHelper(decoder_inputs_embedded, len_target, time_major=False) decoder = BasicDecoder(decoder_cell, helper, attn_zero_state, output_layer=self.projection_layer) # Make beam search decoder beam_search_decoder = BeamSearchDecoder( decoder_cell_beam, self.embeddings_english, self.start_token, self.end_token, attn_zero_state_beam, self.beam_width, output_layer=self.projection_layer) # Define variable scope train decoder to initialize the train decoder and beam search decoder # with dynamic decode with tf.variable_scope("train_decoder"): final_outputs, final_state, final_seq_len = dynamic_decode( decoder, output_time_major=False) with tf.variable_scope("train_decoder", reuse=True): beam_outputs, _, beam_out_len = dynamic_decode( beam_search_decoder, output_time_major=False, maximum_iterations=self.max_seq_len) # Output of train decoder final_outputs_max_len = tf.shape(final_outputs.sample_id)[1] target_output = target_output[:, :final_outputs_max_len] # Output of beam search decoder beam_outputs = tf.transpose(beam_outputs.predicted_ids, [2, 0, 1]) beam_outputs = tf.reshape(beam_outputs[0, :, :], [self.batch_size, -1]) beam_out_len = tf.transpose(beam_out_len) beam_out_len = tf.reshape(beam_out_len[0, :], [-1]) # Get generator parameters generator_params = [ param for param in tf.trainable_variables() if "discriminator" not in param.name ] return target_output, final_outputs, final_seq_len, generator_params, decoder_cell, attn_zero_state, beam_outputs, beam_out_len
def build_decoder(self, encoder_outputs, encoder_state): """构建解码器 """ with tf.variable_scope('decoder') as decoder_scope: #创建解码器单元 (self.decoder_cell,self.decoder_initial_state)\ = self.build_decoder_cell(encoder_outputs, encoder_state) # 解码器embedding 根据词表大小选择CPU还是GPU上训练 with tf.device(_get_embed_device(self.target_vocab_size)): #如果是共享的embedding 则赋值,否则加载预训练 或者初始化进行后续的训练 if self.share_embedding: self.decoder_embeddings = self.encoder_embeddings #如果是预训练的embedding elif self.pretrained_embedding: self.decoder_embeddings = tf.Variable( tf.constant( 0.0, shape=(self.target_vocab_size,self.embedding_size)), trainable=True,#是否可以被训练 name='embeddings') self.decoder_embeddings_placeholder = tf.placeholder( tf.float32, (self.target_vocab_size, self.embedding_size)) self.decoder_embeddings_init = self.decoder_embeddings.assign( self.decoder_embeddings_placeholder) else: self.decoder_embeddings = tf.get_variable( name='embeddings', shape=(self.target_vocab_size, self.embedding_size), initializer=self.initializer, dtype=tf.float32 ) #解码器的输出 self.decoder_output_projection = layers.Dense( self.target_vocab_size, #一共有词表大小个输出 dtype=tf.float32, use_bias=False, name='decoder_output_projection' ) if self.mode == 'train': self.decoder_inputs_embedded = tf.nn.embedding_lookup( params=self.decoder_embeddings, ids=self.decoder_inputs_train #placeholder初始化时设定 ) inputs = self.decoder_inputs_embedded if self.time_major: inputs = tf.transpose(inputs, (1, 0, 2)) #帮助feed参数 一般用于训练阶段Decoder解码,辅助Decoder解码过程 training_helper = seq2seq.TrainingHelper( inputs=inputs, sequence_length=self.decoder_inputs_length, time_major=self.time_major, name='training_helper' ) # 训练的时候不在这里应用 output_layer # 因为这里会每个 time_step 的进行 output_layer 的投影计算,比较慢 # 注意这个trick要成功必须设置 dynamic_decode 的 scope 参数 training_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state, #用之前定义的初始化单元的状态进行初始化 ) # decoder在当前batch下最大的time_steps max_decoder_length = tf.reduce_max(self.decoder_inputs_length) #定义动态解码的输出 (outputs,self.final_state,_)\ = seq2seq.dynamic_decode( #动态decoder decoder=training_decoder, output_time_major=self.time_major, #True是以time(seq_length)为第一维,False是以batch_size为第一维 impute_finished=True, #追踪finished,如果一个序列已经finished,那么后面的每一步output为0 maximum_iterations=max_decoder_length,#最大迭代次数(可以理解为decoder最多可以生成几个词) parallel_iterations=self.parallel_iterations,##while_loop的并行次数 swap_memory=True, ##True时,当遇到OOM(out of memory),是否把张量从显存转到内存 scope=decoder_scope) #在训练时将所有的结果在全连接层一次性做投影运算 可以提高效率官方提倡 self.decoder_logits_train = self.decoder_output_projection( outputs.rnn_output #上面定义的解码器的输出 ) # masks: masking for valid and padded time steps, #tf.sequence_mask的作用是构建序列长度的mask标志 """ tf.sequence_mask([1,2], 4) --> [[ True False False False] [ True True False False]] """ # [batch_size, max_time_step + 1] self.masks = tf.sequence_mask( lengths=self.decoder_inputs_length, maxlen=max_decoder_length, dtype=tf.float32, name='masks' ) decoder_logits_train = self.decoder_logits_train if self.time_major: decoder_logits_train = tf.transpose(decoder_logits_train, (1,0,2)) #解码器训练时的预测输出 decoder_logits_train一共有词表大小个输出,现仅取值最大的那个下标即为预测的对应下标 self.decoder_pred_train = tf.argmax( decoder_logits_train, axis=-1, name='decoder_pred_train') # 下面的一些变量用于特殊的学习训练 # 自定义rewards,其实我这里是修改了masks 损失之类 # train_entropy = cross entropy self.train_entropy = \ tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.decoder_inputs,#标签 logits=decoder_logits_train)#预测 self.masks_rewards = self.masks * self.rewards #seq2sqe中的损失函数 就是将各个时间步输出相加求平均 权重为mask 当句子长度短于最大长度,为0部分的权重为0 self.loss_rewards = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_inputs, weights=self.masks_rewards, #这里权重跟下面的不同 average_across_timesteps=True, #损失将除以总的权重 average_across_batch=True, #损失将是总的损失处于批次大小 ) self.loss = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_inputs, weights=self.masks, average_across_timesteps=True, average_across_batch=True, ) self.loss_add = self.loss + self.add_loss elif self.mode == 'decode': # 预测模式,非训练 #对原数据进行扩展 参考 #https://blog.csdn.net/tsyccnh/article/details/82459859 start_tokens = tf.tile( [WordSequence.START], [self.batch_size] ) end_token = WordSequence.END def embed_and_input_proj(inputs): """输入层的投影层wrapper 将输入转换成对应词表对应下的embedding """ return tf.nn.embedding_lookup( self.decoder_embeddings, inputs ) #如果不使用集束搜索解码 这里定义helper和decoder的结构 if not self.use_beamsearch_decode: # Helper to feed inputs for greedy decoding: # uses the argmax of the output #贪婪搜索解码 decoding_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_tokens,#起始token end_token=end_token, #结束token embedding=embed_and_input_proj #已经将输入转换成对应的embedding ) # Basic decoder performs greedy decoding at each time step # print("building greedy decoder..") inference_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=self.decoder_output_projection ) else: #使用beamsearch解码 # Beamsearch is used to approximately # find the most likely translation # print("building beamsearch decoder..") inference_decoder = BeamSearchDecoder( cell=self.decoder_cell, embedding=embed_and_input_proj, start_tokens=start_tokens, end_token=end_token, initial_state=self.decoder_initial_state, beam_width=self.beam_width, output_layer=self.decoder_output_projection, ) #一般使用最大值 if self.max_decode_step is not None: max_decode_step = self.max_decode_step else: # 默认 4 倍输入长度的输出解码 max_decode_step = tf.round(tf.reduce_max( self.encoder_inputs_length) * 4) ( self.decoder_outputs_decode,#输出 self.final_state, #最后的状态 _ # self.decoder_outputs_length_decode ) = seq2seq.dynamic_decode( decoder=inference_decoder, #这里包含了使用哪种解码方式 output_time_major=self.time_major, # impute_finished=True, # error occurs maximum_iterations=max_decode_step, parallel_iterations=self.parallel_iterations, swap_memory=True, scope=decoder_scope ) #如果不使用beamsearch解码,使用贪婪解码 #调用dynamic_decode进行解码,decoder_outputs_decode是一个namedtuple,里面包含两项(rnn_outputs, sample_id) # rnn_output: [batch_size, decoder_targets_length, vocab_size],保存decode每个时刻每个单词的概率,可以用来计算loss # sample_id: [batch_size], tf.int32,保存最终的编码结果。可以表示最后的答案 if not self.use_beamsearch_decode: dod = self.decoder_outputs_decode self.decoder_pred_decode = dod.sample_id #就是最终的答案 if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0)) #如果使用beamsearch #参考 https://blog.csdn.net/liuchonge/article/details/79021938 # 对于使用beam_search的时候,decoder_outputs_decode它里面包含两项(predicted_ids, beam_search_decoder_output) # predicted_ids: [batch_size, decoder_targets_length, beam_size],保存输出结果 # beam_search_decoder_output: BeamSearchDecoderOutput instance namedtuple(scores, predicted_ids, parent_ids) # 所以对应只需要返回predicted_ids或者sample_id即可翻译成最终的结果 else: self.decoder_pred_decode = \ self.decoder_outputs_decode.predicted_ids if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0, 2)) self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, perm=[0, 2, 1]) dod = self.decoder_outputs_decode self.beam_prob = dod.beam_search_decoder_output.scores
def build_decoder(self, encoder_outputs, encoder_state): """构建解码器 """ with tf.variable_scope('decoder') as decoder_scope: # Building decoder_cell and decoder_initial_state (self.decoder_cell, self.decoder_initial_state) = self.build_decoder_cell( encoder_outputs, encoder_state) # 解码器embedding with tf.device(_get_embed_device(self.target_vocab_size)): if self.share_embedding: self.decoder_embeddings = self.encoder_embeddings elif self.pretrained_embedding: self.decoder_embeddings = tf.Variable(tf.constant( 0.0, shape=(self.target_vocab_size, self.embedding_size)), trainable=True, name='embeddings') self.decoder_embeddings_placeholder = tf.placeholder( tf.float32, (self.target_vocab_size, self.embedding_size)) self.decoder_embeddings_init = \ self.decoder_embeddings.assign( self.decoder_embeddings_placeholder) else: self.decoder_embeddings = tf.get_variable( name='embeddings', shape=(self.target_vocab_size, self.embedding_size), initializer=self.initializer, dtype=tf.float32) self.decoder_output_projection = layers.Dense( self.target_vocab_size, dtype=tf.float32, use_bias=False, name='decoder_output_projection') if self.mode == 'train': # decoder_inputs_embedded: # [batch_size, max_time_step + 1, embedding_size] self.decoder_inputs_embedded = tf.nn.embedding_lookup( params=self.decoder_embeddings, ids=self.decoder_inputs_train) # Helper to feed inputs for training: # read inputs from dense ground truth vectors inputs = self.decoder_inputs_embedded if self.time_major: inputs = tf.transpose(inputs, (1, 0, 2)) training_helper = seq2seq.TrainingHelper( inputs=inputs, sequence_length=self.decoder_inputs_length, time_major=self.time_major, name='training_helper') # 训练的时候不在这里应用 output_layer # 因为这里会每个 time_step 的进行 output_layer 的投影计算,比较慢 # 注意这个trick要成功必须设置 dynamic_decode 的 scope 参数 training_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state, # output_layer=self.decoder_output_projection ) # Maximum decoder time_steps in current batch max_decoder_length = tf.reduce_max(self.decoder_inputs_length) # decoder_outputs_train: BasicDecoderOutput # namedtuple(rnn_outputs, sample_id) # decoder_outputs_train.rnn_output: # if output_time_major=False: # [batch_size, max_time_step + 1, num_decoder_symbols] # if output_time_major=True: # [max_time_step + 1, batch_size, num_decoder_symbols] # decoder_outputs_train.sample_id: [batch_size], tf.int32 ( outputs, self.final_state, # contain attention _ # self.final_sequence_lengths ) = seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=self.time_major, impute_finished=True, maximum_iterations=max_decoder_length, parallel_iterations=self.parallel_iterations, swap_memory=True, scope=decoder_scope) # More efficient to do the projection # on the batch-time-concatenated tensor # logits_train: # [batch_size, max_time_step + 1, num_decoder_symbols] # 训练的时候一次性对所有的结果进行 output_layer 的投影运算 # 官方NMT库说这样能提高10~20%的速度 # 实际上我提高的速度会更大 self.decoder_logits_train = self.decoder_output_projection( outputs.rnn_output) # masks: masking for valid and padded time steps, # [batch_size, max_time_step + 1] self.masks = tf.sequence_mask( lengths=self.decoder_inputs_length, maxlen=max_decoder_length, dtype=tf.float32, name='masks') # Computes per word average cross-entropy over a batch # Internally calls # 'nn_ops.sparse_softmax_cross_entropy_with_logits' by default decoder_logits_train = self.decoder_logits_train if self.time_major: decoder_logits_train = tf.transpose( decoder_logits_train, (1, 0, 2)) self.decoder_pred_train = tf.argmax(decoder_logits_train, axis=-1, name='decoder_pred_train') # 下面的一些变量用于特殊的学习训练 # 自定义rewards,其实我这里是修改了masks # train_entropy = cross entropy self.train_entropy = \ tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.decoder_inputs, logits=decoder_logits_train) self.masks_rewards = self.masks * self.rewards self.loss_rewards = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_inputs, weights=self.masks_rewards, average_across_timesteps=True, average_across_batch=True, ) self.loss = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_inputs, weights=self.masks, average_across_timesteps=True, average_across_batch=True, ) self.loss_add = self.loss + self.add_loss elif self.mode == 'decode': # 预测模式,非训练 start_tokens = tf.tile([WordSequence.START], [self.batch_size]) end_token = WordSequence.END def embed_and_input_proj(inputs): """输入层的投影层wrapper """ return tf.nn.embedding_lookup(self.decoder_embeddings, inputs) if not self.use_beamsearch_decode: # Helper to feed inputs for greedy decoding: # uses the argmax of the output decoding_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_tokens, end_token=end_token, embedding=embed_and_input_proj) # Basic decoder performs greedy decoding at each time step # print("building greedy decoder..") inference_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=self.decoder_output_projection) else: # Beamsearch is used to approximately # find the most likely translation # print("building beamsearch decoder..") inference_decoder = BeamSearchDecoder( cell=self.decoder_cell, embedding=embed_and_input_proj, start_tokens=start_tokens, end_token=end_token, initial_state=self.decoder_initial_state, beam_width=self.beam_width, output_layer=self.decoder_output_projection, ) # For GreedyDecoder, return # decoder_outputs_decode: BasicDecoderOutput instance # namedtuple(rnn_outputs, sample_id) # decoder_outputs_decode.rnn_output: # if output_time_major=False: # [batch_size, max_time_step, num_decoder_symbols] # if output_time_major=True # [max_time_step, batch_size, num_decoder_symbols] # decoder_outputs_decode.sample_id: # if output_time_major=False # [batch_size, max_time_step], tf.int32 # if output_time_major=True # [max_time_step, batch_size], tf.int32 # For BeamSearchDecoder, return # decoder_outputs_decode: FinalBeamSearchDecoderOutput instance # namedtuple(predicted_ids, beam_search_decoder_output) # decoder_outputs_decode.predicted_ids: # if output_time_major=False: # [batch_size, max_time_step, beam_width] # if output_time_major=True # [max_time_step, batch_size, beam_width] # decoder_outputs_decode.beam_search_decoder_output: # BeamSearchDecoderOutput instance # namedtuple(scores, predicted_ids, parent_ids) # 官方文档提到的一个潜在的最大长度选择 # 我这里改为 * 4 # maximum_iterations = tf.round(tf.reduce_max(source_sequence_length) * 2) # https://www.tensorflow.org/tutorials/seq2seq if self.max_decode_step is not None: max_decode_step = self.max_decode_step else: # 默认 4 倍输入长度的输出解码 max_decode_step = tf.round( tf.reduce_max(self.encoder_inputs_length) * 4) ( self.decoder_outputs_decode, self.final_state, _ # self.decoder_outputs_length_decode ) = ( seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=self.time_major, # impute_finished=True, # error occurs maximum_iterations=max_decode_step, parallel_iterations=self.parallel_iterations, swap_memory=True, scope=decoder_scope)) if not self.use_beamsearch_decode: # decoder_outputs_decode.sample_id: # [batch_size, max_time_step] # Or use argmax to find decoder symbols to emit: # self.decoder_pred_decode = tf.argmax( # self.decoder_outputs_decode.rnn_output, # axis=-1, name='decoder_pred_decode') # Here, we use expand_dims to be compatible with # the result of the beamsearch decoder # decoder_pred_decode: # [batch_size, max_time_step, 1] (output_major=False) # self.decoder_pred_decode = tf.expand_dims( # self.decoder_outputs_decode.sample_id, # -1 # ) dod = self.decoder_outputs_decode self.decoder_pred_decode = dod.sample_id if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0)) else: # Use beam search to approximately # find the most likely translation # decoder_pred_decode: # [batch_size, max_time_step, beam_width] (output_major=False) self.decoder_pred_decode = \ self.decoder_outputs_decode.predicted_ids if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0, 2)) self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, perm=[0, 2, 1]) dod = self.decoder_outputs_decode self.beam_prob = dod.beam_search_decoder_output.scores
def buildModel(self): T_in = self.args.T_in T_out = self.args.T_out D_in = self.args.D_in D_out = self.args.D_out E = self.args.embedding_dim H = self.args.hidden_dim SOS = self.args.SOS EOS = self.args.EOS PAD = self.args.PAD beam_width = 3 # Input with tf.name_scope('input'): x = tf.placeholder(shape=(None, T_in), dtype=tf.int32, name='encoder_inputs') # N, T_out y = tf.placeholder(shape=(None, T_out), dtype=tf.int32, name='decoder_inputs') # N x_len = tf.placeholder(shape=(None, ), dtype=tf.int32) # N y_len = tf.placeholder(shape=(None, ), dtype=tf.int32) # dynamic sample num batch_size = tf.shape(x)[0] # symbol mask sos = tf.ones(shape=(batch_size, 1), dtype=tf.int32) * SOS eos = tf.ones(shape=(batch_size, 1), dtype=tf.int32) * EOS pad = tf.ones(shape=(batch_size, 1), dtype=tf.int32) * PAD # input mask x_mask = tf.sequence_mask(x_len, T_in, dtype=tf.float32) y_with_sos_mask = tf.sequence_mask(y_len, T_out + 1, dtype=tf.float32) y_with_pad = tf.concat([y, pad], axis=1) eos_mask = tf.one_hot(y_len, depth=T_out + 1, dtype=tf.int32) * EOS # masked inputs y_with_eos = y_with_pad + eos_mask y_with_sos = tf.concat([sos, y], axis=1) ## Embedding with tf.name_scope('embedding'): if self.args.use_pretrained: embedding_pretrained = np.fromfile(self.args.pretrained_file, dtype=np.float32).reshape( (-1, E)) embedding = tf.Variable(embedding_pretrained, trainable=False) else: embedding = tf.get_variable(name='embedding', shape=(D_in, E), dtype=tf.float32, initializer=xavier_initializer()) e_x = tf.nn.embedding_lookup(embedding, x) e_y = tf.nn.embedding_lookup(embedding, y_with_sos) if self.args.mode == 'train': e_x = tf.nn.dropout(e_x, self.args.keep_prob) ## Encoder with tf.name_scope('encoder'): ## Multi-BiLSTM fw_cell = rnn.MultiRNNCell([ rnn.BasicLSTMCell(num_units=H) for i in range(self.args.layer_size) ]) bw_cell = rnn.MultiRNNCell([ rnn.BasicLSTMCell(num_units=H) for i in range(self.args.layer_size) ]) bi_encoder_output, bi_encoder_state = tf.nn.bidirectional_dynamic_rnn( fw_cell, bw_cell, e_x, sequence_length=x_len, dtype=tf.float32, time_major=False, scope=None) encoder_output = bi_encoder_output[0] + bi_encoder_output[1] encoder_final_state = bi_encoder_state[0] ## Decoder with tf.name_scope('decoder'): decoder_cell = rnn.MultiRNNCell([ rnn.BasicLSTMCell(num_units=H) for i in range(self.args.layer_size) ]) decoder_lengths = tf.ones(shape=[batch_size], dtype=tf.int32) * (T_out + 1) ## Trainning decoder with tf.variable_scope('attention'): attention_mechanism = LuongAttention( num_units=H, memory=encoder_output, memory_sequence_length=x_len, name='attention_fn') projection_layer = Dense(units=D_out, kernel_initializer=xavier_initializer()) train_decoder_cell = AttentionWrapper( cell=decoder_cell, attention_mechanism=attention_mechanism, attention_layer_size=H) train_decoder_init_state = train_decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32).clone(cell_state=encoder_final_state) training_helper = TrainingHelper(e_y, decoder_lengths, time_major=False) train_decoder = BasicDecoder( cell=train_decoder_cell, helper=training_helper, initial_state=train_decoder_init_state, output_layer=projection_layer) train_decoder_outputs, _, _ = dynamic_decode( train_decoder, impute_finished=True, maximum_iterations=T_out + 1) # N, T_out+1, D_out train_decoder_outputs = ln(train_decoder_outputs.rnn_output) ## Beam_search decoder beam_memory = tile_batch(encoder_output, beam_width) beam_memory_state = tile_batch(encoder_final_state, beam_width) beam_memory_length = tile_batch(x_len, beam_width) with tf.variable_scope('attention', reuse=True): beam_attention_mechanism = LuongAttention( num_units=H, memory=beam_memory, memory_sequence_length=beam_memory_length, name='attention_fn') beam_decoder_cell = AttentionWrapper( cell=decoder_cell, attention_mechanism=beam_attention_mechanism, attention_layer_size=None) beam_decoder_init_state = beam_decoder_cell.zero_state( batch_size=batch_size * beam_width, dtype=tf.float32).clone(cell_state=beam_memory_state) start_tokens = tf.ones((batch_size), dtype=tf.int32) * SOS beam_decoder = BeamSearchDecoder( cell=beam_decoder_cell, embedding=embedding, start_tokens=start_tokens, end_token=EOS, initial_state=beam_decoder_init_state, beam_width=beam_width, output_layer=projection_layer) beam_decoder_outputs, _, _ = dynamic_decode( beam_decoder, scope=tf.get_variable_scope(), maximum_iterations=T_out + 1) beam_decoder_result_ids = beam_decoder_outputs.predicted_ids with tf.name_scope('loss'): logits = tf.nn.softmax(train_decoder_outputs) cross_entropy = tf.keras.losses.sparse_categorical_crossentropy( y_with_eos, logits) loss_mask = tf.sequence_mask(y_len + 1, T_out + 1, dtype=tf.float32) loss = tf.reduce_sum(cross_entropy * loss_mask) / tf.cast( batch_size, dtype=tf.float32) prediction = tf.argmax(logits, 2) ## train_op with tf.name_scope('train'): global_step = tf.train.get_or_create_global_step() lr = noam_scheme(self.args.lr, global_step, self.args.warmup_steps) optimizer = tf.train.AdamOptimizer(lr) ## gradient clips trainable_params = tf.trainable_variables() gradients = tf.gradients(loss, trainable_params) clip_gradients, _ = tf.clip_by_global_norm( gradients, self.args.gradient_clip_num) train_op = optimizer.apply_gradients(zip(clip_gradients, trainable_params), global_step=global_step) # Summary with tf.name_scope('summary'): tf.summary.scalar('lr', lr) tf.summary.scalar('loss', loss) tf.summary.scalar('global_step', global_step) summaries = tf.summary.merge_all() return x, y, x_len, y_len, logits, loss, prediction, beam_decoder_result_ids, global_step, train_op, summaries
def build_model(self): print('building model... ...') with tf.variable_scope('seq2seq_placeholder'): self.encoder_inputs = tf.placeholder(tf.int32, [None, None], name="encoder_inputs") self.decoder_inputs = tf.placeholder(tf.int32, [None, None], name="decoder_inputs") self.decoder_targets = tf.placeholder(tf.int32, [None, None], name="decoder_targets") self.decoder_targets_masks = tf.placeholder(tf.float32, [None, None], name="mask") self.encoder_length = tf.placeholder(tf.int32, [None], name="encoder_length") self.decoder_length = tf.placeholder(tf.int32, [None], name="decoder_length") self.max_target_sequence_length = tf.reduce_max( self.decoder_length, name='max_target_len') with tf.variable_scope('seq2seq_embedding'): self.embedding = self.init_embedding(self.vocab_size, self.embedding_size) with tf.variable_scope('seq2seq_encoder'): encoder_outputs, encoder_states = build_encoder( self.embedding, self.encoder_inputs, self.encoder_length, self.enc_num_layers, self.enc_num_units, self.enc_cell_type, bidir=self.enc_bidir) with tf.variable_scope('seq2seq_decoder'): encoder_length = self.encoder_length if self.beam_search: print("use beamsearch decoding..") encoder_outputs = tile_batch(encoder_outputs, multiplier=self.beam_size) encoder_states = tile_batch(encoder_states, multiplier=self.beam_size) encoder_length = tile_batch(encoder_length, multiplier=self.beam_size) attention_mechanism = BahdanauAttention( num_units=self.attn_num_units, memory=encoder_outputs, memory_sequence_length=encoder_length) decoder_cell = create_rnn_cell(self.dec_num_layers, self.dec_num_units, self.dec_cell_type) decoder_cell = AttentionWrapper( cell=decoder_cell, attention_mechanism=attention_mechanism, attention_layer_size=self.dec_num_units, name='Attention_Wrapper') batch_size = self.batch_size if not self.beam_search else self.batch_size * self.beam_size decoder_initial_state = decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32).clone(cell_state=encoder_states) output_layer = tf.layers.Dense(self.vocab_size, use_bias=False, name='output_projection') if self.mode == 'train': decoder_inputs_embedded = tf.nn.embedding_lookup( self.embedding, self.decoder_inputs) # training helper的作用就是决定下一个时序的decoder的输入为给定的decoder inputs, 而不是上一个时刻的输出 training_helper = tf.contrib.seq2seq.TrainingHelper( inputs=decoder_inputs_embedded, sequence_length=self.decoder_length, name='training_helper') training_decoder = tf.contrib.seq2seq.BasicDecoder( cell=decoder_cell, helper=training_helper, initial_state=decoder_initial_state, output_layer=output_layer) decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder=training_decoder, impute_finished=True, maximum_iterations=self.max_target_sequence_length) self.decoder_logits_train = decoder_outputs.rnn_output self.loss = tf.contrib.seq2seq.sequence_loss( logits=self.decoder_logits_train, targets=self.decoder_targets, weights=self.decoder_targets_masks) optimizer = tf.train.AdamOptimizer(self.learning_rate) trainable_params = tf.trainable_variables() gradients = tf.gradients(self.loss, trainable_params) clip_gradients, _ = tf.clip_by_global_norm( gradients, self.max_gradient_norm) self.train_op = optimizer.apply_gradients( zip(clip_gradients, trainable_params)) elif self.mode == 'infer': start_tokens = tf.ones([ self.batch_size, ], tf.int32) * SOS_ID # 这里的batch_size不需要复制 end_token = EOS_ID if self.beam_search: inference_decoder = BeamSearchDecoder( cell=decoder_cell, embedding=self.embedding, start_tokens=start_tokens, end_token=end_token, initial_state=decoder_initial_state, beam_width=self.beam_size, output_layer=output_layer) else: decoding_helper = GreedyEmbeddingHelper( embedding=self.embedding, start_tokens=start_tokens, end_token=end_token) inference_decoder = BasicDecoder( cell=decoder_cell, helper=decoding_helper, initial_state=decoder_initial_state, output_layer=output_layer) decoder_outputs, _, _ = dynamic_decode( decoder=inference_decoder, maximum_iterations=self.infer_max_iter) if self.beam_search: infer_outputs = decoder_outputs.predicted_ids # [batch_size, decoder_targets_length, beam_size] self.infer_outputs = tf.transpose( infer_outputs, [0, 2, 1 ]) # [batch_size, beam_size, decoder_targets_length] else: self.infer_outputs = decoder_outputs.sample_id # [batch_size, decoder_targets_length] self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=self.max_to_keep)
def _test_beam_search(self, decoder, initial_state=None, tiled_initial_state=None, tf_initial_state=None, beam_width_1=1, initiated=False): # Compare with tf built-in BeamSearchDecoder outputs, final_state, _ = beam_search_decode(decoder_or_cell=decoder, embedding=self._embedding, start_tokens=[1] * self._batch_size, end_token=2, beam_width=beam_width_1, max_decoding_length=20) self.assertIsInstance(outputs, tf.contrib.seq2seq.FinalBeamSearchDecoderOutput) self.assertIsInstance(final_state, tf.contrib.seq2seq.BeamSearchDecoderState) num_trainable_variables = len(tf.trainable_variables()) _ = decoder(decoding_strategy='infer_greedy', embedding=self._embedding, start_tokens=[1] * self._batch_size, end_token=2, max_decoding_length=20) self.assertEqual(num_trainable_variables, len(tf.trainable_variables())) if tf_initial_state is None: tf_initial_state = decoder.cell.zero_state( self._batch_size * beam_width_1, tf.float32) beam_decoder = BeamSearchDecoder(cell=decoder.cell, embedding=self._embedding, start_tokens=[1] * self._batch_size, end_token=2, initial_state=tf_initial_state, beam_width=beam_width_1, output_layer=decoder.output_layer) outputs_1, final_state_1, _ = dynamic_decode(decoder=beam_decoder, maximum_iterations=20) ## Tests time major outputs_2, _, _ = beam_search_decode( decoder_or_cell=decoder, embedding=self._embedding, start_tokens=[1] * self._batch_size, end_token=2, beam_width=self._beam_width, initial_state=initial_state, tiled_initial_state=tiled_initial_state, max_decoding_length=21) outputs_3, _, _ = beam_search_decode( decoder_or_cell=decoder, embedding=self._embedding, start_tokens=[1] * self._batch_size, end_token=2, beam_width=self._beam_width, initial_state=initial_state, tiled_initial_state=tiled_initial_state, max_decoding_length=21, output_time_major=True) with self.test_session() as sess: if not initiated: sess.run(tf.global_variables_initializer()) outputs_, final_state_, outputs_1_, final_state_1_ = sess.run( [outputs, final_state, outputs_1, final_state_1], feed_dict={ context.global_mode(): tf.estimator.ModeKeys.PREDICT }) np.testing.assert_array_equal(outputs_.predicted_ids, outputs_1_.predicted_ids) np.testing.assert_array_equal( outputs_.beam_search_decoder_output.scores, outputs_1_.beam_search_decoder_output.scores) np.testing.assert_array_equal( outputs_.beam_search_decoder_output.predicted_ids, outputs_1_.beam_search_decoder_output.predicted_ids) np.testing.assert_array_equal( outputs_.beam_search_decoder_output.parent_ids, outputs_1_.beam_search_decoder_output.parent_ids) np.testing.assert_array_equal(final_state_.log_probs, final_state_1_.log_probs) np.testing.assert_array_equal(final_state_.lengths, final_state_1_.lengths) outputs_2_, outputs_3_ = sess.run([outputs_2, outputs_3], feed_dict={ context.global_mode(): tf.estimator.ModeKeys.PREDICT }) self.assertEqual(outputs_2_.predicted_ids.shape, tuple([self._batch_size, 21, 11])) self.assertEqual(outputs_3_.predicted_ids.shape, tuple([21, self._batch_size, 11]))
def build_decoder(self, encoder_outputs, encoder_state): with tf.variable_scope('decoder') as decoder_scope: ( self.decoder_cell, self.decoder_initial_state ) = self.build_decoder_cell(encoder_outputs, encoder_state) with tf.device(_get_embed_device(self.target_vocab_size)): if self.share_embedding: self.decoder_embeddings = self.encoder_embeddings elif self.pretrained_embedding: self.decoder_embeddings = tf.Variable( tf.constant(0.0, shape=(self.target_vocab_size, self.embedding_size)), trainable=True, name='embeddings' ) self.decoder_embeddings_placeholder =\ tf.placeholder(tf.float32, (self.target_vocab_size, self.embedding_size)) self.decoder_embeddings_init = self.decoder_embeddings.assign(self.decoder_embeddings_placeholder) else: self.decoder_embeddings = tf.get_variable( name='embedding', shape=(self.target_vocab_size, self.embedding_size), initializer=self.initializer, dtype=tf.float32 ) self.decoder_output_projection = layers.Dense(self.target_vocab_size, dtype=tf.float32, use_bias=False, name='decoder_output_projection') if self.mode == 'train': self.decoder_inputs_embdedded = tf.nn.embedding_lookup( params=self.decoder_embeddings, ids=self.decoder_inputs_train ) inputs = self.decoder_inputs_embdedded if self.time_major: inputs = tf.transpose(inputs, (1, 0, 2)) training_helper = seq2seq.TrainingHelper( inputs=inputs, sequence_length=self.decoder_inputs_length, time_major=self.time_major, name='training_helper' ) training_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state ) max_decoder_length = tf.reduce_max( self.decoder_inputs_length ) ( outputs, self.final_state, _ ) = seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=self.time_major, impute_finished=True, maximum_iterations=max_decoder_length, parallel_iterations=self.parallel_iterations, swap_memory=True, scope=decoder_scope ) self.decoder_logits_train = self.decoder_output_projection( outputs.rnn_output ) self.masks = tf.sequence_mask( lengths=self.decoder_inputs_length, maxlen=max_decoder_length, dtype=tf.float32, name='masks' ) decoder_logits_train = self.decoder_logits_train if self.time_major: decoder_logits_train = tf.transpose(decoder_logits_train, (1, 0, 2)) self.decoder_pred_train = tf.argmax( decoder_logits_train, axis=-1, name='decoder_pred_train' ) self.train_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.decoder_inputs, logits=decoder_logits_train) self.masks_rewards = self.masks * self.rewards self.loss_rewards = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_inputs, weights=self.masks_rewards, average_across_timesteps=True, average_across_batch=True ) self.loss = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_inputs, weights=self.masks, average_across_timesteps=True, average_across_batch=True ) self.add_loss = self.loss + self.add_loss elif self.mode == 'decode': start_token = tf.tile( [WordSequence.START], [self.batch_size] ) end_token = WordSequence.END def embed_and_input_proj(inputs): return tf.nn.embedding_lookup( self.decoder_embeddings, inputs ) if not self.use_beamsearch_decode: decoder_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_token, end_token=end_token, embedding=embed_and_input_proj ) inference_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=decoder_helper, initial_state=self.decoder_initial_state, output_layer=self.decoder_output_projection ) else: inference_decoder = BeamSearchDecoder( cell=self.decoder_cell, embedding=embed_and_input_proj, start_tokens=start_token, end_token=end_token, initial_state=self.decoder_initial_state, beam_width=self.beam_width, output_layer=self.decoder_output_projection ) if self.max_decode_step is not None: max_decoder_step = self.max_decode_step else: max_decoder_step = tf.round(tf.reduce_max( self.encoder_inputs_length ) * 4) ( self.decoder_outputs_decode, self.final_state ) = (seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=self.time_major, maximum_iterations=max_decoder_step, parallel_iterations=self.parallel_iterations, swap_memory=True, scope=decoder_scope )) if not self.use_beamsearch_decode: dod = self.decoder_outputs_decode self.decoder_pred_train = tf.transpose( self.decoder_pred_decode, (1, 0) ) else: self.decoder_pred_decode = self.decoder_outputs_decode.predicted_ids if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0, 2) ) self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, perm=[0, 2, 1] ) dod = self.decoder_pred_decode self.beam_prob = dod.beam_search_decoder_output.scores
def __init__(self, vocab_size, learning_rate, encoder_size, max_length, embedding_size, sos_token, eos_token, unk_token, beam_size=5): self.vocab_size = vocab_size self.lr = learning_rate self.encoder_size = encoder_size self.max_length = max_length self.embedding_size = embedding_size self.SOS_token = sos_token self.EOS_token = eos_token self.UNK_token = unk_token self.beam_search_size = beam_size with tf.variable_scope('placeholder_and_embedding'): self.query = tf.placeholder(shape=(None, None), dtype=tf.int64) self.query_length = tf.placeholder(shape=(None, ), dtype=tf.int64) self.reply = tf.placeholder(shape=(None, None), dtype=tf.int64) self.reply_length = tf.placeholder(shape=(None, ), dtype=tf.int64) self.decoder_inputs = tf.placeholder(shape=(None, self.max_length), dtype=tf.int64) self.decoder_target = tf.placeholder(shape=(None, self.max_length), dtype=tf.int64) self.decoder_length = tf.placeholder(shape=(None, ), dtype=tf.int64) self.batch_size = tf.placeholder(shape=(), dtype=tf.int32) self.embedding_pl = tf.placeholder(dtype=tf.float32, shape=(self.vocab_size, embedding_size), name='embedding_source_pl') word_embedding = tf.get_variable(name='word_embedding', shape=(self.vocab_size, embedding_size), dtype=tf.float32, trainable=False) self.init_embedding = word_embedding.assign(self.embedding_pl) with tf.variable_scope("query_encoder"): self.query_encoder = deep_components.gru_encoder( word_embedding, self.encoder_size) query_out, query_state = self.query_encoder( seq_index=self.query, seq_len=self.query_length) with tf.variable_scope("reply_encoder"): self.reply_encoder = deep_components.gru_encoder( word_embedding, self.encoder_size) reply_out, reply_state = self.reply_encoder( seq_index=self.reply, seq_len=self.reply_length) with tf.variable_scope("decoder"): self.decoder = deep_components.decoder(word_embedding, self.encoder_size * 2, self.vocab_size) with tf.variable_scope("seq2seq-train"): # train encoder_state = tf.concat([query_state, reply_state], axis=1) decoder_outputs = [] decoder_state = encoder_state for i in range(0, self.max_length): word_indices = self.decoder_inputs[:, i] decoder_out, decoder_state = self.decoder( word_indices, decoder_state) decoder_outputs.append(decoder_out) # b * l * vocab_size_tar decoder_outputs = tf.concat(decoder_outputs, 1) #b*max_length*vocab_size_tar with tf.variable_scope("cost"): # cost decoder_target_mask = tf.sequence_mask(self.decoder_length, maxlen=self.max_length, dtype=tf.float32) self.cost = sequence_loss(decoder_outputs, self.decoder_target, decoder_target_mask) self.optimizer = tf.train.AdamOptimizer( learning_rate=self.lr).minimize(self.cost) with tf.variable_scope("seq2seq-generate"): # generate self.generate_outputs = [] decoder_state = encoder_state word_indices = self.decoder_inputs[:, 0] # SOS for i in range(0, self.max_length): decoder_out, decoder_state = self.decoder( word_indices, decoder_state) softmax_out = tf.nn.softmax(decoder_out[:, 0, :]) word_indices = tf.cast(tf.arg_max(softmax_out, -1), dtype=tf.int64) # b * 1 self.generate_outputs.append( tf.expand_dims(word_indices, axis=1)) self.generate_outputs = tf.concat(self.generate_outputs, 1) #b*max_len with tf.variable_scope("seq2seq_beam_search_generate"): tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch( encoder_state, multiplier=self.beam_search_size) start_tokens = tf.ones([ self.batch_size, ], tf.int32) * self.SOS_token bs_decoder = BeamSearchDecoder( self.decoder.gru_cell, word_embedding, start_tokens=start_tokens, end_token=self.EOS_token, initial_state=tiled_encoder_final_state, beam_width=self.beam_search_size, output_layer=self.decoder.out_layer) self.bs_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder=bs_decoder, maximum_iterations=self.max_length)