def Decoder_LSTM(inputs, sequence_length, attention_mechanism, is_training= False): ''' In inference, input and sequence_length will be ignoired. ''' cell_List = []; for index in range(hp.Decoder.LSTM.Nums): cell_List.append(ZoneoutLSTMCell( num_units= hp.Decoder.LSTM.Cell_Size, is_training= is_training, cell_zoneout_rate= hp.Decoder.LSTM.Zoneout_Rate, output_zoneout_rate= hp.Decoder.LSTM.Zoneout_Rate )) lstm_Cell = tf.nn.rnn_cell.MultiRNNCell(cell_List); attention_Wrapped_Cell = AttentionWrapper( cell= lstm_Cell, attention_mechanism= attention_mechanism, attention_layer_size=None, alignment_history=True, cell_input_fn=None, output_attention= False, initial_cell_state=None, name=None, attention_layer=None ) helper = Decoder_Helper( inputs= inputs, #Mel sequence_length= sequence_length, #Mel_length time_major= False, is_training= is_training, name= None ) decoder = Decoder_Decoder( cell= attention_Wrapped_Cell, helper= helper, initial_state= attention_Wrapped_Cell.zero_state(tf.shape(inputs)[0], tf.float32) ) final_outputs, final_state, _ = Decoder_Dynamic_Decode( decoder= decoder, impute_finished= False #True ) return final_outputs, final_state
def _build_train(self, config): # decode if config.model_name == "fasttext_flat": self.logits = tf.contrib.layers.fully_connected(self.first_attention, config.n_classes, activation_fn=None) print("logits:", self.logits.get_shape()) self.logits = tf.reshape(self.logits, [-1, config.n_classes]) elif config.model_name == "RCNN_flat": self.logits = tf.contrib.layers.fully_connected(self.xx_final, config.n_classes, activation_fn=None) print("logits:", self.logits.get_shape()) self.logits = tf.reshape(self.logits, [-1, config.n_classes]) else: encoder_state = rnn.LSTMStateTuple(self.xx_final, self.xx_final) attention_mechanism = BahdanauAttention(config.decode_size, memory=self.xx_context, memory_sequence_length=self.x_seq_length) cell = AttentionWrapper(self.lstm, attention_mechanism, output_attention=False) cell_state = cell.zero_state(dtype=tf.float32, batch_size=config.batch_size) cell_state = cell_state.clone(cell_state=encoder_state, attention=self.first_attention) train_helper = TrainingHelper(self.yy, self.y_seq_length) train_decoder = BasicDecoder(cell, train_helper, cell_state, output_layer=self.output_l) self.decoder_outputs_train, decoder_state_train, decoder_seq_train = dynamic_decode(train_decoder, impute_finished=True) self.logits = self.decoder_outputs_train.rnn_output print("logits:", self.logits.get_shape())
def pointer_net(inputs, input_lengths, n_pointers, word_matrix, cell_type, n_layers, n_units, dropout_prob, is_training=True): """Pointer network. Args: inputs (tensor): Inputs to pointer network (typically output of previous RNN) input_lengths (tensor): Actual non-padded lengths of each input sequence n_pointers (int): Number of pointers to generate word_matrix (tensor): Embedding matrix of word vectors cell_type (method): Cell type to use n_layers (int): Number of layers in RNN (same for encoder & decoder) n_units (int): Number of units in RNN cell (same for encoder & decoder) dropout_prob (float): Dropout probability is_training (bool): Whether the model is training or testing """ batch_size, seq_length, _ = inputs.get_shape().as_list() vocab_size = word_matrix.get_shape().as_list()[0] # instantiate RNN cell; only use dropout during training def _rnn_cell(): keep_prob = 1 - dropout_prob if is_training else 1 return DropoutWrapper(cell_type(n_units), output_keep_prob=keep_prob) enc_cell = MultiRNNCell([_rnn_cell() for _ in range(n_layers)]) if n_layers > 1 else _rnn_cell() encoded, _ = tf.nn.dynamic_rnn(enc_cell, inputs, input_lengths, dtype=tf.float32) attention = BahdanauAttention(n_units, encoded, memory_sequence_length=input_lengths) # TODO: find permanent solution (InferenceHelper?) start_tokens = tf.constant(START_TOKEN, shape=[batch_size], dtype=tf.int32) helper = GreedyEmbeddingHelper(word_matrix, start_tokens, END_TOKEN) dec_cell = MultiRNNCell([_rnn_cell() for _ in range(n_layers)]) if n_layers > 1 else _rnn_cell() attn_cell = AttentionWrapper(dec_cell, attention, alignment_history=True) out_cell = tf.contrib.rnn.OutputProjectionWrapper(attn_cell, vocab_size) decoder = BasicDecoder(out_cell, helper, attn_cell.zero_state(batch_size, tf.float32)) _, states, _ = dynamic_decode(decoder, maximum_iterations=n_pointers, impute_finished=True) probs = tf.reshape(states.alignment_history.stack(), [n_pointers, batch_size, seq_length]) return probs
def add_decoder_cell(self, encoder_outputs, encoder_states, hidden_size, cell_type, num_layers): encoder_seq_len = self.source_len if self.mode == 'decode': encoder_outputs = tf.contrib.seq2seq.tile_batch( encoder_outputs, multiplier=self.beam_size) encoder_states = tf.contrib.seq2seq.tile_batch( encoder_states, multiplier=self.beam_size) encoder_seq_len = tf.contrib.seq2seq.tile_batch( encoder_seq_len, multiplier=self.beam_size) hidden_size_ = hidden_size * 2 if self.bidirection else hidden_size cell = MultiRNNCell([ self.one_cell(hidden_size_, cell_type) for _ in range(num_layers) ]) self.attention = BahdanauAttention(self.hidden_size, encoder_outputs, encoder_seq_len) def cell_input_fn(inputs, attention): att_proj = tf.layers.Dense(hidden_size_, dtype=tf.float32, use_bias=False, name='att_proj') return att_proj(tf.concat([inputs, attention], axis=-1)) decoder_cell = AttentionWrapper(cell=cell, attention_mechanism=self.attention, attention_layer_size=hidden_size, cell_input_fn=cell_input_fn, name='attentionwrapper') d_size = self.beam_size * self.batch_size if self.mode == 'decode' else self.batch_size decoder_initial_state = decoder_cell.zero_state( batch_size=d_size, dtype=tf.float32).clone(cell_state=encoder_states) return decoder_cell, decoder_initial_state
def __graph__(self): # encoder encoder_outputs, encoder_state = self.encoder() # decoder with tf.variable_scope('decoder'): ##作用域,'/' encoder_inputs_length = self.encoder_inputs_length if self.beam_search: # 如果使用beam_search,则需要将encoder的输出进行tile_batch,其实就是复制beam_size份。 print("use beamsearch decoding..") encoder_outputs = tile_batch(encoder_outputs, multiplier=self.beam_size) encoder_state = nest.map_structure(lambda s: tf.contrib.seq2seq.tile_batch(s, self.beam_size), encoder_state) encoder_inputs_length = tile_batch(encoder_inputs_length, multiplier=self.beam_size) # 定义要使用的attention机制。 attention_mechanism = BahdanauAttention(num_units=self.rnn_size, memory=encoder_outputs, memory_sequence_length=encoder_inputs_length) # 定义decoder阶段要是用的RNNCell,然后为其封装attention wrapper decoder_cell = self.create_rnn_cell() decoder_cell = AttentionWrapper(cell=decoder_cell, attention_mechanism=attention_mechanism, attention_layer_size=self.rnn_size, name='Attention_Wrapper') # 如果使用beam_seach则batch_size = self.batch_size * self.beam_size batch_size = self.batch_size if not self.beam_search else self.batch_size * self.beam_size # 定义decoder阶段的初始化状态,直接使用encoder阶段的最后一个隐层状态进行赋值 decoder_initial_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32).clone(cell_state=encoder_state) output_layer = tf.layers.Dense(self.vocab_size, kernel_initializer=tf.truncated_normal_initializer( mean=0.0,9 stddev=0.1)) if self.mode == 'train':
def decoding_layer(decoding_embed_inp, embeddings, encoding_op, encoding_st, v_size, fr_len, en_len, max_en_len, rnn_cell_size, word2int, dropout_prob, batch_size, n_layers): for l in range(n_layers): with tf.variable_scope('decs_rnn_layer_{}'.format(l)): #gru = tf.contrib.rnn.GRUCell(rnn_len) gru = get_rnn_cell(rnn_cell_size, dropout_prob) decoding_cell = tf.contrib.rnn.DropoutWrapper( gru, input_keep_prob=dropout_prob) out_l = Dense(v_size, kernel_initializer=tf.truncated_normal_initializer( mean=0.0, stddev=0.1)) attention = BahdanauAttention(rnn_cell_size, encoding_op, fr_len, normalize=False, name='BahdanauAttention') decoding_cell = AttentionWrapper(decoding_cell, attention, rnn_len) attention_zero_state = decoding_cell.zero_state(batch_size, tf.float32) attention_zero_state = attention_zero_state.clone( cell_state=encoding_st[0]) with tf.variable_scope("decoding_layer"): logits_tr = training_decoding_layer(decoding_embed_inp, en_len, decoding_cell, attention_zero_state, out_l, v_size, max_en_len) with tf.variable_scope("decoding_layer", reuse=True): logits_inf = inference_decoding_layer(embeddings, word2int["TOKEN_GO"], word2int["TOKEN_EOS"], decoding_cell, attention_zero_state, out_l, max_en_len, batch_size) return logits_tr, logits_inf
def model_fn(features, labels, mode, params): embedding_encoder = tf.get_variable('embedding_encoder', shape=(params.vocab_size, params.emb_size)) table = lookup_ops.index_to_string_table_from_file(params.word_vocab_file) question_emb = tf.nn.embedding_lookup(embedding_encoder, features['question_words']) passage_emb = tf.nn.embedding_lookup(embedding_encoder, features['passage_words']) question_words_length = features['question_length'] passage_words_length = features['passage_length'] answer_start, answer_end = features['answer_start'], features['answer_end'] answer_start = tf.concat([tf.expand_dims(answer_start, -1)] * 50, -1) answer_end = tf.concat([tf.expand_dims(answer_end, -1)] * 50, -1) with tf.variable_scope('passage_encoding'): passage_enc, (_, passage_bw_state) = biGRU(tf.concat( [passage_emb, answer_start, answer_end], -1), passage_words_length, params, layers=params.layers) with tf.variable_scope('question_encoding'): question_enc, (_, question_bw_state) = biGRU(question_emb, question_words_length, params, layers=params.layers) # output_enc = masked_concat(question_enc, passage_enc, question_words_length, passage_words_length) decoder_state_layer = Dense(params.units, activation=tf.tanh, use_bias=True, name='decoder_state_init') decoder_init_state = tuple( decoder_state_layer( tf.concat([passage_bw_state[i], question_bw_state[i]], -1)) for i in range(params.layers)) question_att = BahdanauAttention( params.units, question_enc, memory_sequence_length=question_words_length) passage_att = BahdanauAttention( params.units, passage_enc, memory_sequence_length=passage_words_length) decoder_cell = AttentionWrapper(MultiRNNCell( [GRUCell(params.units) for _ in range(params.layers)]), [question_att, passage_att], initial_cell_state=decoder_init_state) batch_size = params.batch_size # if mode != tf.estimator.ModeKeys.PREDICT else 1 if mode == tf.estimator.ModeKeys.TRAIN: answer_emb = tf.nn.embedding_lookup(embedding_encoder, features['answer_words']) helper = TrainingHelper(answer_emb, features['answer_length']) else: helper = GreedyEmbeddingHelper( embedding_encoder, tf.fill([batch_size], params.tgt_sos_id), params.tgt_eos_id) projection_layer = Dense(params.vocab_size, use_bias=False) decoder = SNetDecoder(decoder_cell, helper, decoder_cell.zero_state(batch_size, tf.float32), output_layer=projection_layer, params=params) outputs, _, outputs_length = dynamic_decode( decoder, maximum_iterations=params.answer_max_words) logits = outputs.rnn_output if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'answer': table.lookup(tf.cast(outputs.sample_id, tf.int64)) } export_outputs = { 'prediction': tf.estimator.export.PredictOutput(predictions) } return tf.estimator.EstimatorSpec(mode, predictions=predictions, export_outputs=export_outputs) # logits = tf.Print(logits, [outputs.sample_id, labels], summarize=1000) labels = tf.stop_gradient(labels[:, :tf.reduce_max(outputs_length)]) crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits) target_weights = tf.sequence_mask(outputs_length, dtype=logits.dtype) loss = tf.reduce_sum(crossent * target_weights) / params.batch_size if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.AdadeltaOptimizer(learning_rate=1) global_step = tf.train.get_or_create_global_step() grads = optimizer.compute_gradients(loss) gradients, variables = zip(*grads) capped_grads, _ = tf.clip_by_global_norm(gradients, params.grad_clip) train_op = optimizer.apply_gradients(zip(capped_grads, variables), global_step=global_step) return EstimatorSpec( mode, loss=loss, train_op=train_op, ) if mode == tf.estimator.ModeKeys.EVAL: return EstimatorSpec(mode, loss=loss, eval_metric_ops={ 'rouge-l': rouge_l(outputs.sample_id, labels, outputs_length, features['answer_length'], params, table), })
def __init__(self, inputs, targets, src_vocab_size, src_max_length, tgt_vocab_size, tgt_max_length, emb_dim, num_units, batch_size, eos_token, is_train, share_embeddings=False, teacher_forcing=False): xavier = tf.contrib.layers.xavier_initializer start_tokens = tf.zeros([batch_size], dtype=tf.int32) input_lengths = tf.argmin(tf.abs(inputs - eos_token), axis=-1, output_type=tf.int32) target_lengths = tf.argmin(tf.abs(targets - eos_token), axis=-1, output_type=tf.int32) input_embedding_table = tf.get_variable("encoder_embedding", [src_vocab_size, emb_dim], initializer=xavier(), dtype=tf.float32) input_embedding = tf.nn.embedding_lookup(input_embedding_table, inputs) encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units, state_is_tuple=False) encoder_cell = tf.nn.rnn_cell.DropoutWrapper(cell=encoder_cell, input_keep_prob=0.8, output_keep_prob=1.0) # encoder_outputs: [max_time, batch_size, num_units] # encoder_state: [batch_size, num_units] (encoder_output, encoder_state) = tf.nn.bidirectional_dynamic_rnn(cell_fw=encoder_cell, cell_bw=encoder_cell, inputs=input_embedding, sequence_length=input_lengths, dtype=tf.float32, time_major=False) encoder_output = tf.concat(encoder_output, axis=2) encoder_state = tf.concat([encoder_state[0], encoder_state[1]], axis=1) if share_embeddings: assert src_vocab_size == tgt_vocab_size target_embedding_table = input_embedding_table else: target_embedding_table = tf.get_variable("decoder_embedding", [src_vocab_size, emb_dim], initializer=xavier(), dtype=tf.float32) prefixed_targets = tf.concat([tf.expand_dims(start_tokens, 1), targets], axis=1) target_embedding = tf.nn.embedding_lookup(target_embedding_table, prefixed_targets) if teacher_forcing: helper = TrainingHelper(target_embedding, target_lengths + 1, time_major=False) else: helper = GreedyEmbeddingHelper(target_embedding_table, start_tokens, eos_token) decoder_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units * 2, state_is_tuple=False) projection_layer = tf.layers.Dense(tgt_vocab_size, use_bias=False) attention_mechanism = BahdanauAttention(num_units, encoder_output, memory_sequence_length=input_lengths) decoder_cell = AttentionWrapper(decoder_cell, attention_mechanism, attention_layer_size=num_units) #decoder_cell = tf.nn.rnn_cell.DropoutWrapper(cell=decoder_cell, # input_keep_prob=0.8, # output_keep_prob=1.0) encoder_state = decoder_cell.zero_state(batch_size, tf.float32).clone(cell_state=encoder_state) decoder = BasicDecoder(cell=decoder_cell, helper=helper, initial_state=encoder_state, output_layer=projection_layer) decoder_outputs, states, lengths = dynamic_decode(decoder, output_time_major=False, impute_finished=True, maximum_iterations=tgt_max_length) unpadded_logits = decoder_outputs.rnn_output missing_elems = tgt_max_length - tf.shape(unpadded_logits)[1] padding = [[0, 0], [0, missing_elems], [0, 0]] logits = tf.pad(unpadded_logits, padding, 'CONSTANT', constant_values=0.) weights = tf.sequence_mask(target_lengths + 1, # the "+1" is to include EOS maxlen=tgt_max_length, dtype=tf.float32) #self.mle_loss = sequence_loss(targets=targets, # logits=logits, # weights=weights, # average_across_batch=True) crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=targets, logits=logits) mle_loss = (tf.reduce_sum(crossent * weights) / batch_size) preds = decoder_outputs.sample_id self.preds = preds self.logits = logits self.mle_loss = mle_loss
def __init__(self, name, input_reprs, roll_direction=0, activate=True, is_translate=False, word_in=None, encoder_reprs=encoder.bi_reprs): self.name = name with tf.variable_scope(name + '/predictions'): #decoder_state = tf.layers.dense(input_reprs, config.projection_size, name='encoder_to_decoder') decoder_state = input_reprs with tf.variable_scope('word_embeddings_vi'): word_embedding_matrix = tf.get_variable( 'word_embedding_matrix_vi', initializer=pretrained_embeddings_vi) if is_translate: word_embeddings = tf.nn.embedding_lookup( word_embedding_matrix, word_in) else: word_embeddings = tf.nn.embedding_lookup( word_embedding_matrix, words_tgt_in) word_embeddings = tf.nn.dropout( word_embeddings, inputs.keep_prob) word_embeddings *= tf.get_variable('emb_scale', initializer=1.0) decoder_lstm = model_helpers.lstm_cell( config.bidirectional_sizes[0], inputs.keep_prob, config.projection_size) decoder_output_layer = tf.layers.Dense(n_classes, name='predict') if not is_translate: attention_mechanism = LuongAttention( num_units=config.attention_units, memory=encoder_reprs, memory_sequence_length=size_sr, scale=True) attention_cell = AttentionWrapper( decoder_lstm, attention_mechanism, attention_layer_size=config.attention_units) batch_size = tf.shape(words_tgt_in)[0] decoder_initial_state = attention_cell.zero_state( dtype=tf.float32, batch_size=batch_size * config.beam_width) decoder_state = decoder_initial_state.clone( cell_state=decoder_state) helper = tf.contrib.seq2seq.TrainingHelper( word_embeddings, size_tgt) decoder = tf.contrib.seq2seq.BasicDecoder( attention_cell, helper, decoder_state, decoder_output_layer) outputs, state, _ = tf.contrib.seq2seq.dynamic_decode( decoder) # swap_memory=True) self.logits = outputs.rnn_output else: if config.decode_mode == 'greedy': helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( word_embedding_matrix, [embeddings.START, embeddings.START], embeddings.END) decoder = tf.contrib.seq2seq.BasicDecoder( decoder_lstm, helper, decoder_state, decoder_output_layer) elif config.decode_mode == 'beam': encoder_reprs = tf.contrib.seq2seq.tile_batch( encoder_reprs, multiplier=config.beam_width) decoder_state = tf.contrib.seq2seq.tile_batch( decoder_state, multiplier=config.beam_width) size_src = tf.contrib.seq2seq.tile_batch( size_sr, multiplier=config.beam_width) attention_mechanism = LuongAttention( num_units=config.attention_units, memory=encoder_reprs, memory_sequence_length=size_src, scale=True) attention_cell = AttentionWrapper( decoder_lstm, attention_mechanism, attention_layer_size=config.attention_units) batch_size = 2 decoder_initial_state = attention_cell.zero_state( dtype=tf.float32, batch_size=batch_size * config.beam_width) decoder_state = decoder_initial_state.clone( cell_state=decoder_state) #decoder_state = tf.contrib.seq2seq.tile_batch( # decoder_state, multiplier=config.beam_width) decoder = tf.contrib.seq2seq.BeamSearchDecoder( cell=attention_cell, embedding=word_embedding_matrix, start_tokens=[ embeddings.START, embeddings.START ], end_token=embeddings.END, initial_state=decoder_state, beam_width=config.beam_width, output_layer=decoder_output_layer) outputs, state, _ = tf.contrib.seq2seq.dynamic_decode( decoder, maximum_iterations=config.max_translate_length) #swap_memory=True) if config.decode_mode == 'greedy': self.sample_ids = outputs.sample_id elif config.decode_mode == 'beam': self.sample_ids = outputs.predicted_ids ''' outputs, state = tf.nn.dynamic_rnn( model_helpers.lstm_cell(config.bidirectional_sizes[0], inputs.keep_prob, config.projection_size), word_embeddings, initial_state=decoder_state, dtype=tf.float32, sequence_length=size_tgt, scope='predictlstm' ) ''' self.state = state #self.logits = tf.layers.dense(outputs, n_classes, name='predict') #self.logits = tf.layers.dense(outputs.rnn_output, n_classes, name='predict') if is_translate: return targets = words_tgt_out targets *= (1 - inputs.label_smoothing) targets += inputs.label_smoothing / n_classes self.loss = model_helpers.masked_ce_loss( self.logits, targets, inputs.mask)
def add_multilayer_rnn_op(self): """ Adds logits to self """ with tf.variable_scope("bi-lstm"): _inputs = self.input_feature_embeddings for n in range(self.num_layers): with tf.variable_scope(None, default_name="bidirectional-rnn"): if self.rnn_unit == 'lstm': cell_fw = rnn.LSTMCell(self.hidden_dim, forget_bias=1., state_is_tuple=True) cell_bw = rnn.LSTMCell(self.hidden_dim, forget_bias=1., state_is_tuple=True) elif self.rnn_unit == 'gru': cell_fw = rnn.GRUCell(self.hidden_dim) cell_bw = rnn.GRUCell(self.hidden_dim) elif self.rnn_unit == 'rnn': cell_fw = rnn.BasicRNNCell(self.hidden_dim) cell_bw = rnn.BasicRNNCell(self.hidden_dim) else: raise ValueError('rnn_unit must in (lstm, gru, rnn)!') initial_state_fw = cell_fw.zero_state(tf.shape( self.input_feature_embeddings)[0], dtype=tf.float32) initial_state_bw = cell_bw.zero_state(tf.shape( self.input_feature_embeddings)[0], dtype=tf.float32) (output, state) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, _inputs, self.sequence_lengths, initial_state_fw, initial_state_bw, dtype=tf.float32) _inputs = tf.concat(output, 2) self.output = tf.nn.dropout(_inputs, self.dropout_pl) if self.is_attention: with tf.variable_scope('attention'): embedding_dim = self.hidden_dim * 2 attn_mech = BahdanauAttention(embedding_dim, _inputs, self.sequence_lengths) dec_cell = rnn.LSTMCell(self.hidden_dim, state_is_tuple=True) attn_cell = AttentionWrapper(dec_cell, attn_mech, embedding_dim) attn_zero = attn_cell.zero_state(tf.shape( self.input_feature_embeddings)[0], dtype=tf.float32) helper = TrainingHelper(inputs=_inputs, sequence_length=self.sequence_lengths) decoder = BasicDecoder(cell=attn_cell, helper=helper, initial_state=attn_zero) final_outputs, final_state, final_sequence_length = dynamic_decode( decoder) self.output = tf.nn.dropout(final_outputs.rnn_output, self.dropout_pl) with tf.variable_scope("proj"): W = tf.get_variable("W", shape=[2 * self.hidden_dim, self.num_class], dtype=tf.float32) b = tf.get_variable("b", shape=[self.num_class], dtype=tf.float32, initializer=tf.zeros_initializer()) s = tf.shape(self.output) output = tf.reshape(self.output, [-1, 2 * self.hidden_dim]) pred = tf.matmul(output, W) + b self.logits = tf.reshape(pred, [-1, s[1], self.num_class])
dtype='float', sequence_length=x_seq_length) xx_context = outputs # tf.concat(outputs, 2) # [None, DL, 2*hd] xx_final = output_states[0] # tf.concat(output_states, 1) # [None, 2*hd] x_mask = tf.cast(x_mask, "float") first_attention = tf.reduce_mean(xx_context, 1) # [None, 2*hd] # decode output_l = layers_core.Dense(n_classes, use_bias=True) encoder_state = rnn.LSTMStateTuple(xx_final, xx_final) attention_mechanism = BahdanauAttention(hidden_size, memory=xx_context, memory_sequence_length=x_seq_length) lstm = rnn.LayerNormBasicLSTMCell(hidden_size, dropout_keep_prob=keep_prob) cell = AttentionWrapper(lstm, attention_mechanism, output_attention=False) cell_state = cell.zero_state(dtype=tf.float32, batch_size=train_batch_size) cell_state = cell_state.clone(cell_state=encoder_state, attention=first_attention) train_helper = TrainingHelper(yy, y_seq_length) train_decoder = BasicDecoder(cell, train_helper, cell_state, output_layer=output_l) decoder_outputs_train, decoder_state_train, decoder_seq_train = dynamic_decode( train_decoder, impute_finished=True) # infer_decoder/beam_search tiled_inputs = tile_batch(xx_context, multiplier=beam_width) tiled_sequence_length = tile_batch(x_seq_length, multiplier=beam_width) tiled_first_attention = tile_batch(first_attention, multiplier=beam_width) attention_mechanism = BahdanauAttention(
def build_decoder_cell(self, encoder_outputs, encoder_state): """ 构建解码器cell """ encoder_inputs_length = self.encoder_inputs_length batch_size = self.batch_size if self.bidirectional: encoder_state = encoder_state[-self.depth:] if self.time_major: encoder_outputs = tf.transpose(encoder_outputs, (1, 0, 2)) if self.use_beamsearch_decode: encoder_outputs = seq2seq.tile_batch(encoder_outputs, multiplier=self.beam_width) encoder_state = seq2seq.tile_batch(encoder_state, multiplier=self.beam_width) encoder_inputs_length = seq2seq.tile_batch( self.encoder_inputs_length, multiplier=self.beam_width) #如果使用了beamsearch, 那么输入应该是beam_width的倍数等于batch_size的 batch_size *= self.beam_width if self.attention_type.lower() == 'luong': self.attention_mechanism = LuongAttention( num_units=self.hidden_units, memory=encoder_outputs, memory_sequence_length=encoder_inputs_length) else: #BahdanauAttention 就是初始化时传入 num_units 以及 Encoder Outputs,然后调时传入 query 用即可得到权重变量 alignments。 self.attention_mechanism = BahdanauAttention( num_units=self.hidden_units, memory=encoder_outputs, memory_sequence_length=encoder_inputs_length) cell = MultiRNNCell([ self.build_signle_cell(self.hidden_units, use_residual=self.use_residual) for _ in range(self.depth) ]) # 在非训练(预测)模式,并且没开启 beamsearch 的时候,打开 attention 历史信息 alignment_history = (self.mode != 'train' and not self.use_beamsearch_decode) def cell_input_fn(inputs, attention): """ 根据attn_input_feeding属性来判断是否在attention计算前进行一次投影的计算""" if not self.use_residual: return array_ops.concat([inputs, attention], -1) attn_projection = layers.Dense(self.hidden_units, dtype=tf.float32, use_bias=False, name='attention_cell_input_fn') return attn_projection(array_ops.concat([inputs, attention], -1)) attention_cell = AttentionWrapper( cell=cell, attention_mechanism=self.attention_mechanism, attention_layer_size=self.hidden_units, alignment_history=alignment_history, cell_input_fn=cell_input_fn, name='AttentionWrapper') # 空状态 decoder_initial_state = attention_cell.zero_state( batch_size, tf.float32) #传递encoder的状态 定义decoder阶段的初始化状态,直接使用encoder阶段的最后一个隐层状态进行赋值 decoder_initial_state = decoder_initial_state.clone( cell_state=encoder_state) return attention_cell, decoder_initial_state
h=encoder_final_state_h) #Shape: (batch_size, time_step, hidden_units) encoder_outputs = tf.transpose( tf.concat((encoder_fw_outputs, encoder_bw_outputs), 2), [1, 0, 2]) decoder_cell = LSTMCell(hidden_units * 2) attention_mechanism = BahdanauAttention(attention_units, encoder_outputs) attention_cell = AttentionWrapper(decoder_cell, attention_mechanism) copynet_cell = CopyNetWrapper(attention_cell, encoder_outputs, input_ids, vocab_size, gen_vocab_size) decoder_initial_state = copynet_cell.zero_state( batch_size, tf.float32).clone(cell_state=attention_cell.zero_state( batch_size=batch_size, dtype=tf.float32)) helper = tf.contrib.seq2seq.TrainingHelper(targets_embedded, targets_lengths, time_major=True) #helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddings, tf.ones([batch_size], dtype=tf.int32), 0) decoder = tf.contrib.seq2seq.BasicDecoder(copynet_cell, helper, decoder_initial_state, output_layer=None) decoder_outputs, final_state, coder_seq_length = tf.contrib.seq2seq.dynamic_decode( decoder=decoder) decoder_logits, decoder_ids = decoder_outputs #labels = tf.one_hot(decoder_targets, depth=vocab_size, dtype=tf.float32)
def __graph__(self): # encoder encoder_outputs, encoder_state = self.encoder() # decoder with tf.variable_scope('decoder'): encoder_inputs_length = self.encoder_inputs_length if self.beam_search: # 如果使用beam_search,则需要将encoder的输出进行tile_batch,其实就是复制beam_size份。 print("use beamsearch decoding..") # 将encoder的输出复制beam_size份。 encoder_outputs = tile_batch(encoder_outputs, multiplier=self.beam_size) # 将隐藏层状态复制beam_size份,隐层状态包括h和c两个,所以应用lambda表达式。 encoder_state = nest.map_structure( lambda s: tf.contrib.seq2seq.tile_batch(s, self.beam_size), encoder_state) # 将encoder的输入长度复制bea_size份。 encoder_inputs_length = tile_batch(encoder_inputs_length, multiplier=self.beam_size) # 定义要使用的attention机制。 # 使用Bahdanau Attention attention_mechanism = BahdanauAttention( num_units=self.rnn_size, # 隐层的维度 memory=encoder_outputs, # encoder的输出 # memory的mask,通过句子长度判断结尾。 memory_sequence_length=encoder_inputs_length) # 定义decoder阶段要是用的RNNCell,然后为其封装attention wrapper decoder_cell = self.create_rnn_cell() # AttentionWrapper()用于封装带attention机制的RNN网络 decoder_cell = AttentionWrapper( cell=decoder_cell, # decoder的网络 attention_mechanism=attention_mechanism, # attention实例 attention_layer_size=self.rnn_size, # TODO:哪个维度 name='Attention_Wrapper' # 该AttentionWrapper名字 ) # 如果使用beam_seach则batch_size = self.batch_size * self.beam_size batch_size = self.batch_size if not self.beam_search else self.batch_size * self.beam_size # 定义decoder阶段的初始化状态,直接使用encoder阶段的最后一个隐层状态进行赋值 # zero_state()先全部初始化为0,再clone()将encoder的最后一个隐层状态初始化为当前decoder的隐层状态 decoder_initial_state = decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32).clone(cell_state=encoder_state) # 一个全连接层作为输出层,softmax输出为vocab_size,相当于多分类。 # tf.truncated_normal_initializer()生成截断的正太分布。mean参数指明均值,stddev参数指明方差。 output_layer = tf.layers.Dense( self.vocab_size, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) # 如果是训练截断 if self.mode == 'train': # decoder训练 # decoder的网络、初始状态和输出层。 self.decoder_outputs = self.decoder_train( decoder_cell, decoder_initial_state, output_layer) # loss,使用sequence_loss计算。 # logits:输出的预测值;targets:真实值;mask:权重比例,根据targets句子长度得到的。 self.loss = sequence_loss(logits=self.decoder_outputs, targets=self.decoder_targets, weights=self.mask) # 当你想知道 learning rate 如何变化时,目标函数如何变化时,就可以通过向节点附加 tf.summary.scalar 操作来分别输出学习速度和期望误差, # 可以给每个 scalary_summary 分配一个有意义的标签为 'learning rate' 和 'loss function',执行后就可以看到可视化的图表。 tf.summary.scalar('loss', self.loss) # 在 TensorFlow 中,所有的操作只有当你执行,或者一个操作依赖于它的输出时才会运行。 # 为了生成 summaries,我们需要运行所有 summary nodes,所以就用 tf.summary.merge_all 来将它们合并为一个操作, # 这样就可以产生所有的 summary data。 self.summary_op = tf.summary.merge_all() # optimizer使用Adam optimizer = tf.train.AdamOptimizer(self.learing_rate) # 获取所有参数 trainable_params = tf.trainable_variables() # 所有参数根据loss进行梯度下降. gradients = tf.gradients(self.loss, trainable_params) # 梯度截断,防止梯度爆炸. clip_gradients, _ = tf.clip_by_global_norm( gradients, self.max_gradient_norm) # 优化器应用梯度更新所有参数.apply_gradients()里传入(梯度,变量)的元组. self.train_op = optimizer.apply_gradients( zip(clip_gradients, trainable_params)) elif self.mode == 'decode': # 解码阶段 self.decoder_predict_decode = self.decoder_decode( decoder_cell, decoder_initial_state, output_layer)
def __init__(self, vocab_size, learning_rate, encoder_size, max_length, embedding_size, sos_token, eos_token, unk_token, beam_size=5): self.vocab_size = vocab_size self.lr = learning_rate self.encoder_size = encoder_size self.max_length = max_length self.embedding_size = embedding_size self.SOS_token = sos_token self.EOS_token = eos_token self.UNK_token = unk_token self.beam_search_size = beam_size with tf.variable_scope('placeholder_and_embedding'): self.query = tf.placeholder(shape=(None, None), dtype=tf.int32) self.query_length = tf.placeholder(shape=(None, ), dtype=tf.int32) self.reply = tf.placeholder(shape=(None, None), dtype=tf.int32) self.reply_length = tf.placeholder(shape=(None, ), dtype=tf.int32) self.decoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32) self.decoder_target = tf.placeholder(shape=(None, None), dtype=tf.int32) self.decoder_length = tf.placeholder(shape=(None, ), dtype=tf.int32) self.batch_size = tf.placeholder(shape=(), dtype=tf.int32) self.embedding_pl = tf.placeholder(dtype=tf.float32, shape=(self.vocab_size, embedding_size), name='embedding_source_pl') word_embedding = tf.get_variable(name='word_embedding', shape=(self.vocab_size, embedding_size), dtype=tf.float32, trainable=True) self.init_embedding = word_embedding.assign(self.embedding_pl) self.max_target_sequence_length = tf.reduce_max( self.decoder_length, name='max_target_len') self.mask = tf.sequence_mask(self.decoder_length, self.max_target_sequence_length, dtype=tf.float32, name='masks') with tf.variable_scope("query_encoder"): self.query_encoder = deep_components.gru_encoder( word_embedding, self.encoder_size) query_out, query_state = self.query_encoder( seq_index=self.query, seq_len=self.query_length) with tf.variable_scope("reply_encoder"): self.reply_encoder = deep_components.gru_encoder( word_embedding, self.encoder_size) reply_out, reply_state = self.reply_encoder( seq_index=self.reply, seq_len=self.reply_length) with tf.variable_scope("decoder"): combined_encoder_state = tf.concat([query_state, reply_state], axis=1) tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch( combined_encoder_state, multiplier=self.beam_search_size) tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch( query_out, multiplier=self.beam_search_size) tiled_sequence_length = tf.contrib.seq2seq.tile_batch( self.query_length, multiplier=self.beam_search_size) decoder_gru = GRUCell(self.encoder_size * 2) attention_mechanism = BahdanauAttention( num_units=self.encoder_size, memory=tiled_encoder_outputs, memory_sequence_length=tiled_sequence_length) attention_cell = AttentionWrapper( decoder_gru, attention_mechanism, attention_layer_size=self.encoder_size) decoder_initial_state_beam = attention_cell.zero_state( dtype=tf.float32, batch_size=tf.cast(self.batch_size * self.beam_search_size, dtype=tf.int32)).clone( cell_state=tiled_encoder_final_state) ############################# #attention_cell=decoder_gru #decoder_initial_state_beam = tiled_encoder_final_state ############################## decode_out_layer = tf.layers.Dense(self.vocab_size, name='output_layer', _reuse=tf.AUTO_REUSE) with tf.variable_scope("seq2seq-train"): # train self.tiled_d_in = tile_batch(self.decoder_inputs, multiplier=self.beam_search_size) self.tiled_d_tgt = tile_batch(self.decoder_target, multiplier=self.beam_search_size) train_helper = TrainingHelper( tf.contrib.seq2seq.tile_batch( tf.nn.embedding_lookup(word_embedding, self.decoder_inputs), multiplier=self.beam_search_size), sequence_length=tile_batch(self.decoder_length, multiplier=self.beam_search_size), name="train_helper") train_decoder = BasicDecoder( attention_cell, train_helper, initial_state=decoder_initial_state_beam, output_layer=decode_out_layer) self.dec_output, _, self.gen_len = dynamic_decode( train_decoder, impute_finished=True, maximum_iterations=self.max_target_sequence_length) #self.gen_max_len=tf.reduce_max(self.gen_len) #self.padding=tf.zeros(shape=(self.batch_size,self.max_length-self.gen_max_len,self.vocab_size),dtype=tf.float32) #self.padding=tile_batch(self.padding,multiplier=self.beam_search_size) self.dec_logits = tf.identity(self.dec_output.rnn_output) #self.dec_logits = tf.concat((self.dec_logits,self.padding),axis=1) self.decoder_target_mask = tile_batch( self.mask, multiplier=self.beam_search_size) self.cost = sequence_loss( self.dec_logits, tile_batch(self.decoder_target, multiplier=self.beam_search_size), self.decoder_target_mask) self.optimizer = tf.train.AdamOptimizer( learning_rate=self.lr).minimize(self.cost) with tf.variable_scope("seq2seq_beam_search_generate"): start_tokens = tf.ones([ self.batch_size, ], tf.int32) * self.SOS_token beam_infer_decoder = BeamSearchDecoder( attention_cell, embedding=word_embedding, end_token=self.EOS_token, start_tokens=start_tokens, initial_state=decoder_initial_state_beam, beam_width=self.beam_search_size, output_layer=decode_out_layer) self.bs_outputs, _, _ = dynamic_decode( beam_infer_decoder, maximum_iterations=self.max_length) with tf.variable_scope("greedy_generate"): decoding_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( embedding=word_embedding, start_tokens=start_tokens, end_token=self.EOS_token) inference_decoder = tf.contrib.seq2seq.BasicDecoder( cell=attention_cell, helper=decoding_helper, initial_state=decoder_initial_state_beam, output_layer=decode_out_layer) self.greedy_outputs, _, _ = dynamic_decode( inference_decoder, maximum_iterations=self.max_length)
def __init__(self,n_session, pretrainedEmbeddings=[]): tf.reset_default_graph() self.n_sess = n_session self.sess = tf.Session()#config=CONFIG_TF) self.learning_rate = tf.placeholder(tf.float32) hidden_units = config['HIDDEN_UNITS'] attention_units = config['ATTENTION_UNITS'] vocab_size = config['VOCAB_SIZE'] gen_vocab_size = config['GEN_VOCAB_SIZE'] embed_size = config['EMBED_SIZE'] self.paragraphs = tf.placeholder(shape=(None, None), dtype=tf.float32, name='paragraphs') self.ans_locs = tf.placeholder(shape=(None, None), dtype=tf.float32, name='ans_locs') self.encoder_inputs_lengths = tf.placeholder(shape=(None,), dtype=tf.int32, name='encoder_inputs_lengths') self.targets = tf.placeholder(shape=(None, None), dtype=tf.int32, name='targets') self.targets_lengths = tf.placeholder(shape=(None,), dtype=tf.int32, name='targets_lengths') paragraphs = self.paragraphs ans_locs = self.ans_locs encoder_inputs_lengths = self.encoder_inputs_lengths targets = self.targets targets_lengths = self.targets_lengths input_ids = tf.cast(paragraphs, tf.int32) batch_size, max_time = tf.unstack(tf.shape(paragraphs)) # Load pretrained embeddings if any if pretrainedEmbeddings != []: embeddings = tf.Variable(pretrainedEmbeddings, dtype=tf.float32) else: embeddings = tf.Variable(tf.random_uniform([vocab_size, embed_size], -0.01, 0.01), dtype=tf.float32) paragraphs_embedded = tf.nn.embedding_lookup(embeddings, tf.transpose(tf.cast(paragraphs, tf.int32), [1,0])) start_tokens = tf.ones([batch_size], dtype=tf.int32) decoder_inputs = tf.concat([tf.expand_dims(start_tokens, 1), targets], 1) decoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, tf.transpose(decoder_inputs, [1,0])) encoder_inputs = tf.concat([paragraphs_embedded, tf.expand_dims(tf.cast(tf.transpose(ans_locs, [1,0]), tf.float32), axis=2)],axis=2) encoder_cell_fw = LSTMCell(hidden_units) encoder_cell_bw = LSTMCell(hidden_units) ((encoder_fw_outputs,encoder_bw_outputs),(encoder_fw_final_state,encoder_bw_final_state)) = ( tf.nn.bidirectional_dynamic_rnn(cell_fw=encoder_cell_fw, cell_bw=encoder_cell_bw, inputs=encoder_inputs, sequence_length=encoder_inputs_lengths, dtype=tf.float32, time_major=True) ) encoder_final_state_c = tf.concat((encoder_fw_final_state.c, encoder_bw_final_state.c), 1) encoder_final_state_h = tf.concat((encoder_fw_final_state.h, encoder_bw_final_state.h), 1) encoder_final_state = LSTMStateTuple( c=encoder_final_state_c, h=encoder_final_state_h ) #Shape: (batch_size, time_step, hidden_units) encoder_outputs = tf.transpose(tf.concat((encoder_fw_outputs, encoder_bw_outputs), 2), [1,0,2]) decoder_cell = LSTMCell(hidden_units*2) attention_mechanism = BahdanauAttention(attention_units, encoder_outputs) attention_cell = AttentionWrapper(decoder_cell, attention_mechanism) copynet_cell = CopyNetWrapper(attention_cell, encoder_outputs, input_ids, vocab_size, gen_vocab_size) decoder_initial_state = copynet_cell.zero_state(batch_size, tf.float32).clone(cell_state=attention_cell.zero_state(batch_size=batch_size, dtype=tf.float32)) helper = tf.contrib.seq2seq.TrainingHelper(decoder_inputs_embedded, targets_lengths, time_major=True) #helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddings, tf.ones([batch_size], dtype=tf.int32), 0) decoder = tf.contrib.seq2seq.BasicDecoder(copynet_cell, helper, decoder_initial_state, output_layer=None) decoder_outputs, final_state, coder_seq_length = tf.contrib.seq2seq.dynamic_decode(decoder=decoder) decoder_logits, decoder_ids = decoder_outputs #LOSS decoder_targets = tf.transpose(targets, [1,0]) labels = tf.one_hot(decoder_targets, depth=vocab_size, dtype=tf.float32) decoder_logits_ = tf.transpose(decoder_logits,[1,0,2]) stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits( labels=labels, logits=decoder_logits_ ) """eos = tf.constant(config['EOS'], dtype=tf.int32) where_eos_targ = tf.cast(tf.equal(tf.cast(decoder_targets, dtype=tf.int32), eos), tf.float32) n_tokens = tf.cast(tf.argmax(where_eos_targ, axis=0), tf.float32)""" targets_max_len, _ = tf.unstack(tf.shape(decoder_targets)) self.loss = tf.reduce_sum(stepwise_cross_entropy, axis=0) / tf.cast(targets_max_len, tf.float32) self.loss = tf.reduce_sum(self.loss) / tf.cast(batch_size, tf.float32) #self.loss = tf.Print(self.loss,[tf.nn.softmax(decoder_logits),labels], summarize=100) optimizer = tf.train.AdagradOptimizer(self.learning_rate)#tf.train.GradientDescent gradients, variables = zip(*optimizer.compute_gradients(self.loss)) gradients, _ = tf.clip_by_global_norm(gradients, 5.0) self.train_op = optimizer.apply_gradients(zip(gradients, variables))#.minimize(self.loss) self.saver = tf.train.Saver(max_to_keep=None) if os.path.exists("ckpt/"+str(self.n_sess)) == False: os.system("mkdir ckpt/"+str(self.n_sess)) self.sess.run(tf.global_variables_initializer())
def build_graph(self): # build_graph-train vs validate-train print('Building the TensorFlow graph...') opts = self.options self.graph = tf.Graph() with self.graph.as_default(): self.enc_input = tf.placeholder( tf.int32, shape=[opts.batch_size, opts.max_uttr_len_enc]) self.dec_input = tf.placeholder( tf.int32, shape=[opts.batch_size, opts.max_uttr_len_dec]) self.target = tf.placeholder( tf.int32, shape=[opts.batch_size, opts.max_uttr_len_dec]) self.enc_input_len = tf.placeholder(tf.int32, shape=[opts.batch_size]) self.dec_input_len = tf.placeholder(tf.int32, shape=[opts.batch_size]) self.VAD = tf.placeholder(tf.float32, shape=[opts.corpus_size, 3]) self.termfreq = tf.placeholder(tf.float32, shape=[opts.corpus_size, 1]) self.VAD_loss = tf.placeholder(tf.float32, shape=[opts.corpus_size, 1]) with tf.variable_scope('embedding', reuse=tf.AUTO_REUSE): # how to get input_embed for encoder and decoder word_embeddings = tf.Variable(tf.random_uniform( [opts.corpus_size, opts.word_embed_size], -1.0, 1.0), name='embedding') # word_embeddings = tf.constant(opts.word_embeddings, name = 'word_embeddings') enc_input_embed = tf.nn.embedding_lookup( word_embeddings, self.enc_input) dec_input_embed = tf.nn.embedding_lookup( word_embeddings, self.dec_input) enc_input_VAD = tf.nn.embedding_lookup(self.VAD, self.enc_input) target_VAD = tf.nn.embedding_lookup(self.VAD, self.target) enc_input_tf = tf.nn.embedding_lookup(self.termfreq, self.enc_input) target_tf = tf.nn.embedding_lookup(self.termfreq, self.target) target_VAD_loss = tf.nn.embedding_lookup( self.VAD_loss, self.target) target_VAD_loss = tf.squeeze(target_VAD_loss) with tf.variable_scope('encoding', reuse=tf.AUTO_REUSE): cell_enc = tf.nn.rnn_cell.GRUCell(opts.n_hidden_units_enc) # bi-directional? enc_outputs, _ = tf.nn.dynamic_rnn( cell_enc, enc_input_embed, sequence_length=self.enc_input_len, dtype=tf.float32) if opts.mode == 'PREDICT': enc_outputs = tile_batch(enc_outputs, multiplier=opts.beam_width) enc_input_embed = tile_batch(enc_input_embed, multiplier=opts.beam_width) enc_input_VAD = tile_batch(enc_input_VAD, multiplier=opts.beam_width) enc_input_tf = tile_batch(enc_input_tf, multiplier=opts.beam_width) tiled_enc_input_len = tile_batch(self.enc_input_len, multiplier=opts.beam_width) else: tiled_enc_input_len = self.enc_input_len # with tf.variable_scope('attention', reuse = tf.AUTO_REUSE) as attention_layer: # attention_Wb = tf.layers.Dense(units=3, # use_bias=False, # kernel_initializer = tf.truncated_normal_initializer(stddev = 0.1), # name='attention_Wb') with tf.variable_scope('decoding', reuse=tf.AUTO_REUSE) as vs: # attn_mechanism: alpha_<t,t'> attn_mechanism = MyBahdanauAttention( num_units=opts.attn_depth, memory=enc_outputs, memory_sequence_length=tiled_enc_input_len, enc_input_embed=enc_input_embed, enc_input_VAD=enc_input_VAD, enc_input_tf=enc_input_tf, VAD_mode=opts.VAD_mode) cell_dec = tf.nn.rnn_cell.GRUCell(opts.n_hidden_units_dec) # AttentionWrapper: c? cell_dec = AttentionWrapper(cell_dec, attn_mechanism, output_attention=False) output_layer = tf.layers.Dense( units=opts.corpus_size, kernel_initializer=tf.truncated_normal_initializer( stddev=0.1)) # Train if opts.mode == 'TRAIN': dec_initial_state = cell_dec.zero_state( opts.batch_size, tf.float32) attention = compute_attention( attn_mechanism, dec_initial_state.cell_state) #(1,256) dec_initial_state = dec_initial_state.clone( attention=attention) outputs_dec, _ = tf.nn.dynamic_rnn( cell=cell_dec, inputs=dec_input_embed, sequence_length=self.dec_input_len, initial_state=dec_initial_state, dtype=tf.float32, scope=vs) # logits: `[batch_size, sequence_length, num_decoder_symbols]` # The logits correspond to the prediction across all classes at each timestep. logits = output_layer.apply(outputs_dec) # batch size * max sentence length; binary; 0 for non-word in orignal sentence; mask sequence_mask = tf.sequence_mask( self.dec_input_len, maxlen=opts.max_uttr_len_dec, dtype=tf.float32) if opts.VAD_mode: weights = sequence_mask * target_VAD_loss # affective objective function else: weights = sequence_mask # sequence_mask: [batch_size, max_len] # target: [batch_size, max_len] VAD_loss: [batch_size,max_len] # softmax_loss_function(labels=targets, logits=logits_flat) 默认为sparse_softmax_cross_entropy_with_logits self.loss = sequence_loss(logits, self.target, weights) self.loss_batch = sequence_loss(logits, self.target, weights, average_across_batch=False) self.optimizer = tf.train.AdamOptimizer( opts.learning_rate).minimize(self.loss) self.init = tf.global_variables_initializer() # Predict if opts.mode == 'PREDICT': dec_initial_state = cell_dec.zero_state( opts.batch_size * opts.beam_width, tf.float32) attention = compute_attention(attn_mechanism, dec_initial_state.cell_state) dec_initial_state = dec_initial_state.clone( attention=attention) start_tokens = tf.constant(opts.go_index, dtype=tf.int32, shape=[opts.batch_size]) bs_decoder = BeamSearchDecoder( cell=cell_dec, embedding=word_embeddings, start_tokens=start_tokens, end_token=opts.eos_index, initial_state=dec_initial_state, beam_width=opts.beam_width, output_layer=output_layer) final_outputs, final_state, _ = dynamic_decode( bs_decoder, impute_finished=False, maximum_iterations=opts.max_uttr_len_dec, scope=vs) self.predicted_ids = final_outputs.predicted_ids # self.scores = final_outputs.scores # 'FinalBeamSearchDecoderOutput' object has no attribute 'scores' self.prob = final_state.log_probs # log_probs: The log probabilities with shape `[batch_size, beam_width, vocab_size]`. # logits: Logits at the current time step. A tensor of shape `[batch_size, beam_width, vocab_size]` # step_log_probs = nn_ops.log_softmax(logits) # logsoftmax = logits - log(reduce_sum(exp(logits), axis)) # step_log_probs = _mask_probs(step_log_probs, end_token, previously_finished) # total_probs = array_ops.expand_dims(beam_state.log_probs, 2) + step_log_probs # final_outputs.scores #[batch_size, length, beam_width] if opts.mode == 'POST_PREDICT': dec_initial_state = cell_dec.zero_state( opts.batch_size, tf.float32) attention = compute_attention( attn_mechanism, dec_initial_state.cell_state) #(1,256) dec_initial_state = dec_initial_state.clone( attention=attention) outputs_dec, _ = tf.nn.dynamic_rnn( cell=cell_dec, inputs=dec_input_embed, sequence_length=self.dec_input_len, initial_state=dec_initial_state, dtype=tf.float32, scope=vs) logits = output_layer.apply(outputs_dec) sequence_mask = tf.sequence_mask( self.dec_input_len, maxlen=opts.max_uttr_len_dec, dtype=tf.float32) score = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.target, logits=logits) self.prob = -1 * tf.reduce_sum(score * sequence_mask) self.tvars = tf.trainable_variables() self.saver = tf.train.Saver(max_to_keep=100)
def __graph__(self): # encoder encoder_outputs, encoder_state = self.encoder() # decoder with tf.variable_scope('decoder'): encoder_inputs_length = self.encoder_inputs_length if self.beam_search: # 如果使用beam_search,则需要将encoder的输出进行tile_batch,其实就是复制beam_size份。 print("use beamsearch decoding..") encoder_outputs = tile_batch(encoder_outputs, multiplier=self.beam_size) encoder_state = nest.map_structure( lambda s: tf.contrib.seq2seq.tile_batch(s, self.beam_size), encoder_state) encoder_inputs_length = tile_batch(encoder_inputs_length, multiplier=self.beam_size) # 定义要使用的attention机制。 attention_mechanism = BahdanauAttention( num_units=self.rnn_size, memory=encoder_outputs, memory_sequence_length=encoder_inputs_length) # 定义decoder阶段要是用的RNNCell,然后为其封装attention wrapper decoder_cell = self.create_rnn_cell() decoder_cell = AttentionWrapper( cell=decoder_cell, attention_mechanism=attention_mechanism, attention_layer_size=self.rnn_size, name='Attention_Wrapper') # 如果使用beam_seach则batch_size = self.batch_size * self.beam_size batch_size = self.batch_size if not self.beam_search else self.batch_size * self.beam_size # 定义decoder阶段的初始化状态,直接使用encoder阶段的最后一个隐层状态进行赋值 decoder_initial_state = decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32).clone(cell_state=encoder_state) output_layer = tf.layers.Dense( self.vocab_size, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) if self.mode == 'train': self.decoder_outputs = self.decoder_train( decoder_cell, decoder_initial_state, output_layer) # loss self.loss = sequence_loss(logits=self.decoder_outputs, targets=self.decoder_targets, weights=self.mask) # summary tf.summary.scalar('loss', self.loss) self.summary_op = tf.summary.merge_all() # optimizer optimizer = tf.train.AdamOptimizer(self.learing_rate) trainable_params = tf.trainable_variables() gradients = tf.gradients(self.loss, trainable_params) clip_gradients, _ = tf.clip_by_global_norm( gradients, self.max_gradient_norm) self.train_op = optimizer.apply_gradients( zip(clip_gradients, trainable_params)) elif self.mode == 'decode': self.decoder_predict_decode = self.decoder_decode( decoder_cell, decoder_initial_state, output_layer)
def build_initial_graph(self, encoder_input, len_both, beam_width=1, reuse=False): """ Building initial graph with input for encoder and a fixed beam width :param encoder_input: Input that will be processed from encoder :param len_both: length of input :param beam_width: beam width :param reuse: If this graph already exists and should be reused, like in validation graph :return: decoder cell and attention zero state """ # look up embeddings for input sequence encoder_subject_embedded = tf.nn.embedding_lookup( self.embeddings_english, encoder_input) # Define variable scope for LSTM Encoder with tf.variable_scope("LSTM_Encoder_subject", reuse=reuse): # Create a bidirectional lstm with encoder forward cell and encoder backward cell defined as class variables outputs, output_states = tf.nn.bidirectional_dynamic_rnn( self.encoder_cell_forward, self.encoder_cell_backward, inputs=encoder_subject_embedded, sequence_length=len_both, dtype=tf.float32, time_major=False) # Concat outputs and states of forward and backward lstm outputs = tf.concat(outputs, 2) # Unpack forward state and backward state vom output states forward_states, backward_states = output_states # List vor c states (lstm cell state) and h states (lstm hidden state) c_states = [] h_states = [] # Rearrange state to give them into decoder: concat forward and backward c and h state for i, state in enumerate(forward_states): c_forward = state[0] c_backward = backward_states[i][0] c_state = tf.concat([c_forward, c_backward], 1) c_states.append(c_state) h_forward = state[1] h_backward = backward_states[i][1] h_state = tf.concat([h_forward, h_backward], 1) h_states.append(h_state) # List for saving states as tuple state_tuples = [] # Saving states as LSTMStateTuple for i, c_state in enumerate(c_states): state_tuple = LSTMStateTuple(c_state, h_states[i]) state_tuples.append(state_tuple) # Cast list to tuple state_tuples = tuple(state_tuples) # multiply rnn output if beam search is used outputs = tile_batch(outputs, beam_width) len_both = tile_batch(len_both, beam_width) encoder_final_state = tile_batch(state_tuples, beam_width) # Choose luong or bahdanau attention if self.attention == "luong": AttentionBuilder = LuongAttention elif self.attention == "bahdanau": AttentionBuilder = BahdanauAttention else: print("Attention mechanism not found.") sys.exit() # Define variable scope for attention mechanism with tf.variable_scope("Attention", reuse=reuse): # Create an attention mechanism attention_mechanism = AttentionBuilder(self.attention_size, outputs, len_both) # Create Attention wrapper with decoder cell decoder_cell = AttentionWrapper(self.decoder_cell, attention_mechanism, self.attention_size) # Create zero state of decoder cell with specified batch size and beam width attn_zero_state = decoder_cell.zero_state(batch_size=self.batch_size * beam_width, dtype=tf.float32) # Set cell state to final decoder cell state attn_zero_state = attn_zero_state.clone(cell_state=encoder_final_state) return decoder_cell, attn_zero_state
def add_decoder_op(self, enc_final_state, enc_hidden_states, output_embed_matrix, training): original_enc_final_state = enc_final_state flat_enc_final_state = nest.flatten(enc_final_state) enc_final_state = tf.concat(flat_enc_final_state, axis=1) enc_final_size = int(enc_final_state.get_shape()[1]) part_logit_preds = dict() part_token_preds = dict() part_logit_sequence_preds = dict() part_token_sequence_preds = dict() part_layers = [] grammar = self.config.grammar for i, part in enumerate(('trigger', 'query', 'action')): with tf.variable_scope('decode_function_' + part): activation = getattr( tf.nn, self.config.function_nonlinearity) if hasattr( tf.nn, self.config.function_nonlinearity) else getattr( tf, self.config.function_nonlinearity) layer = tf.contrib.layers.fully_connected( enc_final_state, self.config.function_hidden_size, activation_fn=activation) part_layers.append(layer) layer_with_dropout = tf.nn.dropout( layer, keep_prob=self.dropout_placeholder, seed=443 * i) part_logit_preds[part] = tf.layers.dense( layer_with_dropout, len(grammar.functions[part])) part_token_preds[part] = tf.cast(tf.argmax( part_logit_preds[part], axis=1), dtype=tf.int32) first_value_token = grammar.num_functions + grammar.num_begin_tokens + grammar.num_control_tokens num_value_tokens = grammar.output_size - first_value_token output_embed_matrix = tf.concat( (output_embed_matrix[0:grammar.num_control_tokens], output_embed_matrix[first_value_token:]), axis=0) adjusted_trigger = part_token_preds['trigger'] + ( grammar.num_control_tokens + grammar.num_begin_tokens) adjusted_query = part_token_preds['query'] + ( grammar.num_control_tokens + grammar.num_begin_tokens + len(grammar.functions['trigger'])) adjusted_action = part_token_preds['action'] + ( grammar.num_control_tokens + grammar.num_begin_tokens + len(grammar.functions['trigger']) + len(grammar.functions['query'])) layer_concat = tf.concat(part_layers, axis=1) for i, part in enumerate(('trigger', 'query', 'action')): with tf.variable_scope('decode_sequence_' + part): def one_decoder_input(i, like): with tf.variable_scope(str(i)): return tf.layers.dense(layer_concat, like.get_shape()[1]) flat_decoder_initial_state = [ one_decoder_input(i, like) for i, like in enumerate(flat_enc_final_state) ] decoder_initial_state = nest.pack_sequence_as( original_enc_final_state, flat_decoder_initial_state) cell_dec = tf.contrib.rnn.MultiRNNCell([ self.make_rnn_cell(i, True) for i in range(self.config.rnn_layers) ]) # uncompress function tokens (to look them up in the grammar) if training: adjusted_function_token = self.part_function_placeholders[ part] else: if part == 'trigger': adjusted_function_token = adjusted_trigger elif part == 'query': adjusted_function_token = adjusted_query elif part == 'action': adjusted_function_token = adjusted_action # adjust the sequence to "skip" function tokens output_size = grammar.num_control_tokens + num_value_tokens output = self.part_sequence_placeholders[part] adjusted_output = tf.where( output >= grammar.num_control_tokens, output - (first_value_token - grammar.num_control_tokens), output) if self.config.apply_attention: attention = LuongAttention(self.config.decoder_hidden_size, enc_hidden_states, self.input_length_placeholder, probability_fn=tf.nn.softmax) cell_dec = AttentionWrapper( cell_dec, attention, cell_input_fn=lambda inputs, _: inputs, attention_layer_size=self.config.decoder_hidden_size, initial_cell_state=decoder_initial_state) decoder_initial_state = cell_dec.zero_state( self.batch_size, dtype=tf.float32) decoder = Seq2SeqDecoder( self.config, self.input_placeholder, self.input_length_placeholder, adjusted_output, self.part_sequence_length_placeholders[part], self.batch_number_placeholder, max_length=MAX_PRIMITIVE_LENGTH) rnn_output, sample_ids = decoder.decode( cell_dec, decoder_initial_state, output_size, output_embed_matrix, training, grammar_helper=PrimitiveSequenceGrammarHelper( grammar, adjusted_function_token)) part_logit_sequence_preds[part] = rnn_output part_token_sequence_preds[part] = tf.cast(sample_ids, dtype=tf.int32) with tf.variable_scope('top_classifier'): top_hidden = tf.contrib.layers.fully_connected( enc_final_state, self.config.first_token_hidden_size, activation_fn=tf.tanh) top_hidden_with_dropout = tf.nn.dropout( top_hidden, keep_prob=self.dropout_placeholder, seed=127) top_logits = tf.layers.dense(top_hidden_with_dropout, grammar.num_begin_tokens) top_token = tf.cast(tf.argmax(top_logits, axis=1), dtype=tf.int32) with tf.variable_scope('decode_special'): output_size = grammar.num_control_tokens + num_value_tokens output = self.special_label_placeholder adjusted_output = tf.where( output >= grammar.num_control_tokens, output - (first_value_token - grammar.num_control_tokens), output) cell_dec = tf.contrib.rnn.MultiRNNCell([ self.make_rnn_cell(i, True) for i in range(self.config.rnn_layers) ]) sequence_length = tf.ones( (self.batch_size, ), dtype=tf.int32) * MAX_SPECIAL_LENGTH decoder_initial_state = original_enc_final_state if self.config.apply_attention: attention = LuongAttention(self.config.decoder_hidden_size, enc_hidden_states, self.input_length_placeholder, probability_fn=tf.nn.softmax) cell_dec = AttentionWrapper( cell_dec, attention, cell_input_fn=lambda inputs, _: inputs, attention_layer_size=self.config.decoder_hidden_size, initial_cell_state=original_enc_final_state) decoder_initial_state = cell_dec.zero_state(self.batch_size, dtype=tf.float32) decoder = Seq2SeqDecoder(self.config, self.input_placeholder, self.input_length_placeholder, adjusted_output, sequence_length, self.batch_number_placeholder, max_length=MAX_SPECIAL_LENGTH) rnn_output, sample_ids = decoder.decode( cell_dec, decoder_initial_state, output_size, output_embed_matrix, training, grammar_helper=SpecialSequenceGrammarHelper(grammar)) logit_special_sequence = rnn_output token_special_sequence = tf.cast(sample_ids, dtype=tf.int32) # adjust tokens back to their output code adjusted_top = tf.expand_dims(top_token + grammar.num_control_tokens, axis=1) adjusted_special_sequence = tf.where( token_special_sequence >= grammar.num_control_tokens, token_special_sequence + (first_value_token - grammar.num_control_tokens), token_special_sequence) adjusted_token_sequences = dict() for part in ('trigger', 'query', 'action'): token_sequence = part_token_sequence_preds[part] adjusted_token_sequence = tf.where( token_sequence >= grammar.num_control_tokens, token_sequence + (first_value_token - grammar.num_control_tokens), token_sequence) adjusted_token_sequences[part] = adjusted_token_sequence # remove EOS from the middle of the sentence adjusted_token_sequences['trigger'] = tf.where( tf.equal(adjusted_token_sequences['trigger'], grammar.end), tf.zeros_like(adjusted_token_sequences['trigger']), adjusted_token_sequences['trigger']) adjusted_token_sequences['query'] = tf.where( tf.equal(adjusted_token_sequences['query'], grammar.end), tf.zeros_like(adjusted_token_sequences['query']), adjusted_token_sequences['query']) adjusted_trigger = tf.expand_dims(adjusted_trigger, axis=1) adjusted_query = tf.expand_dims(adjusted_query, axis=1) adjusted_action = tf.expand_dims(adjusted_action, axis=1) program_sequence = tf.concat( (adjusted_top, adjusted_trigger, adjusted_token_sequences['trigger'], adjusted_query, adjusted_token_sequences['query'], adjusted_action, adjusted_token_sequences['action']), axis=1) full_special_sequence = tf.concat( (adjusted_top, adjusted_special_sequence), axis=1) # full special sequence is smaller than program sequence, so we need to pad it all the way to the same shape full_special_sequence = pad_up_to(full_special_sequence, tf.shape(program_sequence)[1], rank=1) rule_token = grammar.dictionary['rule'] - grammar.num_control_tokens full_sequence = tf.where(tf.equal(top_token, rule_token), program_sequence, full_special_sequence) return ThreePartAlignerResult(top_logits, part_logit_preds, part_logit_sequence_preds, logit_special_sequence, full_sequence)
def buildModel(self): T_in = self.args.T_in T_out = self.args.T_out D_in = self.args.D_in D_out = self.args.D_out E = self.args.embedding_dim H = self.args.hidden_dim SOS = self.args.SOS EOS = self.args.EOS PAD = self.args.PAD beam_width = 3 # Input with tf.name_scope('input'): x = tf.placeholder(shape=(None, T_in), dtype=tf.int32, name='encoder_inputs') # N, T_out y = tf.placeholder(shape=(None, T_out), dtype=tf.int32, name='decoder_inputs') # N x_len = tf.placeholder(shape=(None, ), dtype=tf.int32) # N y_len = tf.placeholder(shape=(None, ), dtype=tf.int32) # dynamic sample num batch_size = tf.shape(x)[0] # symbol mask sos = tf.ones(shape=(batch_size, 1), dtype=tf.int32) * SOS eos = tf.ones(shape=(batch_size, 1), dtype=tf.int32) * EOS pad = tf.ones(shape=(batch_size, 1), dtype=tf.int32) * PAD # input mask x_mask = tf.sequence_mask(x_len, T_in, dtype=tf.float32) y_with_sos_mask = tf.sequence_mask(y_len, T_out + 1, dtype=tf.float32) y_with_pad = tf.concat([y, pad], axis=1) eos_mask = tf.one_hot(y_len, depth=T_out + 1, dtype=tf.int32) * EOS # masked inputs y_with_eos = y_with_pad + eos_mask y_with_sos = tf.concat([sos, y], axis=1) ## Embedding with tf.name_scope('embedding'): if self.args.use_pretrained: embedding_pretrained = np.fromfile(self.args.pretrained_file, dtype=np.float32).reshape( (-1, E)) embedding = tf.Variable(embedding_pretrained, trainable=False) else: embedding = tf.get_variable(name='embedding', shape=(D_in, E), dtype=tf.float32, initializer=xavier_initializer()) e_x = tf.nn.embedding_lookup(embedding, x) e_y = tf.nn.embedding_lookup(embedding, y_with_sos) if self.args.mode == 'train': e_x = tf.nn.dropout(e_x, self.args.keep_prob) ## Encoder with tf.name_scope('encoder'): ## Multi-BiLSTM fw_cell = rnn.MultiRNNCell([ rnn.BasicLSTMCell(num_units=H) for i in range(self.args.layer_size) ]) bw_cell = rnn.MultiRNNCell([ rnn.BasicLSTMCell(num_units=H) for i in range(self.args.layer_size) ]) bi_encoder_output, bi_encoder_state = tf.nn.bidirectional_dynamic_rnn( fw_cell, bw_cell, e_x, sequence_length=x_len, dtype=tf.float32, time_major=False, scope=None) encoder_output = bi_encoder_output[0] + bi_encoder_output[1] encoder_final_state = bi_encoder_state[0] ## Decoder with tf.name_scope('decoder'): decoder_cell = rnn.MultiRNNCell([ rnn.BasicLSTMCell(num_units=H) for i in range(self.args.layer_size) ]) decoder_lengths = tf.ones(shape=[batch_size], dtype=tf.int32) * (T_out + 1) ## Trainning decoder with tf.variable_scope('attention'): attention_mechanism = LuongAttention( num_units=H, memory=encoder_output, memory_sequence_length=x_len, name='attention_fn') projection_layer = Dense(units=D_out, kernel_initializer=xavier_initializer()) train_decoder_cell = AttentionWrapper( cell=decoder_cell, attention_mechanism=attention_mechanism, attention_layer_size=H) train_decoder_init_state = train_decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32).clone(cell_state=encoder_final_state) training_helper = TrainingHelper(e_y, decoder_lengths, time_major=False) train_decoder = BasicDecoder( cell=train_decoder_cell, helper=training_helper, initial_state=train_decoder_init_state, output_layer=projection_layer) train_decoder_outputs, _, _ = dynamic_decode( train_decoder, impute_finished=True, maximum_iterations=T_out + 1) # N, T_out+1, D_out train_decoder_outputs = ln(train_decoder_outputs.rnn_output) ## Beam_search decoder beam_memory = tile_batch(encoder_output, beam_width) beam_memory_state = tile_batch(encoder_final_state, beam_width) beam_memory_length = tile_batch(x_len, beam_width) with tf.variable_scope('attention', reuse=True): beam_attention_mechanism = LuongAttention( num_units=H, memory=beam_memory, memory_sequence_length=beam_memory_length, name='attention_fn') beam_decoder_cell = AttentionWrapper( cell=decoder_cell, attention_mechanism=beam_attention_mechanism, attention_layer_size=None) beam_decoder_init_state = beam_decoder_cell.zero_state( batch_size=batch_size * beam_width, dtype=tf.float32).clone(cell_state=beam_memory_state) start_tokens = tf.ones((batch_size), dtype=tf.int32) * SOS beam_decoder = BeamSearchDecoder( cell=beam_decoder_cell, embedding=embedding, start_tokens=start_tokens, end_token=EOS, initial_state=beam_decoder_init_state, beam_width=beam_width, output_layer=projection_layer) beam_decoder_outputs, _, _ = dynamic_decode( beam_decoder, scope=tf.get_variable_scope(), maximum_iterations=T_out + 1) beam_decoder_result_ids = beam_decoder_outputs.predicted_ids with tf.name_scope('loss'): logits = tf.nn.softmax(train_decoder_outputs) cross_entropy = tf.keras.losses.sparse_categorical_crossentropy( y_with_eos, logits) loss_mask = tf.sequence_mask(y_len + 1, T_out + 1, dtype=tf.float32) loss = tf.reduce_sum(cross_entropy * loss_mask) / tf.cast( batch_size, dtype=tf.float32) prediction = tf.argmax(logits, 2) ## train_op with tf.name_scope('train'): global_step = tf.train.get_or_create_global_step() lr = noam_scheme(self.args.lr, global_step, self.args.warmup_steps) optimizer = tf.train.AdamOptimizer(lr) ## gradient clips trainable_params = tf.trainable_variables() gradients = tf.gradients(loss, trainable_params) clip_gradients, _ = tf.clip_by_global_norm( gradients, self.args.gradient_clip_num) train_op = optimizer.apply_gradients(zip(clip_gradients, trainable_params), global_step=global_step) # Summary with tf.name_scope('summary'): tf.summary.scalar('lr', lr) tf.summary.scalar('loss', loss) tf.summary.scalar('global_step', global_step) summaries = tf.summary.merge_all() return x, y, x_len, y_len, logits, loss, prediction, beam_decoder_result_ids, global_step, train_op, summaries
def build_model(self): print('building model... ...') with tf.variable_scope('seq2seq_placeholder'): self.encoder_inputs = tf.placeholder(tf.int32, [None, None], name="encoder_inputs") self.decoder_inputs = tf.placeholder(tf.int32, [None, None], name="decoder_inputs") self.decoder_targets = tf.placeholder(tf.int32, [None, None], name="decoder_targets") self.decoder_targets_masks = tf.placeholder(tf.float32, [None, None], name="mask") self.encoder_length = tf.placeholder(tf.int32, [None], name="encoder_length") self.decoder_length = tf.placeholder(tf.int32, [None], name="decoder_length") self.max_target_sequence_length = tf.reduce_max( self.decoder_length, name='max_target_len') with tf.variable_scope('seq2seq_embedding'): self.embedding = self.init_embedding(self.vocab_size, self.embedding_size) with tf.variable_scope('seq2seq_encoder'): encoder_outputs, encoder_states = build_encoder( self.embedding, self.encoder_inputs, self.encoder_length, self.enc_num_layers, self.enc_num_units, self.enc_cell_type, bidir=self.enc_bidir) with tf.variable_scope('seq2seq_decoder'): encoder_length = self.encoder_length if self.beam_search: print("use beamsearch decoding..") encoder_outputs = tile_batch(encoder_outputs, multiplier=self.beam_size) encoder_states = tile_batch(encoder_states, multiplier=self.beam_size) encoder_length = tile_batch(encoder_length, multiplier=self.beam_size) attention_mechanism = BahdanauAttention( num_units=self.attn_num_units, memory=encoder_outputs, memory_sequence_length=encoder_length) decoder_cell = create_rnn_cell(self.dec_num_layers, self.dec_num_units, self.dec_cell_type) decoder_cell = AttentionWrapper( cell=decoder_cell, attention_mechanism=attention_mechanism, attention_layer_size=self.dec_num_units, name='Attention_Wrapper') batch_size = self.batch_size if not self.beam_search else self.batch_size * self.beam_size decoder_initial_state = decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32).clone(cell_state=encoder_states) output_layer = tf.layers.Dense(self.vocab_size, use_bias=False, name='output_projection') if self.mode == 'train': decoder_inputs_embedded = tf.nn.embedding_lookup( self.embedding, self.decoder_inputs) # training helper的作用就是决定下一个时序的decoder的输入为给定的decoder inputs, 而不是上一个时刻的输出 training_helper = tf.contrib.seq2seq.TrainingHelper( inputs=decoder_inputs_embedded, sequence_length=self.decoder_length, name='training_helper') training_decoder = tf.contrib.seq2seq.BasicDecoder( cell=decoder_cell, helper=training_helper, initial_state=decoder_initial_state, output_layer=output_layer) decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder=training_decoder, impute_finished=True, maximum_iterations=self.max_target_sequence_length) self.decoder_logits_train = decoder_outputs.rnn_output self.loss = tf.contrib.seq2seq.sequence_loss( logits=self.decoder_logits_train, targets=self.decoder_targets, weights=self.decoder_targets_masks) optimizer = tf.train.AdamOptimizer(self.learning_rate) trainable_params = tf.trainable_variables() gradients = tf.gradients(self.loss, trainable_params) clip_gradients, _ = tf.clip_by_global_norm( gradients, self.max_gradient_norm) self.train_op = optimizer.apply_gradients( zip(clip_gradients, trainable_params)) elif self.mode == 'infer': start_tokens = tf.ones([ self.batch_size, ], tf.int32) * SOS_ID # 这里的batch_size不需要复制 end_token = EOS_ID if self.beam_search: inference_decoder = BeamSearchDecoder( cell=decoder_cell, embedding=self.embedding, start_tokens=start_tokens, end_token=end_token, initial_state=decoder_initial_state, beam_width=self.beam_size, output_layer=output_layer) else: decoding_helper = GreedyEmbeddingHelper( embedding=self.embedding, start_tokens=start_tokens, end_token=end_token) inference_decoder = BasicDecoder( cell=decoder_cell, helper=decoding_helper, initial_state=decoder_initial_state, output_layer=output_layer) decoder_outputs, _, _ = dynamic_decode( decoder=inference_decoder, maximum_iterations=self.infer_max_iter) if self.beam_search: infer_outputs = decoder_outputs.predicted_ids # [batch_size, decoder_targets_length, beam_size] self.infer_outputs = tf.transpose( infer_outputs, [0, 2, 1 ]) # [batch_size, beam_size, decoder_targets_length] else: self.infer_outputs = decoder_outputs.sample_id # [batch_size, decoder_targets_length] self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=self.max_to_keep)
def add_prediction_op(self): encoder_embed_seq = embed_sequence( self.inputs, vocab_size=self.config.vocab_size + 2, embed_dim=self.config.embedding_size, scope='embed') decoder_input_embed_seq = embed_sequence( self.labels[:, :-1], vocab_size=self.config.vocab_size + 2, embed_dim=self.config.embedding_size, scope='embed', reuse=True) with tf.variable_scope('embed', reuse=True): embeddings = tf.get_variable('embeddings') encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn( BasicLSTMCell(self.config.num_units, name="encoder"), encoder_embed_seq, dtype=tf.float32, sequence_length=self.lengths, ) if self.config.train: tiled_encoder_outputs = encoder_outputs tiled_encoder_final_state = encoder_final_state tiled_sequence_length = self.lengths else: tiled_encoder_outputs = tile_batch( encoder_outputs, multiplier=self.config.beam_width) tiled_encoder_final_state = tile_batch( encoder_final_state, multiplier=self.config.beam_width) tiled_sequence_length = tile_batch( self.lengths, multiplier=self.config.beam_width) attention_mechanism = BahdanauAttention( num_units=self.config.num_units, memory=tiled_encoder_outputs, memory_sequence_length=tiled_sequence_length) attn_cell = AttentionWrapper( BasicLSTMCell(self.config.num_units, name="decoder"), attention_mechanism, attention_layer_size=self.config.num_units / 2) if self.config.train: batch_size = self.config.batch_size else: batch_size = self.config.batch_size * self.config.beam_width decoder_initial_state = attn_cell.zero_state(dtype=tf.float32, batch_size=batch_size) decoder_initial_state = decoder_initial_state.clone( cell_state=tiled_encoder_final_state) output_layer = tf.layers.Dense(self.config.vocab_size + 2, use_bias=True, name='output_projection') if self.config.train: training_helper = TrainingHelper(inputs=decoder_input_embed_seq, sequence_length=self.lengths, name='training_helper') decoder = BasicDecoder(cell=attn_cell, helper=training_helper, initial_state=decoder_initial_state, output_layer=output_layer) else: def embed_and_input_proj(inputs): return tf.nn.embedding_lookup(embeddings, inputs) start_tokens = tf.ones([ self.config.batch_size, ], tf.int32) * (self.config.vocab_size + 1) decoder = BeamSearchDecoder( cell=attn_cell, embedding=embed_and_input_proj, start_tokens=start_tokens, end_token=self.config.vocab_size, initial_state=decoder_initial_state, beam_width=self.config.beam_width, output_layer=output_layer, ) if self.config.train: decoder_outputs, _, _ = dynamic_decode( decoder=decoder, impute_finished=True, maximum_iterations=self.config.max_sequence_length + 1) pred_logits = tf.identity(decoder_outputs.rnn_output, name="prediction") else: decoder_outputs, _, _ = dynamic_decode( decoder=decoder, impute_finished=False, maximum_iterations=self.config.max_sequence_length + 1) pred_logits = tf.identity(decoder_outputs.predicted_ids, name="prediction") return pred_logits