return_state=True, dropout=0.4, recurrent_dropout=0.2) decoder_outputs, decoder_fwd_state, decoder_back_state = decoder_lstm( dec_emb, initial_state=[state_h, state_c]) # Attention layer attn_layer = AttentionLayer(name='attention_layer') attn_out, attn_states = attn_layer([encoder_outputs, decoder_outputs]) # Concat attention input and decoder LSTM output decoder_concat_input = Concatenate( axis=-1, name='concat_layer')([decoder_outputs, attn_out]) #dense layer decoder_dense = TimeDistributed(Dense(y_voc, activation='softmax')) decoder_outputs = decoder_dense(decoder_concat_input) # Define the model model = Model([encoder_inputs, decoder_inputs], decoder_outputs) model.summary() model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy') es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2) #Wait for the epochs to get over history = model.fit([x_tr, y_tr[:, :-1]], y_tr.reshape(y_tr.shape[0], y_tr.shape[1], 1)[:, 1:], epochs=50,
def _create_decoder(cells, batch_size, encoder_outputs, encoder_state, encoder_lengths, decoding_inputs, decoding_lengths, embed_matrix, target_vocab_size, scope, max_sequence_size, use_attention=True, softmax_temperature=None): """Summary Parameters ---------- cells : TYPE Description batch_size : TYPE Description encoder_outputs : TYPE Description encoder_state : TYPE Description encoder_lengths : TYPE Description decoding_inputs : TYPE Description decoding_lengths : TYPE Description embed_matrix : TYPE Description target_vocab_size : TYPE Description scope : TYPE Description max_sequence_size : TYPE Description use_attention : bool, optional Description softmax_temperature : float32, optional Values above 1.0 result in more random samples Returns ------- TYPE Description """ from tensorflow.python.layers.core import Dense # Output projection output_layer = Dense(target_vocab_size, name='output_projection') # Setup Attention if use_attention: attn_mech = tf.contrib.seq2seq.LuongAttention( cells.output_size, encoder_outputs, encoder_lengths, scale=True) cells = tf.contrib.seq2seq.AttentionWrapper( cell=cells, attention_mechanism=attn_mech, attention_layer_size=cells.output_size, alignment_history=False) initial_state = cells.zero_state( dtype=tf.float32, batch_size=batch_size) initial_state = initial_state.clone(cell_state=encoder_state) # Setup training a build decoder helper = tf.contrib.seq2seq.TrainingHelper( inputs=decoding_inputs, sequence_length=decoding_lengths, time_major=False) train_decoder = tf.contrib.seq2seq.BasicDecoder( cell=cells, helper=helper, initial_state=initial_state, output_layer=output_layer) train_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode( train_decoder, output_time_major=False, impute_finished=True, maximum_iterations=max_sequence_size) train_logits = tf.identity(train_outputs.rnn_output, name='train_logits') # Setup inference and build decoder scope.reuse_variables() start_tokens = tf.tile(tf.constant([GO_ID], dtype=tf.int32), [batch_size]) helper = tf.contrib.seq2seq.SampleEmbeddingHelper( embedding=embed_matrix, start_tokens=start_tokens, end_token=EOS_ID, softmax_temperature=softmax_temperature) # helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( # embedding=embed_matrix, start_tokens=start_tokens, end_token=EOS_ID) infer_decoder = tf.contrib.seq2seq.BasicDecoder( cell=cells, helper=helper, initial_state=initial_state, output_layer=output_layer) infer_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode( infer_decoder, output_time_major=False, impute_finished=True, maximum_iterations=max_sequence_size) infer_logits = tf.identity(infer_outputs.sample_id, name='infer_logits') return train_logits, infer_logits
embedding_layer = build_embedding(words_class, 130) embedding_layer_chi = build_embedding(150, 130, name='chi') # 解码词语嵌入 # lstm 输入 lstm_inputs = tf.nn.embedding_lookup(embedding_layer, inputs) # 编码层 encoder_layer = build_lstm(lstm_hidden_num, 0.9, 1) # 初始化lstm 层 initial_state = encoder_layer.zero_state(batch_size, tf.float32) # lstm 动态时间维度展开 encoder_lstm_outputs, encoder_final_state = tf.nn.dynamic_rnn( encoder_layer, lstm_inputs, dtype=tf.float32) # 开始构建解码层 # 解码层第一个传入的字符为`<BOE>` encoder_final_c = encoder_final_state[0][0] # 编码层最后的状态 c # 全连接层 F_C_layer = Dense(fully_connect) #decoder_lstm_inputs = tf.nn.embedding_lookup(embedding_layer_chi, decoder_inputs) #decoder_layer = build_lstm(lstm_hidden_num, 0.9, 1) #decoder_layer = tf.contrib.rnn.LSTMCell(100) #decoder_lstm_outputs, decoder_final_state = tf.nn.dynamic_rnn(decoder_layer, decoder_lstm_inputs, initial_state=\ # encoder_final_state, scope="plain_decoder")# 坑啊,两个dynamic 要用scope进行区分 #decoder_lstm_outputs, decoder_final_state = inference_layer(decoder_inputs, encoder_final_state, is_inference=False) # 构建损失函数 #decoder_logits = tf.to_float(decoder_logits, name='ToFloat32') # 解码层构建-for inference def inference_layer(inputs_infer, initial_state=None, is_inference=True): """
def atten_decoder_input_fn(inputs, attention): _input_layer = Dense(self.state_size * 2) return _input_layer(tf.concat([inputs, attention], 1))
def create_model(self): self.encoder_input = tf.placeholder(tf.int32, [None, None], name='encoder_input') self.encoder_input_lengths = tf.placeholder( tf.int32, [None], name='encoder_input_lengths') self.dropout_kp = tf.placeholder(tf.float32, name='dropout_kp') # GO self.decoder_input = tf.placeholder(tf.int32, [None, None], name='decoder_input') # EOS self.decoder_target = tf.placeholder(tf.int32, [None, None], name='decoder_target') self.decoder_input_lengths = tf.placeholder( tf.int32, [None], name='decoder_input_lengths') self.max_decoder_sequence_length = tf.reduce_max( self.decoder_input_lengths, name='max_decoder_sequence_length') self.max_encoder_sequence_length = tf.reduce_max( self.encoder_input_lengths, name='max_encoder_sequence_length') self.topic_words = tf.placeholder(tf.int32, [None, None], name='topic_words') with tf.device('/cpu:0'), tf.name_scope('embedding'): W = tf.Variable(tf.constant( 0., shape=[self.vocab_size, self.embedding_size]), name="W") self.embedding_placeholder = tf.placeholder( tf.float32, [self.vocab_size, self.embedding_size], name='embedding_placeholder') embeding_init = W.assign(self.embedding_placeholder) encoder_embedded_inputs = tf.nn.embedding_lookup( embeding_init, self.encoder_input) decoder_embedded_input = tf.nn.embedding_lookup( embeding_init, self.decoder_input) topic_words_embedded = tf.nn.embedding_lookup( embeding_init, self.topic_words) with tf.variable_scope('content_encoder'): fw_encoder_cells = [] for _ in range(self.num_layers): cell = tf.contrib.rnn.GRUCell(self.embedding_size) fw_encoder_wraped_cell = tf.contrib.rnn.DropoutWrapper( cell, output_keep_prob=self.dropout_kp) fw_encoder_cells.append(fw_encoder_wraped_cell) fw_encoder_cell = tf.contrib.rnn.MultiRNNCell(fw_encoder_cells) bw_encoder_cells = [] for _ in range(self.num_layers): cell = tf.contrib.rnn.GRUCell(self.embedding_size) bw_encoder_wraped_cell = tf.contrib.rnn.DropoutWrapper( cell, output_keep_prob=self.dropout_kp) bw_encoder_cells.append(bw_encoder_wraped_cell) bw_encoder_cell = tf.contrib.rnn.MultiRNNCell(bw_encoder_cells) ((content_output_fw, content_output_bw), (content_output_state_fw, content_output_state_bw)) = \ tf.nn.bidirectional_dynamic_rnn(cell_fw=fw_encoder_cell, cell_bw=bw_encoder_cell, inputs=encoder_embedded_inputs, dtype=tf.float32) content_outputs = tf.concat([content_output_fw, content_output_bw], axis=-1) content_state = tf.squeeze(tf.concat( [content_output_state_fw, content_output_state_bw], axis=-1), axis=0) with tf.variable_scope('topic_encoder'): topic_fw_encoder_cells = [] for _ in range(self.num_layers): cell = tf.contrib.rnn.GRUCell(self.embedding_size) topic_fw_encoder_wraped_cell = tf.contrib.rnn.DropoutWrapper( cell, output_keep_prob=self.dropout_kp) topic_fw_encoder_cells.append(topic_fw_encoder_wraped_cell) topic_fw_encoder_cell = tf.contrib.rnn.MultiRNNCell( topic_fw_encoder_cells) topic_bw_encoder_cells = [] for _ in range(self.num_layers): cell = tf.contrib.rnn.GRUCell(self.embedding_size) topic_bw_encoder_wraped_cell = tf.contrib.rnn.DropoutWrapper( cell, output_keep_prob=self.dropout_kp) topic_bw_encoder_cells.append(topic_bw_encoder_wraped_cell) topic_bw_encoder_cell = tf.contrib.rnn.MultiRNNCell( topic_bw_encoder_cells) # num_topic_words = tf.tile(tf.constant([self.K], dtype=tf.int32), [tf.shape(self.topic_words)[0]]) ((topic_output_fw, topic_output_bw), (topic_output_state_fw, topic_output_state_bw)) = \ tf.nn.bidirectional_dynamic_rnn(cell_fw=topic_fw_encoder_cell, cell_bw=topic_bw_encoder_cell, inputs=topic_words_embedded, dtype=tf.float32) topic_outputs = tf.concat([topic_output_fw, topic_output_bw], axis=-1) with tf.variable_scope("topic_summarizer"): topic_words_embedded_flatten = tf.reshape( topic_words_embedded, [-1, self.K * self.embedding_size]) summarizer_W = tf.get_variable( name='summarizer_W', shape=[self.K * self.embedding_size, self.summarizer_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) summarizer_b = tf.Variable(tf.constant( 0.1, dtype=tf.float32, shape=[self.summarizer_size]), name='summarizer_b') summarizer_vector = tf.tanh( tf.nn.xw_plus_b(topic_words_embedded_flatten, summarizer_W, summarizer_b)) with tf.variable_scope('decoder') as decoder: decoder_cells = [] for _ in range(self.num_layers): cell = tf.contrib.rnn.GRUCell(self.embedding_size) decoder_wraped_cell = tf.contrib.rnn.DropoutWrapper( cell, output_keep_prob=self.dropout_kp) decoder_cells.append(decoder_wraped_cell) decoder_cell = tf.contrib.rnn.MultiRNNCell(decoder_cells) output_layer = Dense( self.vocab_size, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1), activation=tf.nn.sigmoid) state_layer = Dense( self.embedding_size, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) self.decoder_outputs_array = tensor_array_ops.TensorArray( dtype=tf.float32, size=self.max_decoder_sequence_length, dynamic_size=False, infer_shape=True) attention_size = 10 def content_score_mlp(hidden_state): content_score_W_1 = tf.get_variable( name='content_score_W_1', shape=[self.embedding_size, attention_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) content_score_W_2 = tf.get_variable( name='content_score_W_2', shape=[2 * self.embedding_size, attention_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) content_score_W_3 = tf.get_variable( name='content_score_W_3', shape=[self.summarizer_size, attention_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) content_score_v = tf.get_variable( name='content_score_v', shape=[attention_size, 1], dtype=tf.float32, initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) addition = tf.tanh( tf.matmul(hidden_state, content_score_W_1) + tf.transpose(tf.tensordot( content_outputs, content_score_W_2, axes=[[2], [0]]), perm=[1, 0, 2]) + tf.matmul(summarizer_vector, content_score_W_3)) addition = tf.transpose(addition, perm=[1, 0, 2]) weight = tf.tensordot(addition, content_score_v, axes=[[2], [0]]) return weight def topic_score_mlp(hidden_state): topic_score_W_1 = tf.get_variable( name='topic_score_W_1', shape=[self.embedding_size, attention_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) topic_score_W_2 = tf.get_variable( name='topic_score_W_2', shape=[2 * self.embedding_size, attention_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) topic_score_W_3 = tf.get_variable( name='topic_score_W_3', shape=[2 * self.embedding_size, attention_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) topic_score_v = tf.get_variable( name='topic_score_v', shape=[attention_size, 1], dtype=tf.float32, initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) addition = tf.tanh( tf.matmul(hidden_state, topic_score_W_1) + tf.transpose(tf.tensordot( topic_outputs, topic_score_W_2, axes=[[2], [0]]), perm=[1, 0, 2]) + tf.matmul(content_outputs[:, -1, :], topic_score_W_3)) addition = tf.transpose(addition, perm=[1, 0, 2]) weight = tf.tensordot(addition, topic_score_v, axes=[[2], [0]]) return weight decoder_state_size = 300 def get_overall_state(hidden_state): content_weights = content_score_mlp(hidden_state) topic_weights = topic_score_mlp(hidden_state) content_attention_output = tf.reduce_sum(content_outputs * content_weights, axis=1) topic_attention_output = tf.reduce_sum(topic_outputs * topic_weights, axis=1) state_W = tf.get_variable( name='state_W', shape=[self.embedding_size, decoder_state_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) content_attention_W = tf.get_variable( name='content_attention_W', shape=[2 * self.embedding_size, decoder_state_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) topic_attention_W = tf.get_variable( name='topic_attention_W', shape=[2 * self.embedding_size, decoder_state_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) decoder_b = tf.get_variable( name='decoder_b', shape=[decoder_state_size], dtype=tf.float32, initializer=tf.constant_initializer(0.1)) overall_state = tf.matmul(hidden_state, state_W) + \ tf.matmul(content_attention_output, content_attention_W) + \ tf.matmul(topic_attention_output, topic_attention_W) + \ decoder_b return overall_state training_initial_state = state_layer(content_state) def training_decode(i, hidden_state, decoder_outputs_array): overall_state = get_overall_state(hidden_state) cell_outputs, states = decoder_cell( decoder_embedded_input[:, i, :], (overall_state, )) outputs = output_layer(cell_outputs) decoder_outputs_array = decoder_outputs_array.write(i, outputs) return i + 1, states[0], decoder_outputs_array _, _, self.decoder_outputs_array = control_flow_ops.while_loop( cond=lambda i, _1, _2: i < self.max_decoder_sequence_length, body=training_decode, loop_vars=(tf.constant(0, dtype=tf.int32), training_initial_state, self.decoder_outputs_array)) training_decoder_output = tf.transpose( self.decoder_outputs_array.stack(), perm=[1, 0, 2]) beam_width = 5 with tf.variable_scope(decoder, reuse=True): def get_final_state(state): final_state = tensor_array_ops.TensorArray(dtype=tf.float32, size=beam_width, dynamic_size=False, infer_shape=True) state_array = tf.unstack(state.cell_state[0], num=beam_width, axis=1) for i in range(beam_width): final_state = final_state.write( i, get_overall_state(state_array[i])) final_state = tf.transpose(final_state.stack(), perm=[1, 0, 2]) new_state = tf.contrib.seq2seq.BeamSearchDecoderState( (final_state, ), state.log_probs, state.finished, state.lengths) return new_state start_tokens = tf.tile(tf.constant([GO_ID], dtype=tf.int32), [tf.shape(content_state)[0]]) overall_state = get_overall_state(state_layer(content_state)) decoder_initial_state = tf.contrib.seq2seq.tile_batch( (overall_state, ), multiplier=beam_width) beam_search_decoder = tf.contrib.seq2seq.BeamSearchDecoder( cell=decoder_cell, embedding=embeding_init, start_tokens=start_tokens, end_token=EOS_ID, initial_state=decoder_initial_state, beam_width=beam_width, output_layer=output_layer) predicted_ids = tensor_array_ops.TensorArray( dtype=tf.int32, size=self.max_decoder_sequence_length, dynamic_size=False, infer_shape=True) parent_ids = tensor_array_ops.TensorArray( dtype=tf.int32, size=self.max_decoder_sequence_length, dynamic_size=False, infer_shape=True) scores = tensor_array_ops.TensorArray( dtype=tf.float32, size=self.max_decoder_sequence_length, dynamic_size=False, infer_shape=True) initial_finished, initial_inputs, initial_state = beam_search_decoder.initialize( ) initial_final_state = get_final_state(initial_state) initial_sequence_lengths = array_ops.zeros_like(initial_finished, dtype=tf.int32) num_decoder_output = tf.identity(self.max_decoder_sequence_length) def predicting_decode(i, input_data, hidden_state, predicted_ids, parent_ids, sequence_lengths, finished, scores): outputs, next_state, next_inputs, decoder_finished = beam_search_decoder.step( i, input_data, hidden_state) next_finished = math_ops.logical_or(decoder_finished, finished) next_finished = math_ops.logical_or( next_finished, i + 1 >= num_decoder_output) next_sequence_lengths = array_ops.where( math_ops.logical_and(math_ops.logical_not(finished), next_finished), array_ops.fill(array_ops.shape(sequence_lengths), i + 1), sequence_lengths) states = get_final_state(next_state) predicted_ids = predicted_ids.write(i, outputs.predicted_ids) parent_ids = parent_ids.write(i, outputs.parent_ids) scores = scores.write(i, outputs.scores) return i + 1, next_inputs, states, predicted_ids, parent_ids, \ next_sequence_lengths, next_finished, scores _, _next_inputs, _states, predicted_ids, parent_ids, \ sequence_lengths, finished, scores = control_flow_ops.while_loop( cond=lambda i, _1, _2, _3, _4, _5, _6, _7: i < self.max_decoder_sequence_length, body=predicting_decode, loop_vars=(tf.constant(0, dtype=tf.int32), initial_inputs, initial_final_state, predicted_ids, parent_ids, initial_sequence_lengths, initial_finished, scores) ) predicted_ids = predicted_ids.stack() parent_ids = parent_ids.stack() scores = scores.stack() final_outputs_instance = tf.contrib.seq2seq.BeamSearchDecoderOutput( scores, predicted_ids, parent_ids) final_outputs, final_state = beam_search_decoder.finalize( final_outputs_instance, _states, sequence_lengths) self.training_logits = tf.identity(training_decoder_output, name='training_logits') self.predicting_logits = tf.identity(final_outputs.predicted_ids, name='predicting_logits') masks = tf.sequence_mask(self.decoder_input_lengths, self.max_decoder_sequence_length, dtype=tf.float32, name='masks') self.cost = tf.contrib.seq2seq.sequence_loss(self.training_logits, self.decoder_target, masks) optimizer = tf.train.AdamOptimizer(learning_rate=0.00001) gradients = optimizer.compute_gradients(self.cost) capped_gradients = [(tf.clip_by_value(grad, -5.0, 5.0), var) for grad, var in gradients if grad is not None] self.train_op = optimizer.apply_gradients(capped_gradients)
def build_decoder(self): with tf.variable_scope("decode"): for layer in range(self.num_layers): with tf.variable_scope('decoder_{}'.format(layer + 1)): dec_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(2 * self.lstm_hidden_units) dec_cell = tf.contrib.rnn.DropoutWrapper(dec_cell, input_keep_prob=self.keep_prob) self.output_layer = Dense(self.vocab_size) self.init_state = dec_cell.zero_state(self.batch_size, tf.float32) with tf.name_scope("training_decoder"): training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=self.dec_embed_input, sequence_length=self.target_sentence_length, time_major=False) training_decoder = basic_decoder.BasicDecoder(dec_cell, training_helper, initial_state=self.init_state, latent_vector=self.z_tilda, output_layer=self.output_layer) self.training_logits, _state, _len = tf.contrib.seq2seq.dynamic_decode(training_decoder, output_time_major=False, impute_finished=True, maximum_iterations=self.num_tokens) self.training_logits = tf.identity(self.training_logits.rnn_output, 'logits') with tf.name_scope("validate_decoder"): start_token = self.word_index['GO'] end_token = self.word_index['EOS'] start_tokens = tf.tile(tf.constant([start_token], dtype=tf.int32), [self.batch_size], name='start_tokens') inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(self.embeddings, start_tokens, end_token) inference_decoder = basic_decoder.BasicDecoder(dec_cell, inference_helper, initial_state=self.init_state, latent_vector=self.z_tilda, output_layer=self.output_layer) self.validate_logits, _state, _len = tf.contrib.seq2seq.dynamic_decode(inference_decoder, output_time_major=False, impute_finished=True, maximum_iterations=self.num_tokens) self.validate_sent = tf.identity(self.validate_logits.sample_id, name='predictions') with tf.name_scope("inference_decoder"): start_token = self.word_index['GO'] end_token = self.word_index['EOS'] start_tokens = tf.tile(tf.constant([start_token], dtype=tf.int32), [self.batch_size], name='start_tokens') inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(self.embeddings, start_tokens, end_token) inference_decoder = basic_decoder.BasicDecoder(dec_cell, inference_helper, initial_state=self.init_state, latent_vector=self.z_sampled, output_layer=self.output_layer) self.inference_logits, _state, _len = tf.contrib.seq2seq.dynamic_decode(inference_decoder, output_time_major=False, impute_finished=True, maximum_iterations=self.num_tokens) self.inference_logits = tf.identity(self.inference_logits.sample_id, name='predictions')
scope="encoder_rnn") decoder_inputs = tf.placeholder(tf.int32, shape=[None, None], name="decoder_inputs") decoder_labels = tf.placeholder(tf.int32, shape=[None, None], name="decoder_labels") decoder_lengths = tf.placeholder(tf.int32, shape=[None], name="decoder_lengths") decoder_emb = tf.nn.embedding_lookup(embedding, decoder_inputs) helper = seq2seq.TrainingHelper(decoder_emb, decoder_lengths) projection = Dense(embedding.shape[0], use_bias=False) decoder_cell = GRUCell(encoder_cell.state_size) decoder = seq2seq.BasicDecoder(decoder_cell, helper, encoder_state, output_layer=projection) decoder_outputs, _, _ = seq2seq.dynamic_decode(decoder, scope="decoder") decoder_outputs = decoder_outputs.rnn_output question_mask = tf.sequence_mask(decoder_lengths, dtype=tf.float32) question_loss = seq2seq.sequence_loss(logits=decoder_outputs, targets=decoder_labels, weights=question_mask,
def attn_decoder_input_fn(inputs, attention): _input_layer = Dense(num_nodes, name='attn_input_feeding', dtype=tf.float32) return _input_layer(array_ops.concat([inputs, attention], -1))
def _model(self, features, labels, mode, params): """ main model. """ question_sequence = features['question_seq'] answer_sequence = features['answer_seq'] batch_size = tf.shape(question_sequence)[0] start_token = tf.ones([1], tf.int32) model_size = params["model_size"] num_layers = params["num_layers"] keep_prob = params["keep_prob"] vocab_size = params["vocab_size"] embedding_size = params["embedding_size"] question_lengths = tf.reduce_sum( tf.to_int32(tf.not_equal(question_sequence, self.vocabs["<PAD>"])), 1) answer_lengths = tf.reduce_sum( tf.to_int32(tf.not_equal(answer_sequence, self.vocabs["<PAD>"])), 1) question_embed = layers.embed_sequence(question_sequence, vocab_size=vocab_size, embed_dim=embedding_size, scope='embed') answer_embed = layers.embed_sequence(answer_sequence, vocab_size=vocab_size, embed_dim=embedding_size, scope='embed', reuse=True) with tf.variable_scope('embed', reuse=True): embeddings = tf.get_variable('embeddings') fcells = [] for i in range(num_layers): c = tf.nn.rnn_cell.GRUCell(model_size) c = tf.nn.rnn_cell.DropoutWrapper(c, input_keep_prob=keep_prob, output_keep_prob=keep_prob) fcells.append(c) # I cant figure out how to use tuple version. fcell = tf.nn.rnn_cell.MultiRNNCell(fcells) #bcells = [] #for i in range(num_layers): # c = tf.nn.rnn_cell.GRUCell(model_size) # c = tf.nn.rnn_cell.DropoutWrapper(c, input_keep_prob=keep_prob, # output_keep_prob=keep_prob) # bcells.append(c) # I cant figure out how to use tuple version. #bcell = tf.nn.rnn_cell.MultiRNNCell(bcells) bcell = tf.contrib.rnn.GRUCell(num_units=model_size) #icell = tf.contrib.rnn.GRUCell(num_units=model_size) encoder_outputs, encoder_final_state = tf.nn.bidirectional_dynamic_rnn( fcell, bcell, question_embed, sequence_length=question_lengths, dtype=tf.float32) # helpers train_helper = tf.contrib.seq2seq.TrainingHelper(answer_embed, answer_lengths, time_major=False) start_tokens = tf.tile(tf.constant([self.vocabs['<START>']], dtype=tf.int32), [batch_size], name='start_tokens') pred_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( embeddings, start_tokens=start_tokens, end_token=self.vocabs["<EOS>"]) # rnn cell and dense layer cell = tf.contrib.rnn.GRUCell(num_units=model_size) cells = [] for i in range(num_layers): c = tf.nn.rnn_cell.GRUCell(model_size) c = tf.nn.rnn_cell.DropoutWrapper(c, input_keep_prob=keep_prob, output_keep_prob=keep_prob) cells.append(c) # I cant figure out how to use tuple version. cell = tf.nn.rnn_cell.MultiRNNCell(cells) projection_layer = Dense( units=vocab_size, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) # deocder in seq2seq model. For this case we don't have an encoder. def decode(helper, scope, output_max_length, reuse=None): with tf.variable_scope(scope, reuse=reuse): attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( num_units=model_size, memory=encoder_outputs[0], memory_sequence_length=question_lengths) #cell = tf.contrib.rnn.GRUCell(num_units=model_size) attn_cell = tf.contrib.seq2seq.AttentionWrapper( cell, attention_mechanism, attention_layer_size=model_size) #out_cell = tf.contrib.rnn.OutputProjectionWrapper( # attn_cell, vocab_size, reuse=reuse #) decoder = tf.contrib.seq2seq.BasicDecoder( cell=attn_cell, helper=helper, initial_state=attn_cell.zero_state(dtype=tf.float32, batch_size=batch_size), #initial_state=encoder_final_state, output_layer=projection_layer) outputs = tf.contrib.seq2seq.dynamic_decode( decoder=decoder, output_time_major=False, impute_finished=True, maximum_iterations=output_max_length) return outputs[0] train_outputs = decode(train_helper, 'decode', 3000) pred_outputs = decode(pred_helper, 'decode', 300, reuse=True) targets = answer_sequence[:, 1:] probs = tf.nn.softmax(pred_outputs.rnn_output, name="probs") # in case in prediction mode return if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec(mode=mode, predictions={ "probs": probs, "syms": pred_outputs.sample_id }) # mask the PADs mask = tf.to_float( tf.not_equal(answer_sequence[:, :-1], self.vocabs["<PAD>"])) #tf.identity(mask[0], name='mask') #tf.identity(targets[0], name='targets') #tf.identity(train_outputs.rnn_output[0,output_lengths[0]-2:output_lengths[0],:], name='rnn_out') # Loss function loss = tf.contrib.seq2seq.sequence_loss( train_outputs.rnn_output[:, :-1, :], targets, mask) tf.summary.scalar("loss", loss) # Optimizer learning_rate = tf.Variable(0.0, trainable=False) initial_learning_rate = tf.constant(0.001) learning_rate = tf.train.exponential_decay(initial_learning_rate, tf.train.get_global_step(), 100, 0.99) tf.summary.scalar("learning_rate", learning_rate) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), 5.0) optimizer = tf.train.AdamOptimizer(learning_rate) # Visualise gradients vis_grads = [0 if i is None else i for i in grads] for g in vis_grads: tf.summary.histogram("gradients_" + str(g), g) train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=tf.train.get_global_step()) tf.identity(question_sequence[0], name="train_input") tf.identity(train_outputs.sample_id[0], name='train_pred') tf.identity(pred_outputs.sample_id[0], name='predictions') return tf.estimator.EstimatorSpec(mode=mode, predictions=None, loss=loss, train_op=train_op)
def __init__(self, rnn_size=128, layer_size=2, encoder_vocab_size, decoder_vocab_size, embedding_dim=200, grad_clip=5, is_inference=False): # define inputs self.input_x = tf.placeholder(tf.int32, shape=[None, None], name='input_ids') # define embedding layer with tf.variable_scope('embedding'): encoder_embedding = tf.Variable(tf.truncated_normal( shape=[encoder_vocab_size, embedding_dim], stddev=0.1), name='encoder_embedding') decoder_embedding = tf.Variable(tf.truncated_normal( shape=[decoder_vocab_size, embedding_dim], stddev=0.1), name='decoder_embedding') # define encoder with tf.variable_scope('encoder'): encoder = self._get_simple_lstm(rnn_size, layer_size) with tf.device('/cpu:0'): input_x_embedded = tf.nn.embedding_lookup(encoder_embedding, self.input_x) encoder_outputs, encoder_state = tf.nn.dynamic_rnn(encoder, input_x_embedded, dtype=tf.float32) # define helper for decoder if is_inference: self.start_tokens = tf.placeholder(tf.int32, shape=[None], name='start_tokens') self.end_token = tf.placeholder(tf.int32, name='end_token') helper = GreedyEmbeddingHelper(decoder_embedding, self.start_tokens, self.end_token) else: self.target_ids = tf.placeholder(tf.int32, shape=[None, None], name='target_ids') self.decoder_seq_length = tf.placeholder(tf.int32, shape=[None], name='batch_seq_length') with tf.device('/cpu:0'): target_embeddeds = tf.nn.embedding_lookup( decoder_embedding, self.target_ids) helper = TrainingHelper(target_embeddeds, self.decoder_seq_length) with tf.variable_scope('decoder'): fc_layer = Dense(decoder_vocab_size) decoder_cell = self._get_simple_lstm(rnn_size, layer_size) decoder = BasicDecoder(decoder_cell, helper, encoder_state, fc_layer) logits, final_state, final_sequence_lengths = dynamic_decode(decoder) if not is_inference: targets = tf.reshape(self.target_ids, [-1]) logits_flat = tf.reshape(logits.rnn_output, [-1, decoder_vocab_size]) print 'shape logits_flat:{}'.format(logits_flat.shape) print 'shape logits:{}'.format(logits.rnn_output.shape) self.cost = tf.losses.sparse_softmax_cross_entropy( targets, logits_flat) # define train op tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), grad_clip) optimizer = tf.train.AdamOptimizer(1e-3) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) else: self.prob = tf.nn.softmax(logits)
tf.contrib.rnn.LSTMStateTuple(c=encoder_last_state_c, h=encoder_last_state_h)) encoder_last_state = tuple(encoder_last_state) #batch_size = batch_size * beam_width ######################################################### ends building encoder # building training decoder, no beam search with tf.variable_scope('shared_attention_mechanism'): attention_mechanism = seq2seq.BahdanauAttention( num_units=hidden_dim * 2, memory=encoder_outputs, memory_sequence_length=encoder_inputs_length) global_decoder_cell = tf.contrib.rnn.MultiRNNCell([ tf.nn.rnn_cell.BasicLSTMCell(hidden_dim * 2) for _ in range(num_layers) ]) projection_layer = Dense(label_dim) decoder_cell = seq2seq.AttentionWrapper( cell=global_decoder_cell, #tf.nn.rnn_cell.BasicLSTMCell(hidden_dim*2), attention_mechanism=attention_mechanism, attention_layer_size=hidden_dim * 2) #input_vectors = tf.nn.embedding_lookup(tgt_w, decoder_inputs) print(decoder_inputs.shape, decoder_inputs.shape) #decoder training training_helper = seq2seq.TrainingHelper( inputs=decoder_inputs_train, sequence_length=tf.tile(tf.constant([15], dtype=tf.int32), [batch_size]), #decoder_inputs_length_train, time_major=False) #print(decoder_cell.zero_state(batch_size,tf.float32))
def __init__(self, data, args, embed): with tf.variable_scope("input"): with tf.variable_scope("embedding"): # build the embedding table and embedding input if embed is None: # initialize the embedding randomly self.embed = tf.get_variable('embed', [data.vocab_size, args.embedding_size], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.sentence = tf.placeholder(tf.int32, (None, None), 'sen_inps') # batch*len self.sentence_length = tf.placeholder(tf.int32, (None,), 'sen_lens') # batch self.use_prior = tf.placeholder(dtype=tf.bool, name="use_prior") batch_size, batch_len = tf.shape(self.sentence)[0], tf.shape(self.sentence)[1] self.decoder_max_len = batch_len - 1 self.encoder_input = tf.nn.embedding_lookup(self.embed, self.sentence) # batch*len*unit self.encoder_len = self.sentence_length decoder_input = tf.split(self.sentence, [self.decoder_max_len, 1], 1)[0] # no eos_id self.decoder_input = tf.nn.embedding_lookup(self.embed, decoder_input) # batch*(len-1)*unit self.decoder_target = tf.split(self.sentence, [1, self.decoder_max_len], 1)[1] # no go_id, batch*(len-1) self.decoder_len = self.sentence_length - 1 self.decoder_mask = tf.sequence_mask(self.decoder_len, self.decoder_max_len, dtype=tf.float32) # batch*(len-1) # initialize the training process self.learning_rate = tf.Variable(float(args.lr), trainable=False, dtype=tf.float32) self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate * args.lr_decay) self.global_step = tf.Variable(0, trainable=False) # build rnn_cell cell_enc = tf.nn.rnn_cell.GRUCell(args.eh_size) cell_dec = tf.nn.rnn_cell.GRUCell(args.dh_size) # build encoder with tf.variable_scope('encoder'): encoder_output, encoder_state = dynamic_rnn(cell_enc, self.encoder_input, self.encoder_len, dtype=tf.float32, scope="encoder_rnn") with tf.variable_scope('recognition_net'): recog_input = encoder_state self.recog_mu = tf.layers.dense(inputs=recog_input, units=args.z_dim, activation=None, name='recog_mu') self.recog_logvar = tf.layers.dense(inputs=recog_input, units=args.z_dim, activation=None, name='recog_logvar') epsilon = tf.random_normal(tf.shape(self.recog_logvar), name="epsilon") std = tf.exp(0.5 * self.recog_logvar) self.recog_z = tf.add(self.recog_mu, tf.multiply(std, epsilon), name='recog_z') self.kld = tf.reduce_mean( 0.5 * tf.reduce_sum(tf.exp(self.recog_logvar) + self.recog_mu * self.recog_mu - self.recog_logvar - 1, axis=-1)) self.prior_z = tf.random_normal(tf.shape(self.recog_logvar), name="prior_z") latent_sample = tf.cond(self.use_prior, lambda: self.prior_z, lambda: self.recog_z, name='latent_sample') dec_init_state = tf.layers.dense(inputs=latent_sample, units=args.dh_size, activation=None) with tf.variable_scope("output_layer", initializer=tf.orthogonal_initializer()): self.output_layer = Dense(data.vocab_size, kernel_initializer=tf.truncated_normal_initializer(stddev=0.1), use_bias=True) with tf.variable_scope("decode", initializer=tf.orthogonal_initializer()): train_helper = tf.contrib.seq2seq.TrainingHelper( inputs=self.decoder_input, sequence_length=self.decoder_len ) train_decoder = tf.contrib.seq2seq.BasicDecoder( cell=cell_dec, helper=train_helper, initial_state=dec_init_state, output_layer=self.output_layer ) train_output, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder=train_decoder, maximum_iterations=self.decoder_max_len, impute_finished=True ) logits = train_output.rnn_output crossent = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.decoder_target, logits=logits) crossent = tf.reduce_sum(crossent * self.decoder_mask) self.sen_loss = crossent / tf.to_float(batch_size) self.ppl_loss = crossent / tf.reduce_sum(self.decoder_mask) self.decoder_distribution_teacher = tf.nn.log_softmax(logits) with tf.variable_scope("decode", reuse=True): infer_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(self.embed, tf.fill([batch_size], data.go_id), data.eos_id) infer_decoder = tf.contrib.seq2seq.BasicDecoder( cell=cell_dec, helper=infer_helper, initial_state=dec_init_state, output_layer=self.output_layer ) infer_output, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder=infer_decoder, maximum_iterations=self.decoder_max_len, impute_finished=True ) self.decoder_distribution = infer_output.rnn_output self.generation_index = tf.argmax(tf.split(self.decoder_distribution, [2, data.vocab_size - 2], 2)[1], 2) + 2 # for removing UNK self.kl_weights = tf.minimum(tf.to_float(self.global_step) / args.full_kl_step, 1.0) self.kl_loss = self.kl_weights * tf.maximum(self.kld, args.min_kl) self.loss = self.sen_loss + self.kl_loss # calculate the gradient of parameters and update self.params = [k for k in tf.trainable_variables() if args.name in k.name] opt = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=args.momentum) gradients = tf.gradients(self.loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(gradients, args.grad_clip) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) # save checkpoint self.latest_saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=args.checkpoint_max_to_keep, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) self.best_saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=1, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) # create summary for tensorboard self.create_summary(args)
def __init__(self, n_cond, n_pred, hidden_dim, n_layers=2, input_dim=1, learning_rate=0.001, output_dim=1, cell_type='GRU', batch_size=100, optimizer='Adam', teacher_forcing_ratio=0.5, use_scheduled_sampling=True): """ Construct graph TrainingHelper just iterates over the dec_inputs passed to it But in general a helper will take sample ids passed by basic decoder and used these to pick inputs BasicDecoder just implements a step function which produces outputs and sample ids at each step the outputs are the result of applying the rnn cell followed by an optional output layer what I need is a version of GreedyEmbeddingHelper, (A helper for use during inference. Uses the argmax of the output (treated as logits) and passes the result through an embedding layer to get the next input.) """ super().__init__(n_cond, n_pred, teacher_forcing_ratio=teacher_forcing_ratio, use_scheduled_sampling=use_scheduled_sampling) self.graph = tf.Graph() with self.graph.as_default(): if use_scheduled_sampling: self.sampling_probability = tf.placeholder( tf.float32, shape=() ) # the probability of sampling from the outputs instead of reading directly from the inputs self.teacher_force = tf.placeholder(tf.bool) cells = [] self.keep_prob = tf.placeholder(tf.float32) for i in range(n_layers): with tf.variable_scope('RNN_{}'.format(i)): if cell_type == 'GRU': cells.append( DropoutWrapper(tf.nn.rnn_cell.GRUCell(hidden_dim), output_keep_prob=self.keep_prob)) elif cell_type == 'LSTM': cells.append( DropoutWrapper( tf.nn.rnn_cell.BasicLSTMCell(hidden_dim), output_keep_prob=self.keep_prob)) # cells.append(tf.nn.rnn_cell.BasicLSTMCell(...)) cell = tf.nn.rnn_cell.MultiRNNCell(cells) self.inputs = tf.placeholder(tf.float32, shape=(None, n_cond, input_dim)) self.go_sym = tf.placeholder(tf.float32, shape=(None, 1, input_dim)) self.targets = tf.placeholder(tf.float32, shape=(None, n_pred, input_dim)) dec_input = tf.concat([self.go_sym, self.targets[:, :-1, :]], 1) enc_outputs, enc_state = tf.nn.dynamic_rnn( cell, self.inputs, dtype=tf.float32) # returns outputs, state # one of the features of the dynamic seq2seq is that it can handle variable length sequences # but to do this you need to pad them to equal length then specify the lengths separately # with constant lengths we still need to specify the lengths for traininghelper, but n.b. they're all the same sequence_lengths = tf.constant(n_pred, shape=(batch_size, )) output_layer = Dense(1, activation=None) # if not use_scheduled_sampling: # train_helper = tf.contrib.seq2seq.TrainingHelper(dec_input, sequence_lengths) # else: train_helper = tf.contrib.seq2seq.ScheduledOutputTrainingHelper( dec_input, sequence_lengths, self.sampling_probability, next_input_layer=Dense(1, activation=None)) def sampler(time, outputs, state): # this isn't necessary, but just do it to get the types right sample_ids = math_ops.cast(math_ops.argmax(outputs, axis=-1), tf.int32) return sample_ids def looper(time, outputs, state, sample_ids): # next_inputs_fn: callable that takes `(time, outputs, state, sample_ids)` # and emits `(finished, next_inputs, next_state)`. next_time = time + 1 finished = next_time >= sequence_lengths next_inputs = tf.reshape( outputs, (batch_size, input_dim)) # collapse the time axis # I think this is the right thing to do based on looking at the shape of the outputs of TrainingHelper.initialize return (finished, outputs, state) inf_helper = tf.contrib.seq2seq.CustomHelper( lambda: (array_ops.tile([False], [batch_size]), tf.reshape(self.go_sym, (batch_size, input_dim))), sampler, looper) # initialize fn, sample fn, next_inputs fn # initialize_fn: callable that returns `(finished, next_inputs)` # for the first iteration. # sample_fn: callable that takes `(time, outputs, state)` # next_inputs_fn - see note on looper #https://github.com/tensorflow/tensorflow/issues/11540 train_decoder = tf.contrib.seq2seq.BasicDecoder( cell=cell, helper=train_helper, initial_state=enc_state, output_layer=output_layer) inf_decoder = tf.contrib.seq2seq.BasicDecoder( cell=cell, helper=inf_helper, initial_state=enc_state, output_layer=output_layer) outputs, states, sequence_lengths = tf.cond( self.teacher_force, lambda: tf.contrib.seq2seq.dynamic_decode( decoder=train_decoder), lambda: tf.contrib.seq2seq.dynamic_decode(decoder=inf_decoder)) # outputs, states, sequence_lengths = tf.contrib.seq2seq.dynamic_decode(decoder=train_decoder) # here outputs is an instance of class BasicDecoderOutput, with attrs rnn_output, sample_ids self.preds = outputs.rnn_output self.loss = tf.reduce_mean(tf.abs(self.preds - self.targets)) if optimizer == 'Adam': self.optimizer = tf.train.AdamOptimizer(learning_rate) elif optimizer == 'RMSProp': self.optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=0.92, momentum=0.5) self.train_op = self.optimizer.minimize(self.loss) tf.summary.scalar('loss', self.loss) self.summary_op = tf.summary.merge_all() self.saver = tf.train.Saver() self.init = tf.global_variables_initializer()
def _init_decoder(self): data_y = process_decoding_input(self.data_y, self.vocab_to_int_y, self.batch_size) self.dec_embeddings = tf.Variable(tf.random_uniform( [self.vocab_size_y, self.embedding_size], -1.0, 1.0), dtype=tf.float32) dec_embedded = tf.nn.embedding_lookup(self.dec_embeddings, data_y) with tf.variable_scope("decoder"): dec_cell = rnn_cell(self.cell_size, self.dec_num_layers, self.dec_keep_prob) out_layer = Dense(self.vocab_size_y, kernel_initializer=tf.truncated_normal_initializer( mean=0.0, stddev=0.1)) att_mechanism = seq2seq.BahdanauAttention(self.cell_size, self.enc_outputs, self.x_length, normalize=False) dec_cell = seq2seq.DynamicAttentionWrapper( dec_cell, att_mechanism, attention_size=self.cell_size) init_state = seq2seq.DynamicAttentionWrapperState( cell_state=self.enc_states[0], attention=_zero_state_tensors(self.cell_size, self.batch_size, tf.float32)) with tf.variable_scope("decoding"): train_helper = seq2seq.TrainingHelper( dec_embedded, sequence_length=self.y_length, time_major=False) train_decoder = seq2seq.BasicDecoder(dec_cell, train_helper, init_state, out_layer) train_out, _ = seq2seq.dynamic_decode( train_decoder, output_time_major=False, impute_finished=True, maximum_iterations=self.max_length, swap_memory=True) self.decoder_train = train_out.rnn_output with tf.variable_scope("decoding", reuse=True): start_tokens = tf.tile( tf.constant([self.vocab_to_int_y[START]], dtype=tf.int32), [self.batch_size]) infer_helper = seq2seq.GreedyEmbeddingHelper( embedding=self.dec_embeddings, start_tokens=start_tokens, end_token=self.vocab_to_int_y[STOP]) infer_decoder = seq2seq.BasicDecoder(dec_cell, infer_helper, init_state, out_layer) infer_out, _ = seq2seq.dynamic_decode( infer_decoder, output_time_major=False, impute_finished=True, maximum_iterations=self.max_length) self.decoder_inference = infer_out.sample_id tf.identity(self.decoder_train, 'decoder_train') tf.identity(self.decoder_inference, 'decoder_inference')
def build_graph(self): with tf.variable_scope('input'): self.inputs = tf.placeholder(tf.int32, [None, None], name='inputs') self.targets = tf.placeholder(tf.int32, [None, None], name='targets') self.learning_rate = tf.placeholder(tf.float32, name='learning_rate') self.target_sequence_length = tf.placeholder( tf.int32, (None, ), name='target_sequence_length') self.max_target_sequence_length = tf.reduce_max( self.target_sequence_length, name='max_target_length') self.source_sequence_length = tf.placeholder( tf.int32, (None, ), name='source_sequence_length') with tf.variable_scope('encoder'): encoder_embed_input = tf.contrib.layers.embed_sequence( self.inputs, len(self.source_letter_to_int), self.config.encoding_embedding_size) encoder_cell = tf.contrib.rnn.MultiRNNCell([ self.get_lstm_cell(self.config.rnn_size) for _ in range(self.config.rnn_layers) ]) encoder_output, encoder_state = tf.nn.dynamic_rnn( encoder_cell, encoder_embed_input, sequence_length=self.source_sequence_length, dtype=tf.float32) with tf.variable_scope('decoder'): # 1. embedding decoder_input = self.process_decoder_input( self.targets, self.target_letter_to_int, self.config.batch_size) target_vocab_size = len(self.target_letter_to_int) decoder_embeddings = tf.Variable( tf.random_uniform( [target_vocab_size, self.config.decoding_embedding_size])) decoder_embed_input = tf.nn.embedding_lookup( decoder_embeddings, decoder_input) # decoder_embed_input = tf.contrib.layers.embed_sequence(decoder_input, target_vocab_size, self.config.decoding_embedding_size) # 2. construct the rnn decoder_cell = tf.contrib.rnn.MultiRNNCell([ self.get_lstm_cell(self.config.rnn_size) for _ in range(self.config.rnn_layers) ]) # 3. output fully connected output_layer = Dense( target_vocab_size, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) if self.mode == 'train': training_helper = tf.contrib.seq2seq.TrainingHelper( inputs=decoder_embed_input, sequence_length=self.target_sequence_length, time_major=False) training_decoder = tf.contrib.seq2seq.BasicDecoder( decoder_cell, training_helper, encoder_state, output_layer) decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode( training_decoder, impute_finished=True, maximum_iterations=self.max_target_sequence_length) else: start_tokens = tf.tile( tf.constant([self.target_letter_to_int['<GO>']], dtype=tf.int32), [self.config.batch_size], name='start_tokens') predicting_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( decoder_embeddings, start_tokens, self.target_letter_to_int['<EOS>']) predicting_decoder = tf.contrib.seq2seq.BasicDecoder( decoder_cell, predicting_helper, encoder_state, output_layer) decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode( predicting_decoder, impute_finished=True, maximum_iterations=self.max_target_sequence_length) with tf.variable_scope('loss'): training_logits = tf.identity(decoder_output.rnn_output, 'logits') predicting_logits = tf.identity( decoder_output.sample_id, name='predictions') # used for predict masks = tf.sequence_mask(self.target_sequence_length, self.max_target_sequence_length, dtype=tf.float32, name='masks') self.loss = tf.contrib.seq2seq.sequence_loss( training_logits, self.targets, masks) with tf.name_scope('optimize'): # optimizer = tf.train.AdamOptimizer(lr) # gradients = optimizer.compute_gradients(cost) # capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None] # train_op = optimizer.apply_gradients(capped_gradients) training_variables = tf.trainable_variables() grads, _ = tf.clip_by_global_norm( tf.gradients(self.loss, training_variables), 5) optimizer = tf.train.AdamOptimizer(self.learning_rate) self.train_op = optimizer.apply_gradients(zip( grads, training_variables), name='train_op')
def _build_model_op(self): # self attention if self.unimodal: input = self.input else: if self.attn_fusion: input = self.self_attention(self.a_input, self.v_input, self.t_input, '') input = input * tf.expand_dims(self.mask, axis=-1) else: input = tf.concat([self.a_input, self.v_input, self.t_input], axis=-1) # input = tf.nn.dropout(input, 1-self.lstm_inp_dropout) self.gru_output = self.BiGRU(input, 100, 'gru', 1 - self.lstm_dropout) self.inter = tf.nn.dropout(self.gru_output, 1 - self.dropout_lstm_out) # self.inter = self.gru_output if self.attn_2: self.inter = self.self_attention_2(self.inter, '') init = tf.glorot_uniform_initializer(seed=self.seed, dtype=tf.float32) if self.unimodal: self.inter1 = Dense( 100, activation=tf.nn.tanh, kernel_initializer=init, kernel_regularizer=tf.contrib.layers.l2_regularizer(0.001))( self.inter) else: self.inter1 = Dense( 200, activation=tf.nn.relu, kernel_initializer=init, kernel_regularizer=tf.contrib.layers.l2_regularizer(0.001))( self.inter) self.inter1 = self.inter1 * tf.expand_dims(self.mask, axis=-1) self.inter1 = Dense( 200, activation=tf.nn.relu, kernel_initializer=init, kernel_regularizer=tf.contrib.layers.l2_regularizer(0.001))( self.inter1) self.inter1 = self.inter1 * tf.expand_dims(self.mask, axis=-1) self.inter1 = Dense( 200, activation=tf.nn.relu, kernel_initializer=init, kernel_regularizer=tf.contrib.layers.l2_regularizer(0.001))( self.inter1) self.inter1 = self.inter1 * tf.expand_dims(self.mask, axis=-1) self.inter1 = tf.nn.dropout(self.inter1, 1 - self.dropout) self.output = Dense( self.emotions, kernel_initializer=init, kernel_regularizer=tf.contrib.layers.l2_regularizer(0.001))( self.inter1) # print('self.output', self.output.get_shape()) self.preds = tf.nn.softmax(self.output) # To calculate the number correct, we want to count padded steps as incorrect correct = tf.cast( tf.equal(tf.argmax(self.preds, -1, output_type=tf.int32), tf.argmax(self.y, -1, output_type=tf.int32)), tf.int32) * tf.cast(self.mask, tf.int32) # To calculate accuracy we want to divide by the number of non-padded time-steps, # rather than taking the mean self.accuracy = tf.reduce_sum(tf.cast( correct, tf.float32)) / tf.reduce_sum( tf.cast(self.seq_len, tf.float32)) # y = tf.argmax(self.y, -1) loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.output, labels=self.y) loss = loss * self.mask self.loss = tf.reduce_sum(loss) / tf.reduce_sum(self.mask)
def build_latent_space(self): with tf.name_scope("latent_space"): self.z_tilda = Dense(self.latent_dim, name='z_tilda')(self.h_N) # [batch_size x latent_dim]
def build_decoder(self): print("building decoder and attention..") with tf.variable_scope('decoder'): # Building decoder_cell and decoder_initial_state self.decoder_cell, self.decoder_initial_state = self.build_decoder_cell( ) # Initialize decoder embeddings to have variance=1. sqrt3 = math.sqrt(3) # Uniform(-sqrt(3), sqrt(3)) has variance=1. initializer = tf.random_uniform_initializer(-sqrt3, sqrt3, dtype=self.dtype) self.decoder_embeddings = tf.get_variable( name='embedding', shape=[self.num_decoder_symbols, self.embedding_size], initializer=initializer, dtype=self.dtype) # Input projection layer to feed embedded inputs to the cell # ** Essential when use_residual=True to match input/output dims input_layer = Dense(self.hidden_units, dtype=self.dtype, name='input_projection') # Output projection layer to convert cell_outputs to logits output_layer = Dense(self.num_decoder_symbols, name='output_projection') if self.mode == 'train': # decoder_inputs_embedded: [batch_size, max_time_step + 1, embedding_size] self.decoder_inputs_embedded = tf.nn.embedding_lookup( params=self.decoder_embeddings, ids=self.decoder_inputs_train) # Embedded inputs having gone through input projection layer self.decoder_inputs_embedded = input_layer( self.decoder_inputs_embedded) # Helper to feed inputs for training: read inputs from dense ground truth vectors training_helper = seq2seq.TrainingHelper( inputs=self.decoder_inputs_embedded, sequence_length=self.decoder_inputs_length_train, time_major=False, name='training_helper') training_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state, output_layer=output_layer) #output_layer=None) # Maximum decoder time_steps in current batch max_decoder_length = tf.reduce_max( self.decoder_inputs_length_train) # decoder_outputs_train: BasicDecoderOutput # namedtuple(rnn_outputs, sample_id) # decoder_outputs_train.rnn_output: [batch_size, max_time_step + 1, num_decoder_symbols] if output_time_major=False # [max_time_step + 1, batch_size, num_decoder_symbols] if output_time_major=True # decoder_outputs_train.sample_id: [batch_size], tf.int32 (self.decoder_outputs_train, self.decoder_last_state_train, self.decoder_outputs_length_train) = (seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=False, impute_finished=True, maximum_iterations=max_decoder_length)) # More efficient to do the projection on the batch-time-concatenated tensor # logits_train: [batch_size, max_time_step + 1, num_decoder_symbols] # self.decoder_logits_train = output_layer(self.decoder_outputs_train.rnn_output) self.decoder_logits_train = tf.identity( self.decoder_outputs_train.rnn_output) # Use argmax to extract decoder symbols to emit self.decoder_pred_train = tf.argmax(self.decoder_logits_train, axis=-1, name='decoder_pred_train') # masks: masking for valid and padded time steps, [batch_size, max_time_step + 1] masks = tf.sequence_mask( lengths=self.decoder_inputs_length_train, maxlen=max_decoder_length, dtype=self.dtype, name='masks') def class_weighted_loss(labels, logits): class_weights = tf.constant([ 0.00017234778799135608, 0.00017234778799135608, 0.00017234778799135608, 1.6821366229319637e-05, 4.898869308918329e-05, 7.106575604186823e-05, 7.126891354944498e-05, 7.514392550863835e-05, 7.719102618435312e-05, 8.89973910758995e-05, 0.00010430076292140834, 0.00010567508046918493, 0.00011254233356378444, 0.00013745981039146453, 0.00015365550520395147, 0.00016343173716428013, 0.00016623641703291143, 0.00018462654135821253, 0.0001873476479039208, 0.00018800477750021655, 0.00020981274294876723, 0.00021602805964389768, 0.00024354484846033354, 0.00024936107032012903, 0.0002495739348066665, 0.000319111899575184, 0.00033594586064125193, 0.0003818581956683335, 0.0003838636576651593, 0.0005417806138677063, 0.0006711205600832021, 0.0006750650134170244, 0.0006953534538202605, 0.0007032603813511271, 0.0007207552048226591, 0.0007264535179396215, 0.0007633538390502503, 0.000891602363160162, 0.0009813883808113227, 0.0010641991144668115, 0.0011028839931134101, 0.0012656472742694626, 0.0013067898106130453, 0.0013988733031399323, 0.0016671901108961662, 0.0017748398034871436, 0.0022286969673726295, 0.0022647955802244397, 0.0022727983914619817, 0.002481488984505173, 0.002566647824356508, 0.0026578592759658715, 0.002682243306020604, 0.002818588715090889, 0.002964064261676225, 0.0029888566207422903, 0.0030339714376591553, 0.0032127969269917125, 0.0032616731479905726, 0.0033361096721148385, 0.00424275689171333, 0.004594299605598149, 0.004750383639466329, 0.005306946739139776, 0.005497452519519153, 0.005911782580732912, 0.007162605175765489, 0.007194652626216341, 0.007496526162980663, 0.007960420108709664, 0.007960420108709664, 0.008691918172753256, 0.009110509132914177, 0.011323977901122198, 0.011652209144632988, 0.012711500885054168, 0.013180367720978298, 0.015169857188295775, 0.016242473353124773, 0.022971498027990745, 0.024000072566557496, 0.024549692548997745, 0.029504676366226647, 0.035733441376874495, 0.03828583004665124, 0.03874710510745427, 0.058472904071249165, 0.0630590141944844, 0.08040024309796762, 0.3573344137687449 ]) weights = tf.gather(class_weights, labels) unweighted_losses = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels, logits=logits) return unweighted_losses * weights # Computes per word average cross-entropy over a batch # Internally calls 'nn_ops.sparse_softmax_cross_entropy_with_logits' by default if self.loss_type == 'weighted': print 'using weighted loss!' self.loss = seq2seq.sequence_loss( logits=self.decoder_logits_train, targets=self.decoder_targets_train, weights=masks, average_across_timesteps=True, average_across_batch=True, softmax_loss_function=class_weighted_loss, ) else: self.loss = seq2seq.sequence_loss( logits=self.decoder_logits_train, targets=self.decoder_targets_train, weights=masks, average_across_timesteps=True, average_across_batch=True, ) # Training summary for the current batch_loss tf.summary.scalar('loss', self.loss) # Contruct graphs for minimizing loss self.init_optimizer() elif self.mode == 'decode': # Start_tokens: [batch_size,] `int32` vector start_tokens = tf.ones([ self.batch_size, ], tf.int32) * data_utils.start_token end_token = data_utils.end_token def embed_and_input_proj(inputs): return input_layer( tf.nn.embedding_lookup(self.decoder_embeddings, inputs)) if not self.use_beamsearch_decode: # Helper to feed inputs for greedy decoding: uses the argmax of the output decoding_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_tokens, end_token=end_token, embedding=embed_and_input_proj) # Basic decoder performs greedy decoding at each time step print("building greedy decoder..") inference_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=output_layer) else: # Beamsearch is used to approximately find the most likely translation print("building beamsearch decoder..") inference_decoder = beam_search_decoder.BeamSearchDecoder( cell=self.decoder_cell, embedding=embed_and_input_proj, start_tokens=start_tokens, end_token=end_token, initial_state=self.decoder_initial_state, beam_width=self.beam_width, output_layer=output_layer, ) # For GreedyDecoder, return # decoder_outputs_decode: BasicDecoderOutput instance # namedtuple(rnn_outputs, sample_id) # decoder_outputs_decode.rnn_output: [batch_size, max_time_step, num_decoder_symbols] if output_time_major=False # [max_time_step, batch_size, num_decoder_symbols] if output_time_major=True # decoder_outputs_decode.sample_id: [batch_size, max_time_step], tf.int32 if output_time_major=False # [max_time_step, batch_size], tf.int32 if output_time_major=True # For BeamSearchDecoder, return # decoder_outputs_decode: FinalBeamSearchDecoderOutput instance # namedtuple(predicted_ids, beam_search_decoder_output) # decoder_outputs_decode.predicted_ids: [batch_size, max_time_step, beam_width] if output_time_major=False # [max_time_step, batch_size, beam_width] if output_time_major=True # decoder_outputs_decode.beam_search_decoder_output: BeamSearchDecoderOutput instance # namedtuple(scores, predicted_ids, parent_ids) (self.decoder_outputs_decode, self.decoder_last_state_decode, self.decoder_outputs_length_decode) = ( seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=False, #impute_finished=True, # error occurs maximum_iterations=self.max_decode_step)) if not self.use_beamsearch_decode: # decoder_outputs_decode.sample_id: [batch_size, max_time_step] # Or use argmax to find decoder symbols to emit: # self.decoder_pred_decode = tf.argmax(self.decoder_outputs_decode.rnn_output, # axis=-1, name='decoder_pred_decode') # Here, we use expand_dims to be compatible with the result of the beamsearch decoder # decoder_pred_decode: [batch_size, max_time_step, 1] (output_major=False) self.decoder_pred_decode = tf.expand_dims( self.decoder_outputs_decode.sample_id, -1) else: # Use beam search to approximately find the most likely translation # decoder_pred_decode: [batch_size, max_time_step, beam_width] (output_major=False) self.decoder_pred_decode = self.decoder_outputs_decode.predicted_ids
def build_decoder(self): print("building decoder and attention..") with tf.variable_scope('decoder'): # Building decoder_cell and decoder_initial_state self.decoder_cell, self.decoder_initial_state = self.build_decoder_cell( ) # Initialize decoder embeddings to have variance=1. sqrt3 = math.sqrt(3) # Uniform(-sqrt(3), sqrt(3)) has variance=1. initializer = tf.random_uniform_initializer(-sqrt3, sqrt3, dtype=self.dtype) self.decoder_embeddings = tf.get_variable( name='embedding', shape=[self.num_decoder_symbols, self.embedding_size], initializer=initializer, dtype=self.dtype) # Input projection layer to feed embedded inputs to the cell # ** Essential when use_residual=True to match input/output dims input_layer = Dense(self.hidden_units, dtype=self.dtype, name='input_projection') # Output projection layer to convert cell_outputs to logits output_layer = Dense(self.num_decoder_symbols, name='output_projection') if self.mode == 'train': # decoder_inputs_embedded: [batch_size, max_time_step + 1, embedding_size] self.decoder_inputs_embedded = tf.nn.embedding_lookup( params=self.decoder_embeddings, ids=self.decoder_inputs_train) # Embedded inputs having gone through input projection layer self.decoder_inputs_embedded = input_layer( self.decoder_inputs_embedded) # Helper to feed inputs for training: read inputs from dense ground truth vectors training_helper = seq2seq.TrainingHelper( inputs=self.decoder_inputs_embedded, sequence_length=self.decoder_inputs_length_train, time_major=False, name='training_helper') training_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state, output_layer=output_layer) #output_layer=None) # Maximum decoder time_steps in current batch max_decoder_length = tf.reduce_max( self.decoder_inputs_length_train) # decoder_outputs_train: BasicDecoderOutput # namedtuple(rnn_outputs, sample_id) # decoder_outputs_train.rnn_output: [batch_size, max_time_step + 1, num_decoder_symbols] if output_time_major=False # [max_time_step + 1, batch_size, num_decoder_symbols] if output_time_major=True # decoder_outputs_train.sample_id: [batch_size], tf.int32 (self.decoder_outputs_train, self.decoder_last_state_train, self.decoder_outputs_length_train) = (seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=False, impute_finished=True, maximum_iterations=max_decoder_length)) # More efficient to do the projection on the batch-time-concatenated tensor # logits_train: [batch_size, max_time_step + 1, num_decoder_symbols] # self.decoder_logits_train = output_layer(self.decoder_outputs_train.rnn_output) self.decoder_logits_train = tf.identity( self.decoder_outputs_train.rnn_output) # Use argmax to extract decoder symbols to emit self.decoder_pred_train = tf.argmax(self.decoder_logits_train, axis=-1, name='decoder_pred_train') # masks: masking for valid and padded time steps, [batch_size, max_time_step + 1] masks = tf.sequence_mask( lengths=self.decoder_inputs_length_train, maxlen=max_decoder_length, dtype=self.dtype, name='masks') # Computes per word average cross-entropy over a batch # Internally calls 'nn_ops.sparse_softmax_cross_entropy_with_logits' by default self.loss = seq2seq.sequence_loss( logits=self.decoder_logits_train, targets=self.decoder_targets_train, weights=masks, average_across_timesteps=True, average_across_batch=True, ) # Training summary for the current batch_loss tf.summary.scalar('loss', self.loss) # Contruct graphs for minimizing loss self.init_optimizer() elif self.mode == 'decode': # Start_tokens: [batch_size,] `int32` vector start_tokens = tf.ones([ self.batch_size, ], tf.int32) * data_utils.start_token end_token = data_utils.end_token def embed_and_input_proj(inputs): return input_layer( tf.nn.embedding_lookup(self.decoder_embeddings, inputs)) if not self.use_beamsearch_decode: # Helper to feed inputs for greedy decoding: uses the argmax of the output decoding_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_tokens, end_token=end_token, embedding=embed_and_input_proj) # Basic decoder performs greedy decoding at each time step print("building greedy decoder..") inference_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=output_layer) else: # Beamsearch is used to approximately find the most likely translation print("building beamsearch decoder..") inference_decoder = beam_search_decoder.BeamSearchDecoder( cell=self.decoder_cell, embedding=embed_and_input_proj, start_tokens=start_tokens, end_token=end_token, initial_state=self.decoder_initial_state, beam_width=self.beam_width, output_layer=output_layer, ) # For GreedyDecoder, return # decoder_outputs_decode: BasicDecoderOutput instance # namedtuple(rnn_outputs, sample_id) # decoder_outputs_decode.rnn_output: [batch_size, max_time_step, num_decoder_symbols] if output_time_major=False # [max_time_step, batch_size, num_decoder_symbols] if output_time_major=True # decoder_outputs_decode.sample_id: [batch_size, max_time_step], tf.int32 if output_time_major=False # [max_time_step, batch_size], tf.int32 if output_time_major=True # For BeamSearchDecoder, return # decoder_outputs_decode: FinalBeamSearchDecoderOutput instance # namedtuple(predicted_ids, beam_search_decoder_output) # decoder_outputs_decode.predicted_ids: [batch_size, max_time_step, beam_width] if output_time_major=False # [max_time_step, batch_size, beam_width] if output_time_major=True # decoder_outputs_decode.beam_search_decoder_output: BeamSearchDecoderOutput instance # namedtuple(scores, predicted_ids, parent_ids) (self.decoder_outputs_decode, self.decoder_last_state_decode, self.decoder_outputs_length_decode) = ( seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=False, #impute_finished=True, # error occurs maximum_iterations=self.max_decode_step)) if not self.use_beamsearch_decode: # decoder_outputs_decode.sample_id: [batch_size, max_time_step] # Or use argmax to find decoder symbols to emit: # self.decoder_pred_decode = tf.argmax(self.decoder_outputs_decode.rnn_output, # axis=-1, name='decoder_pred_decode') # Here, we use expand_dims to be compatible with the result of the beamsearch decoder # decoder_pred_decode: [batch_size, max_time_step, 1] (output_major=False) self.decoder_pred_decode = tf.expand_dims( self.decoder_outputs_decode.sample_id, -1) else: # Use beam search to approximately find the most likely translation # decoder_pred_decode: [batch_size, max_time_step, beam_width] (output_major=False) self.decoder_pred_decode = self.decoder_outputs_decode.predicted_ids
def _inference(self): self.embedding = tf.get_variable("embedding", [self.VOL_SIZE, self.EMBEDDING_SIZE], dtype=tf.float32) num_classes = self.VOL_SIZE # use softmax to map decoder_output to number(0-5,EOS) self.softmax_w = self.variable(name="softmax_w", shape=[self.HIDDEN_UNIT, num_classes]) self.softmax_b = self.variable(name="softmax_b", shape=[num_classes]) # prepare to compute c_i = \sum a_{ij}h_j, encoder_states are h_js hidden_states = [] self.W_a = self.variable(name="attention_w_a", shape=[self.HIDDEN_UNIT, self.HIDDEN_UNIT]) self.U_a = self.variable(name="attention_u_a", shape=[self.HIDDEN_UNIT, self.HIDDEN_UNIT]) self.v_a = self.variable(name="attention_v_a", shape=[1, self.EMBEDDING_SIZE]) # connect intention with decoder # connect intention with intention self.I_E = self.variable(name="intention_e", shape=[self.HIDDEN_UNIT, self.HIDDEN_UNIT]) self.encoder_to_intention_b = self.variable(name="encoder_intention_b", shape=[self.HIDDEN_UNIT]) self.I_I = self.variable(name="intention_i", shape=[self.HIDDEN_UNIT, self.HIDDEN_UNIT]) self.intention_to_decoder_b = self.variable(name="intention_decoder_b", shape=[self.HIDDEN_UNIT]) # self.C = self.variable(name="attention_C", shape=[self.HIDDEN_UNIT, self.HIDDEN_UNIT]) # encoder_params = rnn_encoder.StackBidirectionalRNNEncoder.default_params() # encoder_params["rnn_cell"]["cell_params"][ # "num_units"] = self.HIDDEN_UNIT # encoder_params["rnn_cell"]["cell_class"] = "BasicLSTMCell" # encoder_params["rnn_cell"]["num_layers"] = self.N_LAYER with tf.variable_scope("encoder") as scope: encoder_embedding_vectors = tf.nn.embedding_lookup( self.embedding, self.encoder_inputs) encoder_fw_cell = self.stacked_rnn(self.HIDDEN_UNIT) encoder_bw_cell = self.stacked_rnn(self.HIDDEN_UNIT) self.encoder_initial_fw_state = self.get_state_variables( self.batch_size, encoder_fw_cell) self.encoder_initial_bw_state = self.get_state_variables( self.batch_size, encoder_bw_cell) ((outputs_fw, outputs_bw), (state_fw, state_bw)) = \ tf.nn.bidirectional_dynamic_rnn(cell_fw=encoder_fw_cell, cell_bw=encoder_bw_cell, inputs=encoder_embedding_vectors, sequence_length=self.encoder_inputs_length, initial_state_fw=self.encoder_initial_fw_state, initial_state_bw=self.encoder_initial_bw_state, dtype=tf.float32) encoder_final_state_c = tf.concat( (state_fw[self.N_LAYER - 1][0], state_bw[self.N_LAYER - 1][0]), 1) encoder_final_state_h = tf.concat( (state_fw[self.N_LAYER - 1][1], state_bw[self.N_LAYER - 1][1]), 1) encoder_final_state = tf.nn.rnn_cell.LSTMStateTuple( c=encoder_final_state_c, h=encoder_final_state_h) hidden_state = tf.reshape(encoder_final_state[1], shape=(-1, self.HIDDEN_UNIT * 2)) # compute U_a*h_j quote:"this vector can be pre-computed.. U_a is R^n * n, h_j is R^n" # U_ah = [] # for h in hidden_states: # ## h.shape is BATCH, HIDDEN_UNIT # u_ahj = tf.matmul(h, self.U_a) # U_ah.append(u_ahj) # hidden_states = tf.stack(hidden_states) self.decoder_outputs = [] # self.internal = [] # with tf.variable_scope("decoder") as scope: self.decoder_cell = self.stacked_rnn(self.HIDDEN_UNIT) self.decoder_state = self.get_state_variables( self.batch_size, self.decoder_cell) # # building intention network with tf.variable_scope("intention") as scope: self.intention_cell = self.stacked_rnn(self.HIDDEN_UNIT) self.intention_state = self.get_state_variables( self.batch_size, self.intention_cell) if self.turn_index > 0: tf.get_variable_scope().reuse_variables() # for encoder_step_hidden_state in hidden_states: intention_output, intention_state = self.intention_cell( hidden_state, self.intention_state) # # # # cT_encoder= self._concat_hidden(encoder_state) initial_decoder_state = [] for i in xrange(len(intention_state)): b = intention_state[i] c = b[0] h = b[1] Dh = tf.tanh(tf.matmul(h, self.I_I)) initial_decoder_state.append(tf.contrib.rnn.LSTMStateTuple(c, Dh)) # print(len(initial_decoder_state)) initial_decoder_state = tuple(initial_decoder_state) print(initial_decoder_state) # # intention_states.append(intention_hidden_state) # intention_state = self.intention_state # for encoder_step_hidden_state in hidden_states: # intention_output, intention_state = self.intention_cell(encoder_step_hidden_state, intention_state) # # intention_state = self.intention_state # self.modified = [] # for layer in xrange(len(encoder_state)): # layer_intention_state = encoder_state[layer] # layer_last_encoder_state = self.encoder_state[layer] # h = layer_intention_state[1] # c = layer_intention_state[0] # eh = layer_last_encoder_state[1] # ec = layer_last_encoder_state[0] # self.kernel_i = tf.add(tf.matmul(h, self.I_I), self.intention_to_decoder_b) # self.kernel_e = tf.add(tf.matmul(eh, self.I_E), self.encoder_to_intention_b) # self.h_ = tf.concat([self.kernel_e, self.kernel_i], axis=1) # cc = tf.concat([c, ec], axis=1) # layer = tf.contrib.rnn.LSTMStateTuple(cc, self.h_) # self.modified.append(layer) # # *****************************************mark************************************************************ # with tf.variable_scope("decoder") as scope: # if self.TRAINABLE: # decoder_embedding_vectors = tf.nn.embedding_lookup( # self.embedding, self.decoder_inputs) # self.decoder_outputs, decoder_state = tf.nn.dynamic_rnn(cell=self.decoder_cell, # inputs=decoder_embedding_vectors, # sequence_length=self.decoder_inputs_length, # initial_state=initial_decoder_state, # dtype=tf.float32 # ) # self.intention_state_update_op = self.get_state_update_op( # self.intention_state, intention_state) # self.encoder_state_update_op = self.get_state_update_op( # self.encoder_initial_fw_state, decoder_state) # *****************************************mark end******************** # ***************try another way to decode********************* with tf.variable_scope("decoder") as scope: if self.TRAINABLE: decoder_embedding_vectors = tf.nn.embedding_lookup( self.embedding, self.decoder_inputs) output_layer = Dense( self.VOL_SIZE, kernel_initializer=tf.truncated_normal_initializer( mean=0.0, stddev=0.1)) self.max_target_sequence_length = tf.reduce_max( self.decoder_inputs_length, name='max_target_len') training_helper = tf.contrib.seq2seq.TrainingHelper( inputs=decoder_embedding_vectors, sequence_length=self.decoder_inputs_length, time_major=False) training_decoder = tf.contrib.seq2seq.BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=initial_decoder_state, output_layer=output_layer) self.decoder_output, decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( training_decoder, impute_finished=True, maximum_iterations=self.max_target_sequence_length) self.intention_state_update_op = self.get_state_update_op( self.intention_state, intention_state) self.encoder_state_update_op = self.get_state_update_op( self.encoder_initial_fw_state, decoder_state) else: # https://github.com/tensorflow/tensorflow/issues/11598 # PREDICTING_DECODER ## METHOD 1 output_layer = Dense( self.VOL_SIZE, kernel_initializer=tf.truncated_normal_initializer( mean=0.0, stddev=0.1)) greedy_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding=self.embedding,\ start_tokens = tf.tile(tf.constant([self.data_config.GO_], dtype=tf.int32), [self.batch_size]),\ end_token = self.data_config.EOS_) infer_decoder = tf.contrib.seq2seq.BasicDecoder( cell=self.decoder_cell, helper=greedy_helper, initial_state=initial_decoder_state, output_layer=output_layer) self.decoder_output, decoder_state, final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode( infer_decoder, impute_finished=True, maximum_iterations=100) logits = tf.identity(self.decoder_output.rnn_output, 'logits') # self.predictions_ = tf.argmax(logits, axis=2) time_major = tf.transpose(logits, [1, 0, 2]) print(time_major) (self.predictions_, self.log_probabilities) = tf.nn.ctc_beam_search_decoder(inputs=time_major, \ sequence_length=final_sequence_lengths,\ beam_width=self.beam_width, top_paths=self.paths, merge_repeated=True)
def build_decoder(self): with tf.variable_scope("decoder"): decoder_cell, decoder_initial_state = self.build_decoder_cell() # start tokens : [batch_size], which is fed to BeamsearchDecoder during inference start_tokens = tf.ones([self.batch_size], dtype=tf.int32) * data_util.ID_GO end_token = data_util.ID_EOS input_layer = Dense(self.state_size * 2, dtype=tf.float32, name="input_layer") output_layer = Dense(self.decoder_vocab_size, name="output_projection") if self.mode == "train": # feed ground truth decoder input token every time step decoder_input_lookup = tf.nn.embedding_lookup( self.embedding_matrix, self.decoder_input) decoder_input_lookup = input_layer(decoder_input_lookup) training_helper = seq2seq.TrainingHelper( inputs=decoder_input_lookup, sequence_length=self.decoder_train_len, name="training_helper") training_decoder = seq2seq.BasicDecoder(cell=decoder_cell, initial_state=decoder_initial_state, helper=training_helper, output_layer=output_layer) # decoder_outputs_train: BasicDecoderOutput # namedtuple(rnn_outputs, sample_id) # decoder_outputs_train.rnn_output: [batch_size, max_time_step + 1, num_decoder_symbols] if output_time_major=False # [max_time_step + 1, batch_size, num_decoder_symbols] if output_time_major=True # decoder_outputs_train.sample_id: [batch_size], tf.int32 max_decoder_len = tf.reduce_max(self.decoder_train_len) decoder_outputs_train, final_state, _ = seq2seq.dynamic_decode( training_decoder, impute_finished=True, swap_memory=True, maximum_iterations=max_decoder_len) self.decoder_logits_train = tf.identity( decoder_outputs_train.rnn_output) decoder_pred = tf.argmax(self.decoder_logits_train, axis=2) # sequence mask for get valid sequence except zero padding weights = tf.sequence_mask(self.decoder_len, maxlen=max_decoder_len, dtype=tf.float32) # compute cross entropy loss for all sequence prediction and ignore loss from zero padding self.loss = seq2seq.sequence_loss( logits=self.decoder_logits_train, targets=self.decoder_target, weights=weights, average_across_batch=True, average_across_timesteps=True) tf.summary.scalar("loss", self.loss) with tf.variable_scope("train_optimizer") and tf.device( "/device:GPU:1"): # use AdamOptimizer and clip gradient by max_norm 5.0 # use global step for counting every iteration params = tf.trainable_variables() gradients = tf.gradients(self.loss, params) clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0) learning_rate = tf.train.exponential_decay(self.lr, self.global_step, 10000, 0.96) opt = tf.train.AdagradOptimizer(learning_rate) self.train_op = opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step) elif self.mode == "test": def embedding_proj(inputs): return input_layer( tf.nn.embedding_lookup(self.embedding_matrix, inputs)) inference_decoder = seq2seq.BeamSearchDecoder(cell=decoder_cell, embedding=embedding_proj, start_tokens=start_tokens, end_token=end_token, initial_state=decoder_initial_state, beam_width=self.beam_depth, output_layer=output_layer) # For GreedyDecoder, return # decoder_outputs_decode: BasicDecoderOutput instance # namedtuple(rnn_outputs, sample_id) # decoder_outputs_decode.rnn_output: [batch_size, max_time_step, num_decoder_symbols] if output_time_major=False # [max_time_step, batch_size, num_decoder_symbols] if output_time_major=True # decoder_outputs_decode.sample_id: [batch_size, max_time_step], tf.int32 if output_time_major=False # [max_time_step, batch_size], tf.int32 if output_time_major=True # For BeamSearchDecoder, return # decoder_outputs_decode: FinalBeamSearchDecoderOutput instance # namedtuple(predicted_ids, beam_search_decoder_output) # decoder_outputs_decode.predicted_ids: [batch_size, max_time_step, beam_width] if output_time_major=False # [max_time_step, batch_size, beam_width] if output_time_major=True # decoder_outputs_decode.beam_search_decoder_output: BeamSearchDecoderOutput instance # namedtuple(scores, predicted_ids, parent_ids) with tf.device("/device:GPU:1"): decoder_outputs, decoder_last_state, decoder_output_length = \ seq2seq.dynamic_decode(decoder=inference_decoder, output_time_major=False, swap_memory=True, maximum_iterations=self.max_iter) self.decoder_pred_test = decoder_outputs.predicted_ids
def __init__(self, params): # Input variable batch_size = params.batch_size gen_length = params.gen_length if infer == 1: batch_size = 1 gen_length = 1 self.dropout_keep = tf.placeholder_with_default(tf.constant(1.0), shape=None) self.lr = tf.placeholder_with_default(tf.constant(0.01), shape=None) self.x_word = tf.placeholder(tf.int32, shape=(None, params.turn_num * params.utc_length), name='x_word') self.x_api = tf.placeholder(tf.float32, shape=(None, 3), name='x_api') self.y_word_in = tf.placeholder(tf.int32, shape=(None, gen_length), name='y_word') self.y_word_out = tf.placeholder(tf.int32, shape=(None, gen_length), name='y_word') self.y_len = tf.placeholder(tf.int32, shape=(None, )) # Word embedding x_embedding = tf.get_variable( name='x_embedding', shape=[params.vocab_size, params.embed_size]) x_word_embedded = tf.nn.embedding_lookup(x_embedding, self.x_word) y_embedding = tf.get_variable( name='y_embedding', shape=[params.vocab_size, params.embed_size]) y_word_embedded = tf.nn.embedding_lookup(y_embedding, self.y_word_in) # Extend x_api to concat with y_word_embedded x_api = tf.expand_dims(self.x_api, 1) x_api_extend = x_api for i in range(gen_length - 1): x_api_extend = tf.concat([x_api_extend, x_api], 1) # y_word_embedded = tf.concat([y_word_embedded, x_api_extend], 2) def single_cell(state_size): # define the cell of LSTM return tf.contrib.rnn.BasicLSTMCell(state_size) # Encoder with tf.variable_scope('encoder'): self.encoder_multi_cell = tf.contrib.rnn.MultiRNNCell([ single_cell(params.state_size) for _ in range(params.layer_num) ]) # multi-layer self.encoder_outputs, self.encoder_state = tf.nn.dynamic_rnn( self.encoder_multi_cell, x_word_embedded, sequence_length=[params.utc_length] * params.batch_size, dtype=tf.float32, scope='encoder') with tf.variable_scope('decoder'): self.decoder_multi_cell = tf.contrib.rnn.MultiRNNCell([ single_cell(params.state_size) for _ in range(params.layer_num) ]) # multi-layer attn_mech = tf.contrib.seq2seq.BahdanauAttention( num_units=params.state_size, # LuongAttention memory=self.encoder_outputs, name='attention_mechanic') attn_cell = tf.contrib.seq2seq.DynamicAttentionWrapper( self.decoder_multi_cell, attention_mechanism=attn_mech, attention_size=128, name="attention_wrapper") attn_zero = attn_cell.zero_state(batch_size=batch_size, dtype=tf.float32) init_state = tf.contrib.seq2seq.DynamicAttentionWrapperState( cell_state=self.encoder_state, attention=attn_zero) train_helper = tf.contrib.seq2seq.TrainingHelper( inputs=y_word_embedded, sequence_length=self.y_len, time_major=False) projection_layer = Dense(params.vocab_size) decoder = tf.contrib.seq2seq.BasicDecoder( cell=attn_cell, # attn_cell, helper=train_helper, # A Helper instance initial_state=init_state, # initial state of decoder output_layer=projection_layer ) # instance of tf.layers.Layer, like Dense # Perform dynamic decoding with decoder self.decoder_outputs, self.decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( decoder=decoder) self.w = tf.get_variable( "softmax_w", [params.vocab_size, params.vocab_size]) # weights for output self.b = tf.get_variable("softmax_b", [params.vocab_size]) outputs = self.decoder_outputs[0] # Loss output = tf.reshape(outputs, [-1, params.vocab_size]) self.logits = tf.matmul(output, self.w) + self.b self.probs = tf.nn.softmax(self.logits) targets = tf.reshape(self.y_word_out, [-1]) weights = tf.ones_like(targets, dtype=tf.float32) # print outputs # print self.logits # print targets # print weights loss = tf.contrib.legacy_seq2seq.sequence_loss([self.logits], [targets], [weights]) self.cost = tf.reduce_sum(loss) / batch_size optimizer = tf.train.AdamOptimizer(self.lr) tvars = tf.trainable_variables() grads = tf.gradients(self.cost, tvars) grads, _ = tf.clip_by_global_norm(grads, params.grad_clip) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def initialize_model(self): # encoder_emb_layer = self.embedding_matrix # decoder_emb_layer = self.embedding_matrix INPUT_NUM_VOCAB = len(self.src_dictionary) OUTPUT_NUM_VOCAB = len(self.src_dictionary) tf.reset_default_graph() self.encoder_input_seq = tf.placeholder(tf.int32, [None, None], name='encoder_input_seq') self.encoder_seq_len = tf.placeholder(tf.int32, (None,), name='encoder_seq_len') # Decoder placeholders self.decoder_output_seq = tf.placeholder(tf.int32, [None, None], name='decoder_output_seq') self.decoder_seq_len = tf.placeholder(tf.int32, (None,), name='decoder_seq_len') max_decoder_seq_len = tf.reduce_max(self.decoder_seq_len, name='max_decoder_seq_len') encoder_input_embedded = tf.nn.embedding_lookup(self.embedding_matrix, self.encoder_input_seq) encoder_multi_cell = tf.nn.rnn_cell.BasicLSTMCell(self.RNN_STATE_DIM) self.encoder_output, encoder_state = tf.nn.dynamic_rnn(encoder_multi_cell, encoder_input_embedded, sequence_length=self.encoder_seq_len, dtype=tf.float64) decoder_raw_seq = self.decoder_output_seq[:, :-1] go_prefixes = tf.fill([self.BATCH_SIZE, 1], self.src_dictionary[('<s>', 'None', 'None')]) decoder_input_seq = tf.concat([go_prefixes, decoder_raw_seq], 1) decoder_input_embedded = tf.nn.embedding_lookup(self.embedding_matrix, decoder_input_seq) decoder_multi_cell = tf.nn.rnn_cell.BasicLSTMCell(self.RNN_STATE_DIM) output_layer_kernel_initializer = tf.truncated_normal_initializer(mean=0.0, stddev=0.1) output_layer = Dense( OUTPUT_NUM_VOCAB, kernel_initializer=output_layer_kernel_initializer ) with tf.variable_scope("decode"): training_helper = tf.contrib.seq2seq.TrainingHelper( inputs=decoder_input_embedded, sequence_length=[self.max_length for x in range(self.BATCH_SIZE)], time_major=False ) training_decoder = tf.contrib.seq2seq.BasicDecoder( decoder_multi_cell, training_helper, encoder_state, output_layer ) training_decoder_output_seq, _, _ = tf.contrib.seq2seq.dynamic_decode( training_decoder, impute_finished=True, maximum_iterations=self.max_length ) with tf.variable_scope("decode", reuse=True): start_tokens = tf.tile( tf.constant([self.src_dictionary[('<s>', 'None', 'None')]], dtype=tf.int32), [self.BATCH_SIZE], name='start_tokens') # Helper for the inference process. inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( embedding=self.embedding_matrix, start_tokens=start_tokens, end_token=self.src_dictionary[('</s>', 'None', 'None')] ) # Basic decoder inference_decoder = tf.contrib.seq2seq.BasicDecoder( decoder_multi_cell, inference_helper, encoder_state, output_layer ) # Perform dynamic decoding using the decoder inference_decoder_output_seq, _, _ = tf.contrib.seq2seq.dynamic_decode( inference_decoder, impute_finished=True, maximum_iterations=self.max_length ) training_logits = tf.identity(training_decoder_output_seq.rnn_output, name='logits') inference_logits = tf.identity(inference_decoder_output_seq.sample_id, name='predictions') # Create the weights for sequence_loss masks = tf.sequence_mask( self.decoder_seq_len, self.max_length, dtype=tf.float64, name='masks' ) self.cost = tf.contrib.seq2seq.sequence_loss( training_logits, self.decoder_output_seq, masks ) optimizer = tf.train.AdamOptimizer(self.LEARNING_RATE) self.train_pred = training_decoder_output_seq.sample_id gradients = optimizer.compute_gradients(self.cost) capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None] self.train_op = optimizer.apply_gradients(capped_gradients)
def build_latent_space(self): with tf.name_scope("latent_space"): self.z_mean = Dense(self.latent_dim, name='z_mean')(self.h_N) self.z_log_sigma = Dense(self.latent_dim, name='z_log_sigma')(self.h_N) self.z_vector = tf.identity(self.sample_gaussian(), name='z_vector')
[ 4.7, 3.2, 1.3, 0.2, 5.1, 3.5, 1.4, 0.2, 5.1, 3.5, 1.4, 0.2, 5.1, 3.5, 1.4, 0.2, 5.1, 3.5, 1.4, 0.2, 5.1, 3.5, 1.4, 0.2, 5.1, 3.5, 1.4, 0.2, 5.1, 3.5 ], [ 4.6, 3.1, 1.5, 0.2, 5.1, 3.5, 1.4, 0.2, 5.1, 3.5, 1.4, 0.2, 5.1, 3.5, 1.4, 0.2, 5.1, 3.5, 1.4, 0.2, 5.1, 3.5, 1.4, 0.2, 5.1, 3.5, 1.4, 0.2, 5.1, 3.5 ], ]) labels = np.array([0, 1, 0, 1]) print('------A. Training------') model = Sequential() layer1 = Dense(200, activation='relu', input_dim=4) model.add(layer1) layer2 = Dense(200, activation='relu') model.add(layer2) layer3 = Dense(3, activation='softmax') model.add(layer3) model.compile('adam', loss='categorical_crossentropy', metrics=['accuracy']) # predefined multiclass dataset train_output = model.fit(data, labels, batch_size=20, epochs=5) print('---------------------') print(train_output.history) print('------B. Evaluation------') # predefined eval dataset
for a in range(SIZE_RNN_LAYER): cell = rnn.BasicLSTMCell(SIZE_RNN_STATE) cell = rnn.DropoutWrapper(cell, output_keep_prob=keep_prob) cell_decode.append(cell) multi_rnn_decode = rnn.MultiRNNCell(cell_decode, state_is_tuple=True) dec_cell = tf.contrib.seq2seq.AttentionWrapper( cell=multi_rnn_decode, attention_mechanism=attn_luong, attention_layer_size=SIZE_ATTN, name="attention_wrapper") initial_state = dec_cell.zero_state(dtype=tf.float32, batch_size=batch_size) initial_state = initial_state.clone(cell_state=state_enc) output_layer = Dense(voc_size_kor, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) # train mode with tf.variable_scope("decoder_layer"): train_helper = tf.contrib.seq2seq.TrainingHelper(inputs=embed_dec, sequence_length=dec_pad_len, time_major=False) train_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell, train_helper, initial_state, output_layer) output_train_dec, state_train_dec, len_train_dec = tf.contrib.seq2seq.dynamic_decode( decoder=train_decoder, output_time_major=False, impute_finished=True, maximum_iterations=padded_kor_len) # predict mode
def __init__(self, lstm_size, lstm_layers, source_vocab_size, enc_embedding_size, tgt_word_to_int, dec_embedding_size, tgt_max_length): #----------------------------------------------------------------------- # Placeholders #----------------------------------------------------------------------- self.inputs = tf.placeholder(tf.int32, [None, None], name='inputs') self.targets = tf.placeholder(tf.int32, [None, None], name='targets') self.batch_size = tf.placeholder(tf.int32, [], name='batch_size') self.tgt_seq_length = tf.placeholder(tf.int32, [None], name='tgt_seq_length') self.src_seq_length = tf.placeholder(tf.int32, [None], name='src_seq_length') #----------------------------------------------------------------------- # Encoder #----------------------------------------------------------------------- with tf.variable_scope('encoder'): with tf.variable_scope('embedding'): enc_embed = tf.contrib.layers.embed_sequence( self.inputs, source_vocab_size, enc_embedding_size) with tf.variable_scope('rnn'): enc_cell = tf.contrib.rnn.MultiRNNCell( [tf.contrib.rnn.BasicLSTMCell(lstm_size) \ for _ in range(lstm_layers)]) self.initial_state = enc_cell.zero_state(self.batch_size, tf.float32) _, self.enc_state = tf.nn.dynamic_rnn( enc_cell, enc_embed, sequence_length=self.src_seq_length, initial_state=self.initial_state) #----------------------------------------------------------------------- # Decoder #----------------------------------------------------------------------- target_vocab_size = len(tgt_word_to_int) with tf.variable_scope('decoder'): #------------------------------------------------------------------- # Embedding #------------------------------------------------------------------- with tf.variable_scope('embedding'): self.dec_embed = tf.Variable( tf.random_uniform([target_vocab_size, dec_embedding_size])) #------------------------------------------------------------------- # Final classifier #------------------------------------------------------------------- with tf.variable_scope('classifier') as classifier_scope: self.output_layer = Dense(target_vocab_size, kernel_initializer = \ tf.truncated_normal_initializer( mean = 0.0, stddev=0.1)) #------------------------------------------------------------------- # RNN #------------------------------------------------------------------- with tf.variable_scope('rnn'): self.dec_cell = tf.contrib.rnn.MultiRNNCell( [tf.contrib.rnn.BasicLSTMCell(lstm_size) \ for _ in range(lstm_layers)]) #------------------------------------------------------------------- # Inference decoder #------------------------------------------------------------------- with tf.variable_scope('decoder'): start_tokens = tf.tile([tgt_word_to_int['<s>']], [self.batch_size]) helper = seq2seq.GreedyEmbeddingHelper(self.dec_embed, start_tokens, tgt_word_to_int['</s>']) decoder = seq2seq.BasicDecoder(self.dec_cell, helper, self.enc_state, self.output_layer) outputs, _, _ = seq2seq.dynamic_decode(decoder, impute_finished=\ True, maximum_iterations=\ tgt_max_length) self.outputs = tf.identity(outputs.sample_id, 'predictions')
def decoding_layer(phonem_dict, decoding_embedding_size, num_layers, rnn_size, target_sequence_length, max_target_sequence_length, encoder_state, decoder_input): ''' 构造Decoder层 参数: - target_letter_to_int: target数据的映射表 - decoding_embedding_size: embed向量大小 - num_layers: 堆叠的RNN单元数量 - rnn_size: RNN单元的隐层结点数量 - target_sequence_length: target数据序列长度 - max_target_sequence_length: target数据序列最大长度 - encoder_state: encoder端编码的状态向量 - decoder_input: decoder端输入 ''' # 1. Embedding target_vocab_size = len(phonem_dict) decoder_embeddings = tf.Variable( tf.random_uniform([target_vocab_size, decoding_embedding_size])) decoder_embed_input = tf.nn.embedding_lookup(decoder_embeddings, decoder_input) # 构造Decoder中的RNN单元 def get_decoder_cell(rnn_size): decoder_cell = tf.contrib.rnn.LSTMCell( rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2)) return decoder_cell cell = tf.contrib.rnn.MultiRNNCell( [get_decoder_cell(rnn_size) for _ in range(num_layers)]) # Output全连接层 # target_vocab_size定义了输出层的大小 output_layer = Dense(target_vocab_size, kernel_initializer=tf.truncated_normal_initializer( mean=0.1, stddev=0.1)) # 4. Training decoder with tf.variable_scope("decode"): training_helper = tf.contrib.seq2seq.TrainingHelper( inputs=decoder_embed_input, sequence_length=target_sequence_length, time_major=False) training_decoder = tf.contrib.seq2seq.BasicDecoder( cell, training_helper, encoder_state, output_layer) training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode( training_decoder, impute_finished=True, maximum_iterations=max_target_sequence_length) # 5. Predicting decoder # 与training共享参数 with tf.variable_scope("decode", reuse=True): # 创建一个常量tensor并复制为batch_size的大小 start_tokens = tf.tile(tf.constant([phonem_dict['_sos_']], dtype=tf.int32), [tf.shape(target_sequence_length)[0]], name='start_token') predicting_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( decoder_embeddings, start_tokens, phonem_dict['_eos_']) predicting_decoder = tf.contrib.seq2seq.BasicDecoder( cell, predicting_helper, encoder_state, output_layer) predicting_decoder_output, _, _ = \ tf.contrib.seq2seq.dynamic_decode(predicting_decoder, impute_finished=True, maximum_iterations=max_target_sequence_length) return training_decoder_output, predicting_decoder_output
def decoding_layer(target_letter_to_int, decoding_embedding_size, num_layers, rnn_size, target_sequence_length, max_target_sequence_length, enc_state, dec_input): # 1. Decoder Embedding target_vocab_size = len(target_letter_to_int) dec_embeddings = tf.Variable( tf.random_uniform([target_vocab_size, decoding_embedding_size])) dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, dec_input) # 2. Construct the decoder cell def make_cell(rnn_size): dec_cell = tf.contrib.rnn.LSTMCell( rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2)) return dec_cell dec_cell = tf.contrib.rnn.MultiRNNCell( [make_cell(rnn_size) for _ in range(num_layers)]) # 3. Dense layer to translate the decoder's output at each time # step into a choice from the target vocabulary output_layer = Dense(target_vocab_size, kernel_initializer=tf.truncated_normal_initializer( mean=0.0, stddev=0.1)) # 4. Set up a training decoder and an inference decoder # Training Decoder with tf.variable_scope("decode"): # Helper for the training process. Used by BasicDecoder to read inputs. training_helper = tf.contrib.seq2seq.TrainingHelper( inputs=dec_embed_input, sequence_length=target_sequence_length, time_major=False) # Basic decoder training_decoder = tf.contrib.seq2seq.BasicDecoder( dec_cell, training_helper, enc_state, output_layer) # Perform dynamic decoding using the decoder training_decoder_output = tf.contrib.seq2seq.dynamic_decode( training_decoder, impute_finished=True, maximum_iterations=max_target_sequence_length)[0] # 5. Inference Decoder # Reuses the same parameters trained by the training process with tf.variable_scope("decode", reuse=True): start_tokens = tf.tile(tf.constant([target_letter_to_int['<GO>']], dtype=tf.int32), [batch_size], name='start_tokens') # Helper for the inference process. inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( dec_embeddings, start_tokens, target_letter_to_int['<EOS>']) # Basic decoder inference_decoder = tf.contrib.seq2seq.BasicDecoder( dec_cell, inference_helper, enc_state, output_layer) # Perform dynamic decoding using the decoder inference_decoder_output = tf.contrib.seq2seq.dynamic_decode( inference_decoder, impute_finished=True, maximum_iterations=max_target_sequence_length)[0] return training_decoder_output, inference_decoder_output
def model(self): # 将来替换为record input inputs = tf.placeholder(dtype=tf.int32, shape=(FLAGS.batch_size, FLAGS.en_max_length)) targets = tf.placeholder(dtype=tf.int32, shape=(FLAGS.batch_size, FLAGS.zh_max_length)) start_tokens = tf.constant(0, dtype=tf.int32) end_token = tf.constant(0, dtype=tf.int32) en_len_sequence = tf.placeholder(dtype=tf.int32, shape=FLAGS.batch_size) zh_len_sequence = tf.placeholder(dtype=tf.int32, shape=FLAGS.batch_size, name='batch_seq_length') en_embedding_matrix = tf.get_variable( name='embedding_matrix', shape=(FLAGS.en_vocab_size, FLAGS.en_embedded_size), dtype=tf.float32, # regularizer=tf.nn.l2_loss, initializer=tf.truncated_normal_initializer(mean=0, stddev=0.01)) zh_embedding_matrix = tf.get_variable( name='zh_embedding_matrix', shape=(FLAGS.zh_vocab_size, FLAGS.zh_embedded_size), dtype=tf.float32, # regularizer=tf.nn.l2_loss, initializer=tf.truncated_normal_initializer(mean=0, stddev=0.01)) tf.add_to_collection(tf.GraphKeys.LOSSES, tf.nn.l2_loss(en_embedding_matrix)) tf.add_to_collection(tf.GraphKeys.LOSSES, tf.nn.l2_loss(zh_embedding_matrix)) tf.summary.histogram('zh_embedding_matrix', zh_embedding_matrix) tf.summary.histogram('en_embedding_matrix', en_embedding_matrix) with tf.device('/cpu:0'): embedded = tf.nn.embedding_lookup(en_embedding_matrix, inputs) target_embedded = tf.nn.embedding_lookup(zh_embedding_matrix, targets) with tf.name_scope("encoder"): cells_fw = [ tf.contrib.rnn.GRUCell(num) for num in config.encoder_fw_units ] cells_bw = [ tf.contrib.rnn.GRUCell(num) for num in config.encoder_bw_units ] outputs, states_fw, states_bw = \ tf.contrib.rnn.stack_bidirectional_dynamic_rnn(cells_fw, cells_bw, embedded, dtype=tf.float32, sequence_length=en_len_sequence) dense_fw = tf.concat(states_fw, axis=1) dense_bw = tf.concat(states_bw, axis=1) states = tf.concat([dense_bw, dense_fw], axis=1) tf.summary.histogram('encoder_state', states) ''' external memory will add here ''' with tf.name_scope("decoder"): attention_m = \ tf.contrib.seq2seq.BahdanauAttention( FLAGS.attention_size, outputs, en_len_sequence) cell_out = [ tf.contrib.rnn.GRUCell(num) for num in config.out_cell_units ] cell_attention = \ [tf.contrib.seq2seq.AttentionWrapper( cell_out[i], attention_m) for i in range(len(config.out_cell_units))] cells = tf.contrib.rnn.MultiRNNCell(cell_attention) initial_state = cells.zero_state(dtype=tf.float32, batch_size=FLAGS.batch_size) initial_state = list(initial_state) initial_state[0] = initial_state[0].clone(cell_state=states) initial_state = tuple(initial_state) if FLAGS.is_inference: helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( zh_embedding_matrix, start_tokens, end_token) else: helper = tf.contrib.seq2seq.TrainingHelper( target_embedded, zh_len_sequence) dense = Dense(FLAGS.zh_vocab_size) decoder = tf.contrib.seq2seq.BasicDecoder(cells, helper, initial_state, dense) logits, final_states, final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode( decoder) weights = tf.constant( 1.0, shape=[FLAGS.batch_size, FLAGS.zh_max_length]) inference_losses = tf.contrib.seq2seq.sequence_loss( logits.rnn_output, targets, weights) tf.summary.scalar('inference_loss', inference_losses) tf.add_to_collection(tf.GraphKeys.LOSSES, inference_losses) losses = tf.add_n(tf.get_collection(tf.GraphKeys.LOSSES)) tf.summary.scalar('losses', losses) eval = sequence_equal(logits.sample_id, targets) tf.summary.scalar('eval', eval) global_step = tf.train.get_or_create_global_step() learning_rate = tf.train.exponential_decay(FLAGS.learning_rate, global_step, FLAGS.decay_step, FLAGS.decay_rate) tf.summary.scalar('learning_rate', learning_rate) opt = tf.train.GradientDescentOptimizer(learning_rate) grads_and_vars = opt.compute_gradients(losses) clipped_grads_and_vars = tf.contrib.training.clip_gradient_norms( grads_and_vars, FLAGS.max_gradient) apply_grads_op = opt.apply_gradients(clipped_grads_and_vars, global_step) summary_op = tf.summary.merge_all() if FLAGS.is_inference: return logits.sample_id elif FLAGS.is_train: return [global_step, eval, losses, apply_grads_op, summary_op] else: return [global_step, eval, losses]