def testDynamicAttentionDecoderStateIsTuple(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): cell_fn = lambda: rnn_cell.MultiRNNCell( # pylint: disable=g-long-lambda cells=[rnn_cell.BasicLSTMCell(2) for _ in range(2)]) cell = cell_fn() inp = [constant_op.constant(0.5, shape=[2, 2])] * 2 enc_outputs, enc_state = rnn.static_rnn(cell, inp, dtype=dtypes.float32) attn_states = array_ops.concat([ array_ops.reshape(e, [-1, 1, cell.output_size]) for e in enc_outputs ], 1) dec_inp = [constant_op.constant(0.4, shape=[2, 2])] * 3 # Use a new cell instance since the attention decoder uses a # different variable scope. dec, mem = seq2seq_lib.attention_decoder( dec_inp, enc_state, attn_states, cell_fn(), output_size=4) sess.run([variables.global_variables_initializer()]) res = sess.run(dec) self.assertEqual(3, len(res)) self.assertEqual((2, 4), res[0].shape) res = sess.run([mem]) self.assertEqual(2, len(res[0])) self.assertEqual((2, 2), res[0][0].c.shape) self.assertEqual((2, 2), res[0][0].h.shape) self.assertEqual((2, 2), res[0][1].c.shape) self.assertEqual((2, 2), res[0][1].h.shape)
def testDynamicAttentionDecoderStateIsTuple(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): single_cell = lambda: core_rnn_cell_impl.BasicLSTMCell( # pylint: disable=g-long-lambda 2, state_is_tuple=True) cell = core_rnn_cell_impl.MultiRNNCell( cells=[single_cell() for _ in range(2)], state_is_tuple=True) inp = constant_op.constant(0.5, shape=[2, 2, 2]) enc_outputs, enc_state = core_rnn.static_rnn( cell, inp, dtype=dtypes.float32) attn_states = array_ops.concat([ array_ops.reshape(e, [-1, 1, cell.output_size]) for e in enc_outputs ], 1) dec_inp = [constant_op.constant(0.4, shape=[2, 2])] * 3 dec, mem = seq2seq_lib.attention_decoder( dec_inp, enc_state, attn_states, cell, output_size=4) sess.run([variables.global_variables_initializer()]) res = sess.run(dec) self.assertEqual(3, len(res)) self.assertEqual((2, 4), res[0].shape) res = sess.run([mem]) self.assertEqual(2, len(res[0])) self.assertEqual((2, 2), res[0][0].c.shape) self.assertEqual((2, 2), res[0][0].h.shape) self.assertEqual((2, 2), res[0][1].c.shape) self.assertEqual((2, 2), res[0][1].h.shape)
def testAttentionDecoder2(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): cell_fn = lambda: core_rnn_cell_impl.GRUCell(2) cell = cell_fn() inp = [constant_op.constant(0.5, shape=[2, 2])] * 2 enc_outputs, enc_state = core_rnn.static_rnn( cell, inp, dtype=dtypes.float32) attn_states = array_ops.concat([ array_ops.reshape(e, [-1, 1, cell.output_size]) for e in enc_outputs ], 1) dec_inp = [constant_op.constant(0.4, shape=[2, 2])] * 3 # Use a new cell instance since the attention decoder uses a # different variable scope. dec, mem = seq2seq_lib.attention_decoder( dec_inp, enc_state, attn_states, cell_fn(), output_size=4, num_heads=2) sess.run([variables.global_variables_initializer()]) res = sess.run(dec) self.assertEqual(3, len(res)) self.assertEqual((2, 4), res[0].shape) res = sess.run([mem]) self.assertEqual((2, 2), res[0].shape)
def testDynamicAttentionDecoderStateIsTuple(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): cell_fn = lambda: core_rnn_cell_impl.MultiRNNCell( # pylint: disable=g-long-lambda cells=[core_rnn_cell_impl.BasicLSTMCell(2) for _ in range(2)]) cell = cell_fn() inp = constant_op.constant(0.5, shape=[2, 2, 2]) enc_outputs, enc_state = core_rnn.static_rnn( cell, inp, dtype=dtypes.float32) attn_states = array_ops.concat([ array_ops.reshape(e, [-1, 1, cell.output_size]) for e in enc_outputs ], 1) dec_inp = [constant_op.constant(0.4, shape=[2, 2])] * 3 # Use a new cell instance since the attention decoder uses a # different variable scope. dec, mem = seq2seq_lib.attention_decoder( dec_inp, enc_state, attn_states, cell_fn(), output_size=4) sess.run([variables.global_variables_initializer()]) res = sess.run(dec) self.assertEqual(3, len(res)) self.assertEqual((2, 4), res[0].shape) res = sess.run([mem]) self.assertEqual(2, len(res[0])) self.assertEqual((2, 2), res[0][0].c.shape) self.assertEqual((2, 2), res[0][0].h.shape) self.assertEqual((2, 2), res[0][1].c.shape) self.assertEqual((2, 2), res[0][1].h.shape)
def testAttentionDecoder2(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): cell_fn = lambda: core_rnn_cell_impl.GRUCell(2) cell = cell_fn() inp = [constant_op.constant(0.5, shape=[2, 2])] * 2 enc_outputs, enc_state = core_rnn.static_rnn( cell, inp, dtype=dtypes.float32) attn_states = array_ops.concat([ array_ops.reshape(e, [-1, 1, cell.output_size]) for e in enc_outputs ], 1) dec_inp = [constant_op.constant(0.4, shape=[2, 2])] * 3 # Use a new cell instance since the attention decoder uses a # different variable scope. dec, mem = seq2seq_lib.attention_decoder( dec_inp, enc_state, attn_states, cell_fn(), output_size=4, num_heads=2) sess.run([variables.global_variables_initializer()]) res = sess.run(dec) self.assertEqual(3, len(res)) self.assertEqual((2, 4), res[0].shape) res = sess.run([mem]) self.assertEqual((2, 2), res[0].shape)
def RNN(encoder_input, decoder_input, weights, biases, encoder_attention_states, n_input_encoder, n_steps_encoder, n_hidden_encoder, n_input_decoder, n_steps_decoder, n_hidden_decoder): # Prepare data shape to match `rnn` function requirements # Current data input shape: (batch_size, n_steps, n_input) # Required shape: 'n_steps' tensors list of shape (batch_size, n_input) # Prepare data for encoder # Permuting batch_size and n_steps encoder_input = tf.transpose(encoder_input, [1, 0, 2]) # Reshaping to (n_steps*batch_size, n_input) encoder_input = tf.reshape(encoder_input, [-1, n_input_encoder]) # Split to get a list of 'n_steps' tensors of shape (batch_size, n_input) encoder_input = tf.split(encoder_input, n_steps_encoder, 0) # Prepare data for decoder # Permuting batch_size and n_steps decoder_input = tf.transpose(decoder_input, [1, 0, 2]) # Reshaping to (n_steps*batch_size, n_input) decoder_input = tf.reshape(decoder_input, [-1, n_input_decoder]) # Split to get a list of 'n_steps' tensors of shape (batch_size, n_input) decoder_input = tf.split(decoder_input, n_steps_decoder, 0) # Encoder. with tf.variable_scope('encoder') as scope: encoder_cell = rnn_cell.BasicLSTMCell(n_hidden_encoder, forget_bias=1.0) encoder_outputs, encoder_state, attn_weights = attention_encoder.attention_encoder( encoder_input, encoder_attention_states, encoder_cell) # First calculate a concatenation of encoder outputs to put attention on. top_states = [ tf.reshape(e, [-1, 1, encoder_cell.output_size]) for e in encoder_outputs ] attention_states = tf.concat(top_states, 1) with tf.variable_scope('decoder') as scope: decoder_cell = rnn_cell.BasicLSTMCell(n_hidden_decoder, forget_bias=1.0) outputs, states = seq2seq.attention_decoder(decoder_input, encoder_state, attention_states, decoder_cell) return tf.matmul(outputs[-1], weights['out1']) + biases['out1'], attn_weights
def testDynamicAttentionDecoder2(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): cell = core_rnn_cell_impl.GRUCell(2) inp = constant_op.constant(0.5, shape=[2, 2, 2]) enc_outputs, enc_state = rnn.dynamic_rnn( cell, inp, dtype=dtypes.float32) attn_states = enc_outputs dec_inp = [constant_op.constant(0.4, shape=[2, 2])] * 3 dec, mem = seq2seq_lib.attention_decoder( dec_inp, enc_state, attn_states, cell, output_size=4, num_heads=2) sess.run([variables.global_variables_initializer()]) res = sess.run(dec) self.assertEqual(3, len(res)) self.assertEqual((2, 4), res[0].shape) res = sess.run([mem]) self.assertEqual((2, 2), res[0].shape)
def attention_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, dtype=dtypes.float32, scope=None, loop_function=None): """Basic RNN sequence-to-sequence model. This model first runs an RNN to encode encoder_inputs into a state vector, then runs decoder, initialized with the last encoder state, on decoder_inputs. Encoder and decoder use the same RNN cell type, but don't share parameters. Args: encoder_inputs: A list of 2D Tensors [batch_size x input_size]. decoder_inputs: A list of 2D Tensors [batch_size x input_size]. cell: tf.nn.rnn_cell.RNNCell defining the cell function and size. dtype: The dtype of the initial state of the RNN cell (default: tf.float32). scope: VariableScope for the created subgraph; default: "basic_rnn_seq2seq". Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x output_size] containing the generated outputs. state: The state of each decoder cell in the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. """ with variable_scope.variable_scope(scope or "attention_rnn_seq2seq"): enc_cell = seq2seq.copy.deepcopy(cell) encoder_outputs, enc_state = seq2seq.rnn.static_rnn(enc_cell, encoder_inputs, dtype=dtype) # First calculate a concatenation of encoder outputs to put attention on. top_states = [ seq2seq.array_ops.reshape(e, [-1, 1, cell.output_size]) for e in encoder_outputs ] attention_states = seq2seq.array_ops.concat(top_states, 1) return seq2seq.attention_decoder(decoder_inputs, enc_state, attention_states, cell, loop_function=loop_function)
def testDynamicAttentionDecoder1(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): cell_fn = lambda: rnn_cell.GRUCell(2) cell = cell_fn() inp = constant_op.constant(0.5, shape=[2, 2, 2]) enc_outputs, enc_state = rnn.dynamic_rnn( cell, inp, dtype=dtypes.float32) attn_states = enc_outputs dec_inp = [constant_op.constant(0.4, shape=[2, 2])] * 3 # Use a new cell instance since the attention decoder uses a # different variable scope. dec, mem = seq2seq_lib.attention_decoder( dec_inp, enc_state, attn_states, cell_fn(), output_size=4) sess.run([variables.global_variables_initializer()]) res = sess.run(dec) self.assertEqual(3, len(res)) self.assertEqual((2, 4), res[0].shape) res = sess.run([mem]) self.assertEqual((2, 2), res[0].shape)
def testDynamicAttentionDecoder1(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): cell_fn = lambda: rnn_cell.GRUCell(2) cell = cell_fn() inp = constant_op.constant(0.5, shape=[2, 2, 2]) enc_outputs, enc_state = rnn.dynamic_rnn( cell, inp, dtype=dtypes.float32) attn_states = enc_outputs dec_inp = [constant_op.constant(0.4, shape=[2, 2])] * 3 # Use a new cell instance since the attention decoder uses a # different variable scope. dec, mem = seq2seq_lib.attention_decoder( dec_inp, enc_state, attn_states, cell_fn(), output_size=4) sess.run([variables.global_variables_initializer()]) res = sess.run(dec) self.assertEqual(3, len(res)) self.assertEqual((2, 4), res[0].shape) res = sess.run([mem]) self.assertEqual((2, 2), res[0].shape)
def _seq2seq(self): hps = self._hps vocab_size = self._vocab.count with tf.variable_scope("SumModel"): article_lens = self._article_lens # 由于sequence loss需要 seq_len * [batch_size] targets = tf.unstack(tf.transpose(self._targets)) loss_weights = tf.unstack(tf.transpose(self._loss_weights)) with tf.variable_scope('embedding'), tf.device('/cpu:0'): embedding = tf.get_variable("embedding", [vocab_size, hps.emb_dim], dtype=tf.float32) # [batch, seq_len, emb_dim] emb_encoder_inputs = tf.nn.embedding_lookup( embedding, self._articles) emb_decoder_inputs = tf.nn.embedding_lookup( embedding, self._abstracts) with tf.variable_scope("encoder"): cell_fw = LSTMCell(hps.num_hidden, initializer=tf.random_uniform_initializer( -0.1, 0.1, seed=123), state_is_tuple=False) cell_bw = LSTMCell(hps.num_hidden, initializer=tf.random_uniform_initializer( -0.1, 0.1, seed=113), state_is_tuple=False) # outputs: (output_fw, output_bw) => output_fw: [batch_size, max_time, cell_fw.output_size] # output_states: A tuple (output_state_fw, output_state_bw) encoder_outputs, encoder_output_states = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, inputs=emb_encoder_inputs, dtype=tf.float32, sequence_length=article_lens) # encoder_outputs: [batch_size, max_time, 2 * output_size] self._enc_outputs = tf.concat(encoder_outputs, axis=2) # [batch_size, 2 * output_size] encoder_state_fw, _ = encoder_output_states with tf.variable_scope("output_projection"): w = tf.get_variable( "w", [hps.num_hidden, vocab_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) v = tf.get_variable( "b", [vocab_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) with tf.variable_scope("decoder"): loop_function = None if hps.mode == "test": loop_function = _extract_argmax_and_embed( embedding, (w, v), update_embedding=False) decoder_cell = LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113), state_is_tuple=False) # 将实际输入转化成符合要求的输入 # [seq_len, batch, emb_dim] => seq_len * [batch, emb_dim] emb_decoder_inputs = tf.unstack( tf.transpose(emb_decoder_inputs, perm=[1, 0, 2])) # [batch, cell_size] self._dec_in_state = encoder_state_fw initial_state_attention = (hps.mode == 'test') # decoder_outputs: seq_len * [batch, hidden_size] # self._dec_out_state: [batch, state_size]=[batch, 2*cell_size] decoder_outputs, self._dec_out_state = attention_decoder( decoder_inputs=emb_decoder_inputs, initial_state=self._dec_in_state, attention_states=self._enc_outputs, cell=decoder_cell, num_heads=1, loop_function=loop_function, initial_state_attention=initial_state_attention) with tf.variable_scope("output"): # 还可以写成 #[batch * seq_len, vsize] output = tf.reshape(tf.stack(values=decoder_outputs, axis=1), [-1, hps.num_hidden]) logits = tf.matmul(output, w) + v model_outputs = tf.unstack(tf.reshape( logits, [-1, hps.dec_timesteps, vocab_size]), axis=1) # seq_len * [batch, vsize] # 输出层共享 # model_outputs = [] # for i in range(len(decoder_outputs)): # if i > 0: # tf.get_variable_scope().reuse_variables() # model_outputs.append( # tf.nn.xw_plus_b(decoder_outputs[i], w, v)) with tf.variable_scope("loss"): # logits: seq_len * [batch_size, vsize] # targets: seq_len * [batch_size] # weights: seq_len * [batch_size] 注意这里的weights的作用是做mask # 1. sequence_loss先是调用sequence_loss_by_example,获取[batch_size]维的loss,在除以batch_size # 2. sequence_loss_by_example利用weights来做mask,获取实际的每个time_step的平均loss # 因为batch里面实际句子长度不一样,所有weights要先初始化zeros,然后向里面填1 self._loss = sequence_loss(logits=model_outputs, targets=targets, weights=loss_weights) if hps.mode == "test": with tf.variable_scope("decode_output"): # seq_len * [batch, vsize] => seq_len * [batch, 1] best_outputs = [tf.arg_max(x, 1) for x in model_outputs] # [batch, seq_len] self._outputs = tf.concat( axis=1, values=[tf.reshape(x, [-1, 1]) for x in best_outputs])
def inference(self): self.logger.debug('Inference') self.logger.debug('Imported seq2seq model from TF') with tf.variable_scope("encoder"): enc_cell_fw = self.build_multirnn_block(self.rnn_size, self.enc_rnn_layers, self.cell_type) # enc_cell_bw = self.build_multirnn_block(self.rnn_size, # self.enc_rnn_layers, # self.cell_type) self.enc_zero_fw = enc_cell_fw.zero_state(self.batch_size, tf.float32) # self.enc_zero_bw = enc_cell_bw.zero_state(self.batch_size, tf.float32) self.logger.debug('Initialize encoder') # inputs = batch_norm(self.encoder_inputs, is_training=self.infer) # inputs = [] # for tensor in self.encoder_inputs: # inputs.append(batch_norm(tensor, is_training=self.infer)) # enc_out_orig, enc_state_fw, enc_state_bw = static_bidirectional_rnn( # cell_fw=enc_cell_fw, cell_bw=enc_cell_bw, inputs=inputs, # initial_state_fw=self.enc_zero_fw, # initial_state_bw=self.enc_zero_bw, # sequence_length=self.seq_length # ) enc_out, enc_state_fw = static_rnn(cell=enc_cell_fw, inputs=self.encoder_inputs, initial_state=self.enc_zero_fw, sequence_length=self.seq_length) # enc_out = [] # for tensor in enc_out_orig: # enc_out.append(batch_norm(tensor, is_training=self.infer)) # This op is created to visualize the thought vectors self.enc_state_fw = enc_state_fw # self.enc_state_bw = enc_state_bw self.logger.info('enc out (len {}) tensors shape: {}'.format( len(enc_out), enc_out[0].get_shape())) # print('enc out tensor shape: ', enc_out.get_shape()) self.encoder_state_summaries_fw = histogram_summary( 'encoder_state_fw', enc_state_fw) # self.encoder_state_summaries_bw = histogram_summary( # 'encoder_state_bw', enc_state_bw) dec_cell = self.build_multirnn_block(self.rnn_size, self.dec_rnn_layers, self.cell_type) if self.dropout > 0: # print('Applying dropout {} to decoder'.format(self.dropout)) self.logger.info('Applying dropout {} to decoder'.format( self.dropout)) dec_cell = tf.contrib.rnn.DropoutWrapper( dec_cell, input_keep_prob=self.keep_prob) dec_cell = tf.contrib.rnn.OutputProjectionWrapper( dec_cell, self.parameters_length) if self.infer: def loop_function(prev, _): return prev else: loop_function = None # First calculate a concatenation of encoder outputs to put attention on. # assert enc_cell_fw.output_size == enc_cell_bw.output_size # top_states = [ # tf.reshape(e, [-1, 1, enc_cell_fw.output_size + # enc_cell_bw.output_size]) # for e in enc_out # ] top_states = [ tf.reshape(e, [-1, 1, enc_cell_fw.output_size]) for e in enc_out ] attention_states = tf.concat(top_states, 1) self.logger.debug('Initialize decoder') # ############################################################################ # # Code from renzhe0009 @ StackOverflow # # http://stackoverflow.com/q/42703140/7390416 # # License: MIT # # Because published after March 2016 - meta.stackexchange.com/q/272956 # # # enc_state_c = tf.concat(values=(enc_state_fw.c, enc_state_bw.c), axis=1) # # enc_state_h = tf.concat(values=(enc_state_fw.h, enc_state_bw.h), axis=1) # enc_state_c = enc_state_fw.c + enc_state_bw.c # enc_state_h = enc_state_fw.h + enc_state_bw.h # # enc_state = LSTMStateTuple(c=enc_state_c, h=enc_state_h) # ############################################################################ dec_out, dec_state = attention_decoder( self.decoder_inputs, enc_state_fw, cell=dec_cell, attention_states=attention_states, loop_function=loop_function) # Apply sigmoid activation to decoder outputs dec_out = tf.sigmoid(dec_out) # print('dec_state shape: ', dec_state[0].get_shape()) # merge outputs into a tensor and transpose to be [B, seq_length, out_dim] dec_outputs = tf.transpose(tf.stack(dec_out), [1, 0, 2]) # print('dec outputs shape: ', dec_outputs.get_shape()) self.logger.info('dec outputs shape: {}'.format( dec_outputs.get_shape())) # Decoder outputs summaries split_dec_out = tf.split(dec_outputs, self.parameters_length, axis=2, name='decoder_parameter') [ self.decoder_outputs_summaries.append( histogram_summary(split_tensor.name, split_tensor)) for split_tensor in split_dec_out ] # Separate decoder output into parameters and flags # params_in, flags_in = tf.split(dec_outputs, [42, 2], axis=2) self.encoder_vars = {} for tvar in tf.trainable_variables(): if 'char_embedding' in tvar.name or 'encoder' in tvar.name: self.encoder_vars[tvar.name] = tvar print('tvar: ', tvar.name) return dec_outputs