def embedding_attention_seq2seq(encoder_inputs, encoder_mask, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols, embedding_size, beam_size, num_layers=1, num_heads=1, feed_previous=False, dtype=dtypes.float32, scope=None, initial_state_attention=True): """Embedding sequence-to-sequence model with attention. Args: encoder_mask: The mask of input sentences denoting padding positions. encoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. decoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. cell: rnn_cell.RNNCell defining the cell function and size. num_encoder_symbols: Integer; number of symbols on the encoder side. num_decoder_symbols: Integer; number of symbols on the decoder side. embedding_size: Integer, the length of the embedding vector for each symbol. beam_size: Integer, the beam size used in beam search. num_heads: Number of attention heads that read from attention_states. feed_previous: Boolean, if True, only the first of decoder_inputs will be used (the "GO" symbol). dtype: The dtype of the initial RNN state (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "embedding_attention_seq2seq". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states. Returns: A tuple of the form (outputs, state, symbols), where: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. state: The state of each decoder cell the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. symbols: A list of target word ids, the best results returned by beam search """ with variable_scope.variable_scope(scope or "embedding_attention_seq2seq"): embedding = variable_scope.get_variable( "embedding", [num_encoder_symbols, embedding_size], dtype=dtype, initializer=init_ops.random_normal_initializer(0, 0.01, seed=SEED)) encoder_cell = rnn_cell.EmbeddingWrapper(cell, embedding_classes=num_encoder_symbols, embedding_size=embedding_size, embedding=embedding) encoder_lens = math_ops.reduce_sum(encoder_mask, [1]) encoder_outputs, _, encoder_state = rnn.bidirectional_rnn( encoder_cell, encoder_cell, encoder_inputs, sequence_length=encoder_lens, dtype=dtype) assert encoder_cell._embedding is embedding # First calculate a concatenation of encoder outputs to put attention on. top_states = [array_ops.reshape(e, [-1, 1, 2 * cell.output_size]) for e in encoder_outputs] attention_states = array_ops.concat(1, top_states) # Decoder. output_size = None return embedding_attention_decoder(encoder_mask, decoder_inputs, encoder_state, attention_states, cell, num_decoder_symbols, embedding_size, beam_size=beam_size, num_heads=num_heads, output_size=output_size, num_layers=num_layers, feed_previous=feed_previous, initial_state_attention=initial_state_attention)
def embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols, num_heads=1, output_projection=None, feed_previous=False, dtype=tf.float32, scope=None): """Embedding sequence-to-sequence model with attention. This model first embeds encoder_inputs by a newly created embedding (of shape [num_encoder_symbols x cell.input_size]). Then it runs an RNN to encode embedded encoder_inputs into a state vector. It keeps the outputs of this RNN at every step to use for attention later. Next, it embeds decoder_inputs by another newly created embedding (of shape [num_decoder_symbols x cell.input_size]). Then it runs attention decoder, initialized with the last encoder state, on embedded decoder_inputs and attending to encoder outputs. Args: encoder_inputs: a list of 2D Tensors [batch_size x cell.input_size]. decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size]. cell: rnn_cell.RNNCell defining the cell function and size. num_encoder_symbols: integer; number of symbols on the encoder side. num_decoder_symbols: integer; number of symbols on the decoder side. num_heads: number of attention heads that read from attention_states. output_projection: None or a pair (W, B) of output projection weights and biases; W has shape [cell.output_size x num_decoder_symbols] and B has shape [num_decoder_symbols]; if provided and feed_previous=True, each fed previous output will first be multiplied by W and added B. feed_previous: Boolean or scalar Boolean Tensor; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be taken from previous outputs (as in embedding_rnn_decoder). If False, decoder_inputs are used as given (the standard decoder case). dtype: The dtype of the initial RNN state (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "embedding_attention_seq2seq". Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x num_decoder_symbols] containing the generated outputs. states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. """ with tf.variable_scope(scope or "embedding_attention_seq2seq"): # Encoder. encoder_cell = rnn_cell.EmbeddingWrapper(cell, num_encoder_symbols) encoder_outputs, encoder_states = rnn.rnn( encoder_cell, encoder_inputs, dtype=dtype) # First calculate a concatenation of encoder outputs to put attention on. top_states = [tf.reshape(e, [-1, 1, cell.output_size]) for e in encoder_outputs] attention_states = tf.concat(1, top_states) # Decoder. output_size = None if output_projection is None: cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols) output_size = num_decoder_symbols if isinstance(feed_previous, bool): return embedding_attention_decoder( decoder_inputs, encoder_states[-1], attention_states, cell, num_decoder_symbols, num_heads, output_size, output_projection, feed_previous) else: # If feed_previous is a Tensor, we construct 2 graphs and use cond. outputs1, states1 = embedding_attention_decoder( decoder_inputs, encoder_states[-1], attention_states, cell, num_decoder_symbols, num_heads, output_size, output_projection, True) tf.get_variable_scope().reuse_variables() outputs2, states2 = embedding_attention_decoder( decoder_inputs, encoder_states[-1], attention_states, cell, num_decoder_symbols, num_heads, output_size, output_projection, False) outputs = tf.control_flow_ops.cond(feed_previous, lambda: outputs1, lambda: outputs2) states = tf.control_flow_ops.cond(feed_previous, lambda: states1, lambda: states2) return outputs, states
def embedding_attention_seq2seq(encoder_inputs_1, encoder_inputs_2, encoder_mask_1, encoder_mask_2, decoder_inputs, cell, num_encoder_symbols_1, num_encoder_symbols_2, num_decoder_symbols, # added by al embedding_size, beam_size, # added by shiyue constant_emb_en, # added by al constant_emb_fr, # added by al num_heads=1, output_projection=None, feed_previous=False, dtype=dtypes.float32, scope=None, # initial_state_attention=False #annotated by yfeng initial_state_attention=True # added by yfeng ): """Embedding sequence-to-sequence model with attention. This model first embeds encoder_inputs by a newly created embedding (of shape [num_encoder_symbols x input_size]). Then it runs an RNN to encode embedded encoder_inputs into a state vector. It keeps the outputs of this RNN at every step to use for attention later. Next, it embeds decoder_inputs by another newly created embedding (of shape [num_decoder_symbols x input_size]). Then it runs attention decoder, initialized with the last encoder state, on embedded decoder_inputs and attending to encoder outputs. Args: encoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. decoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. cell: rnn_cell.RNNCell defining the cell function and size. num_encoder_symbols: Integer; number of symbols on the encoder side. num_decoder_symbols: Integer; number of symbols on the decoder side. embedding_size: Integer, the length of the embedding vector for each symbol. num_heads: Number of attention heads that read from attention_states. output_projection: None or a pair (W, B) of output projection weights and biases; W has shape [output_size x num_decoder_symbols] and B has shape [num_decoder_symbols]; if provided and feed_previous=True, each fed previous output will first be multiplied by W and added B. feed_previous: Boolean or scalar Boolean Tensor; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be taken from previous outputs (as in embedding_rnn_decoder). If False, decoder_inputs are used as given (the standard decoder case). dtype: The dtype of the initial RNN state (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "embedding_attention_seq2seq". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states. Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x num_decoder_symbols] containing the generated outputs. state: The state of each decoder cell at the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. """ with variable_scope.variable_scope(scope or "embedding_attention_seq2seq"): # Encoder. # annotated by yfeng """ encoder_cell = rnn_cell.EmbeddingWrapper( cell, embedding_classes=num_encoder_symbols, embedding_size=embedding_size) encoder_outputs, encoder_state = rnn.rnn( encoder_cell, encoder_inputs, dtype=dtype) # First calculate a concatenation of encoder outputs to put attention on. top_states = [array_ops.reshape(e, [-1, 1, cell.output_size]) for e in encoder_outputs] """ # start by yfeng # sqrt3 = math.sqrt(3) # Uniform(-sqrt(3), sqrt(3)) has variance=1. ''' embedding_1 = variable_scope.get_variable( "embedding_1", [num_encoder_symbols_1, embedding_size], dtype=dtype, initializer=init_ops.random_normal_initializer(0, 0.01, seed=SEED)) # annotated by yfeng embedding_2 = variable_scope.get_variable( # added by al "embedding_2", [num_encoder_symbols_2, embedding_size], dtype=dtype, initializer=init_ops.random_normal_initializer(0, 0.01, seed=SEED)) # annotated by yfeng ''' embedding_1 = variable_scope.get_variable( "embedding_1", [num_encoder_symbols_1, embedding_size], dtype=dtype, trainable=False, initializer=init_ops.constant_initializer(constant_emb_en)) # for constant embedding embedding_2 = variable_scope.get_variable( # added by al "embedding_2", [num_encoder_symbols_2, embedding_size], dtype=dtype, trainable=False, initializer=init_ops.constant_initializer(constant_emb_fr)) # for constant embedding # initializer = init_ops.random_normal_initializer(0, 0.01, seed=1.0)) #change from uniform to normal by yfeng encoder_lens_1 = math_ops.reduce_sum(encoder_mask_1, [1]) encoder_lens_2 = math_ops.reduce_sum(encoder_mask_2, [1]) with variable_scope.variable_scope("encoder_1"): encoder_cell_1 = rnn_cell.EmbeddingWrapper( cell, embedding_classes=num_encoder_symbols_1, embedding_size=embedding_size, embedding=embedding_1) encoder_outputs_1, _, encoder_state_1 = rnn.bidirectional_rnn( encoder_cell_1, encoder_cell_1, encoder_inputs_1, sequence_length=encoder_lens_1, dtype=dtype) with variable_scope.variable_scope("encoder_2"): encoder_cell_2 = rnn_cell.EmbeddingWrapper( cell, embedding_classes=num_encoder_symbols_2, embedding_size=embedding_size, embedding=embedding_2) encoder_outputs_2, _, encoder_state_2 = rnn.bidirectional_rnn( encoder_cell_2, encoder_cell_2, encoder_inputs_2, sequence_length=encoder_lens_2, dtype=dtype) encoder_state = alpha * encoder_state_1 + beta * encoder_state_2 # this can be changed assert encoder_cell_1._embedding is embedding_1 assert encoder_cell_2._embedding is embedding_2 # First calculate a concatenation of encoder outputs to put attention on. top_states_1 = [array_ops.reshape(e, [-1, 1, 2 * cell.output_size]) for e in encoder_outputs_1] top_states_2 = [array_ops.reshape(e, [-1, 1, 2 * cell.output_size]) for e in encoder_outputs_2] # end by yfeng attention_states_1 = array_ops.concat(1, top_states_1) attention_states_2 = array_ops.concat(1, top_states_2) # Decoder. output_size = None if output_projection is None: cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols) output_size = num_decoder_symbols if isinstance(feed_previous, bool): return embedding_attention_decoder(encoder_mask_1, encoder_mask_2, decoder_inputs, encoder_state, attention_states_1, attention_states_2, cell, num_decoder_symbols, embedding_size, beam_size=beam_size, # added by shiyue constant_emb_fr=constant_emb_fr, # added by al num_heads=num_heads, output_size=output_size, output_projection=output_projection, feed_previous=feed_previous, initial_state_attention=initial_state_attention) # If feed_previous is a Tensor, we construct 2 graphs and use cond. def decoder(feed_previous_bool): reuse = None if feed_previous_bool else True with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=reuse): outputs, state, _ = embedding_attention_decoder(encoder_mask_1, encoder_mask_2, # modified by shiyue decoder_inputs, encoder_state, attention_states_1, attention_states_2, cell, num_decoder_symbols, embedding_size, beam_size=beam_size, # added by shiyue constant_emb_fr=constant_emb_fr, # added by al num_heads=num_heads, output_size=output_size, output_projection=output_projection, feed_previous=feed_previous_bool, update_embedding_for_previous=False, initial_state_attention=initial_state_attention) state_list = [state] if nest.is_sequence(state): state_list = nest.flatten(state) return outputs + state_list outputs_and_state = control_flow_ops.cond(feed_previous, lambda: decoder(True), lambda: decoder(False)) outputs_len = len(decoder_inputs) # Outputs length same as decoder inputs. state_list = outputs_and_state[outputs_len:] state = state_list[0] if nest.is_sequence(encoder_state): state = nest.pack_sequence_as(structure=encoder_state, flat_sequence=state_list) return outputs_and_state[:outputs_len], state
def embedding_attention_seq2seq(encoder_inputs, encoder_mask, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols, embedding_size, beam_size, output_projection=None, num_layers=1, feed_previous=False, dtype=dtypes.float32, scope=None, initial_state_attention=True): """Embedding sequence-to-sequence model with attention. This model first embeds encoder_inputs by a newly created embedding (of shape [num_encoder_symbols x input_size]). Then it runs an bidirectional-RNN to encode embedded encoder_inputs into a state vector. It keeps the outputs of this bidirectional-RNN at every step to use for attention later. Next, it embeds decoder_inputs by another newly created embedding (of shape [num_decoder_symbols x input_size]). Then it runs attention decoder, initialized with the last encoder state, on embedded decoder_inputs and attending to encoder outputs. Args: encoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. encoder_mask: the mask of encoder inputs that label where are PADs. decoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. cell: rnn_cell.RNNCell defining the cell function and size. num_encoder_symbols: Integer; number of symbols on the encoder side. num_decoder_symbols: Integer; number of symbols on the decoder side. embedding_size: Integer, the length of the embedding vector for each symbol. output_projection: None or a pair (W, B) of output projection weights and biases; W has shape [output_size x num_decoder_symbols] and B has shape [num_decoder_symbols]; if provided and feed_previous=True, each fed previous output will first be multiplied by W and added B. feed_previous: Boolean or scalar Boolean Tensor; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be taken from previous outputs (as in embedding_rnn_decoder). If False, decoder_inputs are used as given (the standard decoder case). dtype: The dtype of the initial RNN state (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "embedding_attention_seq2seq". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states. Returns: A tuple of the form (outputs, state, symbols), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x num_decoder_symbols] containing the generated outputs. state: The state of each decoder cell at the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. symbols: When training, it is []; when decoding, it is the best translation generated by beam search. """ with variable_scope.variable_scope(scope or "embedding_attention_seq2seq"): # word embeddings of source words embedding = variable_scope.get_variable( "embedding", [num_encoder_symbols, embedding_size], dtype=dtype, initializer=init_ops.random_normal_initializer(0, 0.01, seed=SEED)) # wrap encoder cell with embedding encoder_cell = rnn_cell.EmbeddingWrapper( cell, embedding_classes=num_encoder_symbols, embedding_size=embedding_size, embedding=embedding) # get the sentence lengths of source sentences encoder_lens = math_ops.reduce_sum(encoder_mask, [1]) # encode source sentences with a bidirectional_rnn encoder encoder_outputs, _, encoder_state = rnn.bidirectional_rnn( encoder_cell, encoder_cell, encoder_inputs, sequence_length=encoder_lens, dtype=dtype) # First calculate a concatenation of encoder outputs. top_states = [ array_ops.reshape(e, [-1, 1, 2 * cell.output_size]) for e in encoder_outputs ] attention_states = array_ops.concat(top_states, 1) # Decoder. output_size = None if output_projection is None: cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols) output_size = num_decoder_symbols return embedding_attention_decoder( encoder_mask, decoder_inputs, encoder_state, attention_states, cell, num_decoder_symbols, embedding_size, beam_size=beam_size, output_size=output_size, output_projection=output_projection, num_layers=num_layers, feed_previous=feed_previous, initial_state_attention=initial_state_attention)