Example #1
0
def embedding_attention_seq2seq(encoder_inputs, encoder_mask, decoder_inputs, cell,
                                num_encoder_symbols, num_decoder_symbols, embedding_size,
                                beam_size, num_layers=1, num_heads=1, feed_previous=False, dtype=dtypes.float32,
                                scope=None, initial_state_attention=True):
    """Embedding sequence-to-sequence model with attention.

    Args:
        encoder_mask: The mask of input sentences denoting padding positions.
        encoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
        decoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
        cell: rnn_cell.RNNCell defining the cell function and size.
        num_encoder_symbols: Integer; number of symbols on the encoder side.
        num_decoder_symbols: Integer; number of symbols on the decoder side.
        embedding_size: Integer, the length of the embedding vector for each symbol.
        beam_size: Integer, the beam size used in beam search.
        num_heads: Number of attention heads that read from attention_states.
        feed_previous: Boolean, if True, only the first of decoder_inputs will be used (the "GO" symbol).
        dtype: The dtype of the initial RNN state (default: tf.float32).
        scope: VariableScope for the created subgraph; defaults to "embedding_attention_seq2seq".
        initial_state_attention: If False (default), initial attentions are zero.
            If True, initialize the attentions from the initial state and attention
            states.

    Returns:
        A tuple of the form (outputs, state, symbols), where:
            outputs: A list of the same length as decoder_inputs of 2D Tensors of
                  shape [batch_size x output_size].
            state: The state of each decoder cell the final time-step.
                It is a 2D Tensor of shape [batch_size x cell.state_size].
            symbols: A list of target word ids, the best results returned by beam search
    """
    with variable_scope.variable_scope(scope or "embedding_attention_seq2seq"):
        embedding = variable_scope.get_variable(
                "embedding", [num_encoder_symbols, embedding_size], dtype=dtype,
                initializer=init_ops.random_normal_initializer(0, 0.01, seed=SEED))
        encoder_cell = rnn_cell.EmbeddingWrapper(cell, embedding_classes=num_encoder_symbols,
                embedding_size=embedding_size, embedding=embedding)

        encoder_lens = math_ops.reduce_sum(encoder_mask, [1])

        encoder_outputs, _, encoder_state = rnn.bidirectional_rnn(
                encoder_cell, encoder_cell, encoder_inputs, sequence_length=encoder_lens, dtype=dtype)

        assert encoder_cell._embedding is embedding

        # First calculate a concatenation of encoder outputs to put attention on.
        top_states = [array_ops.reshape(e, [-1, 1, 2 * cell.output_size]) for e in encoder_outputs]
        attention_states = array_ops.concat(1, top_states)

        # Decoder.
        output_size = None

        return embedding_attention_decoder(encoder_mask, decoder_inputs, encoder_state, attention_states, cell,
                                           num_decoder_symbols, embedding_size, beam_size=beam_size,
                                           num_heads=num_heads, output_size=output_size, num_layers=num_layers,
                                           feed_previous=feed_previous,
                                           initial_state_attention=initial_state_attention)
Example #2
0
def embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell,
                                num_encoder_symbols, num_decoder_symbols,
                                num_heads=1, output_projection=None,
                                feed_previous=False, dtype=tf.float32,
                                scope=None):
  """Embedding sequence-to-sequence model with attention.

  This model first embeds encoder_inputs by a newly created embedding (of shape
  [num_encoder_symbols x cell.input_size]). Then it runs an RNN to encode
  embedded encoder_inputs into a state vector. It keeps the outputs of this
  RNN at every step to use for attention later. Next, it embeds decoder_inputs
  by another newly created embedding (of shape [num_decoder_symbols x
  cell.input_size]). Then it runs attention decoder, initialized with the last
  encoder state, on embedded decoder_inputs and attending to encoder outputs.

  Args:
    encoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
    decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
    cell: rnn_cell.RNNCell defining the cell function and size.
    num_encoder_symbols: integer; number of symbols on the encoder side.
    num_decoder_symbols: integer; number of symbols on the decoder side.
    num_heads: number of attention heads that read from attention_states.
    output_projection: None or a pair (W, B) of output projection weights and
      biases; W has shape [cell.output_size x num_decoder_symbols] and B has
      shape [num_decoder_symbols]; if provided and feed_previous=True, each
      fed previous output will first be multiplied by W and added B.
    feed_previous: Boolean or scalar Boolean Tensor; if True, only the first
      of decoder_inputs will be used (the "GO" symbol), and all other decoder
      inputs will be taken from previous outputs (as in embedding_rnn_decoder).
      If False, decoder_inputs are used as given (the standard decoder case).
    dtype: The dtype of the initial RNN state (default: tf.float32).
    scope: VariableScope for the created subgraph; defaults to
      "embedding_attention_seq2seq".

  Returns:
    outputs: A list of the same length as decoder_inputs of 2D Tensors with
      shape [batch_size x num_decoder_symbols] containing the generated outputs.
    states: The state of each decoder cell in each time-step. This is a list
      with length len(decoder_inputs) -- one item for each time-step.
      Each item is a 2D Tensor of shape [batch_size x cell.state_size].
  """
  with tf.variable_scope(scope or "embedding_attention_seq2seq"):
    # Encoder.
    encoder_cell = rnn_cell.EmbeddingWrapper(cell, num_encoder_symbols)
    encoder_outputs, encoder_states = rnn.rnn(
        encoder_cell, encoder_inputs, dtype=dtype)

    # First calculate a concatenation of encoder outputs to put attention on.
    top_states = [tf.reshape(e, [-1, 1, cell.output_size])
                  for e in encoder_outputs]
    attention_states = tf.concat(1, top_states)

    # Decoder.
    output_size = None
    if output_projection is None:
      cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols)
      output_size = num_decoder_symbols

    if isinstance(feed_previous, bool):
      return embedding_attention_decoder(
          decoder_inputs, encoder_states[-1], attention_states, cell,
          num_decoder_symbols, num_heads, output_size, output_projection,
          feed_previous)
    else:  # If feed_previous is a Tensor, we construct 2 graphs and use cond.
      outputs1, states1 = embedding_attention_decoder(
          decoder_inputs, encoder_states[-1], attention_states, cell,
          num_decoder_symbols, num_heads, output_size, output_projection, True)
      tf.get_variable_scope().reuse_variables()
      outputs2, states2 = embedding_attention_decoder(
          decoder_inputs, encoder_states[-1], attention_states, cell,
          num_decoder_symbols, num_heads, output_size, output_projection, False)

      outputs = tf.control_flow_ops.cond(feed_previous,
                                         lambda: outputs1, lambda: outputs2)
      states = tf.control_flow_ops.cond(feed_previous,
                                        lambda: states1, lambda: states2)
      return outputs, states
Example #3
0
def embedding_attention_seq2seq(encoder_inputs_1, encoder_inputs_2, encoder_mask_1, encoder_mask_2, decoder_inputs, cell,
                                num_encoder_symbols_1, num_encoder_symbols_2, num_decoder_symbols, # added by al
                                embedding_size,
                                beam_size,  # added by shiyue
                                constant_emb_en, # added by al
                                constant_emb_fr, # added by al
                                num_heads=1, output_projection=None,
                                feed_previous=False, dtype=dtypes.float32,
                                scope=None,
                                # initial_state_attention=False  #annotated by yfeng
                                initial_state_attention=True  # added by yfeng
                                ):
    """Embedding sequence-to-sequence model with attention.

    This model first embeds encoder_inputs by a newly created embedding (of shape
    [num_encoder_symbols x input_size]). Then it runs an RNN to encode
    embedded encoder_inputs into a state vector. It keeps the outputs of this
    RNN at every step to use for attention later. Next, it embeds decoder_inputs
    by another newly created embedding (of shape [num_decoder_symbols x
    input_size]). Then it runs attention decoder, initialized with the last
    encoder state, on embedded decoder_inputs and attending to encoder outputs.

    Args:
      encoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
      decoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
      cell: rnn_cell.RNNCell defining the cell function and size.
      num_encoder_symbols: Integer; number of symbols on the encoder side.
      num_decoder_symbols: Integer; number of symbols on the decoder side.
      embedding_size: Integer, the length of the embedding vector for each symbol.
      num_heads: Number of attention heads that read from attention_states.
      output_projection: None or a pair (W, B) of output projection weights and
        biases; W has shape [output_size x num_decoder_symbols] and B has
        shape [num_decoder_symbols]; if provided and feed_previous=True, each
        fed previous output will first be multiplied by W and added B.
      feed_previous: Boolean or scalar Boolean Tensor; if True, only the first
        of decoder_inputs will be used (the "GO" symbol), and all other decoder
        inputs will be taken from previous outputs (as in embedding_rnn_decoder).
        If False, decoder_inputs are used as given (the standard decoder case).
      dtype: The dtype of the initial RNN state (default: tf.float32).
      scope: VariableScope for the created subgraph; defaults to
        "embedding_attention_seq2seq".
      initial_state_attention: If False (default), initial attentions are zero.
        If True, initialize the attentions from the initial state and attention
        states.

    Returns:
      A tuple of the form (outputs, state), where:
        outputs: A list of the same length as decoder_inputs of 2D Tensors with
          shape [batch_size x num_decoder_symbols] containing the generated
          outputs.
        state: The state of each decoder cell at the final time-step.
          It is a 2D Tensor of shape [batch_size x cell.state_size].
    """
    with variable_scope.variable_scope(scope or "embedding_attention_seq2seq"):
        # Encoder.
        # annotated by yfeng
        """
        encoder_cell = rnn_cell.EmbeddingWrapper(
            cell, embedding_classes=num_encoder_symbols,
            embedding_size=embedding_size)
        encoder_outputs, encoder_state = rnn.rnn(
            encoder_cell, encoder_inputs, dtype=dtype)

        # First calculate a concatenation of encoder outputs to put attention on.
        top_states = [array_ops.reshape(e, [-1, 1, cell.output_size])
                      for e in encoder_outputs]
        """
        # start by yfeng
        # sqrt3 = math.sqrt(3)  # Uniform(-sqrt(3), sqrt(3)) has variance=1.
        '''
        embedding_1 = variable_scope.get_variable(
                "embedding_1", [num_encoder_symbols_1, embedding_size],
                dtype=dtype,
                initializer=init_ops.random_normal_initializer(0, 0.01, seed=SEED))  # annotated by yfeng
        embedding_2 = variable_scope.get_variable( # added by al
                "embedding_2", [num_encoder_symbols_2, embedding_size],
                dtype=dtype,
                initializer=init_ops.random_normal_initializer(0, 0.01, seed=SEED))  # annotated by yfeng
        '''
        embedding_1 = variable_scope.get_variable(
                "embedding_1", [num_encoder_symbols_1, embedding_size],
                dtype=dtype,
                trainable=False,
                initializer=init_ops.constant_initializer(constant_emb_en))  # for constant embedding
        embedding_2 = variable_scope.get_variable( # added by al
                "embedding_2", [num_encoder_symbols_2, embedding_size],
                dtype=dtype,
                trainable=False,
                initializer=init_ops.constant_initializer(constant_emb_fr))  # for constant embedding

        # initializer = init_ops.random_normal_initializer(0, 0.01, seed=1.0)) #change from uniform to normal by yfeng
        encoder_lens_1 = math_ops.reduce_sum(encoder_mask_1, [1])
        encoder_lens_2 = math_ops.reduce_sum(encoder_mask_2, [1])
        
        with variable_scope.variable_scope("encoder_1"):
            encoder_cell_1 = rnn_cell.EmbeddingWrapper(
                    cell, embedding_classes=num_encoder_symbols_1,
                    embedding_size=embedding_size, embedding=embedding_1)
            encoder_outputs_1, _, encoder_state_1 = rnn.bidirectional_rnn(
                    encoder_cell_1, encoder_cell_1, encoder_inputs_1, sequence_length=encoder_lens_1, dtype=dtype)

        
        with variable_scope.variable_scope("encoder_2"):
            encoder_cell_2 = rnn_cell.EmbeddingWrapper(
                    cell, embedding_classes=num_encoder_symbols_2,
                    embedding_size=embedding_size, embedding=embedding_2)
            encoder_outputs_2, _, encoder_state_2 = rnn.bidirectional_rnn(
                    encoder_cell_2, encoder_cell_2, encoder_inputs_2, sequence_length=encoder_lens_2, dtype=dtype)

        encoder_state = alpha * encoder_state_1 + beta * encoder_state_2 # this can be changed

        assert encoder_cell_1._embedding is embedding_1
        assert encoder_cell_2._embedding is embedding_2

        # First calculate a concatenation of encoder outputs to put attention on.
        top_states_1 = [array_ops.reshape(e, [-1, 1, 2 * cell.output_size])
                      for e in encoder_outputs_1]
        top_states_2 = [array_ops.reshape(e, [-1, 1, 2 * cell.output_size])
                      for e in encoder_outputs_2]
        # end by yfeng
        attention_states_1 = array_ops.concat(1, top_states_1)
        attention_states_2 = array_ops.concat(1, top_states_2)

        # Decoder.
        output_size = None
        if output_projection is None:
            cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols)
            output_size = num_decoder_symbols

        if isinstance(feed_previous, bool):
            return embedding_attention_decoder(encoder_mask_1, encoder_mask_2, 
                                               decoder_inputs, encoder_state, attention_states_1, attention_states_2, cell,
                                               num_decoder_symbols, embedding_size,
                                               beam_size=beam_size,  # added by shiyue
                                               constant_emb_fr=constant_emb_fr, # added by al
                                               num_heads=num_heads,
                                               output_size=output_size, output_projection=output_projection,
                                               feed_previous=feed_previous,
                                               initial_state_attention=initial_state_attention)

        # If feed_previous is a Tensor, we construct 2 graphs and use cond.
        def decoder(feed_previous_bool):
            reuse = None if feed_previous_bool else True
            with variable_scope.variable_scope(variable_scope.get_variable_scope(),
                                               reuse=reuse):
                outputs, state, _ = embedding_attention_decoder(encoder_mask_1, encoder_mask_2,   # modified by shiyue
                                                                decoder_inputs, encoder_state, attention_states_1, attention_states_2, cell,
                                                                num_decoder_symbols, embedding_size,
                                                                beam_size=beam_size,  # added by shiyue
                                                                constant_emb_fr=constant_emb_fr, # added by al
                                                                num_heads=num_heads,
                                                                output_size=output_size,
                                                                output_projection=output_projection,
                                                                feed_previous=feed_previous_bool,
                                                                update_embedding_for_previous=False,
                                                                initial_state_attention=initial_state_attention)
                state_list = [state]
                if nest.is_sequence(state):
                    state_list = nest.flatten(state)
                return outputs + state_list

        outputs_and_state = control_flow_ops.cond(feed_previous,
                                                  lambda: decoder(True),
                                                  lambda: decoder(False))
        outputs_len = len(decoder_inputs)  # Outputs length same as decoder inputs.
        state_list = outputs_and_state[outputs_len:]
        state = state_list[0]
        if nest.is_sequence(encoder_state):
            state = nest.pack_sequence_as(structure=encoder_state,
                                          flat_sequence=state_list)
        return outputs_and_state[:outputs_len], state
Example #4
0
def embedding_attention_seq2seq(encoder_inputs,
                                encoder_mask,
                                decoder_inputs,
                                cell,
                                num_encoder_symbols,
                                num_decoder_symbols,
                                embedding_size,
                                beam_size,
                                output_projection=None,
                                num_layers=1,
                                feed_previous=False,
                                dtype=dtypes.float32,
                                scope=None,
                                initial_state_attention=True):
    """Embedding sequence-to-sequence model with attention.

    This model first embeds encoder_inputs by a newly created embedding (of shape
    [num_encoder_symbols x input_size]). Then it runs an bidirectional-RNN to encode
    embedded encoder_inputs into a state vector. It keeps the outputs of this
    bidirectional-RNN at every step to use for attention later. Next, it embeds decoder_inputs
    by another newly created embedding (of shape [num_decoder_symbols x
    input_size]). Then it runs attention decoder, initialized with the last
    encoder state, on embedded decoder_inputs and attending to encoder outputs.

    Args:
      encoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
      encoder_mask: the mask of encoder inputs that label where are PADs.
      decoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
      cell: rnn_cell.RNNCell defining the cell function and size.
      num_encoder_symbols: Integer; number of symbols on the encoder side.
      num_decoder_symbols: Integer; number of symbols on the decoder side.
      embedding_size: Integer, the length of the embedding vector for each symbol.
      output_projection: None or a pair (W, B) of output projection weights and
        biases; W has shape [output_size x num_decoder_symbols] and B has
        shape [num_decoder_symbols]; if provided and feed_previous=True, each
        fed previous output will first be multiplied by W and added B.
      feed_previous: Boolean or scalar Boolean Tensor; if True, only the first
        of decoder_inputs will be used (the "GO" symbol), and all other decoder
        inputs will be taken from previous outputs (as in embedding_rnn_decoder).
        If False, decoder_inputs are used as given (the standard decoder case).
      dtype: The dtype of the initial RNN state (default: tf.float32).
      scope: VariableScope for the created subgraph; defaults to
        "embedding_attention_seq2seq".
      initial_state_attention: If False (default), initial attentions are zero.
        If True, initialize the attentions from the initial state and attention
        states.

    Returns:
      A tuple of the form (outputs, state, symbols), where:
        outputs: A list of the same length as decoder_inputs of 2D Tensors with
          shape [batch_size x num_decoder_symbols] containing the generated
          outputs.
        state: The state of each decoder cell at the final time-step.
          It is a 2D Tensor of shape [batch_size x cell.state_size].
        symbols: When training, it is []; when decoding, it is the best translation
          generated by beam search.
    """
    with variable_scope.variable_scope(scope or "embedding_attention_seq2seq"):
        # word embeddings of source words
        embedding = variable_scope.get_variable(
            "embedding", [num_encoder_symbols, embedding_size],
            dtype=dtype,
            initializer=init_ops.random_normal_initializer(0, 0.01, seed=SEED))
        # wrap encoder cell with embedding
        encoder_cell = rnn_cell.EmbeddingWrapper(
            cell,
            embedding_classes=num_encoder_symbols,
            embedding_size=embedding_size,
            embedding=embedding)

        # get the sentence lengths of source sentences
        encoder_lens = math_ops.reduce_sum(encoder_mask, [1])

        # encode source sentences with a bidirectional_rnn encoder
        encoder_outputs, _, encoder_state = rnn.bidirectional_rnn(
            encoder_cell,
            encoder_cell,
            encoder_inputs,
            sequence_length=encoder_lens,
            dtype=dtype)
        # First calculate a concatenation of encoder outputs.
        top_states = [
            array_ops.reshape(e, [-1, 1, 2 * cell.output_size])
            for e in encoder_outputs
        ]
        attention_states = array_ops.concat(top_states, 1)

        # Decoder.
        output_size = None
        if output_projection is None:
            cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols)
            output_size = num_decoder_symbols

        return embedding_attention_decoder(
            encoder_mask,
            decoder_inputs,
            encoder_state,
            attention_states,
            cell,
            num_decoder_symbols,
            embedding_size,
            beam_size=beam_size,
            output_size=output_size,
            output_projection=output_projection,
            num_layers=num_layers,
            feed_previous=feed_previous,
            initial_state_attention=initial_state_attention)