Exemple #1
0
 def attention(query):
     """Put attention masks on hidden using hidden_features and query."""
     ds = []  # Results of attention reads will be stored here.
     for a in xrange(num_heads):
         with tf.variable_scope("Attention_%d" % a):
             y = rnn_cell.linear(query, attention_vec_size, True)
             y = tf.reshape(y, [-1, 1, 1, attention_vec_size])
             # Attention mask is a softmax of v^T * tanh(...).
             s = tf.reduce_sum(v[a] * tf.tanh(hidden_features[a] + y),
                               [2, 3])
             a = tf.nn.softmax(s)
             # Now calculate the attention-weighted vector d.
             d = tf.reduce_sum(
                 tf.reshape(a, [-1, attn_length, 1, 1]) * hidden,
                 [1, 2])
             ds.append(tf.reshape(d, [-1, attn_size]))
     return ds
Exemple #2
0
 def attention(query):
   """Put attention masks on hidden using hidden_features and query."""
   ds = []  # Results of attention reads will be stored here.
   for a in xrange(num_heads):
     with tf.variable_scope("Attention_%d" % a):
       y = rnn_cell.linear(query, attention_vec_size, True)
       y = tf.reshape(y, [-1, 1, 1, attention_vec_size])
       # Attention mask is a softmax of v^T * tanh(...).
       s = tf.reduce_sum(
           v[a] * tf.tanh(hidden_features[a] + y), [2, 3])
       a = tf.nn.softmax(s)
       # Now calculate the attention-weighted vector d.
       d = tf.reduce_sum(
           tf.reshape(a, [-1, attn_length, 1, 1]) * hidden,
           [1, 2])
       ds.append(tf.reshape(d, [-1, attn_size]))
   return ds
Exemple #3
0
    def _build_recurrent_model(self, input, state, num_units, **kwargs):
        from rnn_cell import linear
        c, h = array_ops.split(state, 2, 1)
        c = tf.identity(c, name='LSTMCell/c_state')
        h = tf.identity(c, name='LSTMCell/h_state')

        i, j, f, o = array_ops.split(linear([input, h], 4 * num_units, True),
                                     4, 1)

        j = tf.tanh(j, name='LSTMCell/j_input')
        i = tf.sigmoid(i, name='LSTMCell/i_gate')
        f = tf.sigmoid(f, name='LSTMCell/f_gate')
        o = tf.sigmoid(o, name='LSTMCell/o_gate')
        c_ = i * j + f * c
        h_ = o * tf.tanh(c_)

        return h_, array_ops.concat([c_, h_], 1, name='LSTMCell/c_h_states')
Exemple #4
0
def attention_decoder(decoder_inputs, initial_state, attention_states, cell,
                      output_size=None, num_heads=1, loop_function=None,
                      dtype=tf.float32, scope=None,
                      initial_state_attention=False):
  """RNN decoder with attention for the sequence-to-sequence model.

  Args:
    decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
    initial_state: 2D Tensor [batch_size x cell.state_size].
    attention_states: 3D Tensor [batch_size x attn_length x attn_size].
    cell: rnn_cell.RNNCell defining the cell function and size.
    output_size: size of the output vectors; if None, we use cell.output_size.
    num_heads: number of attention heads that read from attention_states.
    loop_function: if not None, this function will be applied to i-th output
      in order to generate i+1-th input, and decoder_inputs will be ignored,
      except for the first element ("GO" symbol). This can be used for decoding,
      but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf.
      Signature -- loop_function(prev, i) = next
        * prev is a 2D Tensor of shape [batch_size x cell.output_size],
        * i is an integer, the step number (when advanced control is needed),
        * next is a 2D Tensor of shape [batch_size x cell.input_size].
    dtype: The dtype to use for the RNN initial state (default: tf.float32).
    scope: VariableScope for the created subgraph; default: "attention_decoder".
    initial_state_attention: If False (default), initial attentions are zero.
      If True, initialize the attentions from the initial state and attention
      states -- useful when we wish to resume decoding from a previously
      stored decoder state and attention states.

  Returns:
    outputs: A list of the same length as decoder_inputs of 2D Tensors of shape
      [batch_size x output_size]. These represent the generated outputs.
      Output i is computed from input i (which is either i-th decoder_inputs or
      loop_function(output {i-1}, i)) as follows. First, we run the cell
      on a combination of the input and previous attention masks:
        cell_output, new_state = cell(linear(input, prev_attn), prev_state).
      Then, we calculate new attention masks:
        new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
      and then we calculate the output:
        output = linear(cell_output, new_attn).
    states: The state of each decoder cell in each time-step. This is a list
      with length len(decoder_inputs) -- one item for each time-step.
      Each item is a 2D Tensor of shape [batch_size x cell.state_size].

  Raises:
    ValueError: when num_heads is not positive, there are no inputs, or shapes
      of attention_states are not set.
  """
  if not decoder_inputs:
    raise ValueError("Must provide at least 1 input to attention decoder.")
  if num_heads < 1:
    raise ValueError("With less than 1 heads, use a non-attention decoder.")
  if not attention_states.get_shape()[1:2].is_fully_defined():
    raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
                     % attention_states.get_shape())
  if output_size is None:
    output_size = cell.output_size

  with tf.variable_scope(scope or "attention_decoder"):
    batch_size = tf.shape(decoder_inputs[0])[0]  # Needed for reshaping.
    attn_length = attention_states.get_shape()[1].value
    attn_size = attention_states.get_shape()[2].value

    # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
    hidden = tf.reshape(
        attention_states, [-1, attn_length, 1, attn_size])
    hidden_features = []
    v = []
    attention_vec_size = attn_size  # Size of query vectors for attention.
    for a in xrange(num_heads):
      k = tf.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size])
      hidden_features.append(tf.nn.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
      v.append(tf.get_variable("AttnV_%d" % a, [attention_vec_size]))

    states = [initial_state]

    def attention(query):
      """Put attention masks on hidden using hidden_features and query."""
      ds = []  # Results of attention reads will be stored here.
      for a in xrange(num_heads):
        with tf.variable_scope("Attention_%d" % a):
          y = rnn_cell.linear(query, attention_vec_size, True)
          y = tf.reshape(y, [-1, 1, 1, attention_vec_size])
          # Attention mask is a softmax of v^T * tanh(...).
          s = tf.reduce_sum(
              v[a] * tf.tanh(hidden_features[a] + y), [2, 3])
          a = tf.nn.softmax(s)
          # Now calculate the attention-weighted vector d.
          d = tf.reduce_sum(
              tf.reshape(a, [-1, attn_length, 1, 1]) * hidden,
              [1, 2])
          ds.append(tf.reshape(d, [-1, attn_size]))
      return ds

    outputs = []
    prev = None
    batch_attn_size = tf.pack([batch_size, attn_size])
    attns = [tf.zeros(batch_attn_size, dtype=dtype)
             for _ in xrange(num_heads)]
    for a in attns:  # Ensure the second shape of attention vectors is set.
      a.set_shape([None, attn_size])
    if initial_state_attention:
      attns = attention(initial_state)
    for i in xrange(len(decoder_inputs)):
      if i > 0:
        tf.get_variable_scope().reuse_variables()
      inp = decoder_inputs[i]
      # If loop_function is set, we use it instead of decoder_inputs.
      if loop_function is not None and prev is not None:
        with tf.variable_scope("loop_function", reuse=True):
          inp = tf.stop_gradient(loop_function(prev, i))
      # Merge input and previous attentions into one vector of the right size.
      x = rnn_cell.linear([inp] + attns, cell.input_size, True)
      # Run the RNN.
      cell_output, new_state = cell(x, states[-1])
      states.append(new_state)
      # Run the attention mechanism.
      if i == 0 and initial_state_attention:
        with tf.variable_scope(tf.get_variable_scope(), reuse=True):
          attns = attention(new_state)
      else:
        attns = attention(new_state)

      with tf.variable_scope("AttnOutputProjection"):
        output = rnn_cell.linear([cell_output] + attns, output_size, True)
      if loop_function is not None:
        # We do not propagate gradients over the loop function.
        prev = tf.stop_gradient(output)
      outputs.append(output)

  return outputs, states
Exemple #5
0
def attention_decoder(decoder_inputs,
                      initial_state,
                      attention_states,
                      cell,
                      output_size=None,
                      num_heads=1,
                      loop_function=None,
                      dtype=tf.float32,
                      scope=None,
                      initial_state_attention=False):
    """RNN decoder with attention for the sequence-to-sequence model.

  Args:
    decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
    initial_state: 2D Tensor [batch_size x cell.state_size].
    attention_states: 3D Tensor [batch_size x attn_length x attn_size].
    cell: rnn_cell.RNNCell defining the cell function and size.
    output_size: size of the output vectors; if None, we use cell.output_size.
    num_heads: number of attention heads that read from attention_states.
    loop_function: if not None, this function will be applied to i-th output
      in order to generate i+1-th input, and decoder_inputs will be ignored,
      except for the first element ("GO" symbol). This can be used for decoding,
      but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf.
      Signature -- loop_function(prev, i) = next
        * prev is a 2D Tensor of shape [batch_size x cell.output_size],
        * i is an integer, the step number (when advanced control is needed),
        * next is a 2D Tensor of shape [batch_size x cell.input_size].
    dtype: The dtype to use for the RNN initial state (default: tf.float32).
    scope: VariableScope for the created subgraph; default: "attention_decoder".
    initial_state_attention: If False (default), initial attentions are zero.
      If True, initialize the attentions from the initial state and attention
      states -- useful when we wish to resume decoding from a previously
      stored decoder state and attention states.

  Returns:
    outputs: A list of the same length as decoder_inputs of 2D Tensors of shape
      [batch_size x output_size]. These represent the generated outputs.
      Output i is computed from input i (which is either i-th decoder_inputs or
      loop_function(output {i-1}, i)) as follows. First, we run the cell
      on a combination of the input and previous attention masks:
        cell_output, new_state = cell(linear(input, prev_attn), prev_state).
      Then, we calculate new attention masks:
        new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
      and then we calculate the output:
        output = linear(cell_output, new_attn).
    states: The state of each decoder cell in each time-step. This is a list
      with length len(decoder_inputs) -- one item for each time-step.
      Each item is a 2D Tensor of shape [batch_size x cell.state_size].

  Raises:
    ValueError: when num_heads is not positive, there are no inputs, or shapes
      of attention_states are not set.
  """
    if not decoder_inputs:
        raise ValueError("Must provide at least 1 input to attention decoder.")
    if num_heads < 1:
        raise ValueError(
            "With less than 1 heads, use a non-attention decoder.")
    if not attention_states.get_shape()[1:2].is_fully_defined():
        raise ValueError(
            "Shape[1] and [2] of attention_states must be known: %s" %
            attention_states.get_shape())
    if output_size is None:
        output_size = cell.output_size

    with tf.variable_scope(scope or "attention_decoder"):
        batch_size = tf.shape(decoder_inputs[0])[0]  # Needed for reshaping.
        attn_length = attention_states.get_shape()[1].value
        attn_size = attention_states.get_shape()[2].value

        # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
        hidden = tf.reshape(attention_states, [-1, attn_length, 1, attn_size])
        hidden_features = []
        v = []
        attention_vec_size = attn_size  # Size of query vectors for attention.
        for a in xrange(num_heads):
            k = tf.get_variable("AttnW_%d" % a,
                                [1, 1, attn_size, attention_vec_size])
            hidden_features.append(
                tf.nn.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
            v.append(tf.get_variable("AttnV_%d" % a, [attention_vec_size]))

        states = [initial_state]

        def attention(query):
            """Put attention masks on hidden using hidden_features and query."""
            ds = []  # Results of attention reads will be stored here.
            for a in xrange(num_heads):
                with tf.variable_scope("Attention_%d" % a):
                    y = rnn_cell.linear(query, attention_vec_size, True)
                    y = tf.reshape(y, [-1, 1, 1, attention_vec_size])
                    # Attention mask is a softmax of v^T * tanh(...).
                    s = tf.reduce_sum(v[a] * tf.tanh(hidden_features[a] + y),
                                      [2, 3])
                    a = tf.nn.softmax(s)
                    # Now calculate the attention-weighted vector d.
                    d = tf.reduce_sum(
                        tf.reshape(a, [-1, attn_length, 1, 1]) * hidden,
                        [1, 2])
                    ds.append(tf.reshape(d, [-1, attn_size]))
            return ds

        outputs = []
        prev = None
        batch_attn_size = tf.pack([batch_size, attn_size])
        attns = [
            tf.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads)
        ]
        for a in attns:  # Ensure the second shape of attention vectors is set.
            a.set_shape([None, attn_size])
        if initial_state_attention:
            attns = attention(initial_state)
        for i in xrange(len(decoder_inputs)):
            if i > 0:
                tf.get_variable_scope().reuse_variables()
            inp = decoder_inputs[i]
            # If loop_function is set, we use it instead of decoder_inputs.
            if loop_function is not None and prev is not None:
                with tf.variable_scope("loop_function", reuse=True):
                    inp = tf.stop_gradient(loop_function(prev, i))
            # Merge input and previous attentions into one vector of the right size.
            x = rnn_cell.linear([inp] + attns, cell.input_size, True)
            # Run the RNN.
            cell_output, new_state = cell(x, states[-1])
            states.append(new_state)
            # Run the attention mechanism.
            if i == 0 and initial_state_attention:
                with tf.variable_scope(tf.get_variable_scope(), reuse=True):
                    attns = attention(new_state)
            else:
                attns = attention(new_state)

            with tf.variable_scope("AttnOutputProjection"):
                output = rnn_cell.linear([cell_output] + attns, output_size,
                                         True)
            if loop_function is not None:
                # We do not propagate gradients over the loop function.
                prev = tf.stop_gradient(output)
            outputs.append(output)

    return outputs, states
Exemple #6
0
    def __init__(self, is_training, config):
        self.xs = tf.placeholder(tf.int32, [None, config.num_steps])
        self.ys = tf.placeholder(tf.int32, [None, config.num_steps])

        embedding = tf.get_variable("embedding",
                                    [config.vocab_size, config.hidden_size],
                                    dtype=tf.float32)

        if config.cell_type == 'rnn':
            print 'rnn'
            cell = rnn_cell.LegacyRNNCell(config.hidden_size)
        elif config.cell_type == 'lstm':
            print 'lstm'
            cell = rnn_cell.LegacyLSTMCell(config.hidden_size)
        else:
            print 'gru'
            cell = rnn_cell.LegacyGRUCell(config.hidden_size)

        inputs = tf.nn.embedding_lookup(embedding, self.xs)

        if is_training:
            inputs = tf.nn.dropout(inputs, config.keep_prob)

        init_h = tf.zeros([tf.shape(self.xs)[0], config.hidden_size],
                          tf.float32)
        init_c = tf.zeros([tf.shape(self.xs)[0], config.hidden_size],
                          tf.float32)

        input_ta = tf.TensorArray(tf.float32,
                                  config.num_steps,
                                  tensor_array_name='input_array')
        output_ta = tf.TensorArray(tf.float32,
                                   config.num_steps,
                                   tensor_array_name='output_array')
        input_ta = input_ta.unstack(tf.transpose(inputs, [1, 0, 2]))

        def loop_func(t, out_ta, h, c):
            inp_t = input_ta.read(t)
            cell_output, new_h, new_c = cell(inp_t, h, c)
            out_ta = out_ta.write(t, cell_output)
            return t + 1, out_ta, new_h, new_c

        time = tf.constant(0, dtype=tf.int32, name='time')
        loop_vars = (time, output_ta, init_h, init_c)

        result = tf.while_loop(lambda t, *_: t < config.num_steps, loop_func,
                               loop_vars)

        outputs = result[1].stack()
        outputs = tf.transpose(outputs, [1, 0, 2])

        outputs = tf.reshape(outputs, [-1, config.hidden_size])
        logits = rnn_cell.linear(outputs, config.vocab_size, 'logits')
        logits = tf.reshape(
            logits,
            [tf.shape(self.xs)[0], config.num_steps, config.vocab_size])

        loss = tf.contrib.seq2seq.sequence_loss(
            logits, self.ys,
            tf.ones([tf.shape(self.xs)[0], config.num_steps],
                    dtype=tf.float32))

        self.cost = loss

        optimizer = tf.train.GradientDescentOptimizer(config.learning_rate)
        #optimizer = tf.train.AdamOptimizer()
        if not config.clip:
            self.train_op = optimizer.minimize(loss)
        else:
            trainable_variables = tf.trainable_variables()
            grads, _ = tf.clip_by_global_norm(
                tf.gradients(self.cost, trainable_variables), 5)

            self.train_op = optimizer.apply_gradients(
                zip(grads, trainable_variables))
Exemple #7
0
 def _build_recurrent_model(self, input, state, num_units, **kwargs):
     from rnn_cell import linear
     hidden_state = tf.tanh(
         linear([input, state], num_units, True, scope='BasicRNN'),
         'BasicRNN/hidden_state')
     return hidden_state, hidden_state