Ejemplo n.º 1
0
  def loop_function(prev, i, log_beam_probs, beam_path, beam_symbols):
    if output_projection is not None:
      prev = tf.xw_plus_b(
          prev, output_projection[0], output_projection[1])
    # prev= prev.get_shape().with_rank(2)[1]

    probs  = tf.log(tf.nn.softmax(prev))

    if i > 1:

        probs = tf.reshape(probs + log_beam_probs[-1],
                               [-1, beam_size * num_symbols])

    best_probs, indices = tf.nn.top_k(probs, beam_size)
    indices = tf.stop_gradient(tf.squeeze(tf.reshape(indices, [-1, 1])))
    best_probs = tf.stop_gradient(tf.reshape(best_probs, [-1, 1]))

    symbols = indices % num_symbols # Which word in vocabulary.
    beam_parent = indices // num_symbols # Which hypothesis it came from.


    beam_symbols.append(symbols)
    beam_path.append(beam_parent)
    log_beam_probs.append(best_probs)

    # Note that gradients will not propagate through the second parameter of
    # embedding_lookup.

    emb_prev = tf.nn.embedding_lookup(embedding, symbols)
    emb_prev  = tf.reshape(emb_prev,[beam_size,embedding_size])
    if not update_embedding:
      emb_prev = tf.stop_gradient(emb_prev)
    return emb_prev
Ejemplo n.º 2
0
 def extract_argmax_and_embed(prev, _):
   """Loop_function that extracts the symbol from prev and embeds it."""
   if output_projection is not None:
     prev = tf.xw_plus_b(
         prev, output_projection[0], output_projection[1])
   prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
   emb_prev = tf.embedding_lookup(embedding, prev_symbol)
   return emb_prev
Ejemplo n.º 3
0
 def extract_argmax_and_embed(prev, _):
     """Loop_function that extracts the symbol from prev and embeds it."""
     if output_projection is not None:
         prev = tf.xw_plus_b(prev, output_projection[0],
                             output_projection[1])
     prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
     emb_prev = tf.embedding_lookup(embedding, prev_symbol)
     return emb_prev
Ejemplo n.º 4
0
def beam_rnn_decoder(decoder_inputs, initial_state, cell, loop_function=None,
                scope=None,output_projection=None, beam_size=10):
  """RNN decoder for the sequence-to-sequence model.
  Args:
    decoder_inputs: A list of 2D Tensors [batch_size x input_size].
    initial_state: 2D Tensor with shape [batch_size x cell.state_size].
    cell: rnn_cell.RNNCell defining the cell function and size.
    loop_function: If not None, this function will be applied to the i-th output
      in order to generate the i+1-st input, and decoder_inputs will be ignored,
      except for the first element ("GO" symbol). This can be used for decoding,
      but also for training to emulate http://arxiv.org/abs/1506.03099.
      Signature -- loop_function(prev, i) = next
        * prev is a 2D Tensor of shape [batch_size x output_size],
        * i is an integer, the step number (when advanced control is needed),
        * next is a 2D Tensor of shape [batch_size x input_size].
    scope: VariableScope for the created subgraph; defaults to "rnn_decoder".
  Returns:
    A tuple of the form (outputs, state), where:
      outputs: A list of the same length as decoder_inputs of 2D Tensors with
        shape [batch_size x output_size] containing generated outputs.
      state: The state of each cell at the final time-step.
        It is a 2D Tensor of shape [batch_size x cell.state_size].
        (Note that in some cases, like basic RNN cell or GRU cell, outputs and
         states can be the same. They are different for LSTM cells though.)
  """
  with tf.variable_scope(scope or "rnn_decoder"):
    state = initial_state
    outputs = []
    prev = None
    log_beam_probs, beam_path, beam_symbols = [],[],[]
    state_size = int(initial_state.get_shape().with_rank(2)[1])

    for i, inp in enumerate(decoder_inputs):
      if loop_function is not None and prev is not None:
        with tf.variable_scope("loop_function", reuse=True):
          inp = loop_function(prev, i,log_beam_probs, beam_path, beam_symbols)
      if i > 0:
        tf.get_variable_scope().reuse_variables()

      input_size = inp.get_shape().with_rank(2)[1]
      print(input_size)
      x = inp
      output, state = cell(x, state)

      if loop_function is not None:
        prev = output
      if  i ==0:
          states =[]
          for kk in range(beam_size):
                states.append(state)
          state = tf.reshape(tf.concat(axis=0, values=states), [-1, state_size])

      outputs.append(tf.argmax(tf.xw_plus_b(
          output, output_projection[0], output_projection[1]), axis=1))
  return outputs, state, tf.reshape(tf.concat(axis=0, values=beam_path),[-1,beam_size]), tf.reshape(tf.concat(axis=0, values=beam_symbols),[-1,beam_size])
Ejemplo n.º 5
0
 def loop_function(prev, _):
   if output_projection is not None:
     prev = tf.xw_plus_b(
         prev, output_projection[0], output_projection[1])
   prev_symbol = tf.argmax(prev, 1)
   # Note that gradients will not propagate through the second parameter of
   # embedding_lookup.
   emb_prev = tf.nn.embedding_lookup(embedding, prev_symbol)
   if not update_embedding:
     emb_prev = tf.stop_gradient(emb_prev)
   return emb_prev
Ejemplo n.º 6
0
    def get_cd_update(self, x):
        # contrastive divergence algorithm

        # First, we get the samples of x and h from the probability distribution
        # The sample of x
        x_sample = self.gibbs_sample(x)

        # The sample of hidden nodes, starting from the visible state of x
        h = self.sample(tf.sigmoid(tf.xw_plus_b(x, self.W, self.bh)))
        # The sample of the hidden nodes, starting from the visible state of x_sample
        h_sample = self.sample(
            tf.sigmoid(tf.xw_plus_b(x_sample, self.W, self.bh)))

        # Next, we update the value of W, bh and bv, based on the differences between the samples
        # that we drew and original values
        lr = tf.constant(self.learning_rate,
                         tf.float32)  # the CD learning rate
        size_bt = tf.cast(tf.shape(x)[0], tf.float32)  # batch size
        W_ = tf.multiply(lr / size_bt, tf.sub(tf.matmul(x, h,
                                                        transpose_a=True)),
                         tf.matmul(x_sample, h_sample, transpose_a=True))
        bv_ = tf.multiply(
            lr / size_bt,
            tf.reduce_sum(tf.sub(x, x_sample), axis=0, keep_dims=True))
        bh_ = tf.multiply(
            lr / size_bt,
            tf.reduce_sum(tf.sub(h, h_sample), axis=0, keep_dims=True))

        # When we do sess.run(update), tf will run all 3 updates
        update = [
            self.W.assign_add(W_),
            self.bv.assign_add(bv_),
            self.bh.assign_add(bh_)
        ]

        return update
Ejemplo n.º 7
0
def beam_attention_decoder(decoder_inputs, initial_state, attention_states, cell,
                      output_size=None, num_heads=1, loop_function=None,
                      dtype=tf.float32, scope=None,
                      initial_state_attention=False, output_projection=None, beam_size=10):
  """RNN decoder with attention for the sequence-to-sequence model.
  In this context "attention" means that, during decoding, the RNN can look up
  information in the additional tensor attention_states, and it does this by
  focusing on a few entries from the tensor. This model has proven to yield
  especially good results in a number of sequence-to-sequence tasks. This
  implementation is based on http://arxiv.org/abs/1412.7449 (see below for
  details). It is recommended for complex sequence-to-sequence tasks.
  Args:
    decoder_inputs: A list of 2D Tensors [batch_size x input_size].
    initial_state: 2D Tensor [batch_size x cell.state_size].
    attention_states: 3D Tensor [batch_size x attn_length x attn_size].
    cell: rnn_cell.RNNCell defining the cell function and size.
    output_size: Size of the output vectors; if None, we use cell.output_size.
    num_heads: Number of attention heads that read from attention_states.
    loop_function: If not None, this function will be applied to i-th output
      in order to generate i+1-th input, and decoder_inputs will be ignored,
      except for the first element ("GO" symbol). This can be used for decoding,
      but also for training to emulate http://arxiv.org/abs/1506.03099.
      Signature -- loop_function(prev, i) = next
        * prev is a 2D Tensor of shape [batch_size x output_size],
        * i is an integer, the step number (when advanced control is needed),
        * next is a 2D Tensor of shape [batch_size x input_size].
    dtype: The dtype to use for the RNN initial state (default: tf.float32).
    scope: VariableScope for the created subgraph; default: "attention_decoder".
    initial_state_attention: If False (default), initial attentions are zero.
      If True, initialize the attentions from the initial state and attention
      states -- useful when we wish to resume decoding from a previously
      stored decoder state and attention states.
  Returns:
    A tuple of the form (outputs, state), where:
      outputs: A list of the same length as decoder_inputs of 2D Tensors of
        shape [batch_size x output_size]. These represent the generated outputs.
        Output i is computed from input i (which is either the i-th element
        of decoder_inputs or loop_function(output {i-1}, i)) as follows.
        First, we run the cell on a combination of the input and previous
        attention masks:
          cell_output, new_state = cell(linear(input, prev_attn), prev_state).
        Then, we calculate new attention masks:
          new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
        and then we calculate the output:
          output = linear(cell_output, new_attn).
      state: The state of each decoder cell the final time-step.
        It is a 2D Tensor of shape [batch_size x cell.state_size].
  Raises:
    ValueError: when num_heads is not positive, there are no inputs, shapes
      of attention_states are not set, or input size cannot be inferred
      from the input.
  """
  if not decoder_inputs:
    raise ValueError("Must provide at least 1 input to attention decoder.")
  if num_heads < 1:
    raise ValueError("With less than 1 heads, use a non-attention decoder.")
  if not attention_states.get_shape()[1:2].is_fully_defined():
    raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
                     % attention_states.get_shape())
  if output_size is None:
    output_size = cell.output_size

  with tf.variable_scope(scope or "attention_decoder"):
    batch_size = tf.shape(decoder_inputs[0])[0]  # Needed for reshaping.
    attn_length = attention_states.get_shape()[1].value
    attn_size = attention_states.get_shape()[2].value

    # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
    hidden = tf.reshape(
        attention_states, [-1, attn_length, 1, attn_size])
    hidden_features = []
    v = []
    attention_vec_size = attn_size  # Size of query vectors for attention.
    for a in xrange(num_heads):
      k = tf.get_variable("AttnW_%d" % a,
                                      [1, 1, attn_size, attention_vec_size])
      hidden_features.append(tf.nn.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
      v.append(tf.get_variable("AttnV_%d" % a,
                                           [attention_vec_size]))

    print("Initial_state")

    state_size =  int(initial_state.get_shape().with_rank(2)[1])
    states =[]
    for kk in range(1):
        states.append(initial_state)
    state = tf.reshape(tf.concat(axis=0, values=states), [-1, state_size])
    def attention(query):
      """Put attention masks on hidden using hidden_features and query."""
      ds = []  # Results of attention reads will be stored here.
      for a in xrange(num_heads):
        with tf.variable_scope("Attention_%d" % a):
          y = linear(query, attention_vec_size, True)
          y = tf.reshape(y, [-1, 1, 1, attention_vec_size])
          # Attention mask is a softmax of v^T * tanh(...).
          s = tf.reduce_sum(
              v[a] * tf.nn.tanh(hidden_features[a] + y), [2, 3])
          a = tf.nn.softmax(s)
          # Now calculate the attention-weighted vector d. 
          d = tf.reduce_sum(tf.reshape(a, [-1, attn_length, 1, 1]) * hidden,[1, 2])
          # for c in range(ct):
          ds.append(tf.reshape(d, [-1, attn_size]))
      return ds

    outputs = []
    prev = None
    batch_attn_size = tf.stack([batch_size, attn_size])
    attns = [tf.zeros(batch_attn_size, dtype=dtype)
             for _ in xrange(num_heads)]
    for a in attns:  # Ensure the second shape of attention vectors is set.
      a.set_shape([None, attn_size])

    if initial_state_attention:
       attns = []
       attns.append(attention(initial_state))
       tmp = tf.reshape(tf.concat(axis=0, values=attns), [-1, attn_size])
       attns = []
       attns.append(tmp)

    log_beam_probs, beam_path, beam_symbols = [],[],[]
    for i, inp in enumerate(decoder_inputs):

      if i > 0:
        tf.get_variable_scope().reuse_variables()
      # If loop_function is set, we use it instead of decoder_inputs.
      if loop_function is not None :
        with tf.variable_scope("loop_function", reuse=True):
            if prev is not None:
                inp = loop_function(prev, i,log_beam_probs, beam_path, beam_symbols)

      input_size = inp.get_shape().with_rank(2)[1]
      x = linear([inp] + attns, input_size, True)
      cell_output, state = cell(x, state)

      # Run the attention mechanism.
      if i == 0 and initial_state_attention:
        with tf.variable_scope(tf.get_variable_scope(),
                                           reuse=True):
          attns = attention(state)
      else:
          attns = attention(state)

      with tf.variable_scope("AttnOutputProjection"):
        output = linear([cell_output] + attns, output_size, True)
      if loop_function is not None:
        prev = output
      if  i ==0:
          states =[]
          for kk in range(beam_size):
                states.append(state)
          state = tf.reshape(tf.concat(axis=0, values=states), [-1, state_size])
          with tf.variable_scope(tf.get_variable_scope(), reuse=True):
                attns = attention(state)

      outputs.append(tf.argmax(tf.xw_plus_b(
          output, output_projection[0], output_projection[1]), axis=1))

  return outputs, state, tf.reshape(tf.concat(axis=0, values=beam_path),[-1,beam_size]), tf.reshape(tf.concat(axis=0, values=beam_symbols),[-1,beam_size])