Example #1
0
    def build_encoder(self):
        """Inference Network. q(h|X)"""
        with tf.variable_scope("encoder"):
            self.l1_lin = linear(tf.expand_dims(self.x, 0),
                                 self.embed_dim,
                                 bias=True,
                                 scope="l1")
            self.l1 = tf.nn.relu(self.l1_lin)

            self.l2_lin = linear(self.l1,
                                 self.embed_dim,
                                 bias=True,
                                 scope="l2")
            self.l2 = tf.nn.relu(self.l2_lin)

            self.mu = linear(self.l2, self.h_dim, bias=True, scope="mu")
            self.log_sigma_sq = linear(self.l2,
                                       self.h_dim,
                                       bias=True,
                                       scope="log_sigma_sq")

            self.eps = tf.random_normal((1, self.h_dim),
                                        0,
                                        1,
                                        dtype=tf.float32)
            self.sigma = tf.sqrt(tf.exp(self.log_sigma_sq))

            self.h = tf.add(self.mu, tf.mul(self.sigma, self.eps))

            _ = tf.histogram_summary("mu", self.mu)
            _ = tf.histogram_summary("sigma", self.sigma)
            _ = tf.histogram_summary("h", self.h)
            _ = tf.histogram_summary("mu + sigma", self.mu + self.sigma)
 def attention(query):
     """Put attention masks on hidden using hidden_features and query."""
     with vs.variable_scope("Attention"):
         # Attention mask is a softmax of h_in^T*decoder_hidden.
         dec_hid = array_ops.tile(
             query,
             [1, attn_length
              ])  # replicate query for element-wise multiplication
         dec_hid = array_ops.reshape(
             dec_hid, [-1, attn_length, attention_vec_size])
         attn_weight = nn_ops.softmax(
             math_ops.reduce_sum(
                 attention_states * dec_hid,
                 [2
                  ]))  # attn weights for every hidden states in encoder
         # Now calculate the attention-weighted vector (context vector) cc.
         cc = math_ops.reduce_sum(
             array_ops.reshape(attn_weight, [-1, attn_length, 1, 1]) *
             hidden, [1, 2])
         # attented hidden state
         with vs.variable_scope("AttnW1"):
             term1 = rnn_cell.linear(query, attn_size, False)
         with vs.variable_scope("AttnW2"):
             term2 = rnn_cell.linear(cc, attn_size, False)
         # environment representation
         if env:  # 2D Tensor of shape [batch_size, env_size]
             with vs.variable_scope("Environment"):
                 term3 = rnn_cell.linear(math_ops.to_float(env),
                                         attn_size, False)
             h_attn = math_ops.tanh(term1 + term2 + term3)
         else:
             h_attn = math_ops.tanh(term1 + term2)
     return h_attn, attn_weight
Example #3
0
 def __call__(self, inputs, state, scope=None):
   gru_out, gru_state = super(GRUCellAttn, self).__call__(inputs, state, scope)
   with vs.variable_scope(scope or type(self).__name__):
     with vs.variable_scope("Attn2"):
       gamma_h = tanh(rnn_cell.linear(gru_out, self._num_units, True, 1.0))
     weights = tf.reduce_sum(self.phi_hs * gamma_h, reduction_indices=2, keep_dims=True)
     weights = tf.exp(weights - tf.reduce_max(weights, reduction_indices=0, keep_dims=True))
     weights = weights / (1e-6 + tf.reduce_sum(weights, reduction_indices=0, keep_dims=True))
     context = tf.reduce_sum(self.hs * weights, reduction_indices=0)
     with vs.variable_scope("AttnConcat"):
       out = tf.nn.relu(rnn_cell.linear([context, gru_out], self._num_units, True, 1.0))
     self.attn_map = tf.squeeze(tf.slice(weights, [0, 0, 0], [-1, -1, 1]))
     return (out, out) 
Example #4
0
	def __call__(self, inputs, state, episodic_gate, scope=None):
		"""Gated recurrent unit (GRU) with nunits cells."""
		
		with vs.variable_scope("MGRUCell"):  # "GRUCell"
			with vs.variable_scope("Gates"):	# Reset gate and update gate.
				# We start with bias of 1.0 to not reset and not update.
				r = rnn_cell.linear([inputs, state], self._num_units, True, 1.0, scope=scope)
				r = sigmoid(r)
			with vs.variable_scope("Candidate"):
				c = tanh(rnn_cell.linear([inputs, r * state], self._num_units, True))
			
			new_h = tf.mul(episodic_gate, c) + tf.mul((1 - episodic_gate), state)
		return new_h, new_h
Example #5
0
def iterativeLSTM(inputs, state, num_units, forget_bias, iteration_activation, iteration_count, iteration_prob):
    # This function applies the standard LSTM calculation plus the calculation of the evidence to infer if another iteration is needed.

    # "BasicLSTM"
    # Parameters of gates are concatenated into one multiply for efficiency.
    c, h = array_ops.split(1, 2, state)
    concat = linear([inputs, h], 4 * num_units, True)
    # i = input_gate, j = new_input, f = forget_gate, o = output_gate
    i, j, f, o = array_ops.split(1, 4, concat)

    new_c = c * sigmoid(f + forget_bias) + sigmoid(i) * tanh(j)
    new_h = tanh(new_c) * sigmoid(o)

    # Only a new state is exposed if the iteration gate in this unit of this batch activated the extra iteration.
    new_h = (new_h + h) * iteration_activation + h * (1 - iteration_activation)
    new_c = new_c * iteration_activation + c * (1 - iteration_activation)

    new_state = array_ops.concat(1, [new_c, new_h])
    new_output = new_h * iteration_activation + inputs * (1 - iteration_activation)

    # In this approach the evidence of the iteration gate is based on the inputs that doesn't change over iterations and its state
    #p = linear([j], num_units, True, scope= "iteration_activation")

    new_iteration_activation = update_iteration_activations(iteration_activation, tf.ones(tf.shape(inputs)))

    return new_output, new_state, new_iteration_activation
Example #6
0
 def __init__(self, num_units, encoder_output, scope=None):
   self.hs = encoder_output
   with vs.variable_scope(scope or type(self).__name__):
     with vs.variable_scope("Attn1"):
       hs2d = tf.reshape(self.hs, [-1, num_units])
       phi_hs2d = tanh(rnn_cell.linear(hs2d, num_units, True, 1.0))
       self.phi_hs = tf.reshape(phi_hs2d, tf.shape(self.hs))
   super(GRUCellAttn, self).__init__(num_units)
Example #7
0
 def downscale(self, inp):
   with vs.variable_scope("Downscale"):
     inp2d = tf.reshape(tf.transpose(inp, perm=[1, 0, 2]), [-1, 2 * self.size])
     out2d = rnn_cell.linear(inp2d, self.size, True, 1.0)
     out3d = tf.reshape(out2d, [self.batch_size, -1, self.size])
     out3d = tf.transpose(out3d, perm=[1, 0, 2])
     out = tanh(out3d)
   return out
Example #8
0
 def __init__(self, num_units, encoder_output, scope=None):
     self.hs = encoder_output
     with vs.variable_scope(scope or type(self).__name__):
         with vs.variable_scope("Attn1"):
             hs2d = tf.reshape(self.hs, [-1, num_units])
             phi_hs2d = tanh(rnn_cell.linear(hs2d, num_units, True, 1.0))
             self.phi_hs = tf.reshape(phi_hs2d, tf.shape(self.hs))
     super(GRUCellAttn, self).__init__(num_units)
Example #9
0
 def attention(query):
     """Point on hidden using hidden_features and query."""
     with vs.variable_scope("Attention"):
         y = rnn_cell.linear(query, attention_vec_size, True)
         y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
         # Attention mask is a softmax of v^T * tanh(...).
         s = math_ops.reduce_sum(v * math_ops.tanh(hidden_features + y),
                                 [2, 3])
         return s
 def attention(query):
     """Point on hidden using hidden_features and query."""
     with vs.variable_scope("Attention"):
         y = rnn_cell.linear(query, attention_vec_size, True)
         y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
         # Attention mask is a softmax of v^T * tanh(...).
         s = math_ops.reduce_sum(
             v * math_ops.tanh(hidden_features + y), [2, 3])
         return s
Example #11
0
  def testLinear(self):
    with self.test_session() as sess:
      with tf.variable_scope("root", initializer=tf.constant_initializer(1.0)):
        x = tf.zeros([1, 2])
        l = linear([x], 2, False)
        sess.run([tf.initialize_all_variables()])
        res = sess.run([l], {x.name: np.array([[1., 2.]])})
        self.assertAllClose(res[0], [[3.0, 3.0]])

        # Checks prevent you from accidentally creating a shared function.
        with self.assertRaises(ValueError):
          l1 = linear([x], 2, False)

        # But you can create a new one in a new scope and share the variables.
        with tf.variable_scope("l1") as new_scope:
          l1 = linear([x], 2, False)
        with tf.variable_scope(new_scope, reuse=True):
          linear([l1], 2, False)
        self.assertEqual(len(tf.trainable_variables()), 2)
Example #12
0
  def testLinear(self):
    with self.test_session() as sess:
      with tf.variable_scope("root", initializer=tf.constant_initializer(1.0)):
        x = tf.zeros([1, 2])
        l = linear([x], 2, False)
        sess.run([tf.initialize_all_variables()])
        res = sess.run([l], {x.name: np.array([[1., 2.]])})
        self.assertAllClose(res[0], [[3.0, 3.0]])

        # Checks prevent you from accidentally creating a shared function.
        with self.assertRaises(ValueError):
          l1 = linear([x], 2, False)

        # But you can create a new one in a new scope and share the variables.
        with tf.variable_scope("l1") as new_scope:
          l1 = linear([x], 2, False)
        with tf.variable_scope(new_scope, reuse=True):
          linear([l1], 2, False)
        self.assertEqual(len(tf.trainable_variables()), 2)
Example #13
0
    def __call__(self, inputs, state, episodic_gate, scope=None):
        """Gated recurrent unit (GRU) with nunits cells."""

        with vs.variable_scope("MGRUCell"):  # "GRUCell"
            with vs.variable_scope("Gates"):  # Reset gate and update gate.
                # We start with bias of 1.0 to not reset and not update.
                r = rnn_cell.linear([inputs, state],
                                    self._num_units,
                                    True,
                                    1.0,
                                    scope=scope)
                r = sigmoid(r)
            with vs.variable_scope("Candidate"):
                c = tanh(
                    rnn_cell.linear([inputs, r * state], self._num_units,
                                    True))

            new_h = tf.mul(episodic_gate, c) + tf.mul(
                (1 - episodic_gate), state)
        return new_h, new_h
    def build_encoder(self):
        """Inference Network. q(h|X)"""
        with tf.variable_scope("encoder"):
            self.l1_lin = linear(tf.expand_dims(self.x, 0), self.embed_dim, bias=True, scope="l1")
            self.l1 = tf.nn.relu(self.l1_lin)

            self.l2_lin = linear(self.l1, self.embed_dim, bias=True, scope="l2")
            self.l2 = tf.nn.relu(self.l2_lin)

            self.mu = linear(self.l2, self.h_dim, bias=True, scope="mu")
            self.log_sigma_sq = linear(self.l2, self.h_dim, bias=True, scope="log_sigma_sq")

            self.eps = tf.random_normal((1, self.h_dim), 0, 1, dtype=tf.float32)
            self.sigma = tf.sqrt(tf.exp(self.log_sigma_sq))

            self.h = tf.add(self.mu, tf.mul(self.sigma, self.eps))

            _ = tf.histogram_summary("mu", self.mu)
            _ = tf.histogram_summary("sigma", self.sigma)
            _ = tf.histogram_summary("h", self.h)
            _ = tf.histogram_summary("mu + sigma", self.mu + self.sigma)
 def attention(query): 
   """Put attention masks on hidden using hidden_features and query."""
   with vs.variable_scope("Attention"):
     # Attention mask is a softmax of h_in^T*decoder_hidden.
     dec_hid = array_ops.tile(query, [1, attn_length]) # replicate query for element-wise multiplication
     dec_hid = array_ops.reshape(dec_hid, [-1, attn_length, attention_vec_size])
     attn_weight = nn_ops.softmax(math_ops.reduce_sum(attention_states*dec_hid, [2])) # attn weights for every hidden states in encoder
     # Now calculate the attention-weighted vector (context vector) cc.
     cc = math_ops.reduce_sum(array_ops.reshape(attn_weight, [-1, attn_length, 1, 1])*hidden, [1,2])
     # attented hidden state
     with vs.variable_scope("AttnW1"):
       term1 = rnn_cell.linear(query, attn_size, False)
     with vs.variable_scope("AttnW2"):
       term2 = rnn_cell.linear(cc, attn_size, False)
     # environment representation
     if env: # 2D Tensor of shape [batch_size, env_size]
       with vs.variable_scope("Environment"):
         term3 = rnn_cell.linear(math_ops.to_float(env), attn_size, False)
       h_attn = math_ops.tanh(term1 + term2 + term3)
     else:
       h_attn = math_ops.tanh(term1 + term2)
   return h_attn, attn_weight
Example #16
0
 def __call__(self, inputs, state, scope=None):
     gru_out, gru_state = super(GRUCellAttn,
                                self).__call__(inputs, state, scope)
     with vs.variable_scope(scope or type(self).__name__):
         with vs.variable_scope("Attn2"):
             gamma_h = tanh(
                 rnn_cell.linear(gru_out, self._num_units, True, 1.0))
         weights = tf.reduce_sum(self.phi_hs * gamma_h,
                                 reduction_indices=2,
                                 keep_dims=True)
         weights = tf.exp(
             weights -
             tf.reduce_max(weights, reduction_indices=0, keep_dims=True))
         weights = weights / (1e-6 + tf.reduce_sum(
             weights, reduction_indices=0, keep_dims=True))
         context = tf.reduce_sum(self.hs * weights, reduction_indices=0)
         with vs.variable_scope("AttnConcat"):
             out = tf.nn.relu(
                 rnn_cell.linear([context, gru_out], self._num_units, True,
                                 1.0))
         self.attn_map = tf.squeeze(
             tf.slice(weights, [0, 0, 0], [-1, -1, 1]))
         return (out, out)
Example #17
0
    def setup_label_loss(self):
        with vs.variable_scope("LabelLogistic"):
            doshape = tf.shape(self.decoder_output)
            T, batch_size = doshape[0], doshape[1]

            # [batch_size, cell.state_size]
            # decoder_output: [batch_size, time_step, cell.state_size]
            last_state = self.decoder_output[:, -1, :]

            # projecting to label space
            # [batch_size, label_size]
            logits = rnn_cell.linear(last_state, self.label_size, True, 1.0)
            self.losses = tf.nn.softmax_cross_entropy_with_logits(logits, self.label_placeholder)
            self.predictions = logits
Example #18
0
 def attention(query):
     """Put attention masks on hidden using hidden_features and query."""
     ds = []  # Results of attention reads will be stored here.
     for a in xrange(num_heads):
         with variable_scope.variable_scope("Attention_%d" % a):
             y = rnn_cell.linear(query, attention_vec_size, True)
             y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
             # Attention mask is a softmax of v^T * tanh(...).
             s = math_ops.reduce_sum(v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])
             a = nn_ops.softmax(s)
             # Now calculate the attention-weighted vector d.
             d = math_ops.reduce_sum(array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2])
             ds.append(array_ops.reshape(d, [-1, attn_size]))
     return ds
Example #19
0
    def setup_label_loss(self):
        with vs.variable_scope("LabelLogistic"):
            doshape = tf.shape(self.decoder_output)
            T, batch_size = doshape[0], doshape[1]

            # [batch_size, cell.state_size]
            # decoder_output: [batch_size, time_step, cell.state_size]
            last_state = self.decoder_output[:, -1, :]

            # projecting to label space
            # [batch_size, label_size]
            logits = rnn_cell.linear(last_state, self.label_size, True, 1.0)
            self.losses = tf.nn.softmax_cross_entropy_with_logits(
                logits, self.label_placeholder)
            self.predictions = logits
Example #20
0
  def setup_loss(self):
    with vs.variable_scope("Logistic"):
      do2d = tf.reshape(self.decoder_output, [-1, self.size])
      logits2d = rnn_cell.linear(do2d, self.vocab_size, True, 1.0)
      outputs2d = tf.nn.softmax(logits2d)
      self.outputs = tf.reshape(outputs2d, [-1, self.batch_size, self.vocab_size])

      targets_no_GO = tf.slice(self.target_tokens, [1, 0], [-1, -1])
      masks_no_GO = tf.slice(self.target_mask, [1, 0], [-1, -1])
      # easier to pad target/mask than to split decoder input since tensorflow does not support negative indexing
      labels1d = tf.reshape(tf.pad(targets_no_GO, [[0, 1], [0, 0]]), [-1])
      mask1d = tf.reshape(tf.pad(masks_no_GO, [[0, 1], [0, 0]]), [-1])
      losses1d = tf.nn.sparse_softmax_cross_entropy_with_logits(logits2d, labels1d) * tf.to_float(mask1d)
      losses2d = tf.reshape(losses1d, [-1, self.batch_size])
      self.losses = tf.reduce_sum(losses2d) / self.batch_size
Example #21
0
  def downscale(self, inp, mask):
    with vs.variable_scope("Downscale"):
      inp2d = tf.reshape(tf.transpose(inp, perm=[1, 0, 2]), [-1, 2 * self.size])
      out2d = rnn_cell.linear(inp2d, self.size, True, 1.0)
      out3d = tf.reshape(out2d, [self.batch_size, -1, self.size])
      out3d = tf.transpose(out3d, perm=[1, 0, 2])
      out = tanh(out3d)

      mask = tf.transpose(mask)
      mask = tf.reshape(mask, [-1, 2])
      mask = tf.cast(mask, tf.bool)
      mask = tf.reduce_any(mask, reduction_indices=1)
      mask = tf.to_int32(mask)
      mask = tf.reshape(mask, [self.batch_size, -1])
      mask = tf.transpose(mask)
    return out, mask
Example #22
0
 def attention(query):
   """Put attention masks on hidden using hidden_features and query."""
   ds = []  # Results of attention reads will be stored here.
   for a in xrange(num_heads):
     with vs.variable_scope("Attention_%d" % a):
       y = rnn_cell.linear(query, attention_vec_size, True)
       y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
       # Attention mask is a softmax of v^T * tanh(...).
       s = math_ops.reduce_sum(
           v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])
       a = nn_ops.softmax(s)
       # Now calculate the attention-weighted vector d.
       d = math_ops.reduce_sum(
           array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
           [1, 2])
       ds.append(array_ops.reshape(d, [-1, attn_size]))
   return ds
def basic_rnn_cell(inputs, state, num_units, scope=None):
    if state is None:
        if inputs is not None:
            batch_size = inputs.get_shape()[0]
            dtype = inputs.dtype
        else:
            batch_size = 0
            dtype = tf.float32
        init_output = tf.zeros(tf.pack([batch_size, num_units]), dtype=dtype)
        init_state = tf.zeros(tf.pack([batch_size, num_units]), dtype=dtype)
        init_output.set_shape([batch_size, num_units])
        init_state.set_shape([batch_size, num_units])
        return init_output, init_state
    else:
        with tf.variable_op_scope([inputs, state], scope, "BasicRNNCell"):
            output = tf.tanh(linear([inputs, state], num_units, True))
        return output, output
Example #24
0
def basic_rnn_cell(inputs, state, num_units, scope=None):
    if state is None:
        if inputs is not None:
            batch_size = inputs.get_shape()[0]
            dtype = inputs.dtype
        else:
            batch_size = 0
            dtype = tf.float32
        init_output = tf.zeros(tf.pack([batch_size, num_units]), dtype=dtype)
        init_state = tf.zeros(tf.pack([batch_size, num_units]), dtype=dtype)
        init_output.set_shape([batch_size, num_units])
        init_state.set_shape([batch_size, num_units])
        return init_output, init_state
    else:
        with tf.variable_op_scope([inputs, state], scope, "BasicRNNCell"):
            output = tf.tanh(linear([inputs, state], num_units, True))
        return output, output
Example #25
0
def batch_linear(args, output_size, bias):
    '''
    Apply linear map to a batch of matrices.
    args: a 3D Tensor or a list of 3D, batch x n x m, Tensors.
    '''
    if not nest.is_sequence(args):
        args = [args]
    batch_size = args[0].get_shape().as_list()[0] or tf.shape(args[0])[0]
    flat_args = []
    for arg in args:
        m = arg.get_shape().as_list()[2]
        if not m:
            raise ValueError('batch_linear expects shape[2] of arguments: %s' %
                             str(m))
        flat_args.append(tf.reshape(arg, [-1, m]))
    flat_output = linear(flat_args, output_size, bias)
    output = tf.reshape(flat_output, [batch_size, -1, output_size])
    return output
Example #26
0
    def setup_generation_loss(self):
        with vs.variable_scope("Logistic"):
            doshape = tf.shape(self.decoder_output)
            T, batch_size = doshape[0], doshape[1]
            do2d = tf.reshape(self.decoder_output, [-1, self.size])
            logits2d = rnn_cell.linear(do2d, self.vocab_size, True, 1.0)
            outputs2d = tf.nn.log_softmax(logits2d)
            self.outputs = tf.reshape(
                outputs2d, tf.pack([T, batch_size, self.vocab_size]))

            targets_no_GO = tf.slice(self.target_tokens, [1, 0], [-1, -1])
            # easier to pad target/mask than to split decoder input since tensorflow does not support negative indexing
            labels1d = tf.reshape(tf.pad(targets_no_GO, [[0, 1], [0, 0]]),
                                  [-1])
            losses1d = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits2d, labels1d)
            losses2d = tf.reshape(losses1d, tf.pack([T, batch_size]))
            self.losses = tf.reduce_sum(losses2d) / tf.to_float(batch_size)
Example #27
0
  def downscale(self, inp, mask):
    with vs.variable_scope("Downscale"):
      inshape = tf.shape(inp)
      T, batch_size, dim = inshape[0], inshape[1], inshape[2]
      inp2d = tf.reshape(tf.transpose(inp, perm=[1, 0, 2]), [-1, 2 * self.size])
      out2d = rnn_cell.linear(inp2d, self.size, True, 1.0)
      out3d = tf.reshape(out2d, tf.pack((batch_size, tf.to_int32(T/2), dim)))
      out3d = tf.transpose(out3d, perm=[1, 0, 2])
      out3d.set_shape([None, None, self.size])
      out = tanh(out3d)

      mask = tf.transpose(mask)
      mask = tf.reshape(mask, [-1, 2])
      mask = tf.cast(mask, tf.bool)
      mask = tf.reduce_any(mask, reduction_indices=1)
      mask = tf.to_int32(mask)
      mask = tf.reshape(mask, tf.pack([batch_size, -1]))
      mask = tf.transpose(mask)
    return out, mask
Example #28
0
def dnn(tensor_in, hidden_units, activation=nn.relu, dropout=None):
  """Creates fully connected deep neural network subgraph.

  Args:
    tensor_in: tensor or placeholder for input features.
    hidden_units: list of counts of hidden units in each layer.
    activation: activation function between layers. Can be None.
    dropout: if not None, will add a dropout layer with given probability.

  Returns:
    A tensor which would be a deep neural network.
  """
  with vs.variable_scope('dnn'):
    for i, n_units in enumerate(hidden_units):
      with vs.variable_scope('layer%d' % i):
        tensor_in = rnn_cell.linear(tensor_in, n_units, True)
        if activation is not None:
          tensor_in = activation(tensor_in)
        if dropout is not None:
          tensor_in = dropout_ops.dropout(tensor_in, prob=(1.0 - dropout))
    return tensor_in
    def __init__(self,
                 is_training,
                 vocab_size,
                 batch_size,
                 num_steps,
                 config,
                 reuse_conv_variables=None):
        if config.topic_number > 0:
            TopicModel.__init__(self, is_training, vocab_size, batch_size,
                                num_steps, 0, config, reuse_conv_variables)
        else:
            self.y = tf.placeholder(tf.int32, [None, num_steps])
            self.config = config

        #placeholders
        self.x = tf.placeholder(tf.int32, [None, num_steps])
        self.lm_mask = tf.placeholder(tf.float32, [None, num_steps])

        #variables
        self.lstm_word_embedding = tf.get_variable("lstm_embedding", [vocab_size, config.word_embedding_size], \
            trainable=config.word_embedding_update, \
            initializer=tf.random_uniform_initializer(-0.5/config.word_embedding_size, 0.5/config.word_embedding_size))
        self.lm_softmax_w = tf.get_variable(
            "lm_softmax_w", [config.rnn_hidden_size, vocab_size])
        if is_training and config.num_samples > 0:
            self.lm_softmax_w_t = tf.transpose(self.lm_softmax_w)
        self.lm_softmax_b = tf.get_variable(
            "lm_softmax_b", [vocab_size],
            initializer=tf.constant_initializer())
        if config.topic_number > 0:
            self.gate_w = tf.get_variable(
                "gate_w",
                [config.topic_embedding_size, config.rnn_hidden_size])
            self.gate_u = tf.get_variable(
                "gate_u", [config.rnn_hidden_size, config.rnn_hidden_size])
            self.gate_b = tf.get_variable(
                "gate_b", [config.rnn_hidden_size],
                initializer=tf.constant_initializer())

        #define lstm cells
        lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(config.rnn_hidden_size,
                                                 forget_bias=1.0)
        if is_training and config.lm_keep_prob < 1.0:
            lstm_cell = tf.nn.rnn_cell.DropoutWrapper(
                lstm_cell,
                output_keep_prob=config.lm_keep_prob,
                seed=config.seed)
        self.cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] *
                                                config.rnn_layer_size)

        #set initial state to all zeros
        self.initial_state = self.cell.zero_state(batch_size, tf.float32)

        #embedding lookup
        inputs = tf.nn.embedding_lookup(self.lstm_word_embedding, self.x)
        if is_training and config.lm_keep_prob < 1.0:
            inputs = tf.nn.dropout(inputs,
                                   config.lm_keep_prob,
                                   seed=config.seed)

        #transform input from [batch_size,sent_len,emb_size] to [sent_len,batch_size,emb_size ]
        inputs = [
            tf.squeeze(input_, [1])
            for input_ in tf.split(1, num_steps, inputs)
        ]

        #run rnn and get outputs (hidden layer)
        outputs, self.state = tf.nn.rnn(self.cell,
                                        inputs,
                                        initial_state=self.initial_state)

        #reshape output into [sent_len,batch_size,hidden_size] and then into [batch_size*sent_len,hidden_size]
        lstm_hidden = tf.reshape(tf.concat(1, outputs),
                                 [-1, config.rnn_hidden_size])

        if config.topic_number > 0:
            #combine topic and language model hidden with a gating unit
            z, r = array_ops.split(1, 2, linear([self.conv_hidden, lstm_hidden], \
                2 * config.rnn_hidden_size, True, 1.0))
            z, r = tf.sigmoid(z), tf.sigmoid(r)
            c = tf.tanh(tf.matmul(self.conv_hidden, self.gate_w) + tf.matmul((r * lstm_hidden), self.gate_u) + \
                self.gate_b)
            hidden = (1 - z) * lstm_hidden + z * c

            #save z
            self.tm_weights = tf.reshape(tf.reduce_mean(z, 1), [-1, num_steps])
        else:
            hidden = lstm_hidden

        #compute masked/weighted crossent and mean language model loss
        if is_training and config.num_samples > 0:
            lm_crossent = tf.nn.sampled_softmax_loss(self.lm_softmax_w_t, self.lm_softmax_b, hidden, \
                tf.reshape(self.y, [-1,1]), config.num_samples, vocab_size)
        else:
            lm_logits = tf.matmul(hidden,
                                  self.lm_softmax_w) + self.lm_softmax_b
            lm_crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
                lm_logits, tf.reshape(self.y, [-1]))
        lm_crossent_m = lm_crossent * tf.reshape(self.lm_mask, [-1])
        self.lm_cost = tf.reduce_sum(lm_crossent_m) / batch_size

        #compute probs if in testing mode
        if not is_training:
            self.probs = tf.nn.softmax(lm_logits)
            return

        #run optimiser and backpropagate (clipped) gradients for lm loss
        lm_tvars = tf.trainable_variables()
        lm_grads, _ = tf.clip_by_global_norm(
            tf.gradients(self.lm_cost, lm_tvars), config.max_grad_norm)
        self.lm_train_op = tf.train.AdamOptimizer(
            config.learning_rate).apply_gradients(zip(lm_grads, lm_tvars))
def attention_decoder(decoder_inputs,
                      initial_state,
                      attention_states,
                      cell,
                      batch_size,
                      state_size,
                      decoder_inputs_positions=None,
                      decoder_inputs_maps=None,
                      output_size=None,
                      loop_function=None,
                      dtype=dtypes.float32,
                      scope=None):
    """RNN decoder with attention for the sequence-to-sequence model.

  Args:
    decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size]. Embedded inputs.
    initial_state: 2D Tensor [batch_size x cell.state_size].
    attention_states: 3D Tensor [batch_size x attn_length x attn_size].
    cell: rnn_cell.RNNCell defining the cell function and size.
    batch_size: need to clarify batch size explicitly since env_state is updated one sample by one sample.
    state_size: size of environment state.
    decoder_inputs_positions: a list of 2D Tensors of shape [batch_size, 3],
       indicating intial positions of each example in a map. Default None.
    decoder_inputs_maps: a 1D Tensor of length batch_size indicating the map. Default None.
    output_size: size of the output vectors; if None, we use cell.output_size.
    loop_function: if not None, this function will be applied to i-th output
      in order to generate i+1-th input, and decoder_inputs will be ignored,
      except for the first element ("GO" symbol). This can be used for decoding,
      but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf.
      Signature -- loop_function(prev, i) = next
        * prev is a 2D Tensor of shape [batch_size x cell.output_size],
        * i is an integer, the step number (when advanced control is needed),
        * next is a 2D Tensor of shape [batch_size x cell.input_size].
    dtype: The dtype to use for the RNN initial state (default: tf.float32).
    scope: VariableScope for the created subgraph; default: "attention_decoder".

  Returns:
    outputs: A list of the same length as decoder_inputs of 2D Tensors of shape
      [batch_size x output_size]. These represent the generated outputs.
      Output i is computed from input i (which is either i-th decoder_inputs or
      loop_function(output {i-1}, i)) as follows. 
      First, we run the cell on the current decoder input or feed from previous output:
        cur_output, new_state = cell(input, prev_state).
      Then, we calculate new attention masks:
        new_attn = softmax(h_t^T * attention_states).
      Thus, the context vector:
        cont_vec = weighted_sum_of(attention_states), weighted by (new_attn),
      and then we calculate the attended output:
        attn_output = tanh(W1*current_output + W2*cont_vec + W3*env_state).
      The finally output for prediction:
        output = softmax(W*attn_output).
        This "output" should be a 1D Tensor of shape [num_symbols].
        Every item of the output refers to the probability of predicting certain symbol for the next step.
    states: The state of each decoder cell in each time-step. This is a list
      with length len(decoder_inputs) -- one item for each time-step.
      Each item is a 2D Tensor of shape [batch_size x cell.state_size].

  Raises:
    ValueError: when num_heads is not positive, there are no inputs, or shapes
      of attention_states are not set.
  """
    if not decoder_inputs:
        raise ValueError("Must provide at least 1 input to attention decoder.")
    if not attention_states.get_shape()[1:2].is_fully_defined():
        raise ValueError(
            "Shape[1] and [2] of attention_states must be known: %s" %
            attention_states.get_shape())
    if output_size is None:
        output_size = cell.output_size

    with vs.variable_scope(scope or "attention_decoder"):
        attn_length = attention_states.get_shape()[1].value
        attn_size = attention_states.get_shape()[2].value
        mapIdx = array_ops.pack([map3.map_grid, map3.map_jelly,
                                 map3.map_one])  #map

        attention_vec_size = attn_size  # size of query
        states = [initial_state]
        # current position and environment
        position, env = None, None

        hidden = array_ops.reshape(
            attention_states,
            [-1, attn_length, 1, attn_size])  # reshape for later computation

        def attention(query):
            """Put attention masks on hidden using hidden_features and query."""
            with vs.variable_scope("Attention"):
                # Attention mask is a softmax of h_in^T*decoder_hidden.
                dec_hid = array_ops.tile(
                    query,
                    [1, attn_length
                     ])  # replicate query for element-wise multiplication
                dec_hid = array_ops.reshape(
                    dec_hid, [-1, attn_length, attention_vec_size])
                attn_weight = nn_ops.softmax(
                    math_ops.reduce_sum(
                        attention_states * dec_hid,
                        [2
                         ]))  # attn weights for every hidden states in encoder
                # Now calculate the attention-weighted vector (context vector) cc.
                cc = math_ops.reduce_sum(
                    array_ops.reshape(attn_weight, [-1, attn_length, 1, 1]) *
                    hidden, [1, 2])
                # attented hidden state
                with vs.variable_scope("AttnW1"):
                    term1 = rnn_cell.linear(query, attn_size, False)
                with vs.variable_scope("AttnW2"):
                    term2 = rnn_cell.linear(cc, attn_size, False)
                # environment representation
                if env:  # 2D Tensor of shape [batch_size, env_size]
                    with vs.variable_scope("Environment"):
                        term3 = rnn_cell.linear(math_ops.to_float(env),
                                                attn_size, False)
                    h_attn = math_ops.tanh(term1 + term2 + term3)
                else:
                    h_attn = math_ops.tanh(term1 + term2)
            return h_attn, attn_weight

        def updateEnv(_position, _step, _mapNo):
            """ Update env_state according to current position and step.
      Args:
      position: a 2D Tensor of shape [batch_size, 3].
      step: a 2D Tensor of shape [batch_size, 1], where
      0 --> no action, 1 --> move forward 1 step, 2 --> turn right, 3 --> turn left, 4 --> turn back.
      mapNo: a 1D int32 Tensor of length batch_size.
      
      Returns:
      env: a 2D Tensor of shape [batch_size, env_size]
        environment state after taking the step based on the position.
      position: a 2D Tensor of shape [batch_size, 3]
        new position after taking the step based on the position.
      """
            if not _mapNo:
                raise ValueError(" Invalid argument mapNo in updateEnv! ")
            if not _position:
                raise ValueError(" Invalid argument position in updateEnv! ")
            new_env = []
            new_pos = []
            # if step == None, take no step and return the environment representations of each position.
            if not _step:
                new_pos = _position
                for j in xrange(batch_size):
                    vec = array_ops.slice(
                        mapIdx,
                        array_ops.pack([
                            _mapNo[j], _position[j, 0], _position[j, 1],
                            _position[j, 2], 0
                        ]), [1, 1, 1, 1, state_size])
                    new_env.append(array_ops.squeeze(vec))
                new_env = array_ops.reshape(array_ops.pack(new_env),
                                            [batch_size, state_size])
                return new_pos, new_env

            else:

                def f_move(ppos):  # move forward 1 step
                    return control_flow_ops.cond(
                        math_ops.equal(ppos[2], 0), lambda: array_ops.pack(
                            [ppos[0], ppos[1] - 1, ppos[2]]),
                        lambda: control_flow_ops.cond(
                            math_ops.equal(ppos[2], 1), lambda: array_ops.pack(
                                [ppos[0] + 1, ppos[1], ppos[2]]), lambda:
                            control_flow_ops.cond(
                                math_ops.equal(ppos[2], 2), lambda: array_ops.
                                pack([ppos[0], ppos[1] + 1, ppos[2]]
                                     ), lambda: array_ops.pack(
                                         [ppos[0] - 1, ppos[1], ppos[2]]))))

                def f_right(ppos):  # turn right
                    return control_flow_ops.cond(
                        math_ops.equal(ppos[2], 0),
                        lambda: array_ops.pack([ppos[0], ppos[1], 1]),
                        lambda: control_flow_ops.cond(
                            math_ops.equal(ppos[2], 1), lambda: array_ops.pack(
                                [ppos[0], ppos[1], 2]), lambda:
                            control_flow_ops.cond(
                                math_ops.equal(ppos[2], 2), lambda: array_ops.
                                pack([ppos[0], ppos[1], 3]), lambda: array_ops.
                                pack([ppos[0], ppos[1], 0]))))

                def f_left(ppos):  # turn left
                    return control_flow_ops.cond(
                        math_ops.equal(ppos[2], 0),
                        lambda: array_ops.pack([ppos[0], ppos[1], 3]),
                        lambda: control_flow_ops.cond(
                            math_ops.equal(ppos[2], 1), lambda: array_ops.pack(
                                [ppos[0], ppos[1], 0]), lambda:
                            control_flow_ops.cond(
                                math_ops.equal(ppos[2], 2), lambda: array_ops.
                                pack([ppos[0], ppos[1], 1]), lambda: array_ops.
                                pack([ppos[0], ppos[1], 2]))))

                def f_back(ppos):  # turn back
                    return control_flow_ops.cond(
                        math_ops.equal(ppos[2], 0),
                        lambda: array_ops.pack([ppos[0], ppos[1], 2]),
                        lambda: control_flow_ops.cond(
                            math_ops.equal(ppos[2], 1), lambda: array_ops.pack(
                                [ppos[0], ppos[1], 3]), lambda:
                            control_flow_ops.cond(
                                math_ops.equal(ppos[2], 2), lambda: array_ops.
                                pack([ppos[0], ppos[1], 0]), lambda: array_ops.
                                pack([ppos[0], ppos[1], 1]))))

                def ffn4(sstep, ppos):
                    return control_flow_ops.cond(
                        math_ops.equal(sstep, data_utils.turnBack_ID),
                        lambda: f_back(ppos), lambda: _position[j, :])

                def ffn3(sstep, ppos):
                    return control_flow_ops.cond(
                        math_ops.equal(sstep, data_utils.turnLeft_ID),
                        lambda: f_left(ppos), lambda: ffn4(sstep, ppos))

                def ffn2(sstep, ppos):
                    return control_flow_ops.cond(
                        math_ops.equal(sstep, data_utils.turnRight_ID),
                        lambda: f_right(ppos), lambda: ffn3(sstep, ppos))

                def ffn1(sstep, ppos):
                    return control_flow_ops.cond(
                        math_ops.equal(sstep, data_utils.moveAct_ID),
                        lambda: f_move(ppos), lambda: ffn2(sstep, ppos))

                for j in xrange(batch_size):
                    #update position
                    temp_pos = control_flow_ops.cond(
                        math_ops.equal(_step[j], data_utils.noAct_ID),
                        lambda: _position[j, :],
                        lambda: ffn1(_step[j], _position[j, :]))
                    new_pos.append(
                        control_flow_ops.cond(
                            math_ops.logical_or(
                                math_ops.greater(temp_pos[0], 24),
                                math_ops.logical_or(
                                    math_ops.greater(temp_pos[1], 24),
                                    math_ops.logical_or(
                                        math_ops.less(temp_pos[0], 0),
                                        math_ops.less(temp_pos[1], 0)))),
                            lambda: _position[j, :], lambda: temp_pos))
                    # new_pos.append(temp_pos)

                    # update env
                    new_env.append(
                        array_ops.reshape(
                            array_ops.slice(
                                mapIdx,
                                array_ops.pack([
                                    _mapNo[j], new_pos[-1][0], new_pos[-1][1],
                                    new_pos[-1][2], 0
                                ]), [1, 1, 1, 1, state_size]), [state_size]))

                new_pos = array_ops.pack(new_pos)
                new_env = array_ops.pack(new_env)
                return new_pos, new_env
                # return new_pos, None

        outputs = []
        attentions = []
        environments = []
        positions = []
        prev = None

        # print(" Action info: no act=%d, move=%d, turn left=%d, turn right=%d, turn back=%d" %
        #   (data_utils.noAct_ID, data_utils.moveAct_ID, data_utils.turnLeft_ID, data_utils.turnRight_ID, data_utils.turnBack_ID))

        if decoder_inputs_positions and decoder_inputs_maps and batch_size:
            position = decoder_inputs_positions[
                0]  # 2d tensor of shape [batch_size, 3]
            _, env = updateEnv(position, None, decoder_inputs_maps)
        for i in xrange(len(decoder_inputs)):
            if i > 0:
                vs.get_variable_scope().reuse_variables()
            inp = decoder_inputs[i]

            # If loop_function is set, we use it instead of decoder_inputs.
            if loop_function is not None and prev is not None:
                with vs.variable_scope("loop_function", reuse=True):
                    inp = array_ops.stop_gradient(loop_function(prev, i))

            # Run the RNN.
            cur_output, new_state = cell(inp, states[-1])
            cur_output = array_ops.reshape(cur_output, [batch_size, attn_size])
            states.append(new_state)

            # Run the attention mechanism.
            h_attn, attn_weight = attention(cur_output)
            attentions.append(attn_weight)

            with vs.variable_scope("AttnOutputProjection"):
                output = rnn_cell.linear(h_attn, output_size, False)

            if loop_function is not None:
                # We do not propagate gradients over the loop function.
                prev = array_ops.stop_gradient(output)

            if decoder_inputs_positions and decoder_inputs_maps and position:

                # update pos and env
                if loop_function:
                    step = math_ops.argmax(
                        nn_ops.softmax(prev),
                        1)  # step is a list (len=batch_size) of int32 number
                    position, env = updateEnv(position, step,
                                              decoder_inputs_maps)
                else:
                    if i < len(decoder_inputs_positions) - 1:
                        position = decoder_inputs_positions[i + 1]
                    _, env = updateEnv(position, None, decoder_inputs_maps)

            outputs.append(output)
            environments.append(env)
            positions.append(position)

    return outputs, states, attentions, environments, positions
Example #31
0
 def _output_project(self, output, attn, project_size):
     with tf.variable_scope("AttnOutputProjection"):
         new_output = activation(linear([output, attn], project_size, False))
     return new_output
def pointer_decoder(decoder_inputs, initial_state, attention_states, cell,
                    feed_prev=True, dtype=dtypes.float32, scope=None):
    """RNN decoder with pointer net for the sequence-to-sequence model.
    Args:
      decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
      initial_state: 2D Tensor [batch_size x cell.state_size].
      attention_states: 3D Tensor [batch_size x attn_length x attn_size].
      cell: rnn_cell.RNNCell defining the cell function and size.
      dtype: The dtype to use for the RNN initial state (default: tf.float32).
      scope: VariableScope for the created subgraph; default: "pointer_decoder".
    Returns:
      outputs: A list of the same length as decoder_inputs of 2D Tensors of shape
        [batch_size x output_size]. These represent the generated outputs.
        Output i is computed from input i (which is either i-th decoder_inputs.
        First, we run the cell
        on a combination of the input and previous attention masks:
          cell_output, new_state = cell(linear(input, prev_attn), prev_state).
        Then, we calculate new attention masks:
          new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
        and then we calculate the output:
          output = linear(cell_output, new_attn).
      states: The state of each decoder cell in each time-step. This is a list
        with length len(decoder_inputs) -- one item for each time-step.
        Each item is a 2D Tensor of shape [batch_size x cell.state_size].
    """
    if not decoder_inputs:
        raise ValueError("Must provide at least 1 input to attention decoder.")
    if not attention_states.get_shape()[1:2].is_fully_defined():
        raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
                         % attention_states.get_shape())

    with vs.variable_scope(scope or "point_decoder"):
        batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
        input_size = decoder_inputs[0].get_shape()[1].value
        attn_length = attention_states.get_shape()[1].value
        attn_size = attention_states.get_shape()[2].value

        # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
        hidden = array_ops.reshape(
            attention_states, [-1, attn_length, 1, attn_size])

        attention_vec_size = attn_size  # Size of query vectors for attention.
        k = vs.get_variable("AttnW", [1, 1, attn_size, attention_vec_size])
        hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")
        v = vs.get_variable("AttnV", [attention_vec_size])

        states = [initial_state]

        def attention(query):
            """Point on hidden using hidden_features and query."""
            with vs.variable_scope("Attention"):
                y = rnn_cell.linear(query, attention_vec_size, True)
                y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
                # Attention mask is a softmax of v^T * tanh(...).
                s = math_ops.reduce_sum(
                    v * math_ops.tanh(hidden_features + y), [2, 3])
                return s

        outputs = []
        prev = None
        batch_attn_size = array_ops.pack([batch_size, attn_size])
        attns = array_ops.zeros(batch_attn_size, dtype=dtype)

        attns.set_shape([None, attn_size])
        inps = []
        for i in xrange(len(decoder_inputs)):
            if i > 0:
                vs.get_variable_scope().reuse_variables()
            inp = decoder_inputs[i]

            if feed_prev and i > 0:
                inp = tf.pack(decoder_inputs)
                inp = tf.transpose(inp, perm=[1, 0, 2])
                inp = tf.reshape(inp, [-1, attn_length, input_size])
                inp = tf.reduce_sum(inp * tf.reshape(tf.nn.softmax(output), [-1, attn_length, 1]), 1)
                inp = tf.stop_gradient(inp)
                inps.append(inp)

            # Use the same inputs in inference, order internaly

            # Merge input and previous attentions into one vector of the right size.
            x = rnn_cell.linear([inp, attns], cell.input_size, True)
            # Run the RNN.
            cell_output, new_state = cell(x, states[-1])
            states.append(new_state)
            # Run the attention mechanism.
            output = attention(new_state)

            outputs.append(output)

    return outputs, states, inps
Example #33
0
def attention_encoder(decoder_inputs, initial_state, attention_states,
                      cell, num_heads=1,
                      output_size=None, dtype=dtypes.float32, scope=None,
                      initial_state_attention=False):
    """
    Encoder that receives attention from another encoder

    Parameters
    ----------
    decoder_inputs:
        second encoder's input we call it a decoder's input
        it should be already wrapped by add_embedding()
        it's A list of num_steps length 2D Tensors [batch_size, input_size = embed_size]
    initial_state:
        2D Tensor (batch_size x cell.state_size).
    attention_states:
        3D Tensor (batch_size x attn_length (seq_length) x attn_size)
    cell
    num_heads
    output_size
    dtype
    scope
    initial_state_attention

    Returns
    -------
    A tuple of the form (outputs, state), where:
      outputs: A list of the same length as decoder_inputs of 2D Tensors with
        shape [batch_size x output_size] containing the generated outputs.
      state: The state of each decoder cell at the final time-step.
        It is a 2D Tensor of shape (batch_size x cell.state_size).

    """
    decoder_inputs = [decoder_inputs]  # in original model this is a bucket list of inputs

    with vs.variable_scope(scope or "attention_encoder"):
        batch_size = array_ops.shape(decoder_inputs[0])[0]
        attn_length = attention_states.get_shape()[1].value
        attn_size = attention_states.get_shape()[2].value

    v = []
    attention_vec_size = attn_size  # Size of query vectors for attention.
    hidden = array_ops.reshape(
        attention_states, [-1, attn_length, 1, attn_size])
    hidden_features = []
    for a in xrange(num_heads):
        k = vs.get_variable("AttnW_%d" % a,
                            [1, 1, attn_size, attention_vec_size])
        hidden_features.append(tf.nn.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
        v.append(vs.get_variable("AttnV_%d" % a, [attention_vec_size]))

    def attention(query):
        """Put attention masks on hidden using hidden_features and query."""
        ds = []  # Results of attention reads will be stored here.
        for a in xrange(num_heads):
            with vs.variable_scope("Attention_%d" % a):
                y = rnn_cell.linear(query, attention_vec_size, True)
                y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
                # Attention mask is a softmax of v^T * tanh(...).
                s = math_ops.reduce_sum(
                    v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])
                a = tf.nn.softmax(s)
                # Now calculate the attention-weighted vector d.
                d = math_ops.reduce_sum(
                    array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
                    [1, 2])
                ds.append(array_ops.reshape(d, [-1, attn_size]))
        return ds

    outputs = []
    batch_attn_size = array_ops.pack([batch_size, attn_size])
    attns = [array_ops.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads)]

    for a in attns:  # Ensure the second shape of attention vectors is set.
        a.set_shape([None, attn_size])
    if initial_state_attention:
        attns = attention(initial_state)

    state = initial_state

    # this is now iterating on time steps
    for i, inp in enumerate(decoder_inputs):
        if i > 0:
            vs.get_variable_scope().reuse_variables()
        # Merge input and previous attentions into one vector of the right size.
        x = rnn_cell.linear([inp] + attns, cell.input_size, True)
        # Run the RNN.
        cell_output, state = cell(x, state)
        # Run the attention mechanism.
        if i == 0 and initial_state_attention:
            with vs.variable_scope(vs.get_variable_scope(), reuse=True):
                attns = attention(state)
        else:
            attns = attention(state)

        with vs.variable_scope("AttnOutputProjection"):
            output = rnn_cell.linear([cell_output] + attns, output_size, True)

        outputs.append(output)

    # we only want the last state
    return outputs, state
def attention_decoder(decoder_inputs, initial_state, attention_states, cell, batch_size, state_size,
                      decoder_inputs_positions=None, decoder_inputs_maps=None, output_size=None, loop_function=None,
                      dtype=dtypes.float32, scope=None):
  """RNN decoder with attention for the sequence-to-sequence model.

  Args:
    decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size]. Embedded inputs.
    initial_state: 2D Tensor [batch_size x cell.state_size].
    attention_states: 3D Tensor [batch_size x attn_length x attn_size].
    cell: rnn_cell.RNNCell defining the cell function and size.
    batch_size: need to clarify batch size explicitly since env_state is updated one sample by one sample.
    state_size: size of environment state.
    decoder_inputs_positions: a list of 2D Tensors of shape [batch_size, 3],
       indicating intial positions of each example in a map. Default None.
    decoder_inputs_maps: a 1D Tensor of length batch_size indicating the map. Default None.
    output_size: size of the output vectors; if None, we use cell.output_size.
    loop_function: if not None, this function will be applied to i-th output
      in order to generate i+1-th input, and decoder_inputs will be ignored,
      except for the first element ("GO" symbol). This can be used for decoding,
      but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf.
      Signature -- loop_function(prev, i) = next
        * prev is a 2D Tensor of shape [batch_size x cell.output_size],
        * i is an integer, the step number (when advanced control is needed),
        * next is a 2D Tensor of shape [batch_size x cell.input_size].
    dtype: The dtype to use for the RNN initial state (default: tf.float32).
    scope: VariableScope for the created subgraph; default: "attention_decoder".

  Returns:
    outputs: A list of the same length as decoder_inputs of 2D Tensors of shape
      [batch_size x output_size]. These represent the generated outputs.
      Output i is computed from input i (which is either i-th decoder_inputs or
      loop_function(output {i-1}, i)) as follows. 
      First, we run the cell on the current decoder input or feed from previous output:
        cur_output, new_state = cell(input, prev_state).
      Then, we calculate new attention masks:
        new_attn = softmax(h_t^T * attention_states).
      Thus, the context vector:
        cont_vec = weighted_sum_of(attention_states), weighted by (new_attn),
      and then we calculate the attended output:
        attn_output = tanh(W1*current_output + W2*cont_vec + W3*env_state).
      The finally output for prediction:
        output = softmax(W*attn_output).
        This "output" should be a 1D Tensor of shape [num_symbols].
        Every item of the output refers to the probability of predicting certain symbol for the next step.
    states: The state of each decoder cell in each time-step. This is a list
      with length len(decoder_inputs) -- one item for each time-step.
      Each item is a 2D Tensor of shape [batch_size x cell.state_size].

  Raises:
    ValueError: when num_heads is not positive, there are no inputs, or shapes
      of attention_states are not set.
  """
  if not decoder_inputs:
    raise ValueError("Must provide at least 1 input to attention decoder.")
  if not attention_states.get_shape()[1:2].is_fully_defined():
    raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
                     % attention_states.get_shape())
  if output_size is None:
    output_size = cell.output_size

  with vs.variable_scope(scope or "attention_decoder"):
    attn_length = attention_states.get_shape()[1].value
    attn_size = attention_states.get_shape()[2].value
    mapIdx = array_ops.pack([map3.map_grid, map3.map_jelly, map3.map_one]) #map

    attention_vec_size = attn_size # size of query
    states = [initial_state]
    # current position and environment
    position, env = None, None

    hidden = array_ops.reshape(attention_states, [-1, attn_length, 1, attn_size]) # reshape for later computation

    def attention(query): 
      """Put attention masks on hidden using hidden_features and query."""
      with vs.variable_scope("Attention"):
        # Attention mask is a softmax of h_in^T*decoder_hidden.
        dec_hid = array_ops.tile(query, [1, attn_length]) # replicate query for element-wise multiplication
        dec_hid = array_ops.reshape(dec_hid, [-1, attn_length, attention_vec_size])
        attn_weight = nn_ops.softmax(math_ops.reduce_sum(attention_states*dec_hid, [2])) # attn weights for every hidden states in encoder
        # Now calculate the attention-weighted vector (context vector) cc.
        cc = math_ops.reduce_sum(array_ops.reshape(attn_weight, [-1, attn_length, 1, 1])*hidden, [1,2])
        # attented hidden state
        with vs.variable_scope("AttnW1"):
          term1 = rnn_cell.linear(query, attn_size, False)
        with vs.variable_scope("AttnW2"):
          term2 = rnn_cell.linear(cc, attn_size, False)
        # environment representation
        if env: # 2D Tensor of shape [batch_size, env_size]
          with vs.variable_scope("Environment"):
            term3 = rnn_cell.linear(math_ops.to_float(env), attn_size, False)
          h_attn = math_ops.tanh(term1 + term2 + term3)
        else:
          h_attn = math_ops.tanh(term1 + term2)
      return h_attn, attn_weight


    def updateEnv(_position, _step, _mapNo):
      """ Update env_state according to current position and step.
      Args:
      position: a 2D Tensor of shape [batch_size, 3].
      step: a 2D Tensor of shape [batch_size, 1], where
      0 --> no action, 1 --> move forward 1 step, 2 --> turn right, 3 --> turn left, 4 --> turn back.
      mapNo: a 1D int32 Tensor of length batch_size.
      
      Returns:
      env: a 2D Tensor of shape [batch_size, env_size]
        environment state after taking the step based on the position.
      position: a 2D Tensor of shape [batch_size, 3]
        new position after taking the step based on the position.
      """
      if not _mapNo:
        raise ValueError(" Invalid argument mapNo in updateEnv! ")
      if not _position:
        raise ValueError(" Invalid argument position in updateEnv! ")
      new_env = []
      new_pos = []
      # if step == None, take no step and return the environment representations of each position.
      if not _step:
        new_pos = _position 
        for j in xrange(batch_size):
          vec = array_ops.slice(mapIdx, array_ops.pack([_mapNo[j], _position[j,0], _position[j,1], _position[j,2], 0]), [1,1,1,1,state_size])
          new_env.append(array_ops.squeeze(vec))
        new_env = array_ops.reshape(array_ops.pack(new_env), [batch_size, state_size])
        return new_pos, new_env
      
      else:

        def f_move(ppos): # move forward 1 step
          return control_flow_ops.cond(math_ops.equal(ppos[2],0), 
            lambda:array_ops.pack([ppos[0], ppos[1]-1, ppos[2]]), lambda:control_flow_ops.cond(math_ops.equal(ppos[2],1),
              lambda:array_ops.pack([ppos[0]+1, ppos[1], ppos[2]]), lambda:control_flow_ops.cond(math_ops.equal(ppos[2],2),
                lambda:array_ops.pack([ppos[0], ppos[1]+1, ppos[2]]), lambda:array_ops.pack([ppos[0]-1, ppos[1], ppos[2]]))))
            
        def f_right(ppos): # turn right
          return control_flow_ops.cond(math_ops.equal(ppos[2],0),
            lambda: array_ops.pack([ppos[0],ppos[1], 1]), lambda:control_flow_ops.cond(math_ops.equal(ppos[2],1),
              lambda: array_ops.pack([ppos[0], ppos[1], 2]), lambda:control_flow_ops.cond(math_ops.equal(ppos[2],2),
                lambda: array_ops.pack([ppos[0], ppos[1], 3]), lambda: array_ops.pack([ppos[0], ppos[1], 0]))))
        
        def f_left(ppos): # turn left
          return control_flow_ops.cond(math_ops.equal(ppos[2], 0),
            lambda: array_ops.pack([ppos[0], ppos[1], 3]), lambda: control_flow_ops.cond(math_ops.equal(ppos[2],1),
              lambda: array_ops.pack([ppos[0], ppos[1], 0]), lambda:control_flow_ops.cond(math_ops.equal(ppos[2],2),
                lambda:array_ops.pack([ppos[0], ppos[1], 1]), lambda:array_ops.pack([ppos[0],ppos[1],2]))))
        
        def f_back(ppos): # turn back
          return control_flow_ops.cond(math_ops.equal(ppos[2],0),
            lambda:array_ops.pack([ppos[0], ppos[1], 2]), lambda:control_flow_ops.cond(math_ops.equal(ppos[2],1),
              lambda:array_ops.pack([ppos[0], ppos[1], 3]), lambda: control_flow_ops.cond(math_ops.equal(ppos[2],2),
                lambda:array_ops.pack([ppos[0], ppos[1], 0]), lambda:array_ops.pack([ppos[0], ppos[1], 1]))))

        def ffn4(sstep, ppos): 
          return control_flow_ops.cond(math_ops.equal(sstep, data_utils.turnBack_ID),
          lambda:f_back(ppos), lambda:_position[j,:])

        def ffn3(sstep, ppos): 
          return control_flow_ops.cond(math_ops.equal(sstep, data_utils.turnLeft_ID),
          lambda:f_left(ppos), lambda:ffn4(sstep, ppos))

        def ffn2(sstep, ppos): 
          return control_flow_ops.cond(math_ops.equal(sstep, data_utils.turnRight_ID),
          lambda:f_right(ppos), lambda:ffn3(sstep, ppos))

        def ffn1(sstep, ppos): 
          return control_flow_ops.cond(math_ops.equal(sstep, data_utils.moveAct_ID),
          lambda:f_move(ppos), lambda:ffn2(sstep, ppos))


        for j in xrange(batch_size):
          #update position
          temp_pos = control_flow_ops.cond(math_ops.equal(_step[j], data_utils.noAct_ID),
            lambda:_position[j,:], lambda:ffn1(_step[j], _position[j,:]))
          new_pos.append(control_flow_ops.cond(math_ops.logical_or(math_ops.greater(temp_pos[0], 24),
            math_ops.logical_or(math_ops.greater(temp_pos[1], 24),
              math_ops.logical_or(math_ops.less(temp_pos[0], 0), math_ops.less(temp_pos[1],0)))),
            lambda:_position[j,:], lambda:temp_pos))
          # new_pos.append(temp_pos)

          # update env
          new_env.append(array_ops.reshape(
              array_ops.slice(mapIdx, array_ops.pack([_mapNo[j], new_pos[-1][0], new_pos[-1][1], new_pos[-1][2], 0]), [1,1,1,1,state_size]),
              [state_size]))
        
        new_pos = array_ops.pack(new_pos)
        new_env = array_ops.pack(new_env)
        return new_pos, new_env
        # return new_pos, None

    outputs = []
    attentions = []
    environments = []
    positions = []
    prev = None

    # print(" Action info: no act=%d, move=%d, turn left=%d, turn right=%d, turn back=%d" %
    #   (data_utils.noAct_ID, data_utils.moveAct_ID, data_utils.turnLeft_ID, data_utils.turnRight_ID, data_utils.turnBack_ID))
    
    if decoder_inputs_positions and decoder_inputs_maps and batch_size:
      position = decoder_inputs_positions[0] # 2d tensor of shape [batch_size, 3]
      _, env = updateEnv(position, None, decoder_inputs_maps)
    for i in xrange(len(decoder_inputs)):
      if i > 0:
        vs.get_variable_scope().reuse_variables()
      inp = decoder_inputs[i]

      # If loop_function is set, we use it instead of decoder_inputs.
      if loop_function is not None and prev is not None:
        with vs.variable_scope("loop_function", reuse=True):
          inp = array_ops.stop_gradient(loop_function(prev, i))

      # Run the RNN.
      cur_output, new_state = cell(inp, states[-1])
      cur_output = array_ops.reshape(cur_output, [batch_size, attn_size])
      states.append(new_state)

      # Run the attention mechanism.
      h_attn, attn_weight = attention(cur_output)
      attentions.append(attn_weight)
      
      with vs.variable_scope("AttnOutputProjection"):
        output = rnn_cell.linear(h_attn, output_size, False)
      
      if loop_function is not None:
        # We do not propagate gradients over the loop function.
        prev = array_ops.stop_gradient(output)
      
      if decoder_inputs_positions and decoder_inputs_maps and position:
        
        # update pos and env
        if loop_function:
          step = math_ops.argmax(nn_ops.softmax(prev), 1) # step is a list (len=batch_size) of int32 number
          position, env = updateEnv(position, step, decoder_inputs_maps)
        else:
          if i < len(decoder_inputs_positions) - 1:
            position = decoder_inputs_positions[i+1]
          _, env = updateEnv(position, None, decoder_inputs_maps)

      outputs.append(output)
      environments.append(env)
      positions.append(position)

  return outputs, states, attentions, environments, positions
Example #35
0
def local_attention(decoder_hidden_state, hidden_attn, window_size=10,
                    content_function=vinyals_kaiser, dtype=tf.float32):
    """Put local attention on hidden using decoder hidden states and the hidden states of encoder (hidden_attn).

    Parameters
    ----------
    decoder_hidden_state : 2-D Tensor
        Tensor representing the current hidden state of the decoder (output of the recurrent layers).
        Shape is (?, decoder_size).
    hidden_attn : 4-D Tensor
        Tensor representing the hidden states of the encoder (output of the recurrent layers). It has
        shape (?, timesteps, 1, decoder_sdize) so it is possible to apply a 1-D convolution to calculate
        the attention score more efficiently.
    initializer : function
        Function to use when initializing variables within the variables context.
    window_size : int
        Size of each side of the window to use when applying local attention. Not relevant to global
        attention. Default to 10.
    content_function : function
        Content function to score the decoder hidden states and encoder hidden states to extract their
        weights. Default to 'vinyals_kaiser'.
    dtype : tensorflow dtype
        Type of tensors. Default to tf.float32

    Returns
    -------
    ds : 2-D Tensor
        Tensor representing the context vector generated after scoring the encoder and decoder hidden
        states. Has shape (?, decoder_size), i.e., one context vector per batch sample.

    """
    assert content_function is not None
    sigma = window_size / 2
    denominator = sigma ** 2

    print('decode_hidden_state', decoder_hidden_state.get_shape())

    attention_vec_size = hidden_attn.get_shape()[2].value
    attn_length = hidden_attn.get_shape()[1].value

    batch_size = hidden_attn.get_shape()[0].value

    with vs.variable_scope("AttentionLocal"):

        # apply content function to score the hidden states from the encoder
        s = content_function(hidden_attn, decoder_hidden_state)

        from tensorflow.python.ops.rnn_cell import _linear as linear
        with vs.variable_scope("WindowPrediction"):
            ht = linear([decoder_hidden_state], attention_vec_size, True)

        # get the parameters (vp)
        vp = vs.get_variable("AttnVp_%d" % 0, [attention_vec_size])

        # tanh(Wp*ht)
        tanh = math_ops.tanh(ht)
        # S * sigmoid(vp * tanh(Wp*ht))  - this is going to return a number
        # for each sentence in the batch - i.e., a tensor of shape batch x 1
        S = attn_length
        print('tanh', tanh.get_shape())
        pt = math_ops.reduce_sum((vp * tanh), 1)
        pt = math_ops.sigmoid(pt) * S

        # now we get only the integer part of the values
        pt = tf.floor(pt)

        his1 = tf.histogram_summary('local_window_predictions', pt)

        # we now create a tensor containing the indices representing each position
        # of the sentence - i.e., if the sentence contain 5 tokens and batch_size is 3,
        # the resulting tensor will be:
        # [[0, 1, 2, 3, 4]
        #  [0, 1, 2, 3, 4]
        #  [0, 1, 2, 3, 4]]
        #
        indices = []
        for pos in xrange(attn_length):
            indices.append(pos)
        indices = indices * batch_size
        idx = tf.convert_to_tensor(tf.to_float(indices), dtype=dtype)
        idx = tf.reshape(idx, [-1, attn_length])
        print('batch_size', batch_size)
        # print(idx.get_shape())

        # here we calculate the boundaries of the attention window based on the ppositions
        low = pt - window_size + 1  # we add one because the floor op already generates the first position
        high = pt + window_size

        # here we check our positions against the boundaries
        low = tf.expand_dims(low, -1)
        high = tf.expand_dims(high, -1)
        print(idx.get_shape())
        print(low.get_shape())
        print(attn_length)
        mlow = tf.to_float(idx < low)
        mhigh = tf.to_float(idx > high)

        # now we combine both into a pre-mask that has 0s and 1s switched
        # i.e, at this point, True == 0 and False == 1
        m = mlow + mhigh  # batch_size

        # here we switch the 0s to 1s and the 1s to 0s
        # we correct the values so True == 1 and False == 0
        mask = tf.to_float(tf.equal(m, 0.0))

        # here we switch off all the values that fall outside the window
        # first we switch off those in the truncated normal
        alpha = s * mask
        masked_soft = nn_ops.softmax(alpha)

        his2 = tf.histogram_summary('local_alpha_weights', alpha)

        # here we calculate the 'truncated normal distribution'
        # print(pt.get_shape())
        pt = tf.expand_dims(pt, -1)
        numerator = -tf.pow((idx - pt), tf.convert_to_tensor(2, dtype=dtype))
        div = tf.truediv(numerator, denominator)
        e = math_ops.exp(div)  # result of the truncated normal distribution

        at = masked_soft * e

        # Now calculate the attention-weighted vector d.
        print('at shape', at.get_shape())
        ds = math_ops.reduce_sum( tf.expand_dims(at, -1) * hidden_attn, 1)
        print(ds.get_shape())
        # ds = array_ops.reshape(d, [-1, attention_vec_size])

    his3 = tf.histogram_summary('local_attention_context', ds)

    return ds, [ his1, his2, his3 ]
def beam_attention_decoder(decoder_inputs, initial_state, attention_states, cell,
                      output_size=None, num_heads=1, loop_function=None,
                      dtype=dtypes.float32, scope=None,
                      initial_state_attention=False, output_projection=None, beam_size=10):
  """RNN decoder with attention for the sequence-to-sequence model.

  In this context "attention" means that, during decoding, the RNN can look up
  information in the additional tensor attention_states, and it does this by
  focusing on a few entries from the tensor. This model has proven to yield
  especially good results in a number of sequence-to-sequence tasks. This
  implementation is based on http://arxiv.org/abs/1412.7449 (see below for
  details). It is recommended for complex sequence-to-sequence tasks.

  Args:
    decoder_inputs: A list of 2D Tensors [batch_size x input_size].
    initial_state: 2D Tensor [batch_size x cell.state_size].
    attention_states: 3D Tensor [batch_size x attn_length x attn_size].
    cell: rnn_cell.RNNCell defining the cell function and size.
    output_size: Size of the output vectors; if None, we use cell.output_size.
    num_heads: Number of attention heads that read from attention_states.
    loop_function: If not None, this function will be applied to i-th output
      in order to generate i+1-th input, and decoder_inputs will be ignored,
      except for the first element ("GO" symbol). This can be used for decoding,
      but also for training to emulate http://arxiv.org/abs/1506.03099.
      Signature -- loop_function(prev, i) = next
        * prev is a 2D Tensor of shape [batch_size x output_size],
        * i is an integer, the step number (when advanced control is needed),
        * next is a 2D Tensor of shape [batch_size x input_size].
    dtype: The dtype to use for the RNN initial state (default: tf.float32).
    scope: VariableScope for the created subgraph; default: "attention_decoder".
    initial_state_attention: If False (default), initial attentions are zero.
      If True, initialize the attentions from the initial state and attention
      states -- useful when we wish to resume decoding from a previously
      stored decoder state and attention states.

  Returns:
    A tuple of the form (outputs, state), where:
      outputs: A list of the same length as decoder_inputs of 2D Tensors of
        shape [batch_size x output_size]. These represent the generated outputs.
        Output i is computed from input i (which is either the i-th element
        of decoder_inputs or loop_function(output {i-1}, i)) as follows.
        First, we run the cell on a combination of the input and previous
        attention masks:
          cell_output, new_state = cell(linear(input, prev_attn), prev_state).
        Then, we calculate new attention masks:
          new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
        and then we calculate the output:
          output = linear(cell_output, new_attn).
      state: The state of each decoder cell the final time-step.
        It is a 2D Tensor of shape [batch_size x cell.state_size].

  Raises:
    ValueError: when num_heads is not positive, there are no inputs, shapes
      of attention_states are not set, or input size cannot be inferred
      from the input.
  """
  if not decoder_inputs:
    raise ValueError("Must provide at least 1 input to attention decoder.")
  if num_heads < 1:
    raise ValueError("With less than 1 heads, use a non-attention decoder.")
  if not attention_states.get_shape()[1:2].is_fully_defined():
    raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
                     % attention_states.get_shape())
  if output_size is None:
    output_size = cell.output_size

  with variable_scope.variable_scope(scope or "attention_decoder"):
    batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
    attn_length = attention_states.get_shape()[1].value
    attn_size = attention_states.get_shape()[2].value

    # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
    hidden = array_ops.reshape(
        attention_states, [-1, attn_length, 1, attn_size])
    hidden_features = []
    v = []
    attention_vec_size = attn_size  # Size of query vectors for attention.
    for a in xrange(num_heads):
      k = variable_scope.get_variable("AttnW_%d" % a,
                                      [1, 1, attn_size, attention_vec_size])
      hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
      v.append(variable_scope.get_variable("AttnV_%d" % a,
                                           [attention_vec_size]))

    print("Initial_state")

    state_size =  int(initial_state.get_shape().with_rank(2)[1])
    states =[]
    for kk in range(1):
        states.append(initial_state)
    state = tf.reshape(tf.concat(0, states), [-1, state_size])
    def attention(query):
      """Put attention masks on hidden using hidden_features and query."""
      ds = []  # Results of attention reads will be stored here.
      for a in xrange(num_heads):
        with variable_scope.variable_scope("Attention_%d" % a):
          y = linear(query, attention_vec_size, True)
          y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
          # Attention mask is a softmax of v^T * tanh(...).
          s = math_ops.reduce_sum(
              v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])
          a = nn_ops.softmax(s)
          # Now calculate the attention-weighted vector d.
          d = math_ops.reduce_sum(
              array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
              [1, 2])
          # for c in range(ct):
          ds.append(array_ops.reshape(d, [-1, attn_size]))
      return ds

    outputs = []
    prev = None
    batch_attn_size = array_ops.pack([batch_size, attn_size])
    attns = [array_ops.zeros(batch_attn_size, dtype=dtype)
             for _ in xrange(num_heads)]
    for a in attns:  # Ensure the second shape of attention vectors is set.
      a.set_shape([None, attn_size])

    if initial_state_attention:
       attns = []
       attns.append(attention(initial_state))
       tmp = tf.reshape(tf.concat(0, attns), [-1, attn_size])
       attns = []
       attns.append(tmp)

    log_beam_probs, beam_path, beam_symbols = [],[],[]
    for i, inp in enumerate(decoder_inputs):

      if i > 0:
        variable_scope.get_variable_scope().reuse_variables()
      # If loop_function is set, we use it instead of decoder_inputs.
      if loop_function is not None :
        with variable_scope.variable_scope("loop_function", reuse=True):
            if prev is not None:
                inp = loop_function(prev, i,log_beam_probs, beam_path, beam_symbols)

      input_size = inp.get_shape().with_rank(2)[1]
      x = linear([inp] + attns, input_size, True)
      cell_output, state = cell(x, state)

      # Run the attention mechanism.
      if i == 0 and initial_state_attention:
        with variable_scope.variable_scope(variable_scope.get_variable_scope(),
                                           reuse=True):
          attns = attention(state)
      else:
          attns = attention(state)

      with variable_scope.variable_scope("AttnOutputProjection"):
        output = linear([cell_output] + attns, output_size, True)
      if loop_function is not None:
        prev = output
      if  i ==0:
          states =[]
          for kk in range(beam_size):
                states.append(state)
          state = tf.reshape(tf.concat(0, states), [-1, state_size])
          with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True):
                attns = attention(state)

      outputs.append(tf.argmax(nn_ops.xw_plus_b(
          output, output_projection[0], output_projection[1]), dimension=1))

  return outputs, state, tf.reshape(tf.concat(0, beam_path),[-1,beam_size]), tf.reshape(tf.concat(0, beam_symbols),[-1,beam_size])
Example #37
0
def attention_encoder(decoder_inputs,
                      initial_state,
                      attention_states,
                      cell,
                      num_heads=1,
                      output_size=None,
                      dtype=dtypes.float32,
                      scope=None,
                      initial_state_attention=False):
    """
    Encoder that receives attention from another encoder

    Parameters
    ----------
    decoder_inputs:
        second encoder's input we call it a decoder's input
        it should be already wrapped by add_embedding()
        it's A list of num_steps length 2D Tensors [batch_size, input_size = embed_size]
    initial_state:
        2D Tensor (batch_size x cell.state_size).
    attention_states:
        3D Tensor (batch_size x attn_length (seq_length) x attn_size)
    cell
    num_heads
    output_size
    dtype
    scope
    initial_state_attention

    Returns
    -------
    A tuple of the form (outputs, state), where:
      outputs: A list of the same length as decoder_inputs of 2D Tensors with
        shape [batch_size x output_size] containing the generated outputs.
      state: The state of each decoder cell at the final time-step.
        It is a 2D Tensor of shape (batch_size x cell.state_size).

    """
    decoder_inputs = [decoder_inputs
                      ]  # in original model this is a bucket list of inputs

    with vs.variable_scope(scope or "attention_encoder"):
        batch_size = array_ops.shape(decoder_inputs[0])[0]
        attn_length = attention_states.get_shape()[1].value
        attn_size = attention_states.get_shape()[2].value

    v = []
    attention_vec_size = attn_size  # Size of query vectors for attention.
    hidden = array_ops.reshape(attention_states,
                               [-1, attn_length, 1, attn_size])
    hidden_features = []
    for a in xrange(num_heads):
        k = vs.get_variable("AttnW_%d" % a,
                            [1, 1, attn_size, attention_vec_size])
        hidden_features.append(tf.nn.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
        v.append(vs.get_variable("AttnV_%d" % a, [attention_vec_size]))

    def attention(query):
        """Put attention masks on hidden using hidden_features and query."""
        ds = []  # Results of attention reads will be stored here.
        for a in xrange(num_heads):
            with vs.variable_scope("Attention_%d" % a):
                y = rnn_cell.linear(query, attention_vec_size, True)
                y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
                # Attention mask is a softmax of v^T * tanh(...).
                s = math_ops.reduce_sum(
                    v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])
                a = tf.nn.softmax(s)
                # Now calculate the attention-weighted vector d.
                d = math_ops.reduce_sum(
                    array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
                    [1, 2])
                ds.append(array_ops.reshape(d, [-1, attn_size]))
        return ds

    outputs = []
    batch_attn_size = array_ops.pack([batch_size, attn_size])
    attns = [
        array_ops.zeros(batch_attn_size, dtype=dtype)
        for _ in xrange(num_heads)
    ]

    for a in attns:  # Ensure the second shape of attention vectors is set.
        a.set_shape([None, attn_size])
    if initial_state_attention:
        attns = attention(initial_state)

    state = initial_state

    # this is now iterating on time steps
    for i, inp in enumerate(decoder_inputs):
        if i > 0:
            vs.get_variable_scope().reuse_variables()
        # Merge input and previous attentions into one vector of the right size.
        x = rnn_cell.linear([inp] + attns, cell.input_size, True)
        # Run the RNN.
        cell_output, state = cell(x, state)
        # Run the attention mechanism.
        if i == 0 and initial_state_attention:
            with vs.variable_scope(vs.get_variable_scope(), reuse=True):
                attns = attention(state)
        else:
            attns = attention(state)

        with vs.variable_scope("AttnOutputProjection"):
            output = rnn_cell.linear([cell_output] + attns, output_size, True)

        outputs.append(output)

    # we only want the last state
    return outputs, state
Example #38
0
def attention_decoder(decoder_inputs, initial_state, attention_states, cell,
                      output_size=None, num_heads=1, loop_function=None,
                      dtype=dtypes.float32, scope=None,
                      initial_state_attention=False):
  """RNN decoder with attention for the sequence-to-sequence model.

  Args:
    decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
    initial_state: 2D Tensor [batch_size x cell.state_size].
    attention_states: 3D Tensor [batch_size x attn_length x attn_size].
    cell: rnn_cell.RNNCell defining the cell function and size.
    output_size: size of the output vectors; if None, we use cell.output_size.
    num_heads: number of attention heads that read from attention_states.
    loop_function: if not None, this function will be applied to i-th output
      in order to generate i+1-th input, and decoder_inputs will be ignored,
      except for the first element ("GO" symbol). This can be used for decoding,
      but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf.
      Signature -- loop_function(prev, i) = next
        * prev is a 2D Tensor of shape [batch_size x cell.output_size],
        * i is an integer, the step number (when advanced control is needed),
        * next is a 2D Tensor of shape [batch_size x cell.input_size].
    dtype: The dtype to use for the RNN initial state (default: tf.float32).
    scope: VariableScope for the created subgraph; default: "attention_decoder".
    initial_state_attention: If False (default), initial attentions are zero.
      If True, initialize the attentions from the initial state and attention
      states -- useful when we wish to resume decoding from a previously
      stored decoder state and attention states.

  Returns:
    outputs: A list of the same length as decoder_inputs of 2D Tensors of shape
      [batch_size x output_size]. These represent the generated outputs.
      Output i is computed from input i (which is either i-th decoder_inputs or
      loop_function(output {i-1}, i)) as follows. First, we run the cell
      on a combination of the input and previous attention masks:
        cell_output, new_state = cell(linear(input, prev_attn), prev_state).
      Then, we calculate new attention masks:
        new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
      and then we calculate the output:
        output = linear(cell_output, new_attn).
    state: The state of each decoder cell the final time-step.
      It is a 2D Tensor of shape [batch_size x cell.state_size].

  Raises:
    ValueError: when num_heads is not positive, there are no inputs, or shapes
      of attention_states are not set.
  """
  if not decoder_inputs:
    raise ValueError("Must provide at least 1 input to attention decoder.")
  if num_heads < 1:
    raise ValueError("With less than 1 heads, use a non-attention decoder.")
  if not attention_states.get_shape()[1:2].is_fully_defined():
    raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
                     % attention_states.get_shape())
  if output_size is None:
    output_size = cell.output_size

  with vs.variable_scope(scope or "attention_decoder"):
    batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
    attn_length = attention_states.get_shape()[1].value
    attn_size = attention_states.get_shape()[2].value

    # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
    hidden = array_ops.reshape(
        attention_states, [-1, attn_length, 1, attn_size])
    hidden_features = []
    v = []
    attention_vec_size = attn_size  # Size of query vectors for attention.
    for a in xrange(num_heads):
      k = vs.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size])
      hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
      v.append(vs.get_variable("AttnV_%d" % a, [attention_vec_size]))

    state = initial_state

    def attention(query):
      """Put attention masks on hidden using hidden_features and query."""
      ds = []  # Results of attention reads will be stored here.
      for a in xrange(num_heads):
        with vs.variable_scope("Attention_%d" % a):
          y = rnn_cell.linear(query, attention_vec_size, True)
          y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
          # Attention mask is a softmax of v^T * tanh(...).
          s = math_ops.reduce_sum(
              v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])
          a = nn_ops.softmax(s)
          # Now calculate the attention-weighted vector d.
          d = math_ops.reduce_sum(
              array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
              [1, 2])
          ds.append(array_ops.reshape(d, [-1, attn_size]))
      return ds

    outputs = []
    prev = None
    batch_attn_size = array_ops.pack([batch_size, attn_size])
    attns = [array_ops.zeros(batch_attn_size, dtype=dtype)
             for _ in xrange(num_heads)]
    for a in attns:  # Ensure the second shape of attention vectors is set.
      a.set_shape([None, attn_size])
    if initial_state_attention:
      attns = attention(initial_state)
    for i in xrange(len(decoder_inputs)):
      if i > 0:
        vs.get_variable_scope().reuse_variables()
      inp = decoder_inputs[i]
      # If loop_function is set, we use it instead of decoder_inputs.
      if loop_function is not None and prev is not None:
        with vs.variable_scope("loop_function", reuse=True):
          inp = array_ops.stop_gradient(loop_function(prev, i))
      # Merge input and previous attentions into one vector of the right size.
      x = rnn_cell.linear([inp] + attns, cell.input_size, True)
      # Run the RNN.
      cell_output, state = cell(x, state)
      # Run the attention mechanism.
      if i == 0 and initial_state_attention:
        with vs.variable_scope(vs.get_variable_scope(), reuse=True):
          attns = attention(state)
      else:
        attns = attention(state)

      with vs.variable_scope("AttnOutputProjection"):
        output = rnn_cell.linear([cell_output] + attns, output_size, True)
      if loop_function is not None:
        # We do not propagate gradients over the loop function.
        prev = array_ops.stop_gradient(output)
      outputs.append(output)

  return outputs, state
def attention_decoder(decoder_inputs, initial_state, attention_states, cell,
                      output_size=None, num_heads=1, loop_function=None,
                      dtype=dtypes.float32, scope=None):
  """RNN decoder with attention for the sequence-to-sequence model.

  Args:
    decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
    initial_state: 2D Tensor [batch_size x cell.state_size].
    attention_states: 3D Tensor [batch_size x attn_length x attn_size].
    cell: rnn_cell.RNNCell defining the cell function and size.
    output_size: size of the output vectors; if None, we use cell.output_size.
    num_heads: number of attention heads that read from attention_states.
    loop_function: if not None, this function will be applied to i-th output
      in order to generate i+1-th input, and decoder_inputs will be ignored,
      except for the first element ("GO" symbol). This can be used for decoding,
      but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf.
      Signature -- loop_function(prev, i) = next
        * prev is a 2D Tensor of shape [batch_size x cell.output_size],
        * i is an integer, the step number (when advanced control is needed),
        * next is a 2D Tensor of shape [batch_size x cell.input_size].
    dtype: The dtype to use for the RNN initial state (default: tf.float32).
    scope: VariableScope for the created subgraph; default: "attention_decoder".

  Returns:
    outputs: A list of the same length as decoder_inputs of 2D Tensors of shape
      [batch_size x output_size]. These represent the generated outputs.
      Output i is computed from input i (which is either i-th decoder_inputs or
      loop_function(output {i-1}, i)) as follows. First, we run the cell
      on a combination of the input and previous attention masks:
        cell_output, new_state = cell(linear(input, prev_attn), prev_state).
      Then, we calculate new attention masks:
        new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
      and then we calculate the output:
        output = linear(cell_output, new_attn).
    states: The state of each decoder cell in each time-step. This is a list
      with length len(decoder_inputs) -- one item for each time-step.
      Each item is a 2D Tensor of shape [batch_size x cell.state_size].

  Raises:
    ValueError: when num_heads is not positive, there are no inputs, or shapes
      of attention_states are not set.
  """
  if not decoder_inputs:
    raise ValueError("Must provide at least 1 input to attention decoder.")
  if num_heads < 1:
    raise ValueError("With less than 1 heads, use a non-attention decoder.")
  if not attention_states.get_shape()[1:2].is_fully_defined():
    raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
                     % attention_states.get_shape())
  if output_size is None:
    output_size = cell.output_size

  with vs.variable_scope(scope or "attention_decoder"):
    batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
    attn_length = attention_states.get_shape()[1].value
    attn_size = attention_states.get_shape()[2].value

    # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
    hidden = array_ops.reshape(
        attention_states, [-1, attn_length, 1, attn_size])
    hidden_features = []
    v = []
    attention_vec_size = attn_size  # Size of query vectors for attention.
    for a in xrange(num_heads):
      k = vs.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size])
      hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
      v.append(vs.get_variable("AttnV_%d" % a, [attention_vec_size]))

    states = [initial_state]

    def attention(query):
      """Put attention masks on hidden using hidden_features and query."""
      ds = []  # Results of attention reads will be stored here.
      for a in xrange(num_heads):
        with vs.variable_scope("Attention_%d" % a):
          y = rnn_cell.linear(query, attention_vec_size, True)
          y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
          # Attention mask is a softmax of v^T * tanh(...).
          s = math_ops.reduce_sum(
              v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])
          a = nn_ops.softmax(s)
          # Now calculate the attention-weighted vector d.
          d = math_ops.reduce_sum(
              array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
              [1, 2])
          ds.append(array_ops.reshape(d, [-1, attn_size]))
      return ds

    outputs = []
    prev = None
    batch_attn_size = array_ops.pack([batch_size, attn_size])
    attns = [array_ops.zeros(batch_attn_size, dtype=dtype)
             for _ in xrange(num_heads)]
    for a in attns:  # Ensure the second shape of attention vectors is set.
      a.set_shape([None, attn_size])
    for i in xrange(len(decoder_inputs)):
      if i > 0:
        vs.get_variable_scope().reuse_variables()
      inp = decoder_inputs[i]
      # If loop_function is set, we use it instead of decoder_inputs.
      if loop_function is not None and prev is not None:
        with vs.variable_scope("loop_function", reuse=True):
          inp = array_ops.stop_gradient(loop_function(prev, i))
      # Merge input and previous attentions into one vector of the right size.
      x = rnn_cell.linear([inp] + attns, cell.input_size, True)
      # Run the RNN.
      cell_output, new_state = cell(x, states[-1]) # RNN h_i = h(Wx_i, h_i-1),
                                                   # cell = GRUCell() in rnn_cell.py, new_h = u * state + (1 - u) * c  return new_h, new_h
      states.append(new_state)
      # Run the attention mechanism.
      attns = attention(new_state)
      with vs.variable_scope("AttnOutputProjection"):
        output = rnn_cell.linear([cell_output] + attns, output_size, True) ###[cell_ouput]+attns:list of 2D, batch x n, Tensors, -> returns a 2D Tensor with shape [batch x output_size] equal to sum_i(args[i] * W[i])
                                                                           ###number of rows is now output_size (target_vocab_size)
      if loop_function is not None:
        # We do not propagate gradients over the loop function.
        prev = array_ops.stop_gradient(output)
      outputs.append(output)

  return outputs, states
Example #40
0
def attention_decoder(decoder_inputs, initial_state, attention_states, cell,
                      output_size=None, num_heads=1, loop_function=None,
                      dtype=dtypes.float32, scope=None,
                      initial_state_attention=False):
  """RNN decoder with attention for the sequence-to-sequence model.

  In this context "attention" means that, during decoding, the RNN can look up
  information in the additional tensor attention_states, and it does this by
  focusing on a few entries from the tensor. This model has proven to yield
  especially good results in a number of sequence-to-sequence tasks. This
  implementation is based on http://arxiv.org/abs/1412.7449 (see below for
  details). It is recommended for complex sequence-to-sequence tasks.

  Args:
    decoder_inputs: A list of 2D Tensors [batch_size x cell.input_size].
    initial_state: 2D Tensor [batch_size x cell.state_size].
    attention_states: 3D Tensor [batch_size x attn_length x attn_size].
    cell: rnn_cell.RNNCell defining the cell function and size.
    output_size: Size of the output vectors; if None, we use cell.output_size.
    num_heads: Number of attention heads that read from attention_states.
    loop_function: If not None, this function will be applied to i-th output
      in order to generate i+1-th input, and decoder_inputs will be ignored,
      except for the first element ("GO" symbol). This can be used for decoding,
      but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf.
      Signature -- loop_function(prev, i) = next
        * prev is a 2D Tensor of shape [batch_size x cell.output_size],
        * i is an integer, the step number (when advanced control is needed),
        * next is a 2D Tensor of shape [batch_size x cell.input_size].
    dtype: The dtype to use for the RNN initial state (default: tf.float32).
    scope: VariableScope for the created subgraph; default: "attention_decoder".
    initial_state_attention: If False (default), initial attentions are zero.
      If True, initialize the attentions from the initial state and attention
      states -- useful when we wish to resume decoding from a previously
      stored decoder state and attention states.

  Returns:
    A tuple of the form (outputs, state), where:
      outputs: A list of the same length as decoder_inputs of 2D Tensors of
        shape [batch_size x output_size]. These represent the generated outputs.
        Output i is computed from input i (which is either the i-th element
        of decoder_inputs or loop_function(output {i-1}, i)) as follows.
        First, we run the cell on a combination of the input and previous
        attention masks:
          cell_output, new_state = cell(linear(input, prev_attn), prev_state).
        Then, we calculate new attention masks:
          new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
        and then we calculate the output:
          output = linear(cell_output, new_attn).
      state: The state of each decoder cell the final time-step.
        It is a 2D Tensor of shape [batch_size x cell.state_size].

  Raises:
    ValueError: when num_heads is not positive, there are no inputs, or shapes
      of attention_states are not set.
  """
  if not decoder_inputs:
    raise ValueError("Must provide at least 1 input to attention decoder.")
  if num_heads < 1:
    raise ValueError("With less than 1 heads, use a non-attention decoder.")
  if not attention_states.get_shape()[1:2].is_fully_defined():
    raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
                     % attention_states.get_shape())
  if output_size is None:
    output_size = cell.output_size

  with variable_scope.variable_scope(scope or "attention_decoder"):
    batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
    attn_length = attention_states.get_shape()[1].value
    attn_size = attention_states.get_shape()[2].value

    # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
    hidden = array_ops.reshape(
        attention_states, [-1, attn_length, 1, attn_size])
    hidden_features = []
    v = []
    attention_vec_size = attn_size  # Size of query vectors for attention.
    for a in xrange(num_heads):
      k = variable_scope.get_variable("AttnW_%d" % a,
                                      [1, 1, attn_size, attention_vec_size])
      hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
      v.append(variable_scope.get_variable("AttnV_%d" % a,
                                           [attention_vec_size]))

    state = initial_state

    def attention(query):
      """Put attention masks on hidden using hidden_features and query."""
      ds = []  # Results of attention reads will be stored here.
      for a in xrange(num_heads):
        with variable_scope.variable_scope("Attention_%d" % a):
          y = rnn_cell.linear(query, attention_vec_size, True)
          y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
          # Attention mask is a softmax of v^T * tanh(...).
          s = math_ops.reduce_sum(
              v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])
          a = nn_ops.softmax(s)
          # Now calculate the attention-weighted vector d.
          d = math_ops.reduce_sum(
              array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
              [1, 2])
          ds.append(array_ops.reshape(d, [-1, attn_size]))
      return ds

    outputs = []
    prev = None
    batch_attn_size = array_ops.pack([batch_size, attn_size])
    attns = [array_ops.zeros(batch_attn_size, dtype=dtype)
             for _ in xrange(num_heads)]
    for a in attns:  # Ensure the second shape of attention vectors is set.
      a.set_shape([None, attn_size])
    if initial_state_attention:
      attns = attention(initial_state)
    for i, inp in enumerate(decoder_inputs):
      if i > 0:
        variable_scope.get_variable_scope().reuse_variables()
      # If loop_function is set, we use it instead of decoder_inputs.
      if loop_function is not None and prev is not None:
        with variable_scope.variable_scope("loop_function", reuse=True):
          inp = array_ops.stop_gradient(loop_function(prev, i))
      # Merge input and previous attentions into one vector of the right size.
      x = rnn_cell.linear([inp] + attns, cell.input_size, True)
      # Run the RNN.
      cell_output, state = cell(x, state)
      # Run the attention mechanism.
      if i == 0 and initial_state_attention:
        with variable_scope.variable_scope(variable_scope.get_variable_scope(),
                                           reuse=True):
          attns = attention(state)
      else:
        attns = attention(state)

      with variable_scope.variable_scope("AttnOutputProjection"):
        output = rnn_cell.linear([cell_output] + attns, output_size, True)
      if loop_function is not None:
        # We do not propagate gradients over the loop function.
        prev = array_ops.stop_gradient(output)
      outputs.append(output)

  return outputs, state
Example #41
0
def pointer_decoder(decoder_inputs,
                    initial_state,
                    attention_states,
                    cell,
                    feed_prev=True,
                    dtype=dtypes.float32,
                    scope=None):
    """RNN decoder with pointer net for the sequence-to-sequence model.
    Args:
      decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
      initial_state: 2D Tensor [batch_size x cell.state_size].
      attention_states: 3D Tensor [batch_size x attn_length x attn_size].
      cell: rnn_cell.RNNCell defining the cell function and size.
      dtype: The dtype to use for the RNN initial state (default: tf.float32).
      scope: VariableScope for the created subgraph; default: "pointer_decoder".
    Returns:
      outputs: A list of the same length as decoder_inputs of 2D Tensors of shape
        [batch_size x output_size]. These represent the generated outputs.
        Output i is computed from input i (which is either i-th decoder_inputs.
        First, we run the cell
        on a combination of the input and previous attention masks:
          cell_output, new_state = cell(linear(input, prev_attn), prev_state).
        Then, we calculate new attention masks:
          new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
        and then we calculate the output:
          output = linear(cell_output, new_attn).
      states: The state of each decoder cell in each time-step. This is a list
        with length len(decoder_inputs) -- one item for each time-step.
        Each item is a 2D Tensor of shape [batch_size x cell.state_size].
    """
    if not decoder_inputs:
        raise ValueError("Must provide at least 1 input to attention decoder.")
    if not attention_states.get_shape()[1:2].is_fully_defined():
        raise ValueError(
            "Shape[1] and [2] of attention_states must be known: %s" %
            attention_states.get_shape())

    with vs.variable_scope(scope or "point_decoder"):
        batch_size = array_ops.shape(
            decoder_inputs[0])[0]  # Needed for reshaping.
        input_size = decoder_inputs[0].get_shape()[1].value
        attn_length = attention_states.get_shape()[1].value
        attn_size = attention_states.get_shape()[2].value

        # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
        hidden = array_ops.reshape(attention_states,
                                   [-1, attn_length, 1, attn_size])

        attention_vec_size = attn_size  # Size of query vectors for attention.
        k = vs.get_variable("AttnW", [1, 1, attn_size, attention_vec_size])
        hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")
        v = vs.get_variable("AttnV", [attention_vec_size])

        states = [initial_state]

        def attention(query):
            """Point on hidden using hidden_features and query."""
            with vs.variable_scope("Attention"):
                y = rnn_cell.linear(query, attention_vec_size, True)
                y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
                # Attention mask is a softmax of v^T * tanh(...).
                s = math_ops.reduce_sum(v * math_ops.tanh(hidden_features + y),
                                        [2, 3])
                return s

        outputs = []
        prev = None
        batch_attn_size = array_ops.pack([batch_size, attn_size])
        attns = array_ops.zeros(batch_attn_size, dtype=dtype)

        attns.set_shape([None, attn_size])
        inps = []
        for i in xrange(len(decoder_inputs)):
            if i > 0:
                vs.get_variable_scope().reuse_variables()
            inp = decoder_inputs[i]

            if feed_prev and i > 0:
                inp = tf.pack(decoder_inputs)
                inp = tf.transpose(inp, perm=[1, 0, 2])
                inp = tf.reshape(inp, [-1, attn_length, input_size])
                inp = tf.reduce_sum(
                    inp *
                    tf.reshape(tf.nn.softmax(output), [-1, attn_length, 1]), 1)
                inp = tf.stop_gradient(inp)
                inps.append(inp)

            # Use the same inputs in inference, order internaly

            # Merge input and previous attentions into one vector of the right size.
            x = rnn_cell.linear([inp, attns], cell.input_size, True)
            # Run the RNN.
            cell_output, new_state = cell(x, states[-1])
            states.append(new_state)
            # Run the attention mechanism.
            output = attention(new_state)

            outputs.append(output)

    return outputs, states, inps