Beispiel #1
0
    def __call__(self, inputs, state, scope=None):
        """Run one step of F-LSTM.

        Args:
          inputs: input Tensor, 2D, batch x num_units.
          state: this must be a tuple of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`.
          scope: not used

        Returns:
          A tuple containing:

          - A `2-D, [batch x output_dim]`, Tensor representing the output of the
            F-LSTM after reading `inputs` when previous state was `state`.
            Here output_dim is:
               num_proj if num_proj was set,
               num_units otherwise.
          - Tensor(s) representing the new state of F-LSTM after reading `inputs` when
            the previous state was `state`.  Same type and shape(s) as `state`.

        Raises:
          ValueError: If input size cannot be inferred from inputs via
            static shape inference.
        """
        (c_prev, m_prev) = state

        input_size = inputs.get_shape().with_rank(2)[1]
        if input_size.value is None:
            raise ValueError(
                "Could not infer input size from inputs.get_shape()[-1]")
        with vs.variable_scope(scope or "flstm_cell",
                               initializer=self._initializer):
            with vs.variable_scope("factor"):
                fact = linear([inputs, m_prev], self._factor_size, False)
            concat = linear(fact, 4 * self._num_units, True)
            # i = input_gate, j = new_input, f = forget_gate, o = output_gate
            i, j, f, o = array_ops.split(value=concat,
                                         num_or_size_splits=4,
                                         axis=1)

        c = math_ops.sigmoid(f +
                             self._forget_bias) * c_prev + math_ops.sigmoid(
                                 i) * math_ops.tanh(j)
        m = math_ops.sigmoid(o) * self._activation(c)

        if self._num_proj is not None:
            with vs.variable_scope("projection"):
                m = linear(m, self._num_proj, bias=False, scope=scope)

        new_state = LSTMStateTuple(c, m)
        return m, new_state
Beispiel #2
0
    def __call__(self, inputs, state, scope=None):
        """Long short-term memory cell (LSTM)."""
        from tensorflow.python.ops import array_ops
        from tensorflow.contrib.rnn.python.ops.core_rnn_cell_impl import _linear as linear
        with tf.variable_scope(scope or "basic_lstm_cell"):
            # Parameters of gates are concatenated into one multiply for efficiency.
            if self._state_is_tuple:
                c, h = state
            else:
                c, h = array_ops.split(value=state, num_or_size_splits=2, axis=1)
            concat = linear([inputs, h], 4 * self._num_units, True, scope=scope)

            # i = input_gate, j = new_input, f = forget_gate, o = output_gate
            i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1)
            i = ln(i, scope='i/')
            j = ln(i, scope='j/')
            f = ln(i, scope='f/')
            o = ln(i, scope='o/')

            new_c = (c * tf.sigmoid(f + self._forget_bias) + tf.sigmoid(i) *
                     self._activation(j))
            new_h = self._activation(new_c) * tf.sigmoid(o)

            if self._state_is_tuple:
                new_state = tf.contrib.rnn.LSTMStateTuple(new_c, new_h)
            else:
                new_state = array_ops.concat([new_c, new_h], 1)
            return new_h, new_state
Beispiel #3
0
                def attention(query, prev_alpha):
                    """Calculate attention weights."""
                    with variable_scope.variable_scope("Attention"):
                        y = linear(query, attention_vec_size, True)
                        y = array_ops.reshape(y,
                                              [-1, 1, 1, attention_vec_size])
                        if self.use_conv:
                            conv_features = nn_ops.conv2d(
                                prev_alpha, F, [1, 1, 1, 1], "SAME")
                            feat_reshape = nn_ops.conv2d(
                                conv_features, U, [1, 1, 1, 1], "SAME")
                            s = math_ops.reduce_sum(
                                v * math_ops.tanh(hidden_features + y +
                                                  feat_reshape), [2, 3])
                        else:
                            s = math_ops.reduce_sum(
                                v * math_ops.tanh(hidden_features + y), [2, 3])

                        alpha = nn_ops.softmax(s) * attn_mask
                        sum_vec = tf.reduce_sum(alpha,
                                                reduction_indices=[1],
                                                keep_dims=True) + 1e-12
                        norm_term = tf.tile(sum_vec,
                                            tf.stack([1, tf.shape(alpha)[1]]))
                        alpha = alpha / norm_term
                        alpha = tf.expand_dims(alpha, 2)
                        alpha = tf.expand_dims(alpha, 3)
                        # Now calculate the attention-weighted vector d.
                        d = math_ops.reduce_sum(alpha * hidden, [1, 2])
                        d = array_ops.reshape(d, [-1, attn_size])

                    return tuple([d, alpha])
  def testLinear(self):
    with self.test_session() as sess:
      with tf.variable_scope("root", initializer=tf.constant_initializer(1.0)):
        x = tf.zeros([1, 2])
        l = linear([x], 2, False)
        sess.run([tf.global_variables_initializer()])
        res = sess.run([l], {x.name: np.array([[1., 2.]])})
        self.assertAllClose(res[0], [[3.0, 3.0]])

        # Checks prevent you from accidentally creating a shared function.
        with self.assertRaises(ValueError):
          l1 = linear([x], 2, False)

        # But you can create a new one in a new scope and share the variables.
        with tf.variable_scope("l1") as new_scope:
          l1 = linear([x], 2, False)
        with tf.variable_scope(new_scope, reuse=True):
          linear([l1], 2, False)
        self.assertEqual(len(tf.trainable_variables()), 2)
    def testLinear(self):
        with self.test_session() as sess:
            with variable_scope.variable_scope(
                    "root", initializer=init_ops.constant_initializer(1.0)):
                x = array_ops.zeros([1, 2])
                l = linear([x], 2, False)
                sess.run([variables_lib.global_variables_initializer()])
                res = sess.run([l], {x.name: np.array([[1., 2.]])})
                self.assertAllClose(res[0], [[3.0, 3.0]])

                # Checks prevent you from accidentally creating a shared function.
                with self.assertRaises(ValueError):
                    l1 = linear([x], 2, False)

                # But you can create a new one in a new scope and share the variables.
                with variable_scope.variable_scope("l1") as new_scope:
                    l1 = linear([x], 2, False)
                with variable_scope.variable_scope(new_scope, reuse=True):
                    linear([l1], 2, False)
                self.assertEqual(len(variables_lib.trainable_variables()), 2)
Beispiel #6
0
 def attention(query):
   """Put attention masks on hidden using hidden_features and query."""
   ds = []  # Results of attention reads will be stored here.
   for a in xrange(num_heads):
     with variable_scope.variable_scope("Attention_%d" % a):
       y = linear(query, attention_vec_size, True)
       y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
       # Attention mask is a softmax of v^T * tanh(...).
       s = math_ops.reduce_sum(
           v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])
       a = nn_ops.softmax(s)
       # Now calculate the attention-weighted vector d.
       d = math_ops.reduce_sum(
           array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
           [1, 2])
       ds.append(array_ops.reshape(d, [-1, attn_size]))
   return ds
Beispiel #7
0
def basic_rnn_cell(inputs, state, num_units, scope=None):
    if state is None:
        if inputs is not None:
            batch_size = inputs.get_shape()[0]
            dtype = inputs.dtype
        else:
            batch_size = 0
            dtype = tf.float32
        init_output = tf.zeros(tf.stack([batch_size, num_units]), dtype=dtype)
        init_state = tf.zeros(tf.stack([batch_size, num_units]), dtype=dtype)
        init_output.set_shape([batch_size, num_units])
        init_state.set_shape([batch_size, num_units])
        return init_output, init_state
    else:
        with tf.variable_scope(scope, "basic_rnn_cell", [inputs, state]):
            output = tf.tanh(linear([inputs, state], num_units, True))
        return output, output
def basic_rnn_cell(inputs, state, num_units, scope=None):
  if state is None:
    if inputs is not None:
      batch_size = inputs.get_shape()[0]
      dtype = inputs.dtype
    else:
      batch_size = 0
      dtype = tf.float32
    init_output = tf.zeros(tf.stack([batch_size, num_units]), dtype=dtype)
    init_state = tf.zeros(tf.stack([batch_size, num_units]), dtype=dtype)
    init_output.set_shape([batch_size, num_units])
    init_state.set_shape([batch_size, num_units])
    return init_output, init_state
  else:
    with tf.variable_scope(scope, "basic_rnn_cell", [inputs, state]):
      output = tf.tanh(linear([inputs, state],
                              num_units, True))
    return output, output
Beispiel #9
0
            def attn_loop_function(time, cell_output, state, loop_state):
                def attention(query, prev_alpha):
                    """Calculate attention weights."""
                    with variable_scope.variable_scope("Attention"):
                        y = linear(query, attention_vec_size, True)
                        y = array_ops.reshape(y,
                                              [-1, 1, 1, attention_vec_size])
                        s = math_ops.reduce_sum(
                            v * math_ops.tanh(hidden_features + y), [2, 3])

                        alpha = nn_ops.softmax(s) * attn_mask
                        sum_vec = tf.reduce_sum(alpha,
                                                reduction_indices=[1],
                                                keep_dims=True) + 1e-12
                        norm_term = tf.tile(sum_vec,
                                            tf.stack([1, tf.shape(alpha)[1]]))
                        alpha = alpha / norm_term
                        alpha = tf.expand_dims(alpha, 2)
                        alpha = tf.expand_dims(alpha, 3)
                        # Now calculate the attention-weighted vector d.
                        d = math_ops.reduce_sum(alpha * hidden, [1, 2])
                        d = array_ops.reshape(d, [-1, attn_size])

                    return tuple([d, alpha])

                # If loop_function is set, we use it instead of decoder_inputs.
                elements_finished = (time >= seq_len)
                finished = tf.reduce_all(elements_finished)

                if cell_output is None:
                    next_state = final_state
                    output = None
                    loop_state = tuple([attn, alpha])
                    next_input = inputs_ta.read(time)
                else:
                    next_state = state
                    loop_state = attention(cell_output, loop_state[1])
                    with variable_scope.variable_scope("AttnOutputProjection"):
                        output = linear([cell_output, loop_state[0]],
                                        self.cell.output_size, True)

                    if loop_function is not None:
                        simple_input = loop_function(output)
                        # print ("Yolo")
                    else:
                        simple_input = tf.cond(
                            finished,
                            lambda: tf.zeros([batch_size, embedding_size],
                                             dtype=tf.float32),
                            lambda: inputs_ta.read(time))

                    # Merge input and previous attentions into one vector of
                    # the right size.
                    input_size = simple_input.get_shape().with_rank(2)[1]
                    if input_size.value is None:
                        raise ValueError("Could not infer input size")
                    with variable_scope.variable_scope("InputProjection"):
                        next_input = linear([simple_input, loop_state[0]],
                                            input_size, True)

                return (elements_finished, next_input, next_state, output,
                        loop_state)
Beispiel #10
0
def beam_attention_decoder(decoder_inputs, initial_state, attention_states, cell,
                      output_size=None, num_heads=1, loop_function=None,
                      dtype=dtypes.float32, scope=None,
                      initial_state_attention=False, output_projection=None, beam_size=10):
  """RNN decoder with attention for the sequence-to-sequence model.

  In this context "attention" means that, during decoding, the RNN can look up
  information in the additional tensor attention_states, and it does this by
  focusing on a few entries from the tensor. This model has proven to yield
  especially good results in a number of sequence-to-sequence tasks. This
  implementation is based on http://arxiv.org/abs/1412.7449 (see below for
  details). It is recommended for complex sequence-to-sequence tasks.

  Args:
    decoder_inputs: A list of 2D Tensors [batch_size x input_size].
    initial_state: 2D Tensor [batch_size x cell.state_size].
    attention_states: 3D Tensor [batch_size x attn_length x attn_size].
    cell: rnn_cell.RNNCell defining the cell function and size.
    output_size: Size of the output vectors; if None, we use cell.output_size.
    num_heads: Number of attention heads that read from attention_states.
    loop_function: If not None, this function will be applied to i-th output
      in order to generate i+1-th input, and decoder_inputs will be ignored,
      except for the first element ("GO" symbol). This can be used for decoding,
      but also for training to emulate http://arxiv.org/abs/1506.03099.
      Signature -- loop_function(prev, i) = next
        * prev is a 2D Tensor of shape [batch_size x output_size],
        * i is an integer, the step number (when advanced control is needed),
        * next is a 2D Tensor of shape [batch_size x input_size].
    dtype: The dtype to use for the RNN initial state (default: tf.float32).
    scope: VariableScope for the created subgraph; default: "attention_decoder".
    initial_state_attention: If False (default), initial attentions are zero.
      If True, initialize the attentions from the initial state and attention
      states -- useful when we wish to resume decoding from a previously
      stored decoder state and attention states.

  Returns:
    A tuple of the form (outputs, state), where:
      outputs: A list of the same length as decoder_inputs of 2D Tensors of
        shape [batch_size x output_size]. These represent the generated outputs.
        Output i is computed from input i (which is either the i-th element
        of decoder_inputs or loop_function(output {i-1}, i)) as follows.
        First, we run the cell on a combination of the input and previous
        attention masks:
          cell_output, new_state = cell(linear(input, prev_attn), prev_state).
        Then, we calculate new attention masks:
          new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
        and then we calculate the output:
          output = linear(cell_output, new_attn).
      state: The state of each decoder cell the final time-step.
        It is a 2D Tensor of shape [batch_size x cell.state_size].

  Raises:
    ValueError: when num_heads is not positive, there are no inputs, shapes
      of attention_states are not set, or input size cannot be inferred
      from the input.
  """
  if not decoder_inputs:
    raise ValueError("Must provide at least 1 input to attention decoder.")
  if num_heads < 1:
    raise ValueError("With less than 1 heads, use a non-attention decoder.")
  if not attention_states.get_shape()[1:2].is_fully_defined():
    raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
                     % attention_states.get_shape())
  if output_size is None:
    output_size = cell.output_size

  with variable_scope.variable_scope(scope or "attention_decoder"):
    batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
    attn_length = attention_states.get_shape()[1].value
    attn_size = attention_states.get_shape()[2].value

    # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
    hidden = array_ops.reshape(
        attention_states, [-1, attn_length, 1, attn_size])
    hidden_features = []
    v = []
    attention_vec_size = attn_size  # Size of query vectors for attention.
    for a in xrange(num_heads):
      k = variable_scope.get_variable("AttnW_%d" % a,
                                      [1, 1, attn_size, attention_vec_size])
      hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
      v.append(variable_scope.get_variable("AttnV_%d" % a,
                                           [attention_vec_size]))

    print("Initial_state")

    state = initial_state
    def attention(query):
      """Put attention masks on hidden using hidden_features and query."""
      ds = []  # Results of attention reads will be stored here.
      for a in xrange(num_heads):
        with variable_scope.variable_scope("Attention_%d" % a):
          y = linear(query, attention_vec_size, True)
          y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
          # Attention mask is a softmax of v^T * tanh(...).
          s = math_ops.reduce_sum(
              v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])
          a = nn_ops.softmax(s)
          # Now calculate the attention-weighted vector d.
          d = math_ops.reduce_sum(
              array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
              [1, 2])
          # for c in range(ct):
          ds.append(array_ops.reshape(d, [-1, attn_size]))
      return ds

    outputs = []
    prev = None
    batch_attn_size = tf.stack([batch_size, attn_size])
    attns = [array_ops.zeros(batch_attn_size, dtype=dtype)
             for _ in xrange(num_heads)]
    for a in attns:  # Ensure the second shape of attention vectors is set.
      a.set_shape([None, attn_size])

    if initial_state_attention:
       attns = []
       attns.append(attention(initial_state))
       tmp = tf.reshape(tf.concat(axis=0, values=attns), [-1, attn_size])
       attns = []
       attns.append(tmp)

    log_beam_probs, beam_path, beam_symbols = [],[],[]
    for i, inp in enumerate(decoder_inputs):

      if i > 0:
        variable_scope.get_variable_scope().reuse_variables()
      # If loop_function is set, we use it instead of decoder_inputs.
      if loop_function is not None :
        with variable_scope.variable_scope("loop_function", reuse=True):
            if prev is not None:
                inp = loop_function(prev, i,log_beam_probs, beam_path, beam_symbols)

      input_size = inp.get_shape().with_rank(2)[1]
      x = linear([inp] + attns, input_size, True)
      cell_output, state = cell(x, state)

      # Run the attention mechanism.
      if i == 0 and initial_state_attention:
        with variable_scope.variable_scope(variable_scope.get_variable_scope(),
                                           reuse=True):
          attns = attention(state)
      else:
          attns = attention(state)

      with variable_scope.variable_scope("AttnOutputProjection"):
        output = linear([cell_output] + attns, output_size, True)
      if loop_function is not None:
        prev = output
      if  i ==0:
          with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True):
                attns = attention(state)

      outputs.append(tf.argmax(nn_ops.xw_plus_b(
          output, output_projection[0], output_projection[1]), axis=1))

  return outputs, state, tf.reshape(tf.concat(axis=0, values=beam_path),[-1,beam_size]), tf.reshape(tf.concat(axis=0, values=beam_symbols),[-1,beam_size])
Beispiel #11
0
    def __call__(self, inputs, state, scope=None):
        """Run one step of G-LSTM.

        Args:
          inputs: input Tensor, 2D, batch x num_units.
          state: this must be a tuple of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`.
          scope: not used

        Returns:
          A tuple containing:

          - A `2-D, [batch x output_dim]`, Tensor representing the output of the
            G-LSTM after reading `inputs` when previous state was `state`.
            Here output_dim is:
               num_proj if num_proj was set,
               num_units otherwise.
          - Tensor(s) representing the new state of G-LSTM after reading `inputs` when
            the previous state was `state`.  Same type and shape(s) as `state`.

        Raises:
          ValueError: If input size cannot be inferred from inputs via
            static shape inference.
        """
        (c_prev, m_prev) = state

        input_size = inputs.get_shape().with_rank(2)[1]
        if input_size.value is None:
            raise ValueError(
                "Could not infer input size from inputs.get_shape()[-1]")
        dtype = inputs.dtype
        with vs.variable_scope(scope or "glstm_cell",
                               initializer=self._initializer):
            i_parts = []
            j_parts = []
            f_parts = []
            o_parts = []

            for group_id in xrange(self._number_of_groups):
                with vs.variable_scope("group%d" % group_id):
                    x_g_id = array_ops.concat([
                        self._get_input_for_group(inputs, group_id,
                                                  self._group_shape[0]),
                        self._get_input_for_group(m_prev, group_id,
                                                  self._group_shape[0])
                    ],
                                              axis=1)
                    R_k = linear(x_g_id,
                                 4 * self._group_shape[1],
                                 bias=False,
                                 scope=scope)  #will add per gate biases later
                    i_k, j_k, f_k, o_k = array_ops.split(R_k, 4, 1)

                i_parts.append(i_k)
                j_parts.append(j_k)
                f_parts.append(f_k)
                o_parts.append(o_k)

            #it is more efficient to have per gate biases then per gate, per group
            bi = vs.get_variable(name="biases_i",
                                 shape=[self._num_units],
                                 dtype=dtype,
                                 initializer=init_ops.constant_initializer(
                                     0.0, dtype=dtype))
            bj = vs.get_variable(name="biases_j",
                                 shape=[self._num_units],
                                 dtype=dtype,
                                 initializer=init_ops.constant_initializer(
                                     0.0, dtype=dtype))
            bf = vs.get_variable(name="biases_f",
                                 shape=[self._num_units],
                                 dtype=dtype,
                                 initializer=init_ops.constant_initializer(
                                     0.0, dtype=dtype))
            bo = vs.get_variable(name="biases_o",
                                 shape=[self._num_units],
                                 dtype=dtype,
                                 initializer=init_ops.constant_initializer(
                                     0.0, dtype=dtype))

            i = nn_ops.bias_add(array_ops.concat(i_parts, axis=1), bi)
            j = nn_ops.bias_add(array_ops.concat(j_parts, axis=1), bj)
            f = nn_ops.bias_add(array_ops.concat(f_parts, axis=1), bf)
            o = nn_ops.bias_add(array_ops.concat(o_parts, axis=1), bo)

        c = math_ops.sigmoid(f +
                             self._forget_bias) * c_prev + math_ops.sigmoid(
                                 i) * math_ops.tanh(j)
        m = math_ops.sigmoid(o) * self._activation(c)

        if self._num_proj is not None:
            with vs.variable_scope("projection"):
                m = linear(m, self._num_proj, bias=False, scope=scope)

        new_state = LSTMStateTuple(c, m)
        return m, new_state