def body(i, prev_c, prev_h, actions, log_probs):
      # pylint: disable=g-long-lambda
      signal = control_flow_ops.cond(
          math_ops.equal(i, 0),
          lambda: array_ops.tile(device_go_embedding,
                                 [self.hparams.num_children, 1]),
          lambda: embedding_ops.embedding_lookup(device_embeddings,
                                                 actions.read(i - 1))
      )
      if self.hparams.keep_prob is not None:
        signal = nn_ops.dropout(signal, self.hparams.keep_prob)
      next_c, next_h = lstm(signal, prev_c, prev_h, w_lstm, forget_bias)
      query = math_ops.matmul(next_h, attn_w_2)
      query = array_ops.reshape(
          query, [self.hparams.num_children, 1, self.hparams.hidden_size])
      query = math_ops.tanh(query + attn_mem)
      query = array_ops.reshape(query, [
          self.hparams.num_children * self.num_groups, self.hparams.hidden_size
      ])
      query = math_ops.matmul(query, attn_v)
      query = array_ops.reshape(query,
                                [self.hparams.num_children, self.num_groups])
      query = nn_ops.softmax(query)
      query = array_ops.reshape(query,
                                [self.hparams.num_children, self.num_groups, 1])
      query = math_ops.reduce_sum(attn_mem * query, axis=1)
      query = array_ops.concat([next_h, query], axis=1)
      logits = math_ops.matmul(query, device_softmax)
      logits /= self.hparams.temperature
      if self.hparams.tanh_constant > 0:
        logits = math_ops.tanh(logits) * self.hparams.tanh_constant
      if self.hparams.logits_std_noise > 0:
        num_in_logits = math_ops.cast(
            array_ops.size(logits), dtype=dtypes.float32)
        avg_norm = math_ops.divide(
            linalg_ops.norm(logits), math_ops.sqrt(num_in_logits))
        logits_noise = random_ops.random_normal(
            array_ops.shape(logits),
            stddev=self.hparams.logits_std_noise * avg_norm)
        logits = control_flow_ops.cond(
            self.global_step > self.hparams.stop_noise_step, lambda: logits,
            lambda: logits + logits_noise)

      if mode == "sample":
        next_y = random_ops.multinomial(logits, 1, seed=self.hparams.seed)
      elif mode == "greedy":
        next_y = math_ops.argmax(logits, 1)
      elif mode == "target":
        next_y = array_ops.slice(y, [0, i], [-1, 1])
      else:
        raise NotImplementedError
      next_y = math_ops.to_int32(next_y)
      next_y = array_ops.reshape(next_y, [self.hparams.num_children])
      actions = actions.write(i, next_y)
      log_probs += nn_ops.sparse_softmax_cross_entropy_with_logits(
          logits=logits, labels=next_y)
      return i + 1, next_c, next_h, actions, log_probs
 def LSTMCell(cls, x, mprev, cprev, weights):
   xm = array_ops.concat([x, mprev], 1)
   i_i, i_g, f_g, o_g = array_ops.split(
       value=math_ops.matmul(xm, weights), num_or_size_splits=4, axis=1)
   new_c = math_ops.sigmoid(f_g) * cprev + math_ops.sigmoid(
       i_g) * math_ops.tanh(i_i)
   new_c = clip_ops.clip_by_value(new_c, -50.0, 50.0)
   new_m = math_ops.sigmoid(o_g) * math_ops.tanh(new_c)
   return new_m, new_c
def _bahdanau_score(processed_query, keys, normalize):
  """Implements Bahdanau-style (additive) scoring function.

  This attention has two forms.  The first is Bhandanau attention,
  as described in:

  Dzmitry Bahdanau, Kyunghyun Cho, Yoshua Bengio.
  "Neural Machine Translation by Jointly Learning to Align and Translate."
  ICLR 2015. https://arxiv.org/abs/1409.0473

  The second is the normalized form.  This form is inspired by the
  weight normalization article:

  Tim Salimans, Diederik P. Kingma.
  "Weight Normalization: A Simple Reparameterization to Accelerate
   Training of Deep Neural Networks."
  https://arxiv.org/abs/1602.07868

  To enable the second form, set `normalize=True`.

  Args:
    processed_query: Tensor, shape `[batch_size, num_units]` to compare to keys.
    keys: Processed memory, shape `[batch_size, max_time, num_units]`.
    normalize: Whether to normalize the score function.

  Returns:
    A `[batch_size, max_time]` tensor of unnormalized score values.
  """
  dtype = processed_query.dtype
  # Get the number of hidden units from the trailing dimension of keys
  num_units = keys.shape[2].value or array_ops.shape(keys)[2]
  # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting.
  processed_query = array_ops.expand_dims(processed_query, 1)
  v = variable_scope.get_variable(
      "attention_v", [num_units], dtype=dtype)
  if normalize:
    # Scalar used in weight normalization
    g = variable_scope.get_variable(
        "attention_g", dtype=dtype,
        initializer=math.sqrt((1. / num_units)))
    # Bias added prior to the nonlinearity
    b = variable_scope.get_variable(
        "attention_b", [num_units], dtype=dtype,
        initializer=init_ops.zeros_initializer())
    # normed_v = g * v / ||v||
    normed_v = g * v * math_ops.rsqrt(
        math_ops.reduce_sum(math_ops.square(v)))
    return math_ops.reduce_sum(
        normed_v * math_ops.tanh(keys + processed_query + b), [2])
  else:
    return math_ops.reduce_sum(v * math_ops.tanh(keys + processed_query), [2])
    def attention(decoder_state, coverage=None):
      """Calculate the context vector and attention distribution from the decoder state.

      Args:
        decoder_state: state of the decoder
        coverage: Optional. Previous timestep's coverage vector, shape (batch_size, attn_len, 1, 1).

      Returns:
        context_vector: weighted sum of encoder_states
        attn_dist: attention distribution
        coverage: new coverage vector. shape (batch_size, attn_len, 1, 1)
      """
      with variable_scope.variable_scope("Attention"):
        # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper)
        decoder_features = linear(decoder_state, attention_vec_size, True) # shape (batch_size, attention_vec_size)
        decoder_features = tf.expand_dims(tf.expand_dims(decoder_features, 1), 1) # reshape to (batch_size, 1, 1, attention_vec_size)

        def masked_attention(e):
          """Take softmax of e then apply enc_padding_mask and re-normalize"""
          attn_dist = nn_ops.softmax(e) # take softmax. shape (batch_size, attn_length)
          attn_dist *= enc_padding_mask # apply mask
          masked_sums = tf.reduce_sum(attn_dist, axis=1) # shape (batch_size)
          return attn_dist / tf.reshape(masked_sums, [-1, 1]) # re-normalize

        if use_coverage and coverage is not None: # non-first step of coverage
          # Multiply coverage vector by w_c to get coverage_features.
          coverage_features = nn_ops.conv2d(coverage, w_c, [1, 1, 1, 1], "SAME") # c has shape (batch_size, attn_length, 1, attention_vec_size)

          # Calculate v^T tanh(W_h h_i + W_s s_t + w_c c_i^t + b_attn)
          e = math_ops.reduce_sum(v * math_ops.tanh(encoder_features + decoder_features + coverage_features), [2, 3])  # shape (batch_size,attn_length)

          # Calculate attention distribution
          attn_dist = masked_attention(e)

          # Update coverage vector
          coverage += array_ops.reshape(attn_dist, [batch_size, -1, 1, 1])
        else:
          # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn)
          e = math_ops.reduce_sum(v * math_ops.tanh(encoder_features + decoder_features), [2, 3]) # calculate e

          # Calculate attention distribution
          attn_dist = masked_attention(e)

          if use_coverage: # first step of training
            coverage = tf.expand_dims(tf.expand_dims(attn_dist,2),2) # initialize coverage

        # Calculate the context vector from attn_dist and encoder_states
        context_vector = math_ops.reduce_sum(array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) * encoder_states, [1, 2]) # shape (batch_size, attn_size).
        context_vector = array_ops.reshape(context_vector, [-1, attn_size])

      return context_vector, attn_dist, coverage
  def __call__(self, inputs, state, scope=None):
    """Long short-term memory cell (LSTM)."""
    with vs.variable_scope(scope or type(self).__name__):  # "BasicLSTMCell"
      # Parameters of gates are concatenated into one multiply for efficiency.
      c, h = array_ops.split(1, 2, state)
      concat = linear([inputs, h], 4 * self._num_units, True)

      # i = input_gate, j = new_input, f = forget_gate, o = output_gate
      i, j, f, o = array_ops.split(1, 4, concat)

      new_c = c * sigmoid(f + self._forget_bias) + sigmoid(i) * tanh(j)
      new_h = tanh(new_c) * sigmoid(o)

      return new_h, array_ops.concat(1, [new_c, new_h])
Exemple #6
0
  def embed(self, func, embedding_classes, embedding_size, inputs, dtype=None, scope=None,
            keep_prob=1.0, initializer=None):
    embedder_cell = func(self._cell, embedding_classes, embedding_size, initializer=initializer)

    # Like rnn(..) in rnn.py, but we call only the Embedder, not the RNN cell
    outputs = []
    with vs.variable_scope(scope or "Embedder") as varscope:
      if varscope.caching_device is None:
        varscope.set_caching_device(lambda op: op.device)

      for time, input_ in enumerate(inputs):
        if time > 0: vs.get_variable_scope().reuse_variables()
        embedding = embedder_cell.__call__(input_, scope)
        if keep_prob < 1:
          embedding = tf.nn.dropout(embedding, keep_prob)

        # annotation = C~_t = tanh ( E(x_t) + b_c)
        b_c = tf.get_variable("annotation_b", [embedding_size])
        annotation = tanh(tf.nn.bias_add(embedding, b_c))

        # weighted annotation = i_t * C~_t
        # i = sigmoid ( E(x_t) + b_i)
        b_i = tf.get_variable("input_b", [embedding_size])
        i = sigmoid(tf.nn.bias_add(embedding, b_i))
        w_annotation = i * annotation
        outputs.append(w_annotation)

      # return empty state, will be initialized by decoder
      batch_size = array_ops.shape(inputs[0])[0]
      state = self._cell.zero_state(batch_size, dtype)
      return (outputs, state)
  def testOptimizerInit(self):
    with ops.Graph().as_default():
      layer_collection = lc.LayerCollection()

      inputs = array_ops.ones((2, 1)) * 2
      weights_val = np.ones((1, 1), dtype=np.float32) * 3.
      weights = variable_scope.get_variable(
          'w', initializer=array_ops.constant(weights_val))
      bias = variable_scope.get_variable(
          'b', initializer=init_ops.zeros_initializer(), shape=(1, 1))
      output = math_ops.matmul(inputs, weights) + bias

      layer_collection.register_fully_connected((weights, bias), inputs, output)

      logits = math_ops.tanh(output)
      targets = array_ops.constant([[0.], [1.]])
      output = math_ops.reduce_mean(
          nn.softmax_cross_entropy_with_logits(logits=logits, labels=targets))

      layer_collection.register_categorical_predictive_distribution(logits)

      optimizer.KfacOptimizer(
          0.1,
          0.2,
          0.3,
          layer_collection,
          momentum=0.5,
          momentum_type='regular')
Exemple #8
0
 def attention(query):
   """Put attention masks on hidden using hidden_features and query."""
   ds = []  # Results of attention reads will be stored here.
   if nest.is_sequence(query):  # If the query is a tuple, flatten it.
     query_list = nest.flatten(query)
     for q in query_list:  # Check that ndims == 2 if specified.
       ndims = q.get_shape().ndims
       if ndims:
         assert ndims == 2
     query = array_ops.concat(1, query_list)
   for i in xrange(num_heads):
     with variable_scope.variable_scope("Attention_%d" % i):                  
       y = linear(query, attention_vec_size, True)
       y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
       # Attention mask is a softmax of v^T * tanh(...).
       s = math_ops.reduce_sum(
           v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3])
       # multiply with source mask, then do softmax
       if src_mask is not None:
         s = s * src_mask
       a = nn_ops.softmax(s)
       # Now calculate the attention-weighted vector d.
       d = math_ops.reduce_sum(
           array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
           [1, 2])                  
       ds.append(array_ops.reshape(d, [-1, attn_size]))
   return ds            
  def _logits_cumulative(self, inputs, stop_gradient):
    """Evaluate logits of the cumulative densities.

    Args:
      inputs: The values at which to evaluate the cumulative densities, expected
        to be a `Tensor` of shape `(channels, 1, batch)`.
      stop_gradient: Boolean. Whether to add `array_ops.stop_gradient` calls so
        that the gradient of the output with respect to the density model
        parameters is disconnected (the gradient with respect to `inputs` is
        left untouched).

    Returns:
      A `Tensor` of the same shape as `inputs`, containing the logits of the
      cumulative densities evaluated at the given inputs.
    """
    logits = inputs

    for i in range(len(self.filters) + 1):
      matrix = self._matrices[i]
      if stop_gradient:
        matrix = array_ops.stop_gradient(matrix)
      logits = math_ops.matmul(matrix, logits)

      bias = self._biases[i]
      if stop_gradient:
        bias = array_ops.stop_gradient(bias)
      logits += bias

      if i < len(self._factors):
        factor = self._factors[i]
        if stop_gradient:
          factor = array_ops.stop_gradient(factor)
        logits += factor * math_ops.tanh(logits)

    return logits
Exemple #10
0
  def call(self, inputs, state):
    """
    """
    (c_prev, m_prev) = state
    self._batch_size = inputs.shape[0].value or array_ops.shape(inputs)[0]
    scope = vs.get_variable_scope()
    with vs.variable_scope(scope, initializer=self._initializer):
      x = array_ops.concat([inputs, m_prev], axis=1)
      with vs.variable_scope("first_gemm"):
        if self._linear1 is None:
          # no bias for bottleneck
          self._linear1 = _Linear(x, self._fact_size, False)
        R_fact = self._linear1(x)
      with vs.variable_scope("second_gemm"):
        if self._linear2 is None:
          self._linear2 = _Linear(R_fact, 4*self._num_units, True)
        R = self._linear2(R_fact)
      i, j, f, o = array_ops.split(R, 4, 1)

      c = (math_ops.sigmoid(f + self._forget_bias) * c_prev +
           math_ops.sigmoid(i) * math_ops.tanh(j))
      m = math_ops.sigmoid(o) * self._activation(c)

    if self._num_proj is not None:
      with vs.variable_scope("projection"):
        if self._linear3 is None:
          self._linear3 = _Linear(m, self._num_proj, False)
        m = self._linear3(m)

    new_state = rnn_cell_impl.LSTMStateTuple(c, m)
    return m, new_state
  def __call__(self, inputs, state, scope=None):
    with _checked_scope(self, scope or "rwa_cell", reuse=self._reuse):
      h, n, d, a_max = state

      with vs.variable_scope("u"):
        u = _linear(inputs, self._num_units, True)

      with vs.variable_scope("g"):
        g = _linear([inputs, h], self._num_units, True)

      with vs.variable_scope("a"):
        a = _linear([inputs, h], self._num_units, False) # The bias term when factored out of the numerator and denominator cancels and is unnecessary

      z = tf.multiply(u, tanh(g))

      a_newmax = tf.maximum(a_max, a)
      exp_diff = tf.exp(a_max - a_newmax)
      exp_scaled = tf.exp(a - a_newmax)

      n = tf.multiply(n, exp_diff) + tf.multiply(z, exp_scaled)  # Numerically stable update of numerator
      d = tf.multiply(d, exp_diff) + exp_scaled  # Numerically stable update of denominator
      h_new = self._activation(tf.div(n, d))

      new_state = RWACellTuple(h_new, n, d, a_newmax)

    return h_new, new_state
 def attention(query, use_attention=False):
   """Put attention masks on hidden using hidden_features and query."""
   attn_weights = []
   ds = []  # Results of attention reads will be stored here.
   for i in xrange(num_heads):
     with variable_scope.variable_scope("Attention_%d" % i):
       y = rnn_cell._linear(query, attention_vec_size, True)
       y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
       # Attention mask is a softmax of v^T * tanh(...).
       s = math_ops.reduce_sum(
           v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3])
       if use_attention is False: # apply mean pooling
           weights = tf.tile(sequence_length, tf.stack([attn_length]))
           weights = array_ops.reshape(weights, tf.shape(s))
           a = array_ops.ones(tf.shape(s), dtype=dtype) / math_ops.to_float(weights)
           # a = array_ops.ones(tf.shape(s), dtype=dtype) / math_ops.to_float(tf.shape(s)[1])
       else:
         a = nn_ops.softmax(s)
       attn_weights.append(a)
       # Now calculate the attention-weighted vector d.
       d = math_ops.reduce_sum(
           array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
           [1, 2])
       ds.append(array_ops.reshape(d, [-1, attn_size]))
   return attn_weights, ds
Exemple #13
0
 def downscale(self, inp):
   with vs.variable_scope("Downscale"):
     inp2d = tf.reshape(tf.transpose(inp, perm=[1, 0, 2]), [-1, 2 * self.size])
     out2d = rnn_cell.linear(inp2d, self.size, True, 1.0)
     out3d = tf.reshape(out2d, [self.batch_size, -1, self.size])
     out3d = tf.transpose(out3d, perm=[1, 0, 2])
     out = tanh(out3d)
   return out
Exemple #14
0
 def __init__(self, num_units, encoder_output, scope=None):
   self.hs = encoder_output
   with vs.variable_scope(scope or type(self).__name__):
     with vs.variable_scope("Attn1"):
       hs2d = tf.reshape(self.hs, [-1, num_units])
       phi_hs2d = tanh(rnn_cell.linear(hs2d, num_units, True, 1.0))
       self.phi_hs = tf.reshape(phi_hs2d, tf.shape(self.hs))
   super(GRUCellAttn, self).__init__(num_units)
 def attention(query):
     """Point on hidden using hidden_features and query."""
     with vs.variable_scope("Attention"):
         y = rnn_cell.linear(query, attention_vec_size, True)
         y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
         # Attention mask is a softmax of v^T * tanh(...).
         s = math_ops.reduce_sum(
             v * math_ops.tanh(hidden_features + y), [2, 3])
         return s
 def _GenerateOrderedInputs(self, size, n):
   inputs = self._GenerateUnorderedInputs(size, 1)
   queue = data_flow_ops.FIFOQueue(
       capacity=1, dtypes=[inputs[0].dtype], shapes=[inputs[0].get_shape()])
   for _ in xrange(n - 1):
     op = queue.enqueue(inputs[-1])
     with ops.control_dependencies([op]):
       inputs.append(math_ops.tanh(1.0 + queue.dequeue()))
   return inputs
  def __call__(self, query, previous_alignments):
    """Score the query based on the keys and values.

    Args:
      query: Tensor of dtype matching `self.values` and shape
        `[batch_size, query_depth]`.
      previous_alignments: Tensor of dtype matching `self.values` and shape
        `[batch_size, alignments_size]`
        (`alignments_size` is memory's `max_time`).

    Returns:
      alignments: Tensor of dtype matching `self.values` and shape
        `[batch_size, alignments_size]` (`alignments_size` is memory's
        `max_time`).
    """
    with variable_scope.variable_scope(None, "bahdanau_attention", [query]):
      processed_query = self.query_layer(query) if self.query_layer else query
      dtype = processed_query.dtype
      # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting.
      processed_query = array_ops.expand_dims(processed_query, 1)
      keys = self._keys
      v = variable_scope.get_variable(
          "attention_v", [self._num_units], dtype=dtype)
      if self._normalize:
        # Scalar used in weight normalization
        g = variable_scope.get_variable(
            "attention_g", dtype=dtype,
            initializer=math.sqrt((1. / self._num_units)))
        # Bias added prior to the nonlinearity
        b = variable_scope.get_variable(
            "attention_b", [self._num_units], dtype=dtype,
            initializer=init_ops.zeros_initializer())
        # normed_v = g * v / ||v||
        normed_v = g * v * math_ops.rsqrt(
            math_ops.reduce_sum(math_ops.square(v)))
        score = math_ops.reduce_sum(
            normed_v * math_ops.tanh(keys + processed_query + b), [2])
      else:
        score = math_ops.reduce_sum(v * math_ops.tanh(keys + processed_query),
                                    [2])

    alignments = self._probability_fn(score, previous_alignments)
    return alignments
  def testGradientThroughNewStep(self):
    with imperative_mode.ImperativeMode(self._target) as mode:
      x = constant_op.constant(np.random.rand(3))
      y = math_ops.tanh(x)

      with mode.new_step():
        z = constant_op.constant(np.random.rand(3))
        w = math_ops.multiply(y, z)
        dx = gradients_impl.gradients(w, x)
        self.assertAllClose(dx[0].value, z.value * (1.0 - y.value ** 2))
Exemple #19
0
 def testIsSequence(self):
   self.assertFalse(nest.is_sequence("1234"))
   self.assertTrue(nest.is_sequence([1, 3, [4, 5]]))
   self.assertTrue(nest.is_sequence(((7, 8), (5, 6))))
   self.assertTrue(nest.is_sequence([]))
   self.assertFalse(nest.is_sequence(set([1, 2])))
   ones = array_ops.ones([2, 3])
   self.assertFalse(nest.is_sequence(ones))
   self.assertFalse(nest.is_sequence(math_ops.tanh(ones)))
   self.assertFalse(nest.is_sequence(np.ones((4, 5))))
def _lstm_cell(prev_c, prev_h, x):
  """Create an LSTM cell."""
  # i: input gate
  # f: forget gate
  # o: output gate
  # c: cell state
  # x: input
  # h: embedding
  bias = _bias([4])
  w = _weight([8, 16])
  ifoc = math_ops.matmul(array_ops.concat([x, prev_h], axis=1), w)
  i, f, o, c = array_ops.split(ifoc, 4, axis=1)
  i = math_ops.sigmoid(nn.bias_add(i, bias))
  f = math_ops.sigmoid(nn.bias_add(f, bias))
  o = math_ops.sigmoid(nn.bias_add(o, bias))
  c = math_ops.tanh(nn.bias_add(c, bias))
  next_c = f * prev_c + i * c
  next_h = o * math_ops.tanh(next_c)
  return next_c, next_h
Exemple #21
0
 def __call__(self, inputs, state, scope=None):
     """Gated recurrent unit (GRU) with nunits cells."""
     with vs.variable_scope(scope or type(self).__name__):  # "GRUCell"
         with vs.variable_scope("Gates"):  # Reset gate and update gate.
             # We start with bias of 1.0 to not reset and not update.
             r, u = array_ops.split(1, 2, linear([inputs, state], 2 * self._num_units, True, 1.0))
             r, u = sigmoid(r), sigmoid(u)
         with vs.variable_scope("Candidate"):
             c = tanh(linear([inputs, r * state], self._num_units, True))
         new_h = u * state + (1 - u) * c
     return new_h, new_h
Exemple #22
0
def decoder_type_1(decoder_hidden, attn_size, initializer=None):

    with vs.variable_scope("decoder_type_1", initializer=initializer):

        k = vs.get_variable("AttnDecW_%d" % 0, [1, 1, attn_size, 1], initializer=initializer)
        hidden_features = nn_ops.conv2d(decoder_hidden, k, [1, 1, 1, 1], "SAME")

        # s will be (?, timesteps)
        s = math_ops.reduce_sum(math_ops.tanh(hidden_features), [2, 3])

    return s
  def __call__(self, query):
    """Score the query based on the keys and values.

    Args:
      query: Tensor of dtype matching `self.values` and shape
        `[batch_size, query_depth]`.

    Returns:
      score: Tensor of dtype matching `self.values` and shape
        `[batch_size, max_time]` (`max_time` is memory's `max_time`).
    """
    with ops.name_scope(None, "BahndahauAttentionCall", [query]):
      processed_query = self.query_layer(query) if self.query_layer else query
      dtype = processed_query.dtype
      # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting.
      processed_query = array_ops.expand_dims(processed_query, 1)
      v = variable_scope.get_variable(
          "attention_v", [self._num_units], dtype=dtype)
      if self._normalize:
        # Scalar used in weight normalization
        g = variable_scope.get_variable(
            "attention_g", dtype=dtype,
            initializer=math.sqrt((1. / self._num_units)))
        # Bias added prior to the nonlinearity
        b = variable_scope.get_variable(
            "attention_b", [self._num_units], dtype=dtype,
            initializer=init_ops.zeros_initializer())
        # Scalar bias added to attention scores
        r = variable_scope.get_variable(
            "attention_r", dtype=dtype,
            initializer=self._attention_r_initializer)
        # normed_v = g * v / ||v||
        normed_v = g * v * math_ops.rsqrt(
            math_ops.reduce_sum(math_ops.square(v)))
        score = math_ops.reduce_sum(
            normed_v * math_ops.tanh(self.keys + processed_query + b), [2]) + r
      else:
        score = math_ops.reduce_sum(
            v * math_ops.tanh(self.keys + processed_query), [2])

    return score
Exemple #24
0
  def __call__(self, query, tiling_factor=1):
    """Score the query based on the keys and values.

    Args:
      query: Tensor of dtype matching `self.values` and shape
        `[batch_size, query_depth]`.
      tiling_factor: An integer factor for which to tile the batch dimension.
        Used with BeamSearchDecoder.

    Returns:
      score: Tensor of dtype matching `self.values` and shape
        `[batch_size, max_time]` (`max_time` is memory's `max_time`).
    """
    with variable_scope.variable_scope(None, "bahdanau_attention", [query]):
      processed_query = self.query_layer(query) if self.query_layer else query
      dtype = processed_query.dtype
      # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting.
      processed_query = array_ops.expand_dims(processed_query, 1)
      keys = _maybe_tile_batch(self.keys, tiling_factor)
      v = variable_scope.get_variable(
          "attention_v", [self._num_units], dtype=dtype)
      if self._normalize:
        # Scalar used in weight normalization
        g = variable_scope.get_variable(
            "attention_g", dtype=dtype,
            initializer=math.sqrt((1. / self._num_units)))
        # Bias added prior to the nonlinearity
        b = variable_scope.get_variable(
            "attention_b", [self._num_units], dtype=dtype,
            initializer=init_ops.zeros_initializer())
        # normed_v = g * v / ||v||
        normed_v = g * v * math_ops.rsqrt(
            math_ops.reduce_sum(math_ops.square(v)))
        score = math_ops.reduce_sum(
            normed_v * math_ops.tanh(keys + processed_query + b), [2])
      else:
        score = math_ops.reduce_sum(v * math_ops.tanh(keys + processed_query),
                                    [2])

    return score
Exemple #25
0
	def __call__(self, inputs, state, episodic_gate, scope=None):
		"""Gated recurrent unit (GRU) with nunits cells."""
		
		with vs.variable_scope("MGRUCell"):  # "GRUCell"
			with vs.variable_scope("Gates"):	# Reset gate and update gate.
				# We start with bias of 1.0 to not reset and not update.
				r = rnn_cell.linear([inputs, state], self._num_units, True, 1.0, scope=scope)
				r = sigmoid(r)
			with vs.variable_scope("Candidate"):
				c = tanh(rnn_cell.linear([inputs, r * state], self._num_units, True))
			
			new_h = tf.mul(episodic_gate, c) + tf.mul((1 - episodic_gate), state)
		return new_h, new_h
Exemple #26
0
 def testIsSequence(self):
   self.assertFalse(nest.is_sequence("1234"))
   self.assertFalse(nest.is_sequence([1, 3, [4, 5]]))
   self.assertTrue(nest.is_sequence(((7, 8), (5, 6))))
   self.assertFalse(nest.is_sequence([]))
   self.assertFalse(nest.is_sequence(set([1, 2])))
   ones = array_ops.ones([2, 3])
   self.assertFalse(nest.is_sequence(ones))
   self.assertFalse(nest.is_sequence(math_ops.tanh(ones)))
   self.assertFalse(nest.is_sequence(np.ones((4, 5))))
   self.assertTrue(nest.is_sequence({"foo": 1, "bar": 2}))
   self.assertFalse(
       nest.is_sequence(sparse_tensor.SparseTensorValue([[0]], [0], [1])))
 def attention(query): 
   """Put attention masks on hidden using hidden_features and query."""
   with vs.variable_scope("Attention"):
     # Attention mask is a softmax of h_in^T*decoder_hidden.
     dec_hid = array_ops.tile(query, [1, attn_length]) # replicate query for element-wise multiplication
     dec_hid = array_ops.reshape(dec_hid, [-1, attn_length, attention_vec_size])
     attn_weight = nn_ops.softmax(math_ops.reduce_sum(attention_states*dec_hid, [2])) # attn weights for every hidden states in encoder
     # Now calculate the attention-weighted vector (context vector) cc.
     cc = math_ops.reduce_sum(array_ops.reshape(attn_weight, [-1, attn_length, 1, 1])*hidden, [1,2])
     # attented hidden state
     with vs.variable_scope("AttnW1"):
       term1 = rnn_cell.linear(query, attn_size, False)
     with vs.variable_scope("AttnW2"):
       term2 = rnn_cell.linear(cc, attn_size, False)
     # environment representation
     if env: # 2D Tensor of shape [batch_size, env_size]
       with vs.variable_scope("Environment"):
         term3 = rnn_cell.linear(math_ops.to_float(env), attn_size, False)
       h_attn = math_ops.tanh(term1 + term2 + term3)
     else:
       h_attn = math_ops.tanh(term1 + term2)
   return h_attn, attn_weight
Exemple #28
0
 def __call__(self, inputs, state, scope=None):
   gru_out, gru_state = super(GRUCellAttn, self).__call__(inputs, state, scope)
   with vs.variable_scope(scope or type(self).__name__):
     with vs.variable_scope("Attn2"):
       gamma_h = tanh(rnn_cell.linear(gru_out, self._num_units, True, 1.0))
     weights = tf.reduce_sum(self.phi_hs * gamma_h, reduction_indices=2, keep_dims=True)
     weights = tf.exp(weights - tf.reduce_max(weights, reduction_indices=0, keep_dims=True))
     weights = weights / (1e-6 + tf.reduce_sum(weights, reduction_indices=0, keep_dims=True))
     context = tf.reduce_sum(self.hs * weights, reduction_indices=0)
     with vs.variable_scope("AttnConcat"):
       out = tf.nn.relu(rnn_cell.linear([context, gru_out], self._num_units, True, 1.0))
     self.attn_map = tf.squeeze(tf.slice(weights, [0, 0, 0], [-1, -1, 1]))
     return (out, out) 
def lstm(x, prev_c, prev_h, w_lstm, forget_bias):
  """LSTM cell.

  Args:
    x: tensors of size [num_children, hidden_size].
    prev_c: tensors of size [num_children, hidden_size].
    prev_h: same as prev_c.
    w_lstm: .
    forget_bias: .

  Returns:
    next_c:
    next_h:
  """
  ifog = math_ops.matmul(array_ops.concat([x, prev_h], axis=1), w_lstm)
  i, f, o, g = array_ops.split(ifog, 4, axis=1)
  i = math_ops.sigmoid(i)
  f = math_ops.sigmoid(f + forget_bias)
  o = math_ops.sigmoid(o)
  g = math_ops.tanh(g)
  next_c = i * g + f * prev_c
  next_h = o * math_ops.tanh(next_c)
  return next_c, next_h
Exemple #30
0
 def attention(query):
     """Put attention masks on hidden using hidden_features and query."""
     ds = []  # Results of attention reads will be stored here.
     for a in xrange(num_heads):
         with variable_scope.variable_scope("Attention_%d" % a):
             y = linear(query, attention_vec_size, True)
             y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
             # Attention mask is a softmax of v^T * tanh(...).
             s = math_ops.reduce_sum(v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])
             a = nn_ops.softmax(s)
             # Now calculate the attention-weighted vector d.
             d = math_ops.reduce_sum(array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2])
             ds.append(array_ops.reshape(d, [-1, attn_size]))
     return ds
Exemple #31
0
    def call(self, inputs, mask=None, training=None, initial_state=None):
        if not (inputs.shape is 2):
            raise ValueError(
                'The dimension of the inputs vector should be 2: `(input_shape, reward)`'
            )
        object_input = inputs[0]  # (batch_dim, timesteps n_digits)
        reward_input = inputs[1]  # (1,)

        n_digits = tensor_shape.dimension_value(object_input[-1])
        batch_dim = tensor_shape.dimension_value(object_input[0])
        self.units = tensor_shape.dimension_value(object_input[1])

        # Unpacking state matrices
        object_queries = tf.tile(
            tf.reshape(self.O_state,
                       (1, ) + self.O_state.shape), (batch_dim, ) +
            self.O_state.shape)  # (batch_dim, timesteps n_digits)
        object_keys = tf.tile(
            tf.reshape(self.object_keys,
                       (1, ) + self.object_keys.shape), (batch_dim, ) +
            self.object_keys.shape)  # (batch_dim, Tk, n_digits)

        # (self.units, n_actions)
        action_queries = tf.tile(
            tf.reshape(self.A_state, (1, ) + self.A_state.shape),
            (batch_dim, ) + self.A_state.shape)

        action_keys = tf.tile(
            tf.reshape(self.action_keys, (1, ) + self.action_keys.shape),
            (batch_dim, ) + self.action_keys.shape)  # (Tk, n_actions)
        # action_values = self.O_state[:, 2*int(
        #     self.A_state.shape[1] / 3):3*int(self.A_state.shape[1] / 3), :]

        # Context generator
        p_object = self.p_gate(
            [object_queries, object_keys]
        )  # (batch_dim, timesteps n_digits), (batch_dim, Tk, n_digits) -> (batch_dim, timesteps n_digits)

        shifted_object_sequence = self._transformer_shift_objects(
            object_input, object_queries)  # (batch_dim, timesteps n_digits)

        # (batch_dim, timesteps n_digits), (batch_dim, timesteps n_digits) -> (batch_dim, timesteps n_digits)
        object_query_corrected = math_ops.multiply(p_object,
                                                   shifted_object_sequence)

        # (batch_dim, timesteps n_digits), (batch_dim, Tk, n_digits), (batch_dim, Tk, n_actions) -> (batch_dim, timesteps n_actions)
        action_by_object = self.an_gate(
            [object_query_corrected, object_keys, action_keys])

        # Sympathetic circuit
        # (batch_dim, timesteps)
        steps = tf.tile(tf.constant(list(range(self.units)), dtype=float),
                        tf.constant([1, batch_dim]))

        # (batch_dim, timesteps), (batch_dim, timesteps) -> (batch_dim, timesteps)
        old_reward = self.internal_reward
        self.internal_reward.assign(self.internal_reward + self.w_boost * reward_input * math_ops.exp(steps) - \
            self.w_step * math_ops.exp(steps) - \
            self.w_amount * math_ops.exp(reward_input))

        # (batch_dim, timesteps n_digits), (batch_dim, timesteps n_actions) -> (batch_dim, timesteps, n_digits, n_actions)
        corrected_strategy = self.ao_gate(action_queries, action_keys)
        reward_matrix = K.softmax(
            tf.einsum('ijk,ijn->ijkn', object_queries, corrected_strategy) /
            math_ops.sqrt(0.5 * self.n_actions * n_digits))

        # (batch_dim, timesteps n_digits), (batch_dim, timesteps n_actions) -> (batch_dim, timesteps n_digits, n_actions)
        potential_reward = K.softmax(
            math_ops.tanh(
                tf.einsum('ijk,ijn->ijkn', object_query_corrected,
                          corrected_strategy)))

        # (batch_dim, timesteps n_digits, n_actions) * (batch_dim, timesteps) -> (batch_dim, timesteps n_digits, n_actions)
        delta_stimuli = potential_reward * self.internal_reward
        # tf.einsum('ijkn,ij->jkn', potential_reward, self.internal_reward)

        # ws(n_digits, n_actions) * (batch_dim, timesteps n_digits, n_actions) -> (batch_dim, timesteps n_digits, n_actions)
        new_state = self.w_stimuli * delta_stimuli

        # (batch_dim, timesteps n_digits, n_actions), (timesteps, n_digits, n_actions) -> (batch_dim, self.units)
        reward_intersection = tf.einsum('ijkn,ijkn->ij', reward_matrix,
                                        new_state)
        # w(1,) * (batch_dim, timesteps) + (batch_dim, timesteps) -> (batch_dim, timesteps)
        reward_forecast = self.w_rs * reward_intersection + self.internal_reward

        # (batch_dim, timesteps n_actions), (batch_dim, Tk, n_digits), (batch_dim, Tk, n_actions) -> (batch_dim, timesteps n_actions)
        rewarded_actions = self.SR_gate(
            [action_by_object, new_state, reward_matrix])

        # (batch_dim, timesteps n_actions), (batch_dim, Tk, n_actions), (batch_dim, Tk, n_digits) -> (batch_dim, timesteps n_digits)
        object_forecast = self.f_gate(
            [rewarded_actions, action_keys, object_keys])
        # (batch_dim, timesteps n_digits) -> (batch_dim, timesteps n_digits)
        object_forecast_seq = self._transformer_shift_objects(
            object_forecast, shifted_object_sequence)
        # (batch_dim, timesteps n_actions), (batch_dim, Tk, n_digits), (batch_dim, Tk, n_actions) -> (batch_dim, timesteps n_actions)
        simulated_action = self.d_gate(
            [object_forecast_seq, object_keys, action_keys])

        # Repeater
        # (batch_dim, timesteps n_actions), ((batch_dim, self.units) -  (batch_dim, self.units)) ->
        # w(1,), (batch_dim, self.units) -> (batch_dim, timesteps n_actions)
        reward_ratio_action = self.W_R * tf.einsum(
            'ijk,ij->ijk', action_by_object,
            K.softmax(K.abs(self.internal_reward - self.expected_reward)))

        #  (batch_dim, timesteps n_actions), (batch_dim, timesteps n_actions) -> (batch_dim, timesteps n_actions)
        selected_action = reward_ratio_action + \
            K.softmax(K.abs(self.internal_reward - self.expected_reward)) * \
                K.softmax(K.dot(self.W_S, simulated_action)+self.b_S)

        # (batch_dim, timesteps n_actions), (batch_dim, Tk, n_actions) -> (batch_dim, timesteps n_actions)

        new_strategy = self._transformer_shift_actions(
            selected_action,
            corrected_strategy)  # (batch_dim, timesteps n_actions)

        # Packing and updating

        new_obj = object_query_corrected[:, -1, ...]
        new_act = new_strategy[:, -1, ...]
        O_c1 = K.dot(object_queries[self.conv_units:], tf.transpose(new_obj))
        O_c2 = K.dot(object_queries[:self.units], tf.transpose(new_obj))
        E_c1 = (1 / self.units) * tf.einsum(
            'ik->',
            (object_queries[self.conv_units:] - O_c1)**2)  # (timesteps,)
        E_c2 = (1 / self.units) * tf.einsum(
            'ik->',
            (object_queries[self.conv_units:] - O_c2)**2)  # (timesteps,)
        P_short = tf.math.softmax(K.dot(E_c1, self.W_Pshort) + self.b_Pshort)
        P_long = tf.math.softmax(K.dot(E_c2, self.W_Plong) + self.b_Plong)

        if (P_short < 0.51) & (P_long < 0.51):
            object_keys, action_keys = self._min_ABdict_replace_op(
                object_keys[-1, ...], action_keys[-1, ...], new_obj, new_act,
                reward_forecast[-1, ...] - self.internal_reward)
        else:
            object_keys, action_keys = self._mean_ABdict_mix_op(
                object_keys[-1, ...], action_keys[-1, ...], new_obj, new_act,
                reward_forecast[-1, ...] - self.internal_reward)

        object_keys, action_keys = self._mean_ABdict_mix_op(
            object_keys[-2, ...], action_keys[-2, ...], new_obj, new_act,
            old_reward - self.internal_reward)

        self.expected_reward.assign(reward_forecast[-1, ...])
        self.S_state.assign(
            tf.cumsum(self.S_state, axis=0) + tf.reduce_sum(new_state, axis=0))
        self.O_state.assign(object_query_corrected)
        self.A_state.assign(corrected_strategy)
        self.object_keys.assign(object_keys)
        self.action_keys.assign(action_keys)

        # self._update_relevance_matrix(
        #     self.internal_reward, object_query_corrected[:, -2, ...], new_strategy[:, -2, ...])  # t-1 case
        # self._update_relevance_matrix(
        #     self.expected_reward, new_obj, new_act)  # t case

        return new_strategy
Exemple #32
0
        def attention(decoder_state, coverage=None):
            """Calculate the context vector and attention distribution from the decoder state.

      Args:
        decoder_state: state of the decoder
        coverage: Optional. Previous timestep's coverage vector, shape (batch_size, attn_len, 1, 1).

      Returns:
        context_vector: weighted sum of encoder_states
        attn_dist: attention distribution
        coverage: new coverage vector. shape (batch_size, attn_len, 1, 1)
      """
            with variable_scope.variable_scope("Attention"):
                # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper)

                decoder_features = linear(
                    decoder_state, attention_vec_size,
                    True)  # shape (batch_size, attention_vec_size)
                decoder_features = tf.expand_dims(
                    tf.expand_dims(decoder_features, 1),
                    1)  # reshape to (batch_size, 1, 1, attention_vec_size)

                def masked_attention(e, padding_mask):
                    "Take e softmax of e then apply enc_padding_mask and re-normalize" ""
                    e = e * padding_mask + (
                        (1.0 - padding_mask) * tf.float32.min)
                    attn_dist = nn_ops.softmax(
                        e
                    )  # take softmax. shape (batch_size, attn_length). Better way of computing attention.
                    return attn_dist
                    #attn_dist *= padding_mask # apply mask
                    #masked_sums = tf.reduce_sum(attn_dist, axis=1) # shape (batch_size)
                    #return attn_dist / tf.reshape(masked_sums, [-1, 1]) # re-normalize

                if use_query:
                    with variable_scope.variable_scope("query"):
                        decoder_q_features = linear(
                            decoder_state, query_attn_size, True,
                            name='query')  # W_s_q s_t +b
                        decoder_q_features = tf.expand_dims(
                            tf.expand_dims(decoder_q_features, 1), 1
                        )  # reshape to (batch_size, 1, 1, q_attention_vec_size)
                        q = math_ops.reduce_sum(
                            v_q *
                            math_ops.tanh(query_features + decoder_q_features),
                            [
                                2, 3
                            ])  # calculate q v^t tanh(W_q q_i + W_s_q s_t + b)
                        q_dist = masked_attention(q, query_padding_mask)
                        query_vector = math_ops.reduce_sum(
                            array_ops.reshape(q_dist, [batch_size, -1, 1, 1]) *
                            query_states,
                            [1, 2])  # shape (batch_size, q_attn_size). q*
                        query_vector = array_ops.reshape(
                            query_vector, [-1, query_attn_size])  #This is q*
                    with variable_scope.variable_scope("query_z"):
                        query_z = linear(query_vector,
                                         attention_vec_size,
                                         False,
                                         name='query_z')  #This is qz
                        query_z = tf.expand_dims(tf.expand_dims(query_z, 1), 1)

                if use_coverage and coverage is not None:  # non-first step of coverage
                    # Multiply coverage vector by w_c to get coverage_features.
                    coverage_features = nn_ops.conv2d(
                        coverage, w_c, [1, 1, 1, 1], "SAME"
                    )  # c has shape (batch_size, attn_length, 1, attention_vec_size)
                    if use_query:
                        e = math_ops.reduce_sum(
                            v *
                            math_ops.tanh(encoder_features + decoder_features +
                                          query_z + coverage_features),
                            [2, 3])  # shape (batch_size,attn_length)

                    else:
                        # Calculate v^T tanh(W_h h_i + W_s s_t + w_c c_i^t + b_attn)
                        e = math_ops.reduce_sum(
                            v *
                            math_ops.tanh(encoder_features + decoder_features +
                                          coverage_features),
                            [2, 3])  # shape (batch_size,attn_length)

                    # Calculate attention distribution
                    attn_dist = masked_attention(e, enc_padding_mask)
                    # Update coverage vector
                    coverage += array_ops.reshape(attn_dist,
                                                  [batch_size, -1, 1, 1])
                else:
                    if use_query:
                        e = math_ops.reduce_sum(
                            v * math_ops.tanh(encoder_features +
                                              decoder_features + query_z),
                            [2, 3])  # calculate e
                    else:
                        # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn)
                        e = math_ops.reduce_sum(
                            v *
                            math_ops.tanh(encoder_features + decoder_features),
                            [2, 3])  # calculate e

                    # Calculate attention distribution
                    attn_dist = masked_attention(e, enc_padding_mask)

                    if use_coverage:  # first step of training
                        coverage = tf.expand_dims(tf.expand_dims(attn_dist, 2),
                                                  2)  # initialize coverage

                # Calculate the context vector from attn_dist and encoder_states
                context_vector = math_ops.reduce_sum(
                    array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) *
                    encoder_states, [1, 2])  # shape (batch_size, attn_size).
                context_vector = array_ops.reshape(context_vector,
                                                   [-1, attn_size])

            return context_vector, attn_dist, coverage
Exemple #33
0
 def loop_fn(i):
     return math_ops.tanh(a * array_ops.gather(x, i) +
                          array_ops.gather(y, i))
def _attn_add_fun(v, keys, query):
    return math_ops.reduce_sum(v * math_ops.tanh(keys + query), [2])
Exemple #35
0
#arrays to save v and h context
v_con_array = []
h_con_array = []
whole_con_array = []
cosine_penalty_array = []

#array to save y
y_array = []

for i in range(side_len, whole_len - side_len):
	current_output = outputs_bidirection[i]

	#multiply output by w_a and add b_a, get inner_sum of size [batch_size, hidden_size]
	inner_sum = tf.add(tf.matmul(current_output, w_a_1), b_a_1)
        con_i = tanh(inner_sum)

	#calculate the vertical (feature) and horizontal (distal) context vectors
	#shape [batch_size, seq_len]
	con_v = tf.nn.softmax(tf.add(tf.matmul(con_i, w_a_v), b_a_v))
        #shape [batch_size, num_feat]
        con_h = tf.add(tf.matmul(con_i, w_a_h), b_a_h)

	v_con_array.append(tf.expand_dims(con_v, 1))
        h_con_array.append(tf.expand_dims(con_h, 1))

	#tensor product each batch to generate the whole context con_vh
	tiled_con_v = tf.tile(tf.expand_dims(con_v, 2), tf.stack([1, 1, num_feat]))
        tiled_con_h = tf.tile(tf.expand_dims(con_h, 1), tf.stack([1, seq_len, 1]))
        #shape [batch_size, seq_len, num_feat]
        con_vh = tf.multiply(tiled_con_v, tiled_con_h)
Exemple #36
0
  def call(self, inputs, state, scope=None):
    """Run one step of Associative LSTM.

    Args:
      inputs: input Tensor, 2D, batch x cell_size.
      state: a tuple of state Tensors, both `2-D`, with column sizes `c_state`
          and `m_state`.
      scope: VariableScope for the created subgraph; defaults to
          "AssociativeLSTMCell".

    Returns:
      A tuple containing:

      - A `2-D, [batch x output_dim]`, Tensor representing the output of the
        LSTM after reading `inputs` when previous state was `state`.
        Here output_dim is:
           num_proj if num_proj was set,
           cell_size otherwise.
      - Tensor(s) representing the new state of LSTM after reading `inputs` when
        the previous state was `state`.  Same type and shape(s) as `state`.

    Raises:
      ValueError: If input size cannot be inferred from inputs via
        static shape inference.
    """
    num_proj = self._cell_size if self._num_proj is None else self._num_proj

    (c_prev, m_prev) = state

    dtype = inputs.dtype
    input_size = inputs.get_shape().with_rank(2)[1]

    if input_size.value is None:
      raise ValueError("Could not infer input size from inputs.get_shape()[-1]")

    # bs x (input_size + num_proj)
    cell_inputs = tf.concat([inputs, m_prev], 1, name = 'concat2')

    # bs x ((2.5 + _input_keys + _output_keys) * cell_size)
    lstm_matrix = tf.matmul(cell_inputs, self._kernel)
    lstm_matrix = tf.nn.bias_add(lstm_matrix, self._bias)

    # i = input_gate, f = forget_gate, o = output_gate
    # bs x (cell_size // 2)
    i, f, o = tf.split(value = lstm_matrix[:, :int(1.5 * self._cell_size)],
                       axis = 1, num_or_size_splits = 3)

    # u
    # bs x cell_size
    u = tf.split(lstm_matrix[:, int(1.5 * self._cell_size):int(2.5 * self._cell_size)],
                 axis = 1, num_or_size_splits = 1)[0]

    # ri
    # _input_keys x bs x cell_size
    input_keys = tf.split(lstm_matrix[:,
                          int(2.5 * self._cell_size):
                          int((2.5 + self._input_keys) * self._cell_size)],
                          axis = 1, num_or_size_splits = 1)[0]
    input_keys = tf.reshape(input_keys,
        [self._input_keys, -1, self._cell_size])

    # ro
    # _output_keys x bs x cell_size
    output_keys = tf.split(lstm_matrix[:,
                              int((2.5 + self._input_keys) * self._cell_size):],
                              axis = 1, num_or_size_splits = 1)[0]
    output_keys = tf.reshape(output_keys,
        [self._output_keys, -1, self._cell_size])

    # applying the sigmoid activation function
    # bs x (cell_size // 2)
    i = sigmoid(i)
    f = sigmoid(f)
    o = sigmoid(o)

    # appending gates
    # bs x cell_size
    i = tf.concat([i, i], 1, name = 'concat3')
    f = tf.concat([f, f], 1, name = 'concat4')
    o = tf.concat([o, o], 1, name = 'concat5')

    # applying tanh activation function
    # bs x cell_size
    u = tanh(u)
    # _input_keys x bs x cell_size
    input_keys = tanh(input_keys)
    # _output_keys x bs x cell_size
    output_keys = tanh(output_keys)

    # applying permutations
    #_input_keys x num_copies x batch_size x cell_size
    input_keys = self._permute(input_keys, scope = 'input_keys')
    #_output_keys x num_copies x batch_size x cell_size
    output_keys = self._permute(output_keys, scope = 'output_keys')

    # memory copies update
    # num_copies x bs x cell_size
    memory_update = self._complex_multiplication(
        input_keys, tf.expand_dims(tf.expand_dims(u * i, 0), 0))
    memory_update = tf.reduce_mean(memory_update, 0)

    # memory copies forget
    # num_copies x bs x cell_size
    memory_forget = tf.expand_dims(f, 0) * c_prev

    # updating memory
    # num_copies x bs x cell_size
    c = memory_forget + memory_update

    # reading refers to the reading gate
    # _output_keys x bs x cell_size
    reading_gate = tanh(tf.reduce_mean(
        self._complex_multiplication(output_keys, tf.expand_dims(c, 0)), 1))

    # bs x num_proj
    m = tf.expand_dims(o, 0) * reading_gate
    m = tf.transpose(m, [1,0,2])
    m = tf.reshape(m, [-1, self._num_proj])

    new_state = rnn_cell.LSTMStateTuple(c, m)

    return m, new_state
Exemple #37
0
        def attention(decoder_state, coverage=None):
            """Calculate the context vector and attention distribution from the decoder state.

      Args:
        decoder_state: state of the decoder
        coverage: Optional. Previous timestep's coverage vector, shape (batch_size, attn_len, 1, 1).

      Returns:
        context_vector: weighted sum of encoder_states
        attn_dist: attention distribution
        coverage: new coverage vector. shape (batch_size, attn_len, 1, 1)
      """
            with variable_scope.variable_scope("Attention"):
                # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper)
                decoder_features = linear(
                    decoder_state, attention_vec_size,
                    True)  # shape (batch_size, attention_vec_size)
                decoder_features = tf.expand_dims(
                    tf.expand_dims(decoder_features, 1),
                    1)  # reshape to (batch_size, 1, 1, attention_vec_size)

                def masked_attention(e):
                    """Take softmax of e then apply enc_padding_mask and re-normalize"""
                    attn_dist = nn_ops.softmax(
                        e)  # take softmax. shape (batch_size, attn_length)
                    attn_dist *= enc_padding_mask  # apply mask
                    masked_sums = tf.reduce_sum(attn_dist,
                                                axis=1)  # shape (batch_size)
                    return attn_dist / tf.reshape(masked_sums,
                                                  [-1, 1])  # re-normalize

                if use_coverage and coverage is not None:  # non-first step of coverage
                    # Multiply coverage vector by w_c to get coverage_features.
                    coverage_features = nn_ops.conv2d(
                        coverage, w_c, [1, 1, 1, 1], "SAME"
                    )  # c has shape (batch_size, attn_length, 1, attention_vec_size)

                    # Calculate v^T tanh(W_h h_i + W_s s_t + w_c c_i^t + b_attn)
                    e = math_ops.reduce_sum(
                        v * math_ops.tanh(encoder_features + decoder_features +
                                          coverage_features),
                        [2, 3])  # shape (batch_size,attn_length)

                    # Calculate attention distribution
                    attn_dist = masked_attention(e)

                    # Update coverage vector
                    coverage += array_ops.reshape(attn_dist,
                                                  [batch_size, -1, 1, 1])
                else:
                    # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn)
                    e = math_ops.reduce_sum(
                        v * math_ops.tanh(encoder_features + decoder_features),
                        [2, 3])  # calculate e

                    # Calculate attention distribution
                    attn_dist = masked_attention(e)

                    if use_coverage:  # first step of training
                        coverage = tf.expand_dims(
                            tf.expand_dims(attn_dist, 2), 2
                        )  # initialize coverage => HS: batch_size * att_length * 1 * 1이 됨

                # Calculate the context vector from attn_dist and encoder_states

                context_vector = math_ops.reduce_sum(
                    array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) *
                    encoder_states, [1, 2])  # shape (batch_size, attn_size).
                context_vector = array_ops.reshape(context_vector,
                                                   [-1, attn_size])

            return context_vector, attn_dist, coverage
Exemple #38
0
    def attention(self, decoder_state, encoder_states, attention_vec_size,
                  enc_padding_mask, hps):
        """Calculate the context vector and attention distribution from the decoder state.

        Args:
          decoder_state: state of the decoder

        Returns:
          context_vector: weighted sum of encoder_states
          attn_dist: attention distribution
        """
        with tf.variable_scope('attention'):
            w_dec = tf.get_variable('w_dec',
                                    [attention_vec_size, hps.hidden_dim],
                                    dtype=tf.float32,
                                    initializer=self.trunc_norm_init)
            v_dec = tf.get_variable('v_dec', [attention_vec_size],
                                    dtype=tf.float32,
                                    initializer=self.trunc_norm_init)
            # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper)
            decoder_features = tf.nn.xw_plus_b(
                decoder_state, w_dec,
                v_dec)  # shape (batch_size, attention_vec_size)
            decoder_features = tf.expand_dims(
                tf.expand_dims(decoder_features, 1),
                1)  # reshape to (batch_size, 1, 1, attention_vec_size)

            def masked_attention(e):
                """Take softmax of e then apply enc_padding_mask and re-normalize"""
                attn_dist = nn_ops.softmax(
                    e)  # take softmax. shape (batch_size, attn_length)
                attn_dist *= enc_padding_mask  # apply mask
                masked_sums = tf.reduce_sum(attn_dist,
                                            axis=1)  # shape (batch_size)
                return attn_dist / tf.reshape(masked_sums,
                                              [-1, 1])  # re-normalize

            encoder_states = tf.expand_dims(
                encoder_states,
                axis=2)  # now is shape (batch_size, attn_len, 1, attn_size)
            W_h = tf.get_variable("W_h",
                                  [1, 1, hps.hidden_dim, attention_vec_size])
            encoder_features = nn_ops.conv2d(
                encoder_states, W_h, [1, 1, 1, 1],
                "SAME")  # shape (batch_size,attn_length,1,attention_vec_size)

            # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn)
            v = tf.get_variable("v_h", [attention_vec_size])
            e = math_ops.reduce_sum(
                v * math_ops.tanh(encoder_features + decoder_features),
                [2, 3])  # calculate e

            # Calculate attention distribution
            attn_dist = masked_attention(e)

            # Calculate the context vector from attn_dist and encoder_states
            context_vector = math_ops.reduce_sum(
                array_ops.reshape(attn_dist, [hps.batch_size, -1, 1, 1]) *
                encoder_states, [1, 2])  # shape (batch_size, attn_size).
            context_vector = array_ops.reshape(context_vector,
                                               [-1, hps.hidden_dim])

        return context_vector, attn_dist
Exemple #39
0
def dynamic_distraction_m2_decoder(decoder_inputs,
                      initial_state,
                      distract_initial_state,
                      attention_states,
                      attention_states_query,
                      cell1,cell2,
                      distraction_cell,
                      output_size=None,
                      num_heads=1,
                      loop_function=None,
                      dtype=None,
                      scope=None,
                      initial_state_attention=False):
  """RNN decoder with attention for the sequence-to-sequence model.

  In this context "attention" means that, during decoding, the RNN can look up
  information in the additional tensor attention_states, and it does this by
  focusing on a few entries from the tensor. This model has proven to yield
  especially good results in a number of sequence-to-sequence tasks. This
  implementation is based on http://arxiv.org/abs/1412.7449 (see below for
  details). It is recommended for complex sequence-to-sequence tasks.

  Args:
    decoder_inputs: A list of 2D Tensors [batch_size x input_size].
    initial_state: 2D Tensor [batch_size x cell.state_size].
    attention_states: 3D Tensor [batch_size x attn_length x attn_size].
    cell: rnn_cell.RNNCell defining the cell function and size.
    output_size: Size of the output vectors; if None, we use cell.output_size.
    num_heads: Number of attention heads that read from attention_states.
    loop_function: If not None, this function will be applied to i-th output
      in order to generate i+1-th input, and decoder_inputs will be ignored,
      except for the first element ("GO" symbol). This can be used for decoding,
      but also for training to emulate http://arxiv.org/abs/1506.03099.
      Signature -- loop_function(prev, i) = next
        * prev is a 2D Tensor of shape [batch_size x output_size],
        * i is an integer, the step number (when advanced control is needed),
        * next is a 2D Tensor of shape [batch_size x input_size].
    dtype: The dtype to use for the RNN initial state (default: tf.float32).
    scope: VariableScope for the created subgraph; default: "attention_decoder".
    initial_state_attention: If False (default), initial attentions are zero.
      If True, initialize the attentions from the initial state and attention
      states -- useful when we wish to resume decoding from a previously
      stored decoder state and attention states.

  Returns:
    A tuple of the form (outputs, state), where:
      outputs: A list of the same length as decoder_inputs of 2D Tensors of
        shape [batch_size x output_size]. These represent the generated outputs.
        Output i is computed from input i (which is either the i-th element
        of decoder_inputs or loop_function(output {i-1}, i)) as follows.
        First, we run the cell on a combination of the input and previous
        attention masks:
          cell_output, new_state = cell(linear(input, prev_attn), prev_state).
        Then, we calculate new attention masks:
          new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
        and then we calculate the output:
          output = linear(cell_output, new_attn).
      state: The state of each decoder cell the final time-step.
        It is a 2D Tensor of shape [batch_size x cell.state_size].

  Raises:
    ValueError: when num_heads is not positive, there are no inputs, shapes
      of attention_states are not set, or input size cannot be inferb_a     from the input.
  """
  if decoder_inputs is None:
    raise ValueError("Must provide at least 1 input to attention decoder.")
  if num_heads < 1:
    raise ValueError("With less than 1 heads, use a non-attention decoder.")
  if attention_states.get_shape()[2].value is None:
    raise ValueError("Shape[2] of attention_states must be known: %s"
                     % attention_states.get_shape())
  if output_size is None:
    output_size = cell1.output_size

  with variable_scope.variable_scope(
      scope or "dynamic_distraction_m2_decoder", dtype=dtype) as scope:
    dtype = scope.dtype

    batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
    attn_length_state = attention_states.get_shape()[1].value
    attn_length_query = attention_states_query.get_shape()[1].value

    dim_1 = initial_state.get_shape()[1].value
    dim_2 = cell1.output_size
    project_initial_state_W = variable_scope.get_variable("Initial_State_W", [dim_1, dim_2])
    project_initial_state_B = variable_scope.get_variable("Initial_State_Bias", [dim_2])

    print ("Preksha " + scope.name)
    if attn_length_state is None:
      attn_length_state = shape(attention_states)[1]

    if attn_length_query is None:
      attn_length_query = shape(attention_states_query)[1]

    attn_size_state = attention_states.get_shape()[2].value
    attn_size_query = attention_states_query.get_shape()[2].value
    b_a = variable_scope.get_variable("b_a", [1, attn_size_state])

    # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
    hidden_states = array_ops.reshape(
        attention_states, [-1, attn_length_state, 1, attn_size_state])

    hidden_states_query = array_ops.reshape(
        attention_states_query, [-1, attn_length_query, 1, attn_size_query])

    hidden_features_states = []
    hidden_features_query  = []

    v_state = []
    attention_vec_size_state  = attn_size_state  # Size of query vectors for attention.
    
    for a in xrange(num_heads):
      k = variable_scope.get_variable("AttnW_State_%d" % a,
                                      [1, 1, attn_size_state, attention_vec_size_state])

      hidden_features_states.append(nn_ops.conv2d(hidden_states, k, [1, 1, 1, 1], "SAME"))
      
      v_state.append(
          variable_scope.get_variable("AttnV_State_%d" % a, [attention_vec_size_state]))


    v_query = []
    attention_vec_size_query  = attn_size_query  # Size of query vectors for attention.

    for a in xrange(num_heads):
      k = variable_scope.get_variable("AttnW_Query_%d" %a, 
                                      [1, 1, attn_size_query, attention_vec_size_query])

      hidden_features_query.append(nn_ops.conv2d(hidden_states_query, k, [1, 1, 1, 1], "SAME"))
      
      v_query.append(
          variable_scope.get_variable("AttnV_Query_%d" % a, [attention_vec_size_query]))


    state_1 = math_ops.matmul(initial_state, project_initial_state_W) + project_initial_state_B
    state_2 = state_1


    prev_states = []

    for i in range(attn_length_state):
      prev_states.append(array_ops.zeros([batch_size]))

    def attention(query, prev_states, b_a):
      """Put attention masks on hidden using hidden_features and query."""
      ds = []  # Results of attention reads will be stored here.
      if nest.is_sequence(query):  # If the query is a tuple, flatten it.
        query_list = nest.flatten(query)
        for q in query_list:  # Check that ndims == 2 if specified.
          ndims = q.get_shape().ndims
          if ndims:
            assert ndims == 2
        query = array_ops.concat(1, query_list)
      for a in xrange(num_heads):
        with variable_scope.variable_scope("Attention_%d" % a):
          y = linear(query, attention_vec_size_state, True)
          y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size_state])
          # Attention mask is a softmax of v^T * tanh(...).


          temp = hidden_features_states[a] + y
          new_states = array_ops.squeeze(temp, [2])

          new_states_list = array_ops.unpack(new_states, axis=1)
          #print(temp.get_shape(), new_states.get_shape(), len(new_states_list), new_states_list[0].get_shape())
          distract_states_list = []
          for i, _ in enumerate(new_states_list):
              temp = array_ops.reshape(prev_states[i], [-1, 1])
              t1 = math_ops.matmul(temp, b_a)
              print ("b_a size and prev_states size", temp.get_shape(), prev_states[i].get_shape(), b_a.get_shape(), t1.get_shape())
              distract_states_list.append(new_states_list[i] - t1)

          distract_states = array_ops.pack(distract_states_list, axis=1)

          print (len(distract_states_list), distract_states.get_shape())
          s = math_ops.reduce_sum(
              v_state[a] * math_ops.tanh(distract_states), [2])

          print(s.get_shape())
          a = nn_ops.softmax(s)
          prev_states = array_ops.pack(prev_states,  axis=1)
          prev_states = prev_states + a
          
          # Now calculate the attention-weighted vector d.
          d = math_ops.reduce_sum(
              array_ops.reshape(a, [-1, attn_length_state, 1, 1]) * hidden_states,
              [1, 2])
          ds.append(array_ops.reshape(d, [-1, attn_size_state]))
      return ds, array_ops.unpack(prev_states, axis=1)

    def attention_query(query):
      """Put attention masks on hidden using hidden_features and query."""
      ds = []  # Results of attention reads will be stored here.
      if nest.is_sequence(query):  # If the query is a tuple, flatten it.
        query_list = nest.flatten(query)
        for q in query_list:  # Check that ndims == 2 if specified.
          ndims = q.get_shape().ndims
          if ndims:
            assert ndims == 2
        query = array_ops.concat(1, query_list)
      for a in xrange(num_heads):
        with variable_scope.variable_scope("Attention_Query_%d" % a):
          y = linear(query, attention_vec_size_query, True)
          y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size_query])
          # Attention mask is a softmax of v^T * tanh(...).
          s = math_ops.reduce_sum(
              v_query[a] * math_ops.tanh(hidden_features_query[a] + y), [2, 3])
          a = nn_ops.softmax(s)
          # Now calculate the attention-weighted vector d.
          d = math_ops.reduce_sum(
              array_ops.reshape(a, [-1, attn_length_query, 1, 1]) * hidden_states_query,
              [1, 2])
          ds.append(array_ops.reshape(d, [-1, attn_size_query]))


      return ds[0]


    outputs = []
    ctx_vec = []
    prev = None

    batch_attn_size_state = array_ops.pack([batch_size, attn_size_state])
    batch_attn_size_query = array_ops.pack([batch_size, attn_size_query])


    attns_state = [array_ops.zeros(batch_attn_size_state, dtype=dtype)
             for _ in xrange(num_heads)]

    attns_query = [array_ops.zeros(batch_attn_size_query, dtype=dtype)
             for _ in xrange(num_heads)]

    for a in attns_state:  # Ensure the second shape of attention vectors is set.
      a.set_shape([None, attn_size_state])


    for a in attns_query:  # Ensure the second shape of attention vectors is set.
      a.set_shape([None, attn_size_query])


    acc_ctx = array_ops.zeros([batch_size, attn_size_state])

    if initial_state_attention:
      attns_query = attention_query(initial_state)
      list_of_queries = [initial_state, attns_query]
      attns_state, prev_states = attention(list_of_queries, prev_states)

    for i, inp in enumerate(decoder_inputs):
      if i > 0:
        variable_scope.get_variable_scope().reuse_variables()
      # If loop_function is set, we use it instead of decoder_inputs.
      if loop_function is not None and prev is not None:
        with variable_scope.variable_scope("loop_function", reuse=True):
          inp = loop_function(prev, i)
      # Merge input and previous attentions into one vector of the right size.
      input_size = inp.get_shape().with_rank(2)[1]
      if input_size.value is None:
        raise ValueError("Could not infer input size from input: %s" % inp.name)
      

      with variable_scope.variable_scope("Cell2"):
        input_2 = linear([state_1] + [inp], input_size, True)
        output_2, state_2 = cell2(input_2, state_2)
      

      # Run the RNN.
      #print (x.get_shape())
      
      # Run the attention mechanism.

      if i == 0 and initial_state_attention:
        with variable_scope.variable_scope(variable_scope.get_variable_scope(),
                                           reuse=True):
          attns_query = attention_query(output_2)
          list_of_queries = [state, attns_query]
          attns_state, prev_states = attention(list_of_queries, prev_states, b_a)
      else:
        attns_query = attention_query(output_2)
        list_of_queries = [output_2, attns_query]
        attns_state, prev_states = attention(list_of_queries, prev_states, b_a)


      with variable_scope.variable_scope("AttnOutputProjection"):

        W = variable_scope.get_variable("W", [1,attn_size_state])
        U = variable_scope.get_variable("U", [1,attn_size_state])

        new_ctx = math_ops.mul(W, attns_state[0]) - math_ops.mul(U, acc_ctx)
        new_ctx = math_ops.tanh(new_ctx)

        acc_ctx = acc_ctx + new_ctx

        with variable_scope.variable_scope("Cell1"):
          input_1 = linear([output_2] + [new_ctx], input_size, True)
          output_1, state_1 = cell1(input_1, state_1)

        output = math_ops.tanh(linear([inp] + [output_1] + [new_ctx], output_size, True))
        #x_shape = variable_scope.get_variable(name = 'x_shape',shape=cell_output.get_shape())
        if loop_function is not None:
          prev = output
        outputs.append(output)
	ctx_vec.append(new_ctx)
  return outputs, state_1, ctx_vec
Exemple #40
0
def RNN(x, weights, biases):
    x = tf.transpose(x, [1, 0, 2])
    x = tf.reshape(x, [-1, n_input])
    x = tf.split(0, n_steps, x)              # n_steps(list) * batch * 200
    gru_fw_cell = rnn_cell.GRUCell(n_hidden)
    gru_fw_cell = tf.nn.rnn_cell.DropoutWrapper(gru_fw_cell, output_keep_prob=0.7)
    gru_bw_cell = rnn_cell.GRUCell(n_hidden)
    gru_bw_cell = tf.nn.rnn_cell.DropoutWrapper(gru_bw_cell, output_keep_prob=0.7)

    outputs, _, _ = rnn.bidirectional_rnn(gru_fw_cell, gru_bw_cell, x,dtype=tf.float32)
    batch_s = 100

    outputs_all = tf.concat(0,outputs)    # (N*batch) * 2*n_hidden

    # dropout
    outputs_all = tf.nn.dropout(outputs_all, keep_prob=0.5)

    input_all = tf.concat(0,x)    # (n_steps*batch) * 2*n_hidden

    # dropout
    input_all = tf.nn.dropout(input_all, keep_prob=0.5)


    #**********************************************************************************************
    M = tanh(tf.matmul(outputs_all,W_h))    # (N*batch) * 2*hidden

    # dropout
    M = tf.nn.dropout(M, keep_prob=0.5)

    a = tf.matmul(M,w)
    a = tf.reshape(a, [n_steps,-1])    # N*batch
    a = tf.transpose(a, [1,0])    # batch*N
    a = tf.nn.softmax(a)
    a = tf.reshape(a, [batch_s,1,n_steps])    # batch*1*N

    outputs_all = tf.reshape(outputs_all, [n_steps,-1, 2*n_hidden])    # N*batch*d
    outputs_all = tf.transpose(outputs_all, [1,0,2])    # batch*N*d

    a = tf.split(0, batch_s, a)
    outputs_all = tf.split(0, batch_s, outputs_all)

    r = []
    for i in range(batch_s):
        a_temp = a[i][0:1,:,:]
        o_temp = outputs_all[i][0:1,:,:]
        att = tf.reshape(a_temp,[1, n_steps])
        out = tf.reshape(o_temp,[n_steps,2*n_hidden])

        # dropout
        att = tf.nn.dropout(att, keep_prob=0.5)
        out = tf.nn.dropout(out, keep_prob=0.5)

        r.append(tf.matmul(att,out))
    r = tf.concat(0,r)    # batch*d
    #**********************************************************************************************
    M_input = tanh(tf.matmul(input_all,W_h_input))    # (N*batch) * 2*hidden

    # dropout
    M_input = tf.nn.dropout(M_input, keep_prob=0.5)

    a_input = tf.matmul(M_input,w_input)
    #a_input = tf.matmul(input_all,w_input)

    a_input = tf.reshape(a_input, [n_steps,-1])    # N*batch
    a_input = tf.transpose(a_input, [1,0])    # batch*N
    a_input = tf.nn.softmax(a_input)
    a_input = tf.reshape(a_input, [batch_s,1,n_steps])    # batch*1*N

    '''
    a_input = tf.nn.softmax(tf.matmul(M_input,w_input))    # (N*batch) * 1
    a_input = tf.reshape(a_input, [n_steps,-1, 1])    # N*batch*1
    a_input = tf.transpose(a_input, [1,2,0])    # batch*1*N
    '''

    input_all = tf.reshape(input_all, [n_steps,-1, n_input])    # N*batch*n_input
    input_all = tf.transpose(input_all, [1,0,2])    # batch*N*n_input

    a_input = tf.split(0, batch_s, a_input)
    input_all = tf.split(0, batch_s, input_all)

    r_input = []
    for i in range(batch_s):
        a_input_temp = a_input[i][0:1,:,:]
        o_input_temp = input_all[i][0:1,:,:]
        att_input = tf.reshape(a_input_temp,[1, n_steps])
        input_input = tf.reshape(o_input_temp,[n_steps,n_input])

        # dropout
        att_input = tf.nn.dropout(att_input, keep_prob=0.5)
        input_input = tf.nn.dropout(input_input, keep_prob=0.5)

        r_input.append(tf.matmul(att_input,input_input))
    r_input = tf.concat(0,r_input)    # batch*n_input

    '''
    r_input_hidden = tanh(tf.matmul(r_input,W_x_input))
    #r_input_hidden = tf.matmul(r_input,W_x_input)
    _h = tanh(W_p*r + W_p_input*r_input_hidden + W_x*outputs[-1])
    predict = tf.matmul(_h, weights['out']) + biases['out']
    '''

    _h_temp_1 = tanh(W_p*r + W_x*outputs[-1])
    _h_temp_2 = tanh(W_p_input*r_input)
    _h_concat = tf.concat(1,[_h_temp_1,_h_temp_2])

    # dropout
    _h_concat = tf.nn.dropout(_h_concat, keep_prob=0.25)

    predict = tf.matmul(_h_concat, weights_concat['out_concat']) + biases['out']
    return predict,outputs
Exemple #41
0
    def __write_memory(self, his_mem, enc_states, global_trace, step):
        with variable_scope.variable_scope("write_memory"):
            mem_slots = his_mem.get_shape()[1].value
            mem_size = his_mem.get_shape()[2].value

            for i, state in enumerate(enc_states):
                if i > 0:
                    variable_scope.get_variable_scope().reuse_variables()

                # Concatenate history memory with the null slot
                tmp_mem = array_ops.concat([his_mem, tf.identity(self.null_mem)], axis=1) #[batch_size,his_mem_slots+1,his_mem_size]

                hidden = array_ops.reshape(tmp_mem, [-1, mem_slots+1, 1, mem_size]) 
                k = variable_scope.get_variable("AttnW", [1, 1, mem_size, mem_size])
                mem_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")
                v = variable_scope.get_variable("AttnV", [mem_size])

                mstate = state
                y = linear([flatten_query(mstate), global_trace], mem_size, True, scope = "query_trans")
                y = array_ops.reshape(y, [-1, 1, 1, mem_size])
                s = math_ops.reduce_sum(v * math_ops.tanh(mem_features + y), [2, 3])  #[batch_size,mem_slots+1]
                random_mask = 1.0 - tf.sign(math_ops.reduce_sum(tf.abs(tmp_mem), axis=2)) #[batch_size,his_mem_slots+1]
                    #tf.sign(x) 若x==0,返回0;若x<0,返回-1;若x>0,返回1
                # The random_mask shows if a slot is empty, 1 empty, 0 not empty.
                # The null mask is 1 if there is at least 1 empty slot.
                null_mask = random_mask[:, 0:self.hps.his_mem_slots]
                null_mask = math_ops.reduce_sum(null_mask, axis=1) #[batch_size]
                null_mask = tf.sign(null_mask)

                bias = self.random_bias * random_mask #random_bias tensor [batch_size,his_mem_slots+1]
                max_bias = tf.reduce_max(bias, axis=1) #[batch_size]
                max_bias = tf.expand_dims(max_bias, axis=1) #[batch_size,1]
                bias = tf.divide(bias, max_bias + 1e-12)
                
                max_s = tf.expand_dims(math_ops.reduce_max(s, axis=1), axis=1) #[batch_size,1]

                thred1 = tf.ones([self.b_size, self.hps.his_mem_slots+1], dtype=tf.float32)
                thred2 = tf.zeros([self.b_size, self.hps.his_mem_slots+1], dtype=tf.float32)
                thred = tf.where(tf.equal(null_mask, 1), thred1, thred2)

                bias1 = bias * tf.abs(max_s) * thred
                s1 = s + bias1 #为什么要加bias???
                a = nn_ops.softmax(s1) #[batch_size,his_mem_slots+1]

                max_val = tf.reduce_max(a, axis=1) #[batch_size]
                max_val = tf.expand_dims(max_val, axis=1) #[batch_size,1]

                if self.mode == 'train':
                    float_mask0 = tf.tanh(self.gama * (a - max_val)) + 1.0
                elif self.mode == 'decode':
                    float_mask0 = tf.sign(a - max_val) + 1.0

                mask = self.write_masks[step][i]
                float_mask = tf.multiply(mask, float_mask0)
                float_mask = tf.expand_dims(float_mask, axis=2) #[batch_size,his_mem_slots+1,1]
                #print (np.shape(float_mask))

                w_states = tf.tile(mstate, [1, mem_slots]) #[batch_size,2*hidden_size] 变为[batch_size,mem_slots*2*hidden_size]
                w_states = array_ops.reshape(w_states, [-1, mem_slots, mem_size])
                
                final_mask = float_mask[:, 0:self.hps.his_mem_slots, :] #[batch_size,his_mem_slots,1]
                #print (final_mask.get_shape())
                his_mem = (1.0 - final_mask) * his_mem + final_mask * w_states

            return his_mem
Exemple #42
0
def attention_decoder(encoder_mask,
                      decoder_inputs,
                      initial_state,
                      attention_states,
                      cell,
                      beam_size,
                      output_size=None,
                      num_layers=1,
                      loop_function=None,
                      dtype=dtypes.float32,
                      scope=None,
                      initial_state_attention=False):
    """RNN decoder with attention for the sequence-to-sequence model.

    In this context "attention" means that, during decoding, the RNN can look up
    information in the additional tensor attention_states, and it does this by
    focusing on a few entries from the tensor. This model has proven to yield
    especially good results in a number of sequence-to-sequence tasks. This
    implementation is based on http://arxiv.org/abs/1409.0473 (see below for
    details).

    Args:
      encoder_mask: the mask of encoder inputs [batch_size x attn_length].
      decoder_inputs: A list of 2D Tensors [batch_size x input_size].
      initial_state: 2D Tensor [batch_size x cell.state_size].
      attention_states: 3D Tensor [batch_size x attn_length x attn_size].
      cell: rnn_cell.RNNCell defining the cell function and size.
      beam_size: the beam size of beam search
      output_size: Size of the output vectors; if None, we use cell.output_size.
      loop_function: When decoding, this function will be applied to i-th output
        in order to generate i+1-th input. The generation is by beam search.
      dtype: The dtype to use for the RNN initial state (default: tf.float32).
      scope: VariableScope for the created subgraph; default: "attention_decoder".
      initial_state_attention: If False (default), initial attentions are zero.
        If True, initialize the attentions from the initial state.

    Returns:
      A tuple of the form (outputs, state, symbols), where:
        outputs: A list of the same length as decoder_inputs of 2D Tensors of
          shape [batch_size x output_size]. These represent the generated outputs.
          Output i is computed from input i (which is either the i-th element
          of decoder_inputs or loop_function(output {i-1}, i)) as follows.
        state: The state of each decoder cell the final time-step.
          It is a 2D Tensor of shape [batch_size x cell.state_size].
        symbols: When training, it is []; when decoding, it is the best translation
          generated by beam search.

    Raises:
      ValueError: when shapes of attention_states are not set,
        or input size cannot be inferred from the input.
    """
    if not decoder_inputs:
        raise ValueError("Must provide at least 1 input to attention decoder.")
    if not attention_states.get_shape()[1:2].is_fully_defined():
        raise ValueError(
            "Shape[1] and [2] of attention_states must be known: %s" %
            attention_states.get_shape())
    if output_size is None:
        output_size = cell.output_size

    with variable_scope.variable_scope(scope or "attention_decoder"):
        batch_size = array_ops.shape(
            decoder_inputs[0])[0]  # Needed for reshaping.
        attn_length = attention_states.get_shape()[1].value
        attn_size = attention_states.get_shape()[2].value
        state_size = initial_state.get_shape()[1].value
        attention_vec_size = attn_size // 2  # Size of query vectors for attention.

        hidden = array_ops.reshape(attention_states,
                                   [-1, attn_length, 1, attn_size])

        # compute the initial hidden state of decoder
        initial_state = math_ops.tanh(
            linear(initial_state,
                   state_size,
                   False,
                   weight_initializer=init_ops.random_normal_initializer(
                       0, 0.01, seed=SEED)))

        with variable_scope.variable_scope(scope or "attention"):
            k = variable_scope.get_variable(
                "AttnW", [1, 1, attn_size, attention_vec_size],
                initializer=init_ops.random_normal_initializer(0,
                                                               0.001,
                                                               seed=SEED))
            hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")
            v = variable_scope.get_variable(
                "AttnV", [attention_vec_size],
                initializer=init_ops.constant_initializer(0.0))

        def attention(query, scope=None):
            """Put attention masks on hidden using hidden_features and query."""
            with variable_scope.variable_scope(scope or "attention"):
                ds = []  # Results of attention reads will be stored here.
                if nest.is_sequence(
                        query):  # If the query is a tuple, flatten it.
                    query_list = nest.flatten(query)
                    for q in query_list:  # Check that ndims == 2 if specified.
                        ndims = q.get_shape().ndims
                        if ndims:
                            assert ndims == 2
                    query = array_ops.concat(1, query_list)

                with variable_scope.variable_scope("AttnU"):
                    y = linear(
                        query,
                        attention_vec_size,
                        False,
                        weight_initializer=init_ops.random_normal_initializer(
                            0, 0.001, seed=SEED))
                    y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
                    # the additive attention is computed by v^T * tanh(...).
                    s = math_ops.reduce_sum(
                        v * math_ops.tanh(hidden_features + y), [2, 3])
                    s = array_ops.transpose(
                        array_ops.transpose(s) - math_ops.reduce_max(s, [1]))
                    # sofxmax with mask
                    s = math_ops.exp(s)
                    s = math_ops.to_float(encoder_mask) * s
                    a = array_ops.transpose(
                        array_ops.transpose(s) / math_ops.reduce_sum(s, [1]))
                    d = math_ops.reduce_sum(
                        array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
                        [1, 2])
                    ds.append(array_ops.reshape(d, [-1, attn_size]))
            return ds

        outputs = []
        output = None
        state = initial_state
        out_state = array_ops.split(1, num_layers, state)[-1]
        prev = None
        symbols = []
        prev_probs = [0]
        batch_attn_size = array_ops.pack([batch_size, attn_size])
        attns = [array_ops.zeros(batch_attn_size, dtype=dtype)]
        for a in attns:  # Ensure the second shape of attention vectors is set.
            a.set_shape([None, attn_size])

        for i, inp in enumerate(decoder_inputs):
            if i > 0:
                variable_scope.get_variable_scope().reuse_variables()
            # If loop_function is set, we use it instead of decoder_inputs.
            if loop_function is not None and prev is not None:
                with variable_scope.variable_scope("loop_function",
                                                   reuse=True):
                    inp, prev_probs, index, prev_symbol = loop_function(
                        prev, prev_probs, beam_size, i)
                    out_state = array_ops.gather(out_state,
                                                 index)  # update prev state
                    state = array_ops.gather(state, index)  # update prev state
                    attns = [array_ops.gather(attn, index)
                             for attn in attns]  # update prev attens
                    for j, output in enumerate(outputs):
                        outputs[j] = array_ops.gather(
                            output, index)  # update prev outputs
                    for j, symbol in enumerate(symbols):
                        symbols[j] = array_ops.gather(
                            symbol, index)  # update prev symbols
                    symbols.append(prev_symbol)

            # Run the attention mechanism.
            if i > 0 or (i == 0 and initial_state_attention):
                attns = attention(out_state, scope="attention")

            # Run the RNN.
            cinp = array_ops.concat(
                1, [inp, attns[0]
                    ])  # concatenate next input and the context vector
            out_state, state = cell(cinp, state)

            with variable_scope.variable_scope("AttnOutputProjection"):
                output = linear([out_state] + [cinp], output_size, False)
                output = array_ops.reshape(output, [-1, output_size // 2, 2])
                output = math_ops.reduce_max(output, 2)  # maxout

            if loop_function is not None:
                prev = output
            outputs.append(output)

        if loop_function is not None:
            # handle the last symbol
            inp, prev_probs, index, prev_symbol = loop_function(
                prev, prev_probs, beam_size, i + 1)
            out_state = array_ops.gather(out_state, index)  # update prev state
            state = array_ops.gather(state, index)  # update prev state
            for j, output in enumerate(outputs):
                outputs[j] = array_ops.gather(output,
                                              index)  # update prev outputs
            for j, symbol in enumerate(symbols):
                symbols[j] = array_ops.gather(symbol,
                                              index)  # update prev symbols
            symbols.append(prev_symbol)

            # output the best result of beam search
            for k, symbol in enumerate(symbols):
                symbols[k] = array_ops.gather(symbol, 0)
            out_state = array_ops.expand_dims(array_ops.gather(out_state, 0),
                                              0)
            state = array_ops.expand_dims(array_ops.gather(state, 0), 0)
            for j, output in enumerate(outputs):
                outputs[j] = array_ops.expand_dims(array_ops.gather(output, 0),
                                                   0)  # update prev outputs
    return outputs, state, symbols
Exemple #43
0
    def call(self, inputs, state):
        """
        Run one time step of the cell. That is, given the current inputs and the state from the last time step,
        calculate the current state and cell output.

        You will notice that TensorFlow LSTMCell has a lot of other features. But we will not try them. Focus on the
        very basic LSTM functionality.

        Hint 1: If you try to figure out the tensor shapes, use print(a.get_shape()) to see the shape.

        Hint 2: In LSTM there exist both matrix multiplication and element-wise multiplication. Try not to mix them.

        :param inputs: The input at the current time step. The last dimension of it should be 1.
        :param state:  The state value of the cell from the last time step. The state size can be found from function
                       state_size(self).
        :return: A tuple containing (output, new_state). For details check TensorFlow LSTMCell class.
        """
        #############################################
        #           TODO: YOUR CODE HERE            #
        #############################################
        params = self.params

        c_prev = array_ops.slice(state, [0, 0], [-1, params[0]])
        h_prev = array_ops.slice(state, [0, params[0]], [-1, params[1]])

        W = self.W
        b = self.b

        W_fh = W['W_fh']
        W_ih = W['W_ih']
        W_ch = W['W_ch']
        W_oh = W['W_oh']
        W_fi = W['W_fi']
        W_ii = W['W_ii']
        W_ci = W['W_ci']
        W_oi = W['W_oi']
        W_h = W['W_h']
        W_fc = W['W_fc']
        W_ic = W['W_ic']
        W_oc = W['W_oc']

        b_f = b['b_f']
        b_i = b['b_i']
        b_c = b['b_c']
        b_o = b['b_o']

        f = math_ops.sigmoid(
            tf.matmul(h_prev, W_fh) + tf.multiply(inputs, W_fi) + b_f +
            tf.matmul(c_prev, W_fc))
        i = math_ops.sigmoid(
            tf.matmul(h_prev, W_ih) + tf.multiply(inputs, W_ii) + b_i +
            tf.matmul(c_prev, W_ic))
        _c = math_ops.tanh(
            tf.matmul(h_prev, W_ch) + tf.multiply(inputs, W_ci) + b_c)
        c = f * c_prev + i * _c
        o = math_ops.sigmoid(
            tf.matmul(h_prev, W_oh) + tf.multiply(inputs, W_oi) + b_o +
            tf.matmul(c, W_oc))

        h = o * math_ops.tanh(c)
        h = tf.matmul(h, W_h)

        new_state = (array_ops.concat([c, h], 1))
        output = h

        return output, new_state
# In[6]:


# Global variables
batches = 1
stime = 500
num_units = 20
num_inputs = 1
rnn_init_state = np.zeros([1, num_units], dtype="float32")
rnn_inputs = np.zeros((batches, stime, num_inputs), dtype="float32")
rnn_inputs[0, :, 0] = np.sin(np.linspace(0,18*np.pi, stime)) +                       np.sin(np.linspace(0,5.3*np.pi, stime)) +                       np.sin(np.linspace(0,2.1*np.pi, stime)) 
plt.plot(rnn_inputs[0,:,:])
plt.show()

activation = lambda x: math_ops.tanh(x)


# Implementing a static graph without tensorflow API:

# In[7]:


tf.reset_default_graph()
static_graph = tf.Graph()
with static_graph.as_default() as g:
        
    rng = np.random.RandomState(random_seed)

    # Init the ESN cell
    cell = EchoStateRNNCell(num_units=num_units, 
Exemple #45
0
    def call(self, inputs, state):
        """Run one step of G-LSTM.
    Args:
      inputs: input Tensor, 2D, [batch x num_units].
      state: this must be a tuple of state Tensors, both `2-D`,
      with column sizes `c_state` and `m_state`.
    Returns:
      A tuple containing:
      - A `2-D, [batch x output_dim]`, Tensor representing the output of the
        G-LSTM after reading `inputs` when previous state was `state`.
        Here output_dim is:
           num_proj if num_proj was set,
           num_units otherwise.
      - LSTMStateTuple representing the new state of G-LSTM cell
        after reading `inputs` when the previous state was `state`.
    Raises:
      ValueError: If input size cannot be inferred from inputs via
        static shape inference.
    """
        (c_prev, m_prev) = state

        self._batch_size = inputs.shape[0].value or array_ops.shape(inputs)[0]
        input_size = inputs.shape[-1].value or array_ops.shape(inputs)[-1]
        dtype = inputs.dtype
        scope = vs.get_variable_scope()
        with vs.variable_scope(scope, initializer=self._initializer):
            i_parts = []
            j_parts = []
            f_parts = []
            o_parts = []

            for group_id in range(self._number_of_groups):
                with vs.variable_scope("group%d" % group_id):
                    x_g_id = array_ops.concat(
                        [
                            self._get_input_for_group(
                                inputs, group_id,
                                int(input_size / self._number_of_groups)),
                            #self._group_shape[0]), # this is only correct if inputs dim = num_units!!!
                            self._get_input_for_group(
                                m_prev, group_id,
                                int(self._output_size /
                                    self._number_of_groups))
                        ],
                        axis=1)
                    #self._group_shape[0])], axis=1)
                    if self._linear1[group_id] is None:
                        self._linear1[group_id] = _Linear(
                            x_g_id, 4 * self._group_shape[1], False)
                    R_k = self._linear1[group_id](x_g_id)  # pylint: disable=invalid-name
                    i_k, j_k, f_k, o_k = array_ops.split(R_k, 4, 1)

                i_parts.append(i_k)
                j_parts.append(j_k)
                f_parts.append(f_k)
                o_parts.append(o_k)

            bi = vs.get_variable(name="bias_i",
                                 shape=[self._num_units],
                                 dtype=dtype,
                                 initializer=init_ops.constant_initializer(
                                     0.0, dtype=dtype))
            bj = vs.get_variable(name="bias_j",
                                 shape=[self._num_units],
                                 dtype=dtype,
                                 initializer=init_ops.constant_initializer(
                                     0.0, dtype=dtype))
            bf = vs.get_variable(name="bias_f",
                                 shape=[self._num_units],
                                 dtype=dtype,
                                 initializer=init_ops.constant_initializer(
                                     0.0, dtype=dtype))
            bo = vs.get_variable(name="bias_o",
                                 shape=[self._num_units],
                                 dtype=dtype,
                                 initializer=init_ops.constant_initializer(
                                     0.0, dtype=dtype))

            i = nn_ops.bias_add(array_ops.concat(i_parts, axis=1), bi)
            j = nn_ops.bias_add(array_ops.concat(j_parts, axis=1), bj)
            f = nn_ops.bias_add(array_ops.concat(f_parts, axis=1), bf)
            o = nn_ops.bias_add(array_ops.concat(o_parts, axis=1), bo)

        c = (math_ops.sigmoid(f + self._forget_bias) * c_prev +
             math_ops.sigmoid(i) * math_ops.tanh(j))
        m = math_ops.sigmoid(o) * self._activation(c)

        if self._num_proj is not None:
            with vs.variable_scope("projection"):
                if self._linear2 is None:
                    self._linear2 = _Linear(m, self._num_proj, False)
                m = self._linear2(m)

        new_state = rnn_cell_impl.LSTMStateTuple(c, m)
        return m, new_state
 def Cell(v):
     # If v is a vector [n, 1], x is a big square matrix.
     x = math_ops.tanh(v + array_ops.transpose(v, [1, 0]))
     return math_ops.reduce_sum(x, 1, keep_dims=True)
        def attention(query):
            """
      Put attention masks on hidden using hidden_features and query.
      :param query: Vector to compute attention with
      """
            # Results of attention reads will be stored here.
            ds = []
            # Will store masks over encoder context
            attn_masks = []
            # Store attention logits
            attn_logits = []
            # If the query is a tuple, flatten it.
            if nest.is_sequence(query):
                query_list = nest.flatten(query)
                # Check that ndims == 2 if specified.
                for q in query_list:
                    ndims = q.get_shape().ndims
                    if ndims:
                        assert ndims == 2
                query = array_ops.concat(1, query_list)
            for a in xrange(num_heads):
                with variable_scope.variable_scope("Attention_%d" % a):
                    if attn_type == "linear":
                        y = linear(query, attention_vec_size, True)
                        y = array_ops.reshape(y,
                                              [-1, 1, 1, attention_vec_size])
                        # Attention mask is a softmax of v^T * tanh(...).
                        s = math_ops.reduce_sum(
                            v[a] * math_ops.tanh(hidden_features[a] + y),
                            [2, 3])
                    elif attn_type == "bilinear":
                        query = tf.tile(tf.expand_dims(query, 1),
                                        [1, attn_length, 1])
                        query = batch_linear(query, attn_size, bias=True)
                        hid = tf.squeeze(hidden, [2])
                        s = tf.reduce_sum(tf.mul(query, hid), [2])
                    else:
                        # Two layer MLP
                        y = linear(query, attention_vec_size, True)
                        y = array_ops.reshape(y,
                                              [-1, 1, 1, attention_vec_size])
                        # Attention mask is a softmax of v^T * tanh(...).
                        layer1 = math_ops.tanh(hidden_features[a] + y)
                        k2 = variable_scope.get_variable(
                            "AttnW_%d" % a,
                            [1, 1, attn_size, attention_vec_size])
                        layer2 = nn_ops.conv2d(layer1, k2, [1, 1, 1, 1],
                                               "SAME")
                        s = math_ops.reduce_sum(v[a] * math_ops.tanh(layer2),
                                                [2, 3])

                    a = nn_ops.softmax(s)
                    attn_masks.append(a)
                    attn_logits.append(s)
                    # Now calculate the attention-weighted vector d. Hidden is encoder
                    # hidden states
                    d = math_ops.reduce_sum(
                        array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
                        [1, 2])
                    ds.append(array_ops.reshape(d, [-1, attn_size]))
            return ds, attn_masks, attn_logits
Exemple #48
0
    def __call__(self, inputs, state, scope=None):
        """Run one step of G-LSTM.

        Args:
          inputs: input Tensor, 2D, batch x num_units.
          state: this must be a tuple of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`.
          scope: not used

        Returns:
          A tuple containing:

          - A `2-D, [batch x output_dim]`, Tensor representing the output of the
            G-LSTM after reading `inputs` when previous state was `state`.
            Here output_dim is:
               num_proj if num_proj was set,
               num_units otherwise.
          - Tensor(s) representing the new state of G-LSTM after reading `inputs` when
            the previous state was `state`.  Same type and shape(s) as `state`.

        Raises:
          ValueError: If input size cannot be inferred from inputs via
            static shape inference.
        """
        (c_prev, m_prev) = state

        input_size = inputs.get_shape().with_rank(2)[1]
        if input_size.value is None:
            raise ValueError(
                "Could not infer input size from inputs.get_shape()[-1]")
        dtype = inputs.dtype
        with vs.variable_scope(scope or "glstm_cell",
                               initializer=self._initializer):
            i_parts = []
            j_parts = []
            f_parts = []
            o_parts = []

            for group_id in xrange(self._number_of_groups):
                with vs.variable_scope("group%d" % group_id):
                    x_g_id = array_ops.concat([
                        self._get_input_for_group(inputs, group_id,
                                                  self._group_shape[0]),
                        self._get_input_for_group(m_prev, group_id,
                                                  self._group_shape[0])
                    ],
                                              axis=1)
                    R_k = linear(x_g_id,
                                 4 * self._group_shape[1],
                                 bias=False,
                                 scope=scope)  #will add per gate biases later
                    i_k, j_k, f_k, o_k = array_ops.split(R_k, 4, 1)

                i_parts.append(i_k)
                j_parts.append(j_k)
                f_parts.append(f_k)
                o_parts.append(o_k)

            #it is more efficient to have per gate biases then per gate, per group
            bi = vs.get_variable(name="biases_i",
                                 shape=[self._num_units],
                                 dtype=dtype,
                                 initializer=init_ops.constant_initializer(
                                     0.0, dtype=dtype))
            bj = vs.get_variable(name="biases_j",
                                 shape=[self._num_units],
                                 dtype=dtype,
                                 initializer=init_ops.constant_initializer(
                                     0.0, dtype=dtype))
            bf = vs.get_variable(name="biases_f",
                                 shape=[self._num_units],
                                 dtype=dtype,
                                 initializer=init_ops.constant_initializer(
                                     0.0, dtype=dtype))
            bo = vs.get_variable(name="biases_o",
                                 shape=[self._num_units],
                                 dtype=dtype,
                                 initializer=init_ops.constant_initializer(
                                     0.0, dtype=dtype))

            i = nn_ops.bias_add(array_ops.concat(i_parts, axis=1), bi)
            j = nn_ops.bias_add(array_ops.concat(j_parts, axis=1), bj)
            f = nn_ops.bias_add(array_ops.concat(f_parts, axis=1), bf)
            o = nn_ops.bias_add(array_ops.concat(o_parts, axis=1), bo)

        c = math_ops.sigmoid(f +
                             self._forget_bias) * c_prev + math_ops.sigmoid(
                                 i) * math_ops.tanh(j)
        m = math_ops.sigmoid(o) * self._activation(c)

        if self._num_proj is not None:
            with vs.variable_scope("projection"):
                m = linear(m, self._num_proj, bias=False, scope=scope)

        new_state = LSTMStateTuple(c, m)
        return m, new_state
def RNN(x, weights, biases):
    x = tf.transpose(x, [1, 0, 2])
    x = tf.reshape(x, [-1, n_input])
    x = tf.split(0, n_steps, x)
    gru_fw_cell = rnn_cell.GRUCell(n_hidden)
    gru_fw_cell = tf.nn.rnn_cell.DropoutWrapper(gru_fw_cell,
                                                output_keep_prob=0.7)
    gru_bw_cell = rnn_cell.GRUCell(n_hidden)
    gru_bw_cell = tf.nn.rnn_cell.DropoutWrapper(gru_bw_cell,
                                                output_keep_prob=0.7)
    outputs, _, _ = rnn.bidirectional_rnn(gru_fw_cell,
                                          gru_bw_cell,
                                          x,
                                          dtype=tf.float32)

    batch_s = 100

    outputs_all = tf.concat(0, outputs)  # (N*batch) * 2*n_hidden

    # dropout
    outputs_all = tf.nn.dropout(outputs_all, keep_prob=0.5)

    M = tanh(tf.matmul(outputs_all, W_h))  # (N*batch) * 2*hidden
    M_2 = tanh(tf.matmul(outputs_all, W_h_2))  # (N*batch) * 2*hidden

    # dropout
    M = tf.nn.dropout(M, keep_prob=0.5)
    M_2 = tf.nn.dropout(M, keep_prob=0.5)

    #a = tf.matmul(M,w)
    a = tanh(tf.matmul(outputs_all, w))
    a = tf.reshape(a, [n_steps, -1])  # N*batch
    a = tf.transpose(a, [1, 0])  # batch*N
    a = tf.nn.softmax(a)
    a = tf.reshape(a, [batch_s, 1, n_steps])  # batch*1*N

    a_2 = tanh(tf.matmul(outputs_all, w))
    a_2 = tf.reshape(a_2, [n_steps, -1])  # N*batch
    a_2 = tf.transpose(a_2, [1, 0])  # batch*N
    a_2 = tf.nn.softmax(a_2)
    a_2 = tf.reshape(a_2, [batch_s, 1, n_steps])  # batch*1*N

    outputs_all = tf.reshape(outputs_all,
                             [n_steps, -1, 2 * n_hidden])  # N*batch*d
    outputs_all = tf.transpose(outputs_all, [1, 0, 2])  # batch*N*d

    a = tf.split(0, batch_s, a)
    a_2 = tf.split(0, batch_s, a_2)
    outputs_all = tf.split(0, batch_s, outputs_all)

    r = []
    r_2 = []
    for i in range(batch_s):
        a_temp = a[i][0:1, :, :]
        o_temp = outputs_all[i][0:1, :, :]
        att = tf.reshape(a_temp, [1, n_steps])  # 1*N
        out = tf.reshape(o_temp, [n_steps, 2 * n_hidden])  # N*2*n_hidden

        a_2_temp = a_2[i][0:1, :, :]
        o_2_temp = outputs_all[i][0:1, :, :]
        att_2 = tf.reshape(a_2_temp, [1, n_steps])  # 1*N
        out_2 = tf.reshape(o_2_temp, [n_steps, 2 * n_hidden])  # N*2*n_hidden

        # dropout
        att = tf.nn.dropout(att, keep_prob=0.5)
        out = tf.nn.dropout(out, keep_prob=0.5)
        att_2 = tf.nn.dropout(att_2, keep_prob=0.5)
        out_2 = tf.nn.dropout(out_2, keep_prob=0.5)

        r.append(tf.matmul(att, out))
        r_2.append(tf.matmul(att_2, out_2))
    r = tf.concat(0, r)  # batch*d
    r_2 = tf.concat(0, r_2)  # batch*d
    _h = tanh(W_p * r + W_x * outputs[-1] + W_p_2 * r_2)

    # dropout
    _h = tf.nn.dropout(_h, keep_prob=0.25)

    predict = tf.matmul(_h, weights['out']) + biases['out']
    return predict, outputs
 def Foo(x, y, z):
     return math_ops.tanh(math_ops.matmul(x, y) + z)
 def MLP(i, a, ws, bs):
     a = math_ops.tanh(math_ops.matmul(a, ws[i, :]) + bs[i, :])
     return a, ws, bs
 def Forward(x):
     return math_ops.reduce_sum(math_ops.tanh(x))
Exemple #53
0
  def __call__(self, inputs, state, scope=None):
    """Run one step of LSTM.

    Args:
      inputs: input Tensor, 2D, batch x num_units.
      state: state Tensor, 2D, batch x state_size.
      scope: VariableScope for the created subgraph; defaults to "LSTMCell".

    Returns:
      A tuple containing:
      - A 2D, batch x output_dim, Tensor representing the output of the LSTM
        after reading "inputs" when previous state was "state".
        Here output_dim is:
           num_proj if num_proj was set,
           num_units otherwise.
      - A 2D, batch x state_size, Tensor representing the new state of LSTM
        after reading "inputs" when previous state was "state".
    Raises:
      ValueError: if an input_size was specified and the provided inputs have
        a different dimension.
    """
    num_proj = self._num_units if self._num_proj is None else self._num_proj

    c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units])
    m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj])

    dtype = inputs.dtype
    actual_input_size = inputs.get_shape().as_list()[1]
    if self._input_size and self._input_size != actual_input_size:
      raise ValueError("Actual input size not same as specified: %d vs %d." %
                       actual_input_size, self._input_size)
    with vs.variable_scope(scope or type(self).__name__,
                           initializer=self._initializer):  # "LSTMCell"
      concat_w = _get_concat_variable(
          "W", [actual_input_size + num_proj, 4 * self._num_units],
          dtype, self._num_unit_shards)

      b = vs.get_variable(
          "B", shape=[4 * self._num_units],
          initializer=array_ops.zeros_initializer, dtype=dtype)

      # i = input_gate, j = new_input, f = forget_gate, o = output_gate
      cell_inputs = array_ops.concat(1, [inputs, m_prev])
      lstm_matrix = nn_ops.bias_add(math_ops.matmul(cell_inputs, concat_w), b)
      i, j, f, o = array_ops.split(1, 4, lstm_matrix)

      # Diagonal connections
      if self._use_peepholes:
        w_f_diag = vs.get_variable(
            "W_F_diag", shape=[self._num_units], dtype=dtype)
        w_i_diag = vs.get_variable(
            "W_I_diag", shape=[self._num_units], dtype=dtype)
        w_o_diag = vs.get_variable(
            "W_O_diag", shape=[self._num_units], dtype=dtype)

      if self._use_peepholes:
        c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev +
             sigmoid(i + w_i_diag * c_prev) * tanh(j))
      else:
        c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * tanh(j))

      if self._cell_clip is not None:
        c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip)

      if self._use_peepholes:
        m = sigmoid(o + w_o_diag * c) * tanh(c)
      else:
        m = sigmoid(o) * tanh(c)

      if self._num_proj is not None:
        concat_w_proj = _get_concat_variable(
            "W_P", [self._num_units, self._num_proj],
            dtype, self._num_proj_shards)

        m = math_ops.matmul(m, concat_w_proj)

    return m, array_ops.concat(1, [c, m])
def attention_decoder(encoder_mask, decoder_inputs, encoder_embeds, encoder_probs,
                      encoder_hs, mem_mask, initial_state, attention_states, cell, beam_size,
                      output_size=None, num_heads=1, num_layers=1, loop_function=None,
                      dtype=dtypes.float32, scope=None, initial_state_attention=False):
    """RNN decoder with attention for the sequence-to-sequence model.

    In this context "attention" means that, during decoding, the RNN can look up
    information in the additional tensor attention_states, and it does this by
    focusing on a few entries from the tensor. This model has proven to yield
    especially good results in a number of sequence-to-sequence tasks.

    Args:
        encoder_mask: A 2D Tensor [batch_size x input_size]
        decoder_inputs: A list of 3D Tensors [batch_size x input_size x hidden_emb].
        encoder_embeds: A 3D Tensor [batch_size x 2*input_size x hidden_emb]
        encoder_probs: A 3D Tensor [batch_size x 2*input_size x target_vocab_size]
        encoder_hs: A 3D Tensor [batch_size x 2*input_size x input_size]
        mem_mask:  A 2D Tensor [batch_size x 2*input_size]
        initial_state: 2D Tensor [batch_size x cell.state_size].
        attention_states: 3D Tensor [batch_size x attn_length x attn_size].
        cell: rnn_cell.RNNCell defining the cell function and size.
        output_size: Size of the output vectors; if None, we use cell.output_size.
        num_heads: Number of attention heads that read from attention_states.
        loop_function: If not None, this function will be applied to i-th output
            in order to generate i+1-th input, and decoder_inputs will be ignored,
            except for the first element ("GO" symbol).
        dtype: The dtype to use for the RNN initial state (default: tf.float32).
        scope: VariableScope for the created subgraph; default: "attention_decoder".
        initial_state_attention: If False (default), initial attentions are zero.
            If True, initialize the attentions from the initial state and attention
            states -- useful when we wish to resume decoding from a previously
            stored decoder state and attention states.

    Returns:
         A tuple of the form (outputs, state, symbols, logits_mem, aligns_mem), where:
            outputs: A list of the same length as decoder_inputs of 2D Tensors of
                  shape [batch_size x output_size].
            state: The state of each decoder cell the final time-step.
                It is a 2D Tensor of shape [batch_size x cell.state_size].
            symbols: A list of target word ids, the best results returned by beam search.
            aligns_mem: A list of memory attention weights.
            logits_mem: A list of [batch_size x target_vocab_size].

    Raises:
      ValueError: when num_heads is not positive, there are no inputs, shapes
        of attention_states are not set, or input size cannot be inferred
        from the input.
    """
    if not decoder_inputs:
        raise ValueError("Must provide at least 1 input to attention decoder.")
    if num_heads < 1:
        raise ValueError("With less than 1 heads, use a non-attention decoder.")
    if not attention_states.get_shape()[1:2].is_fully_defined():
        raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
                         % attention_states.get_shape())
    if output_size is None:
        output_size = cell.output_size

    with variable_scope.variable_scope(scope or "attention_decoder"):
        batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
        attn_length = attention_states.get_shape()[1].value
        attn_size = attention_states.get_shape()[2].value
        embed_size = encoder_embeds.get_shape()[2].value
        state_size = initial_state.get_shape()[1].value

        hidden = array_ops.reshape(attention_states, [-1, attn_length, 1, attn_size])

        # memory hidden states based on the probability in encoder_hs
        encoder_hs = math_ops.reduce_sum(
                array_ops.tile(array_ops.reshape(attention_states, [batch_size, 1, attn_length, attn_size]),
                               [1, 2 * attn_length, 1, 1]) * array_ops.expand_dims(encoder_hs, 3), [2])

        # merged hidden states are concatenated by target word embeddings
        mems = array_ops.concat(2, [encoder_hs, encoder_embeds])
        mems = array_ops.transpose(array_ops.expand_dims(mems, 3), [0, 1, 3, 2])

        hidden_features = []
        v = []
        attention_vec_size = attn_size // 2  # Size of query vectors for attention.

        initial_state = math_ops.tanh(
                linear(initial_state, state_size, False,
                       weight_initializer=init_ops.random_normal_initializer(0, 0.01, seed=SEED)))

        def attention(query, scope=None):
            """Put attention masks on hidden using hidden_features and query."""
            with variable_scope.variable_scope(scope or "attention"):
                for a in xrange(num_heads):
                    k = variable_scope.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size],
                                                    initializer=init_ops.random_normal_initializer(0, 0.001, seed=SEED))
                    hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
                    v.append(variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size],
                                                         initializer=init_ops.constant_initializer(0.0)))
                ds = []  # Results of attention reads will be stored here.
                aa = []
                if nest.is_sequence(query):  # If the query is a tuple, flatten it.
                    query_list = nest.flatten(query)
                    for q in query_list:  # Check that ndims == 2 if specified.
                        ndims = q.get_shape().ndims
                        if ndims:
                            assert ndims == 2
                    query = array_ops.concat(1, query_list)

                for a in xrange(num_heads):
                    with variable_scope.variable_scope("AttnU_%d" % a):
                        y = linear(query, attention_vec_size, False,
                                   weight_initializer=init_ops.random_normal_initializer(0, 0.001, seed=SEED))
                        y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
                        # Attention mask is a softmax of v^T * tanh(...).
                        s = math_ops.reduce_sum(v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])
                        s = array_ops.transpose(array_ops.transpose(s) - math_ops.reduce_max(s, [1]))
                        # sofxmax with mask
                        s = math_ops.exp(s)
                        s = math_ops.to_float(encoder_mask) * s
                        a = array_ops.transpose(array_ops.transpose(s) / math_ops.reduce_sum(s, [1]))
                        aa.append(a)
                        d = math_ops.reduce_sum(array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2])
                        ds.append(array_ops.reshape(d, [-1, attn_size]))
            return ds, aa

        # memory attention
        def attention_mem(query, scope=None):
            with variable_scope.variable_scope(scope or "attention"):
                vt = []
                hidden_targets = []
                for a in xrange(num_heads):
                    vt.append(variable_scope.get_variable("AttnVt_%d" % a, [attention_vec_size],
                                                          initializer=init_ops.constant_initializer(0.0)))
                    kt = variable_scope.get_variable("AttnWt_%d" % a,
                                                     [1, 1, embed_size + attn_size, attention_vec_size],
                                                     initializer=init_ops.random_normal_initializer(0, 0.001, seed=SEED))
                    hidden_targets.append(nn_ops.conv2d(mems, kt, [1, 1, 1, 1], "SAME"))

                ds_mem = []
                as_mem = []
                if nest.is_sequence(query):  # If the query is a tuple, flatten it.
                    query_list = nest.flatten(query)
                    for q in query_list:  # Check that ndims == 2 if specified.
                        ndims = q.get_shape().ndims
                        if ndims:
                            assert ndims == 2
                    query = array_ops.concat(1, query_list)

                for a in xrange(num_heads):
                    with variable_scope.variable_scope("AttnU_%d" % a):
                        y_mem = linear(query, attention_vec_size, False,
                                       weight_initializer=init_ops.random_normal_initializer(0, 0.001, seed=SEED),
                                       scope="Linear_mem")
                        y_mem = array_ops.reshape(y_mem, [-1, 1, 1, attention_vec_size])
                        # Attention mask is a softmax of v^T * tanh(...).
                        s_mem = math_ops.reduce_sum(vt[a] * math_ops.tanh(hidden_targets[a] + y_mem), [2, 3])
                        s_mem = array_ops.transpose(array_ops.transpose(s_mem) - math_ops.reduce_max(s_mem, [1]))
                        s_mem = math_ops.exp(s_mem)
                        s_mem = mem_mask * s_mem
                        a_mem = array_ops.transpose(array_ops.transpose(s_mem) / math_ops.reduce_sum(s_mem, [1]))
                        as_mem.append(a_mem)
                        # Now calculate the attention-weighted vector d.
                        d_mem = math_ops.reduce_sum(array_ops.expand_dims(a_mem, 2) * encoder_probs, [1])
                        ds_mem.append(d_mem)
            return ds_mem, as_mem

        outputs = []
        logits_mem = []
        aligns_mem = []
        output = None
        state = initial_state
        out_state = array_ops.split(1, num_layers, state)[-1]
        prev = None
        prev_d_mem = None
        symbols = []
        prev_probs = [0]
        batch_attn_size = array_ops.pack([batch_size, attn_size])
        attns = [array_ops.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads)]
        for a in attns:  # Ensure the second shape of attention vectors is set.
            a.set_shape([None, attn_size])

        for i, inp in enumerate(decoder_inputs):
            if i > 0:
                variable_scope.get_variable_scope().reuse_variables()
            # If loop_function is set, we use it instead of decoder_inputs.
            if loop_function is not None and prev is not None:
                with variable_scope.variable_scope("loop_function", reuse=True):
                    inp, prev_probs, index, prev_symbol = loop_function(prev, prev_probs, beam_size, prev_d_mem, i)
                    out_state = array_ops.gather(out_state, index)  # update prev state
                    state = array_ops.gather(state, index)  # update prev state
                    attns = [array_ops.gather(attn, index) for attn in attns]  # update prev attens
                    for j, output in enumerate(outputs):
                        outputs[j] = array_ops.gather(output, index)  # update prev outputs
                    for j, symbol in enumerate(symbols):
                        symbols[j] = array_ops.gather(symbol, index)  # update prev symbols
                    for j, logit_mem in enumerate(logits_mem):
                        logits_mem[j] = array_ops.gather(logit_mem, index)  # update prev outputs
                    for j, align_mem in enumerate(aligns_mem):
                        aligns_mem[j] = array_ops.gather(align_mem, index)  # update prev outputs
                    symbols.append(prev_symbol)

            # Merge input and previous attentions into one vector of the right size.
            input_size = inp.get_shape().with_rank(2)[1]
            if input_size.value is None:
                raise ValueError("Could not infer input size from input: %s" % inp.name)

            # Run the attention mechanism.
            if i > 0 or (i == 0 and initial_state_attention):
                attns, aa = attention(out_state, scope="attention")
                query = array_ops.concat(1, [out_state, inp])
                logit_mem, align_mem = attention_mem(query, scope="attention")
                logits_mem.append(logit_mem[0])
                aligns_mem.append(align_mem[0])

            # Run the RNN.
            cinp = array_ops.concat(1, [inp, attns[0]])
            out_state, state = cell(cinp, state)

            with variable_scope.variable_scope("AttnOutputProjection"):
                output = linear([out_state] + [cinp], output_size, False)
                output = array_ops.reshape(output, [-1, output_size // 2, 2])
                output = math_ops.reduce_max(output, 2)  # maxout

            if loop_function is not None:
                prev = output
                prev_d_mem = logits_mem[-1]
            outputs.append(output)

        if loop_function is not None:
            # process the last symbol
            inp, prev_probs, index, prev_symbol = loop_function(prev, prev_probs, beam_size, prev_d_mem, i + 1)
            out_state = array_ops.gather(out_state, index)  # update prev state
            state = array_ops.gather(state, index)  # update prev state
            for j, output in enumerate(outputs):
                outputs[j] = array_ops.gather(output, index)  # update prev outputs
            for j, symbol in enumerate(symbols):
                symbols[j] = array_ops.gather(symbol, index)  # update prev symbols
            for j, logit_mem in enumerate(logits_mem):
                logits_mem[j] = array_ops.gather(logit_mem, index)  # update prev outputs
            for j, align_mem in enumerate(aligns_mem):
                aligns_mem[j] = array_ops.gather(align_mem, index)  # update prev outputs
            symbols.append(prev_symbol)

            # output the final best result of beam search
            for k, symbol in enumerate(symbols):
                symbols[k] = array_ops.gather(symbol, 0)
            out_state = array_ops.expand_dims(array_ops.gather(out_state, 0), 0)
            state = array_ops.expand_dims(array_ops.gather(state, 0), 0)
            for j, output in enumerate(outputs):
                outputs[j] = array_ops.expand_dims(array_ops.gather(output, 0), 0)  # update prev outputs
            for k, logit_mem in enumerate(logits_mem):
                logits_mem[k] = array_ops.expand_dims(array_ops.gather(logit_mem, 0), 0)
            for k, align_mem in enumerate(aligns_mem):
                aligns_mem[k] = array_ops.expand_dims(array_ops.gather(align_mem, 0), 0)
    return outputs, state, symbols, logits_mem, aligns_mem
Exemple #55
0
    def __call__(self, input_, state, scope=None):
        """Run one step of LSTM.

    Args:
      input_: input Tensor, 2D, batch x num_units.
      state: state Tensor, 2D, batch x state_size.
      scope: VariableScope for the created subgraph; defaults to "LSTMCell".

    Returns:
      A tuple containing:
      - A 2D, batch x output_dim, Tensor representing the output of the LSTM
        after reading "input_" when previous state was "state".
        Here output_dim is:
           num_proj if num_proj was set,
           num_units otherwise.
      - A 2D, batch x state_size, Tensor representing the new state of LSTM
        after reading "input_" when previous state was "state".
    """
        num_proj = self._num_units if self._num_proj is None else self._num_proj

        c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units])
        m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj])

        dtype = input_.dtype

        with vs.variable_scope(scope or type(self).__name__):  # "LSTMCell"
            sharded_w = _get_sharded_variable(
                "W", [self.input_size + num_proj, 4 * self._num_units],
                self._initializer, dtype, self._num_unit_shards)

            b = vs.get_variable("B",
                                shape=[4 * self._num_units],
                                initializer=array_ops.zeros_initializer,
                                dtype=dtype)

            # i = input_gate, j = new_input, f = forget_gate, o = output_gate
            cell_inputs = array_ops.concat(1, [input_, m_prev])
            lstm_matrix = nn_ops.bias_add(
                _matmul_with_sharded_variable(cell_inputs, sharded_w), b)
            i, j, f, o = array_ops.split(1, 4, lstm_matrix)

            # Diagonal connections
            if self._use_peepholes:
                w_f_diag = vs.get_variable("W_F_diag",
                                           shape=[self._num_units],
                                           initializer=self._initializer,
                                           dtype=dtype)
                w_i_diag = vs.get_variable("W_I_diag",
                                           shape=[self._num_units],
                                           initializer=self._initializer,
                                           dtype=dtype)
                w_o_diag = vs.get_variable("W_O_diag",
                                           shape=[self._num_units],
                                           initializer=self._initializer,
                                           dtype=dtype)

            if self._use_peepholes:
                c = (sigmoid(f + 1 + w_f_diag * c_prev) * c_prev +
                     sigmoid(i + w_i_diag * c_prev) * tanh(j))
            else:
                c = (sigmoid(f + 1) * c_prev + sigmoid(i) * tanh(j))

            if self._cell_clip is not None:
                c = clip_ops.clip_by_value(c, -self._cell_clip,
                                           self._cell_clip)

            if self._use_peepholes:
                m = sigmoid(o + w_o_diag * c) * tanh(c)
            else:
                m = sigmoid(o) * tanh(c)

            if self._num_proj is not None:
                sharded_w_proj = _get_sharded_variable(
                    "W_P", [self._num_units, self._num_proj],
                    self._initializer, dtype, self._num_proj_shards)

                m = _matmul_with_sharded_variable(m, sharded_w_proj)

        return m, array_ops.concat(1, [c, m])
Exemple #56
0
 def __call__(self, inputs, state, scope=None):
   """Most basic RNN: output = new_state = tanh(W * input + U * state + B)."""
   with vs.variable_scope(scope or type(self).__name__):  # "BasicRNNCell"
     output = tanh(linear([inputs, state], self._num_units, True))
   return output, output
Exemple #57
0
        def attention(decoder_state, coverage=None):
            """Calculate the context vector and attention distribution from the decoder state.

      Args:
        decoder_state: state of the decoder
        coverage: Optional. Previous timestep's coverage vector, shape (batch_size, attn_len, 1, 1).

      Returns:
        context_vector: weighted sum of encoder_states
        attn_dist: attention distribution
        coverage: new coverage vector. shape (batch_size, attn_len, 1, 1)
      """
            with variable_scope.variable_scope("Attention"):
                # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper)
                decoder_features = linear(
                    decoder_state, attention_vec_size,
                    True)  # shape (batch_size, attention_vec_size)
                decoder_features = tf.expand_dims(
                    tf.expand_dims(decoder_features, 1),
                    1)  # reshape to (batch_size, 1, 1, attention_vec_size)

                def masked_attention(e):
                    """Take softmax of e then apply enc_padding_mask and re-normalize"""
                    attn_dist = nn_ops.softmax(
                        e)  # take softmax. shape (batch_size, attn_length)

                    # If end2end, multiply the selector sentence probability with attnention probability
                    if selector_probs is not None:
                        attn_dist_norescale = attn_dist * enc_padding_mask  # apply mask, attention probabilities of pad tokens will be 0
                        masked_sums = tf.reduce_sum(
                            attn_dist_norescale, axis=1,
                            keep_dims=True)  # shape (batch_size)
                        attn_dist_norescale = attn_dist_norescale / masked_sums

                        batch_nums = tf.expand_dims(
                            tf.range(0, limit=batch_size),
                            1)  # shape (batch_size, 1)
                        batch_nums_tile = tf.tile(
                            batch_nums,
                            [1, attn_len])  # shape (batch_size, attn_len)
                        indices = tf.stack(
                            (batch_nums_tile, enc_sent_id_mask),
                            axis=2)  # shape (batch_size, attn_len, 2)
                        # All pad tokens will get probability of 0.0 since the sentence id is -1 (gather_nd will produce 0.0 for invalid indices)
                        selector_probs_projected = tf.gather_nd(
                            selector_probs,
                            indices)  # shape (batch_size, attn_len)
                        attn_dist *= selector_probs_projected  # shape (batch_size, attn_len)
                        attn_dist *= enc_padding_mask
                        masked_sums = tf.reduce_sum(
                            attn_dist, axis=1,
                            keep_dims=True)  # shape (batch_size, 1)
                        attn_dist = attn_dist / masked_sums  # re-normalize
                        return attn_dist_norescale, attn_dist
                    else:
                        attn_dist *= enc_padding_mask  # apply mask, attention probabilities of pad tokens will be 0
                        masked_sums = tf.reduce_sum(
                            attn_dist, axis=1,
                            keep_dims=True)  # shape (batch_size, 1)
                        attn_dist = attn_dist / masked_sums  # re-normalize
                        return None, attn_dist

                if use_coverage and coverage is not None:  # non-first step of coverage
                    # Multiply coverage vector by w_c to get coverage_features.
                    coverage_features = nn_ops.conv2d(
                        coverage, w_c, [1, 1, 1, 1], "SAME"
                    )  # c has shape (batch_size, attn_length, 1, attention_vec_size)
                    # Calculate v^T tanh(W_h h_i + W_s s_t + w_c c_i^t + b_attn)
                    e = math_ops.reduce_sum(
                        v * math_ops.tanh(encoder_features + decoder_features +
                                          coverage_features),
                        [2, 3])  # shape (batch_size,attn_length)
                    # Calculate attention distribution
                    attn_dist_norescale, attn_dist = masked_attention(e)
                    # Update coverage vector
                    coverage += array_ops.reshape(attn_dist,
                                                  [batch_size, -1, 1, 1])
                else:
                    # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn)
                    e = math_ops.reduce_sum(
                        v * math_ops.tanh(encoder_features + decoder_features),
                        [2, 3])  # calculate e
                    # Calculate attention distribution
                    attn_dist_norescale, attn_dist = masked_attention(e)
                    if use_coverage:  # first step of training
                        coverage = tf.expand_dims(tf.expand_dims(attn_dist, 2),
                                                  2)  # initialize coverage

                # Calculate the context vector from attn_dist and encoder_states
                context_vector = math_ops.reduce_sum(
                    array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) *
                    encoder_states, [1, 2])  # shape (batch_size, attn_size).
                context_vector = array_ops.reshape(context_vector,
                                                   [-1, attn_size])

            return context_vector, attn_dist_norescale, attn_dist, coverage
Exemple #58
0
        def intra_decoder_attention(decoder_state, decoder_history_c,
                                    decoder_history_h):
            """Calculate the context vector and attention distribution from the decoder state and the previous decode states


            Args:
              decoder_state: state of the decoder
              decoder_history: tensor array [ (batch_size, state_size)]
              decoder_coverage: Optional. Previous timestep's coverage vector, shape (batch_size, attn_len, 1, 1).

            Returns:
              context_vector: weighted sum of encoder_states
              attn_dist: attention distribution
              coverage: new coverage vector. shape (batch_size, attn_len, 1, 1)
            """
            with variable_scope.variable_scope("Intra_Decoder_Attention"):
                # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper)
                decoder_features = linear(
                    decoder_state, decoder_cell_size,
                    True)  # shape (batch_size, attention_vec_size)
                decoder_features = tf.expand_dims(
                    tf.expand_dims(decoder_features, 1),
                    1)  # reshape to (batch_size, 1, 1, state size)
                # Getting the history to this point and stack the item to produce a single tensor
                decoder_history_states_c = tf.TensorArray(tf.float32,
                                                          size=0,
                                                          dynamic_size=True)
                decoder_history_states_h = tf.TensorArray(tf.float32,
                                                          size=0,
                                                          dynamic_size=True)
                for i in range(len(decoder_history_c)):
                    decoder_history_states_c.write(i, decoder_history_c[i])
                    decoder_history_states_h.write(i, decoder_history_h[i])

                decoder_history_states_c = decoder_history_states_c.stack()
                decoder_history_states_c = tf.transpose(
                    decoder_history_states_c, [1, 0, 2])
                decoder_history_states_c = tf.expand_dims(
                    decoder_history_states_c, axis=1)

                decoder_history_states_h = decoder_history_states_h.stack()
                decoder_history_states_h = tf.transpose(
                    decoder_history_states_h, [1, 0, 2])
                decoder_history_states_h = tf.expand_dims(
                    decoder_history_states_h, axis=1)

                W_d_h = variable_scope.get_variable(
                    "W_d_h", [1, 1, decoder_cell_size, decoder_cell_size])
                decoder_history_features_h = nn_ops.conv2d(
                    decoder_history_states_h, W_d_h, [1, 1, 1, 1],
                    "SAME")  # shape (batch_size,t,1,state size)

                W_d_c = variable_scope.get_variable(
                    "W_d_c", [1, 1, decoder_cell_size, decoder_cell_size])
                decoder_history_features_c = nn_ops.conv2d(
                    decoder_history_states_c, W_d_c, [1, 1, 1, 1],
                    "SAME")  # shape (batch_size,t,1,state size)

                def masked_d_attention(e):
                    """Take softmax of e then apply enc_padding_mask and re-normalize"""
                    attn_d_dist = nn_ops.softmax(
                        e)  # take softmax. shape (batch_size, attn_length)
                    # attn_d_dist *= dec_padding_mask  # apply mask
                    masked_d_sums = tf.reduce_sum(attn_dist,
                                                  axis=1)  # shape (batch_size)
                    return attn_d_dist / tf.reshape(masked_d_sums,
                                                    [-1, 1])  # re-normalize

                # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn)
                e = math_ops.reduce_sum(
                    v_d * math_ops.tanh(decoder_history_features_c +
                                        decoder_history_features_h +
                                        decoder_features),
                    [2, 3])  # calculate e
                # print("e shape", e.get_shape())
                # print("decoder_history_states_c shape", decoder_history_states_c.get_shape())

                # Calculate attention distribution
                attn_d_dist = masked_d_attention(e)
                # print("attention dis shape", attn_d_dist.get_shape())

                # Calculate the context vector from attn_dist and encoder_states
                context_d_vector_c = math_ops.reduce_sum(
                    array_ops.reshape(attn_d_dist, [batch_size, -1, 1, 1]) *
                    decoder_history_states_c,
                    [1, 2])  # shape (batch_size, state size).
                context_d_vector_c = array_ops.reshape(
                    context_d_vector_c, [-1, state.c.get_shape()[1].value])

                context_d_vector_h = math_ops.reduce_sum(
                    array_ops.reshape(attn_d_dist, [batch_size, -1, 1, 1]) *
                    decoder_history_states_h,
                    [1, 2])  # shape (batch_size, state size).
                context_d_vector_h = array_ops.reshape(
                    context_d_vector_h, [-1, state.c.get_shape()[1].value])

            return context_d_vector_c, context_d_vector_h
 def fun(x):
     return math_ops.reduce_prod(math_ops.tanh(x)**2)
Exemple #60
0
    def __call__(self, inputs, state):
        embs = inputs[0]
        if len(inputs) == 2:
            mask_slice = inputs[1]
        else:
            mask_slice = None

        context = self.context
        context_mask = self.context_mask
        pctx_ = self.pctx_
        """Gated recurrent unit (GRU) with nunits cells."""
        tf.get_variable_scope().reuse_variables()
        W = tf.get_variable('W', dtype=self._precision)
        b = tf.get_variable('b', dtype=self._precision)
        U = tf.get_variable('U', dtype=self._precision)
        Wx = tf.get_variable('Wx', dtype=self._precision)
        Ux = tf.get_variable('Ux', dtype=self._precision)
        bx = tf.get_variable('bx', dtype=self._precision)
        U_nl = tf.get_variable('U_nl', dtype=self._precision)
        b_nl = tf.get_variable('b_nl', dtype=self._precision)
        Ux_nl = tf.get_variable('Ux_nl', dtype=self._precision)
        bx_nl = tf.get_variable('bx_nl', dtype=self._precision)
        Wc = tf.get_variable('Wc', dtype=self._precision)
        Wcx = tf.get_variable('Wcx', dtype=self._precision)
        W_comb_att = tf.get_variable('W_comb_att', dtype=self._precision)
        Wc_att = tf.get_variable('Wc_att', dtype=self._precision)
        b_att = tf.get_variable('b_att', dtype=self._precision)
        U_att = tf.get_variable('U_att', dtype=self._precision)
        c_tt = tf.get_variable('c_tt', dtype=self._precision)

        # graph build
        emb2hidden = math_ops.matmul(embs, Wx) + bx
        emb2gates = math_ops.matmul(embs, W) + b

        nlocation = tf.shape(context)[0]
        nsamples = tf.shape(context)[1]
        if state == None:
            raise ValueError("init state must be provided.")

        if mask_slice is None:
            mask_slice = tf.ones([nsamples, self._num_units])  # for decoding

        # gates input for first gru layer
        preAct1 = math_ops.matmul(state, U)
        preAct1 += emb2gates
        preAct1 = math_ops.sigmoid(preAct1)
        r1, u1 = array_ops.split(preAct1, 2, 1)

        # hidden input for first gru layer
        preActx1 = math_ops.matmul(state, Ux)
        preActx1 *= r1
        preActx1 += emb2hidden

        h1 = math_ops.tanh(preActx1)

        h1 = u1 * state + (1. - u1) * h1
        h1 = mask_slice * h1 + (1. - mask_slice) * state

        # attention
        pstate_ = math_ops.matmul(h1, W_comb_att)
        pctx__ = pctx_ + pstate_[None, :, :]
        pctx__ = math_ops.tanh(pctx__)

        pctx_2d = tf.reshape(pctx__, [-1, tf.shape(pctx__)[2]])
        alpha = math_ops.matmul(pctx_2d, U_att) + c_tt
        #alpha = math_ops.matmul(pctx__, U_att) + c_tt
        alpha = tf.reshape(alpha, [nlocation, nsamples])
        alpha = math_ops.exp(alpha)

        if context_mask is not None:
            alpha = alpha * context_mask

        alpha = alpha / tf.reduce_sum(alpha, 0, keep_dims=True)
        ctx_ = tf.reduce_sum(context * alpha[:, :, None], 0)

        preAct2 = math_ops.matmul(h1, U_nl) + b_nl
        preAct2 += math_ops.matmul(ctx_, Wc)
        preAct2 = math_ops.sigmoid(preAct2)

        r2, u2 = array_ops.split(preAct2, 2, 1)

        preActx2 = math_ops.matmul(h1, Ux_nl) + bx_nl
        preActx2 *= r2
        preActx2 += math_ops.matmul(ctx_, Wcx)

        h2 = math_ops.tanh(preActx2)

        h2 = u2 * h1 + (1. - u2) * h2
        h2 = mask_slice * h2 + (1. - mask_slice) * h1

        output = tf.concat(axis=1, values=[h2, ctx_])

        return output, h2