Example #1
0
    def setup_but_because(self):
        # For Erin: this is the MODEL!!!
        # seqA: but, seqB: because, this will learn to differentiate them
        seqA_w_matrix, seqA_c_vec = self.encoder.encode(
            self.seqA_inputs, self.seqA_mask, temp_max=FLAGS.temp_max)
        seqB_w_matrix, seqB_c_vec = self.encoder.encode(
            self.seqB_inputs,
            self.seqB_mask,
            reuse=True,
            temp_max=FLAGS.temp_max)

        self.seqA_rep = seqA_c_vec
        self.seqB_rep = seqB_c_vec

        # for now we just use context vector
        # we create additional perspectives

        # seqA_c_vec: (batch_size, hidden_size)
        persA_B_mul = seqA_c_vec * seqB_c_vec
        persA_B_sub = seqA_c_vec - seqB_c_vec
        persA_B_avg = (seqA_c_vec + seqB_c_vec) / 2.0

        # logits is [batch_size, label_size]
        if FLAGS.abs:
            persA_B_sub = tf.abs(seqA_c_vec - seqB_c_vec)
            self.logits = rnn_cell._linear(
                [seqA_c_vec, seqB_c_vec, persA_B_mul, persA_B_sub],
                self.label_size,
                bias=True)
        else:
            self.logits = rnn_cell._linear([
                seqA_c_vec, seqB_c_vec, persA_B_mul, persA_B_sub, persA_B_avg
            ],
                                           self.label_size,
                                           bias=True)
Example #2
0
    def build_model(self):
        target_v = tf.placeholder(tf.float32, [None])
        conversation_d = tf.placeholder(tf.int32, [None, None])
        conversation_g = tf.placeholder(tf.int32, [None, None])
        conversation_len = tf.placeholder(tf.int32, [None])
        last_description = tf.placeholder(tf.int32, [None])
        target_word = tf.placeholder(tf.int32, [None])
        i_know = tf.placeholder(tf.int32, [None])

        word_embeddings = tf.get_variable('word_embed',
                                          initializer=tf.convert_to_tensor(
                                              self.data_loader.embeddings,
                                              dtype=tf.float32),
                                          trainable=False)
        conv_d = tf.nn.embedding_lookup(word_embeddings, conversation_d)
        conv_g = tf.nn.embedding_lookup(word_embeddings, conversation_g)
        know = tf.nn.embedding_lookup(word_embeddings, i_know)
        conv = tf.concat(2, [conv_g, conv_d])

        with tf.variable_scope('guesser'):
            last_des = tf.nn.embedding_lookup(word_embeddings,
                                              last_description)
            gue_cell = rnn_cell.GRUCell(self.hidden_units)
            _, gue_state = rnn.dynamic_rnn(gue_cell,
                                           conv,
                                           conversation_len,
                                           dtype=tf.float32)
            gue_repr = tf.tanh(
                rnn_cell._linear([gue_state, last_des], self.final_units,
                                 True))
            gue_core = tf.get_variable('gue_core',
                                       [self.final_units, self.embedding_size])
            gue_ready = tf.matmul(gue_repr, gue_core)
            guesser_value = tf.reduce_sum(tf.mul(gue_ready, know), 1)
            gue_pred = tf.matmul(gue_ready, word_embeddings, transpose_b=True)
            _guess_ = tf.nn.top_k(gue_pred, self.vocab_size)

        with tf.variable_scope('describer'):
            target = tf.nn.embedding_lookup(word_embeddings, target_word)
            des_cell = Contextual_GRUCell(self.hidden_units)
            _, des_state = contextual_rnn(des_cell,
                                          conv,
                                          target,
                                          conversation_len,
                                          dtype=tf.float32)
            des_repr = tf.tanh(
                rnn_cell._linear([des_state, target], self.final_units, True))
            des_core = tf.get_variable('des_core',
                                       [self.final_units, self.embedding_size])
            des_ready = tf.matmul(des_repr, des_core)
            describer_value = tf.reduce_sum(tf.mul(des_ready, know), 1)
            des_pred = tf.matmul(des_ready, word_embeddings, transpose_b=True)
            _description_ = tf.nn.top_k(des_pred, self.vocab_size)

        optimizer = tf.train.GradientDescentOptimizer(self.step_size)
        update_guesser_op = optimizer.minimize(
            tf.reduce_sum(tf.square(target_v - guesser_value)))
        update_describler_op = optimizer.minimize(
            tf.reduce_sum(tf.square(target_v - describer_value)))
        return update_guesser_op, update_describler_op, guesser_value, describer_value, target_v, i_know, conversation_d, conversation_g, conversation_len, last_description, target_word, _guess_, _description_
Example #3
0
  def __call__(self, inputs, state, scope=None):
    """Gated recurrent unit (GRU) with nunits cells."""
    
    with vs.variable_scope(scope or type(self).__name__):
      if self._dropMaskInput.get_shape()[1:] != inputs.get_shape()[1:]:
        print("error: "+str(self._dropMaskInput.get_shape()[1:])+" != "+str(inputs.get_shape()[1:]))
        assert(False)
      if self._dropMaskState.get_shape()[1:] != state.get_shape()[1:]:
        print("error: "+str(self._dropMaskState.get_shape()[1:])+" != "+str(state.get_shape()[1:]))
        assert(False)
      dropin = tf.mul(self._dropMaskInput, inputs)
      dropst = tf.mul(self._dropMaskState, state)

      with vs.variable_scope("Gates"):  # Reset gate and update gate.
        # We start with bias of 1.0 to not reset and not update.
        concat = rnn_cell._linear([dropin, dropst], 2 * self._num_units, True, 1.0)
        r, u = tf.split(1, 2, concat)
        r, u = tf.sigmoid(r), tf.sigmoid(u)

      with vs.variable_scope("Candidate"):
        htilda = self._activation(rnn_cell._linear([dropin, r * dropst], self._num_units, True))

      new_h = u * dropst + (1 - u) * htilda

    return new_h, new_h
Example #4
0
    def __call__(self, inputs, state, scope=None):
        gru_out, gru_state = super(GRUAttnCell,
                                   self).__call__(inputs, state, scope)
        with vs.variable_scope(scope or type(self).__name__):
            with vs.variable_scope("Attn"):
                ht = rnn_cell._linear(gru_out, self._num_units, True, 1.0)
                ht = tf.expand_dims(ht, axis=1)
            scores = tf.reduce_sum(self.hs * ht,
                                   reduction_indices=2,
                                   keep_dims=True)

            # New stuff
            scores = tf.exp(
                scores -
                tf.reduce_max(scores, reduction_indices=1, keep_dims=True))
            scores = scores / (1e-6 + tf.reduce_sum(
                scores, reduction_indices=1, keep_dims=True))

            context = tf.reduce_sum(self.hs * scores, reduction_indices=1)
            with vs.variable_scope("AttnConcat"):
                out = tf.nn.relu(
                    rnn_cell._linear([context, gru_out], self._num_units, True,
                                     1.0))

        return out, out
  def hyper_norm(self, layer, dimensions, scope="hyper"):
    with tf.variable_scope(scope):
      zw = rnn_cell._linear(self.hyper_output,
                            self.hyper_embedding_size, False, scope=scope+ "z")
      alpha = rnn_cell._linear(zw, dimensions, False, scope=scope+ "alpha")
      result = tf.mul(alpha, layer)

      return result
Example #6
0
  def hyper_norm(self, layer, dimensions, scope="hyper"):
    with tf.variable_scope(scope):
      zw = rnn_cell._linear(self.hyper_output,
                            self.hyper_embedding_size, False, scope=scope+ "z")
      alpha = rnn_cell._linear(zw, dimensions, False, scope=scope+ "alpha")
      result = tf.mul(alpha, layer)

      return result
Example #7
0
 def attention(input_t, output_t_minus_1, time):
     with tf.variable_scope('attention'):
         VxS = tf.reshape(rnn_cell._linear(output_t_minus_1, self.attention_judge_size, True),
                          [-1, 1, 1, self.attention_judge_size])  # batch_size x 1 x 1 x attention
     _exp = tf.exp(tf.reduce_sum(attention_V * tf.tanh(WxH + VxS), [3]))  # batch_size x source_len x 1
     _exp = _exp * tf.expand_dims(self.mask, -1)
     attention_weight = _exp / tf.reduce_sum(_exp, [1], keep_dims=True)
     attention_t = tf.reduce_sum(encoder_outputs * attention_weight, [1])
     feed_in_t = tf.tanh(rnn_cell._linear([attention_t, input_t], self.embedding_size, True))
     return feed_in_t
Example #8
0
 def __call__(self, inputs, state, scope=None):
     gru_out, gru_state = super(GRUCellAttn, self).__call__(inputs, state, scope)
     with vs.variable_scope(scope or type(self).__name__):
         with vs.variable_scope("Attn2"):
             gamma_h = tf.nn.tanh(rnn_cell._linear(gru_out, self._num_units, True, 1.0))
         weights = tf.reduce_sum(self.phi_hs * gamma_h, reduction_indices=2, keep_dims=True)
         weights = tf.nn.softmax(weights, dim=1)
         context = tf.reduce_sum(self.hs * weights, reduction_indices=1)
         with vs.variable_scope("AttnConcat"):
             out = tf.nn.relu(rnn_cell._linear([context, gru_out], self._num_units, True, 1.0))
         return (out, out)
 def __call__(self, inputs, state, scope=None):
   """Gated recurrent unit (GRU) with nunits cells."""
   with vs.variable_scope(scope or type(self).__name__):  # "GRUCell"                 
     with vs.variable_scope("Gates"):  # Reset gate and update gate.                  
       # We start with bias of 1.0 to not reset and not update.                       
       r, u = array_ops.split(1, 2, rnn_cell._linear([inputs, state],
                                           2 * self._num_units, True, 1.0))
       r, u = tf.sigmoid(r), tf.sigmoid(u)
     with vs.variable_scope("Candidate"):
       c = self._activation(rnn_cell._linear([inputs, r * state], self._num_units, True))
     new_h = u * state + (1 - u) * c
   return new_h, new_h
Example #10
0
 def __call__(self, inputs, state, scope=None):
   gru_out, gru_state = super(GRUCellAttn, self).__call__(inputs, state, scope)
   with vs.variable_scope(scope or type(self).__name__):
     with vs.variable_scope("Attn2"):
       gamma_h = tanh(rnn_cell._linear(gru_out, self._num_units, True, 1.0))
     weights = tf.reduce_sum(self.phi_hs * gamma_h, reduction_indices=2, keep_dims=True)
     weights = tf.exp(weights - tf.reduce_max(weights, reduction_indices=0, keep_dims=True))
     weights = weights / (1e-6 + tf.reduce_sum(weights, reduction_indices=0, keep_dims=True))
     context = tf.reduce_sum(self.hs * weights, reduction_indices=0)
     with vs.variable_scope("AttnConcat"):
       out = tf.nn.relu(rnn_cell._linear([context, gru_out], self._num_units, True, 1.0))
     self.attn_map = tf.squeeze(tf.slice(weights, [0, 0, 0], [-1, -1, 1]))
     return (out, out) 
Example #11
0
 def __call__(self, inputs, state, scope=None):
   gru_out, gru_state = super(GRUCellAttn, self).__call__(inputs, state, scope)
   with vs.variable_scope(scope or type(self).__name__):
     with vs.variable_scope("Attn2"):
       gamma_h = tanh(rnn_cell._linear(gru_out, self._num_units, True, 1.0))
     weights = tf.reduce_sum(self.phi_hs * gamma_h, reduction_indices=2, keep_dims=True)
     weights = tf.exp(weights - tf.reduce_max(weights, reduction_indices=0, keep_dims=True))
     weights = weights / (1e-6 + tf.reduce_sum(weights, reduction_indices=0, keep_dims=True))
     context = tf.reduce_sum(self.hs * weights, reduction_indices=0)
     with vs.variable_scope("AttnConcat"):
       out = tf.nn.relu(rnn_cell._linear([context, gru_out], self._num_units, True, 1.0))
     self.attn_map = tf.squeeze(tf.slice(weights, [0, 0, 0], [-1, -1, 1]))
     return (out, out)
Example #12
0
  def __call__(self, inputs, state, scope=None):
    """Long short-term memory cell (LSTM) with hypernetworks and layer normalization."""
    with vs.variable_scope(scope or type(self).__name__):
      # Parameters of gates are concatenated into one multiply for efficiency.
      total_h, total_c = tf.split(1, 2, state)
      h = total_h[:, 0:self._num_units]
      c = total_c[:, 0:self._num_units]

      self.hyper_state = tf.concat(1, [total_h[:, self._num_units:], total_c[:, self._num_units:]])
      hyper_input = tf.concat(1, [inputs, h])
      hyper_output, hyper_new_state = self.hyper_cell(hyper_input, self.hyper_state)
      self.hyper_output = hyper_output
      self.hyper_state = hyper_new_state

      input_below_ = rnn_cell._linear([inputs],
                                      4 * self._num_units, False, scope="out_1")
      input_below_ = self.hyper_norm(input_below_, 4 * self._num_units, scope="hyper_x")
      state_below_ = rnn_cell._linear([h],
                                      4 * self._num_units, False, scope="out_2")
      state_below_ = self.hyper_norm(state_below_, 4 * self._num_units, scope="hyper_h")

      if self.is_layer_norm:
        s1 = vs.get_variable("s1", initializer=tf.ones([4 * self._num_units]), dtype=tf.float32)
        s2 = vs.get_variable("s2", initializer=tf.ones([4 * self._num_units]), dtype=tf.float32)
        s3 = vs.get_variable("s3", initializer=tf.ones([self._num_units]), dtype=tf.float32)

        b1 = vs.get_variable("b1", initializer=tf.zeros([4 * self._num_units]), dtype=tf.float32)
        b2 = vs.get_variable("b2", initializer=tf.zeros([4 * self._num_units]), dtype=tf.float32)
        b3 = vs.get_variable("b3", initializer=tf.zeros([self._num_units]), dtype=tf.float32)


        input_below_ = ln(input_below_, s1, b1)


        state_below_ = ln(state_below_, s2, b2)

      lstm_matrix = tf.add(input_below_, state_below_)
      i, j, f, o = array_ops.split(1, 4, lstm_matrix)
      new_c = (c * sigmoid(f) + sigmoid(i) *
               self._activation(j))

      # Currently normalizing c causes lot of nan's in the model, thus commenting it out for now.
      # new_c_ = ln(new_c, s3, b3)
      new_c_ = new_c
      new_h = self._activation(new_c_) * sigmoid(o)

      hyper_h, hyper_c = tf.split(1, 2, hyper_new_state)
      new_total_h = tf.concat(1, [new_h, hyper_h])
      new_total_c = tf.concat(1, [new_c, hyper_c])
      new_total_state = tf.concat(1, [new_total_h, new_total_c])
      return new_h, new_total_state
  def __call__(self, inputs, state, scope=None):
    """Long short-term memory cell (LSTM) with hypernetworks and layer normalization."""
    with vs.variable_scope(scope or type(self).__name__):
      # Parameters of gates are concatenated into one multiply for efficiency.
      total_h, total_c = tf.split(1, 2, state)
      h = total_h[:, 0:self._num_units]
      c = total_c[:, 0:self._num_units]

      self.hyper_state = tf.concat(1, [total_h[:, self._num_units:], total_c[:, self._num_units:]])
      hyper_input = tf.concat(1, [inputs, h])
      hyper_output, hyper_new_state = self.hyper_cell(hyper_input, self.hyper_state)
      self.hyper_output = hyper_output
      self.hyper_state = hyper_new_state

      input_below_ = rnn_cell._linear([inputs],
                                      4 * self._num_units, False, scope="out_1")
      input_below_ = self.hyper_norm(input_below_, 4 * self._num_units, scope="hyper_x")
      state_below_ = rnn_cell._linear([h],
                                      4 * self._num_units, False, scope="out_2")
      state_below_ = self.hyper_norm(state_below_, 4 * self._num_units, scope="hyper_h")

      if self.is_layer_norm:
        s1 = vs.get_variable("s1", initializer=tf.ones([4 * self._num_units]), dtype=tf.float32)
        s2 = vs.get_variable("s2", initializer=tf.ones([4 * self._num_units]), dtype=tf.float32)
        s3 = vs.get_variable("s3", initializer=tf.ones([self._num_units]), dtype=tf.float32)

        b1 = vs.get_variable("b1", initializer=tf.zeros([4 * self._num_units]), dtype=tf.float32)
        b2 = vs.get_variable("b2", initializer=tf.zeros([4 * self._num_units]), dtype=tf.float32)
        b3 = vs.get_variable("b3", initializer=tf.zeros([self._num_units]), dtype=tf.float32)


        input_below_ = ln(input_below_, s1, b1)


        state_below_ = ln(state_below_, s2, b2)

      lstm_matrix = tf.add(input_below_, state_below_)
      i, j, f, o = array_ops.split(1, 4, lstm_matrix)
      new_c = (c * sigmoid(f) + sigmoid(i) *
               self._activation(j))

      # Currently normalizing c causes lot of nan's in the model, thus commenting it out for now.
      # new_c_ = ln(new_c, s3, b3)
      new_c_ = new_c
      new_h = self._activation(new_c_) * sigmoid(o)

      hyper_h, hyper_c = tf.split(1, 2, hyper_new_state)
      new_total_h = tf.concat(1, [new_h, hyper_h])
      new_total_c = tf.concat(1, [new_c, hyper_c])
      new_total_state = tf.concat(1, [new_total_h, new_total_c])
      return new_h, new_total_state
  def __call__(self, inputs, state, scope=None):
    """Gated recurrent unit (GRU) with nunits cells."""
    dim = self._num_units
    with vs.variable_scope(scope or type(self).__name__):  # "GRUCell"
      with vs.variable_scope("Gates"):  # Reset gate and update gate.
        # We start with bias of 1.0 to not reset and not update.
        with vs.variable_scope( "Layer_Parameters"):

          s1 = vs.get_variable("s1", initializer=tf.ones([2*dim]), dtype=tf.float32)
          s2 = vs.get_variable("s2", initializer=tf.ones([2*dim]), dtype=tf.float32)
          s3 = vs.get_variable("s3", initializer=tf.ones([dim]), dtype=tf.float32)
          s4 = vs.get_variable("s4", initializer=tf.ones([dim]), dtype=tf.float32)
          b1 = vs.get_variable("b1", initializer=tf.zeros([2*dim]), dtype=tf.float32)
          b2 = vs.get_variable("b2", initializer=tf.zeros([2*dim]), dtype=tf.float32)
          b3 = vs.get_variable("b3", initializer=tf.zeros([dim]), dtype=tf.float32)
          b4 = vs.get_variable("b4", initializer=tf.zeros([dim]), dtype=tf.float32)


          # Code below initialized for all cells
          # s1 = tf.Variable(tf.ones([2 * dim]), name="s1")
          # s2 = tf.Variable(tf.ones([2 * dim]), name="s2")
          # s3 = tf.Variable(tf.ones([dim]), name="s3")
          # s4 = tf.Variable(tf.ones([dim]), name="s4")
          # b1 = tf.Variable(tf.zeros([2 * dim]), name="b1")
          # b2 = tf.Variable(tf.zeros([2 * dim]), name="b2")
          # b3 = tf.Variable(tf.zeros([dim]), name="b3")
          # b4 = tf.Variable(tf.zeros([dim]), name="b4")

        input_below_ = rnn_cell._linear([inputs],
                               2 * self._num_units, False, scope="out_1")
        input_below_ = ln(input_below_, s1, b1)
        state_below_ = rnn_cell._linear([state],
                               2 * self._num_units, False, scope="out_2")
        state_below_ = ln(state_below_, s2, b2)
        out =tf.add(input_below_, state_below_)
        r, u = array_ops.split(1, 2, out)
        r, u = sigmoid(r), sigmoid(u)

      with vs.variable_scope("Candidate"):
          input_below_x = rnn_cell._linear([inputs],
                                           self._num_units, False, scope="out_3")
          input_below_x = ln(input_below_x, s3, b3)
          state_below_x = rnn_cell._linear([state],
                                           self._num_units, False, scope="out_4")
          state_below_x = ln(state_below_x, s4, b4)
          c_pre = tf.add(input_below_x,r * state_below_x)
          c = self._activation(c_pre)
      new_h = u * state + (1 - u) * c
    return new_h, new_h
  def __call__(self, inputs, state, scope=None):
    """Gated recurrent unit (GRU) with nunits cells."""
    dim = self._num_units
    with vs.variable_scope(scope or type(self).__name__):  # "GRUCell"
      with vs.variable_scope("Gates"):  # Reset gate and update gate.
        # We start with bias of 1.0 to not reset and not update.
        with vs.variable_scope( "Layer_Parameters"):

          s1 = vs.get_variable("s1", initializer=tf.ones([2*dim]), dtype=tf.float32)
          s2 = vs.get_variable("s2", initializer=tf.ones([2*dim]), dtype=tf.float32)
          s3 = vs.get_variable("s3", initializer=tf.ones([dim]), dtype=tf.float32)
          s4 = vs.get_variable("s4", initializer=tf.ones([dim]), dtype=tf.float32)
          b1 = vs.get_variable("b1", initializer=tf.zeros([2*dim]), dtype=tf.float32)
          b2 = vs.get_variable("b2", initializer=tf.zeros([2*dim]), dtype=tf.float32)
          b3 = vs.get_variable("b3", initializer=tf.zeros([dim]), dtype=tf.float32)
          b4 = vs.get_variable("b4", initializer=tf.zeros([dim]), dtype=tf.float32)


          # Code below initialized for all cells
          # s1 = tf.Variable(tf.ones([2 * dim]), name="s1")
          # s2 = tf.Variable(tf.ones([2 * dim]), name="s2")
          # s3 = tf.Variable(tf.ones([dim]), name="s3")
          # s4 = tf.Variable(tf.ones([dim]), name="s4")
          # b1 = tf.Variable(tf.zeros([2 * dim]), name="b1")
          # b2 = tf.Variable(tf.zeros([2 * dim]), name="b2")
          # b3 = tf.Variable(tf.zeros([dim]), name="b3")
          # b4 = tf.Variable(tf.zeros([dim]), name="b4")

        input_below_ = rnn_cell._linear([inputs],
                               2 * self._num_units, False, scope="out_1")
        input_below_ = ln(input_below_, s1, b1)
        state_below_ = rnn_cell._linear([state],
                               2 * self._num_units, False, scope="out_2")
        state_below_ = ln(state_below_, s2, b2)
        out =tf.add(input_below_, state_below_)
        r, u = array_ops.split(1, 2, out)
        r, u = sigmoid(r), sigmoid(u)

      with vs.variable_scope("Candidate"):
          input_below_x = rnn_cell._linear([inputs],
                                           self._num_units, False, scope="out_3")
          input_below_x = ln(input_below_x, s3, b3)
          state_below_x = rnn_cell._linear([state],
                                           self._num_units, False, scope="out_4")
          state_below_x = ln(state_below_x, s4, b4)
          c_pre = tf.add(input_below_x,r * state_below_x)
          c = self._activation(c_pre)
      new_h = u * state + (1 - u) * c
    return new_h, new_h
 def __call__(self, inputs, context, state, scope=None):
     """Contextual Gated recurrent unit (CGRU) with nunits cells."""
     with vs.variable_scope(scope or type(self).__name__):
         with vs.variable_scope("Gates"):
             r, u = array_ops.split(
                 1, 2,
                 rnn_cell._linear([inputs, context, state],
                                  2 * self._num_units, True, 1.0))
             r, u = sigmoid(r), sigmoid(u)
         with vs.variable_scope("Candidate"):
             c = self._activation(
                 rnn_cell._linear([inputs, context, r * state],
                                  self._num_units, True))
         new_h = u * state + (1 - u) * c
     return new_h, new_h
  def __call__(self, inputs, state, scope=None):
    """Long short-term memory cell (LSTM)."""
    with vs.variable_scope(scope or type(self).__name__):  # "BasicLSTMCell"
      # Parameters of gates are concatenated into one multiply for efficiency.
      if self._state_is_tuple:
        c, h = state
      else:
        c, h = array_ops.split(1, 2, state)

      s1 = vs.get_variable("s1", initializer=tf.ones([4 * self._num_units]), dtype=tf.float32)
      s2 = vs.get_variable("s2", initializer=tf.ones([4 * self._num_units]), dtype=tf.float32)
      s3 = vs.get_variable("s3", initializer=tf.ones([self._num_units]), dtype=tf.float32)

      b1 = vs.get_variable("b1", initializer=tf.zeros([4 * self._num_units]), dtype=tf.float32)
      b2 = vs.get_variable("b2", initializer=tf.zeros([4 * self._num_units]), dtype=tf.float32)
      b3 = vs.get_variable("b3", initializer=tf.zeros([self._num_units]), dtype=tf.float32)

      # s1 = tf.Variable(tf.ones([4 * self._num_units]), name="s1")
      # s2 = tf.Variable(tf.ones([4 * self._num_units]), name="s2")
      # s3 = tf.Variable(tf.ones([self._num_units]), name="s3")
      #
      # b1 = tf.Variable(tf.zeros([4 * self._num_units]), name="b1")
      # b2 = tf.Variable(tf.zeros([4 * self._num_units]), name="b2")
      # b3 = tf.Variable(tf.zeros([self._num_units]), name="b3")

      input_below_ = rnn_cell._linear([inputs],
                                      4 * self._num_units, False, scope="out_1")
      input_below_ = ln(input_below_, s1, b1)
      state_below_ = rnn_cell._linear([h],
                                      4 * self._num_units, False, scope="out_2")
      state_below_ = ln(state_below_, s2, b2)
      lstm_matrix = tf.add(input_below_, state_below_)

      i, j, f, o = array_ops.split(1, 4, lstm_matrix)

      new_c = (c * sigmoid(f) + sigmoid(i) *
               self._activation(j))

      # Currently normalizing c causes lot of nan's in the model, thus commenting it out for now.
      # new_c_ = ln(new_c, s3, b3)
      new_c_ = new_c
      new_h = self._activation(new_c_) * sigmoid(o)

      if self._state_is_tuple:
        new_state = LSTMStateTuple(new_c, new_h)
      else:
        new_state = array_ops.concat(1, [new_c, new_h])
      return new_h, new_state
  def __call__(self, inputs, state, scope=None):
    """Long short-term memory cell (LSTM)."""
    with vs.variable_scope(scope or type(self).__name__):  # "BasicLSTMCell"
      # Parameters of gates are concatenated into one multiply for efficiency.
      if self._state_is_tuple:
        c, h = state
      else:
        c, h = array_ops.split(1, 2, state)

      s1 = vs.get_variable("s1", initializer=tf.ones([4 * self._num_units]), dtype=tf.float32)
      s2 = vs.get_variable("s2", initializer=tf.ones([4 * self._num_units]), dtype=tf.float32)
      s3 = vs.get_variable("s3", initializer=tf.ones([self._num_units]), dtype=tf.float32)

      b1 = vs.get_variable("b1", initializer=tf.zeros([4 * self._num_units]), dtype=tf.float32)
      b2 = vs.get_variable("b2", initializer=tf.zeros([4 * self._num_units]), dtype=tf.float32)
      b3 = vs.get_variable("b3", initializer=tf.zeros([self._num_units]), dtype=tf.float32)

      # s1 = tf.Variable(tf.ones([4 * self._num_units]), name="s1")
      # s2 = tf.Variable(tf.ones([4 * self._num_units]), name="s2")
      # s3 = tf.Variable(tf.ones([self._num_units]), name="s3")
      #
      # b1 = tf.Variable(tf.zeros([4 * self._num_units]), name="b1")
      # b2 = tf.Variable(tf.zeros([4 * self._num_units]), name="b2")
      # b3 = tf.Variable(tf.zeros([self._num_units]), name="b3")

      input_below_ = rnn_cell._linear([inputs],
                                      4 * self._num_units, False, scope="out_1")
      input_below_ = ln(input_below_, s1, b1)
      state_below_ = rnn_cell._linear([h],
                                      4 * self._num_units, False, scope="out_2")
      state_below_ = ln(state_below_, s2, b2)
      lstm_matrix = tf.add(input_below_, state_below_)

      i, j, f, o = array_ops.split(1, 4, lstm_matrix)

      new_c = (c * sigmoid(f) + sigmoid(i) *
               self._activation(j))

      # Currently normalizing c causes lot of nan's in the model, thus commenting it out for now.
      # new_c_ = ln(new_c, s3, b3)
      new_c_ = new_c
      new_h = self._activation(new_c_) * sigmoid(o)

      if self._state_is_tuple:
        new_state = LSTMStateTuple(new_c, new_h)
      else:
        new_state = array_ops.concat(1, [new_c, new_h])
      return new_h, new_state
Example #19
0
 def attention(query, use_attention=False):
     """Put attention masks on hidden using hidden_features and query."""
     attn_weights = []
     ds = []  # Results of attention reads will be stored here.
     for i in xrange(num_heads):
         with variable_scope.variable_scope("Attention_%d" % i):
             y = rnn_cell._linear(query, attention_vec_size, True)
             y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
             # Attention mask is a softmax of v^T * tanh(...).
             s = math_ops.reduce_sum(
                 v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3])
             if use_attention is False:  # apply mean pooling
                 weights = tf.tile(sequence_length,
                                   tf.pack([attn_length]))
                 weights = array_ops.reshape(weights, tf.shape(s))
                 a = array_ops.ones(
                     tf.shape(s),
                     dtype=dtype) / math_ops.to_float(weights)
                 # a = array_ops.ones(tf.shape(s), dtype=dtype) / math_ops.to_float(tf.shape(s)[1])
             else:
                 a = nn_ops.softmax(s)
             attn_weights.append(a)
             # Now calculate the attention-weighted vector d.
             d = math_ops.reduce_sum(
                 array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
                 [1, 2])
             ds.append(array_ops.reshape(d, [-1, attn_size]))
     return attn_weights, ds
Example #20
0
 def attention(query, use_attention=False):
   """Put attention masks on hidden using hidden_features and query."""
   attn_weights = []
   ds = []  # Results of attention reads will be stored here.
   for i in xrange(num_heads):
     with variable_scope.variable_scope("Attention_%d" % i):
       y = rnn_cell._linear(query, attention_vec_size, True)
       y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
       # Attention mask is a softmax of v^T * tanh(...).
       s = math_ops.reduce_sum(
           v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3])
       if use_attention is False: # apply mean pooling
           weights = tf.tile(sequence_length, tf.pack([attn_length]))
           weights = array_ops.reshape(weights, tf.shape(s))
           a = array_ops.ones(tf.shape(s), dtype=dtype) / math_ops.to_float(weights)
           # a = array_ops.ones(tf.shape(s), dtype=dtype) / math_ops.to_float(tf.shape(s)[1])
       else:
         a = nn_ops.softmax(s)
       attn_weights.append(a)
       # Now calculate the attention-weighted vector d.
       d = math_ops.reduce_sum(
           array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
           [1, 2])
       ds.append(array_ops.reshape(d, [-1, attn_size]))
   return attn_weights, ds
Example #21
0
def linear(args,
           output_size,
           bias,
           bias_start=0.0,
           scope=None,
           squeeze=False,
           wd=0.0,
           input_keep_prob=1.0,
           is_train=None):
    if args is None or (nest.is_sequence(args) and not args):
        raise ValueError("`args` must be specified")
    if not nest.is_sequence(args):
        args = [args]

    flat_args = [flatten(arg, 1) for arg in args]
    if input_keep_prob < 1.0:
        assert is_train is not None
        flat_args = [
            tf.cond(is_train, lambda: tf.nn.dropout(arg, input_keep_prob),
                    lambda: arg) for arg in flat_args
        ]
    flat_out = _linear(flat_args,
                       output_size,
                       bias,
                       bias_start=bias_start,
                       scope=scope)
    out = reconstruct(flat_out, args[0], 1)
    if squeeze:
        out = tf.squeeze(out, [len(args[0].get_shape().as_list()) - 1])
    if wd:
        add_wd(wd)

    return out
Example #22
0
 def __call__(self, inputs, state, scope=None):
   """Most basic RNN: output = new_state = activation(W * input + U * state + B + noise)."""
   with vs.variable_scope(scope or type(self).__name__):  # "BasicRNNCell"
     z = rnn_cell._linear([inputs, state], self._num_units, True)
     z += random_normal(shape(z), stddev=self._stddev)
     output = self._activation(z)
   return output, output
Example #23
0
 def __call__(self, inputs, state, scope=None):
     """Gated recurrent unit (GRU) with nunits cells."""
     with vs.variable_scope(scope or type(self).__name__):  # "GRUCell"
         with vs.variable_scope("Gates"):  # Reset gate and update gate.
             # We start with bias of 1.0 to not reset and not update.
             r, u = array_ops.split(
                 1, 2,
                 rnn_cell._linear([inputs, state], 2 * self._num_units,
                                  True, 1.0))
             r, u = tf.sigmoid(r), tf.sigmoid(u)
         with vs.variable_scope("Candidate"):
             c = self._activation(
                 rnn_cell._linear([inputs, r * state], self._num_units,
                                  True))
         new_h = u * state + (1 - u) * c
     return new_h, new_h
Example #24
0
    def __call__(self, inputs, state, d_act, scope=None):
        """Long short-term memory cell (LSTM)."""
        with vs.variable_scope(scope or type(self).__name__):  # "BasicLSTMCell"
        # Parameters of gates are concatenated into one multiply for efficiency.
            if self._state_is_tuple:
                c, h = state
            else:
                try:
                    c, h = array_ops.split(1, 2, state)
                except:
                    c, h = array_ops.split(state, 2, 1)
            concat = _linear([inputs, h], 4 * self._num_units, True)

            # i = input_gate, j = new_input, f = forget_gate, o = output_gate
            try:
                i, j, f, o = array_ops.split(1, 4, concat)
            except:
                i, j, f, o = array_ops.split(concat, 4, 1)
            
            w_d = vs.get_variable('w_d', [self.key_words_voc_size, self._num_units])
            
            new_c = (c * sigmoid(f + self._forget_bias) + sigmoid(i) *
                    self._activation(j)) + tf.tanh(tf.matmul(d_act, w_d))
            new_h = self._activation(new_c) * sigmoid(o)

            if self._state_is_tuple:
                new_state = LSTMStateTuple(new_c, new_h)
            else:
                try:
                    new_state = array_ops.concat(1, [new_c, new_h])
                except:
                    new_state = array_ops.concat([new_c, new_h], 1)
            return new_h, new_state
    def _multi_head(self, queries, keys, query_mask, key_mask, num_heads, block_feature=False, scope='multihead', reuse=None):
        with vs.variable_scope(scope, reuse=reuse):
            # batch_size * seq_size_q * num_units
            Q = rnn_cell._linear(tf.reshape(queries,
                                            [-1, self.num_units]),
                                 self.num_units, True, 1.0, scope='Q')
            Q = tf.reshape(Q, tf.shape(queries))
            # batch_size * seq_size_k * num_units
            K = rnn_cell._linear(tf.reshape(keys,
                                            [-1, self.num_units]),
                                 self.num_units, True, 1.0, scope='K')
            K = tf.reshape(K, tf.shape(keys))
            V = rnn_cell._linear(tf.reshape(keys,
                                            [-1, self.num_units]),
                                 self.num_units, True, 1.0, scope='V')
            V = tf.reshape(V, tf.shape(keys))
            Q_ = tf.pack(tf.split(2, num_heads, Q))  # num_heads *  batch_size * seq_size_q *num_units/num_heads
            K_ = tf.pack(tf.split(2, num_heads, K))  # num_heads * batch_size * seq_size_k * num_units/num_heads
            V_ = tf.pack(tf.split(2, num_heads, V))  # num_heads * batch_size * seq_size_k * num_units/num_heads
            len_q = tf.shape(queries)[1]
            len_k = tf.shape(keys)[1]

            # Compute weight
            weights = tf.batch_matmul(Q_, tf.transpose(K_, [0,1,3,2])) \
                      / ((self.num_units/num_heads) ** 0.5)    # num_heads * batch_size * seq_size_q * seq_size_k
            key_mask = tf.tile(tf.reshape(key_mask, [1, -1, 1, len_k]), [num_heads, 1, len_q, 1])
            weights = tf.select(key_mask, weights, tf.ones_like(weights) * (-2**32 + 1))

            if block_feature:
                diag_vals = tf.ones_like(weights[0, 0, :, :]) # seq_size_q * seq_size_k
                mask = tf.cast(tf.batch_matrix_band_part(diag_vals, -1, 0), tf.bool)
                mask = tf.tile(tf.reshape(mask, [1, 1, len_q, len_k]), [num_heads, tf.shape(queries)[0], 1, 1])
                weights = tf.select(mask, weights, tf.ones_like(weights) * (-2 ** 32 + 1))

            weights = tf.reshape(tf.nn.softmax(tf.reshape(weights, [-1, len_k])),
                                 [num_heads, -1, len_q, len_k])
            # num_heads * batch_size * seq_size_q * num_units/num_heads
            ctx = tf.batch_matmul(weights,  V_)

            ctx *= tf.reshape(tf.cast(query_mask, tf.float32), [-1, len_q, 1]) # num_heads * batch_size * seq_size_q * num_units/num_heads
            ctx = tf.concat(2, tf.unpack(ctx))  # batch_size * seq_size_q * num_units
            ctx = rnn_cell._linear(tf.reshape(ctx, [-1, self.num_units]), self.num_units, True, 1.0, scope='context')
            ctx = tf.reshape(ctx, [-1, len_q, self.num_units])
            drop_ctx = tf.nn.dropout(ctx, keep_prob=self.keep_prob)
            # Add and Normalization
            res = layer_normalization(drop_ctx + queries)
        return  res, weights
Example #26
0
def cross_attention_rnn(config, cell,
												inputs,
												padding_mask,
												xvector):
	""" Input a list of tensors and get back the embedded vector for this list.

	NOTE: the difference from this function to the above one is that this takes
				vector from another source into consideration when calculating attention
				weights. See Tan et al., 2015 "Lstm-based deep learning models for
				non-factoid answer selection" for details.
	"""
	num_steps = len(inputs)
	hidden_size = cell.output_size * 2
	batch_size = inputs[0].get_shape()[0].value
	embed_size = inputs[0].get_shape()[1].value
	assert(cell.output_size == config.rnn_hidden_size)
	assert(batch_size == config.batch_size)
	assert(embed_size == config.word_embed_size)

	with tf.variable_scope("attention_RNN"):
		input_length = tf.reduce_sum(tf.pack(padding_mask, axis=1), 1)
		# input_length = tf.Print(input_length, [padding_mask, input_length],
		#													message='input length', summarize=50)
		outputs, state_fw, state_bw = \
				tf.nn.bidirectional_rnn(cell, cell, inputs, dtype=config.data_type,
																sequence_length=input_length)

		# RESHAPE THE OUTPUTS, JUST IN CASE NONE DIM
		shaped_outputs = [tf.reshape(o, [batch_size, hidden_size]) for o in outputs]
		outputs = shaped_outputs
		outputs_for_attention = [tf.concat(1, [o, xvector]) # [batch_size, 2*hidden_size]
														 for o in outputs]

		# OVERALL SEQUENCE REPRESENTAION
		hidden_outputs = []
		attention_weights = []
		outputs_concat = tf.pack(outputs, axis=1) # [batch_size, num_step, hidden_size]
		with tf.variable_scope("attention_computation"):
			context_vector = tf.get_variable("context_vector", [2*hidden_size, 1])
			# Calculate attention
			attention_weights = []
			for i in xrange(len(outputs)):
				if i > 0: tf.get_variable_scope().reuse_variables()
				hidden_output = tf.tanh(rnn_cell._linear(outputs_for_attention[i],
																								 2*hidden_size,
																								 True # If add bias
																								 ))
				hidden_outputs.append(hidden_output)
				attention_weights.append(tf.matmul(hidden_output, context_vector)) # [batch_size, 1]
			attention_weights = tf.concat(1, attention_weights)
			attention_weights = tf.nn.softmax(attention_weights) * \
													tf.pack(padding_mask, axis=1) # [batch_size, num_steps]
			attention_weights = tf.div(attention_weights,
																 1e-12 + tf.reduce_sum(attention_weights, 1, keep_dims=True))
			# Attention weighted sum
			weighted_sum = tf.reduce_sum(outputs_concat * tf.expand_dims(attention_weights, 2),
																	 1) # [batch_size, hidden_size]

	return weighted_sum, outputs_concat, hidden_outputs, attention_weights
Example #27
0
 def __init__(self, num_units, encoder_output, scope=None):
     self.hs = encoder_output
     with vs.variable_scope(scope or type(self).__name__):
         with vs.variable_scope("Attn1"):
             hs2d = tf.reshape(self.hs, [-1, num_units])
             phi_hs2d = tanh(rnn_cell._linear(hs2d, num_units, True, 1.0))
             self.phi_hs = tf.reshape(phi_hs2d, tf.shape(self.hs))
     super(GRUCellAttn, self).__init__(num_units)
Example #28
0
 def __init__(self, num_units, encoder_output, scope=None):
   self.hs = encoder_output
   with vs.variable_scope(scope or type(self).__name__):
     with vs.variable_scope("Attn1"):
       hs2d = tf.reshape(self.hs, [-1, num_units])
       phi_hs2d = tanh(rnn_cell._linear(hs2d, num_units, True, 1.0))
       self.phi_hs = tf.reshape(phi_hs2d, tf.shape(self.hs))
   super(GRUCellAttn, self).__init__(num_units)
Example #29
0
 def attention(query):
     """Point on hidden using hidden_features and query."""
     with vs.variable_scope("Attention"):
         y = rnn_cell._linear(query, attention_vec_size, True)
         y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
         # Attention mask is a softmax of v^T * tanh(...).
         s = math_ops.reduce_sum(v * math_ops.tanh(hidden_features + y),
                                 [2, 3])
         return s
Example #30
0
    def decode(self, h_q, h_p):
        """
								takes in a knowledge representation
								and output a probability estimation over
								all paragraph tokens on which token should be
								the start of the answer span, and which should be
								the end of the answer span.

								:param knowledge_rep: it is a representation of the paragraph and question,
																														decided by how you choose to implement the encoder
								:return:
								"""
        with vs.variable_scope("answer_start"):
            a_s = rnn_cell._linear([h_q, h_p], self.output_size, True, 1.0)
        with vs.variable_scope("answer_end"):
            a_e = rnn_cell._linear([h_q, h_p], self.output_size, True, 1.0)

        return (a_s, a_e)
 def attention(query):
     """Point on hidden using hidden_features and query."""
     with vs.variable_scope("Attention"):
         y = rnn_cell._linear(query, attention_vec_size, True)
         y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
         # Attention mask is a softmax of v^T * tanh(...).
         s = math_ops.reduce_sum(
             v * math_ops.tanh(hidden_features + y), [2, 3])
         return s
Example #32
0
def setup_actor_update(actor):

    with tf.variable_scope("rl"):
        actor.critic_output = tf.placeholder(tf.float32, [None, None, actor.vocab_size], name='critic_output')
        # action_gradients is passed in by Q_network...
        # and in DDPG, it's the gradients of Q w.r.t. policy's chosen actions
        # but in AC, it's the output of Q network w.r.t. all actions
        opt = nlc_model.get_optimizer(FLAGS.optimizer)(actor.learning_rate)

        # update
        params = tf.trainable_variables()

        # TODO: hope this would work
        with tf.variable_scope("Loss"):
            doshape = tf.shape(actor.decoder_output)
            T, batch_size = doshape[0], doshape[1]
            do2d = tf.reshape(actor.decoder_output, [-1, actor.size])
            logits2d = rnn_cell._linear(do2d, actor.vocab_size, True, 1.0)
            # outputs2d = tf.nn.log_softmax(logits2d)

            # apply Q-network's score here (similar to advantage function)
            # 1. reshape critic_output like decoder_output (same shape anyway)
            # TODO: hope this is correct
            critic_do2d = tf.reshape(actor.critic_output, [-1, actor.vocab_size])  # should reshape according to critic
            # 2. multiply this with actor's logitis
            rl_logits2d = logits2d * critic_do2d

            # actor.outputs = tf.reshape(outputs2d, tf.pack([T, batch_size, actor.vocab_size]))

            targets_no_GO = tf.slice(actor.target_tokens, [1, 0], [-1, -1])
            masks_no_GO = tf.slice(actor.target_mask, [1, 0], [-1, -1])
            # easier to pad target/mask than to split decoder input since tensorflow does not support negative indexing
            labels1d = tf.reshape(tf.pad(targets_no_GO, [[0, 1], [0, 0]]), [-1])
            mask1d = tf.reshape(tf.pad(masks_no_GO, [[0, 1], [0, 0]]), [-1])
            losses1d = tf.nn.sparse_softmax_cross_entropy_with_logits(rl_logits2d, labels1d) * tf.to_float(mask1d)
            losses2d = tf.reshape(losses1d, tf.pack([T, batch_size]))

            actor.rl_losses = tf.reduce_sum(losses2d) / tf.to_float(batch_size)

        # http://pemami4911.github.io/blog/2016/08/21/ddpg-rl.html (DDPG update)
        gradients = tf.gradients(actor.rl_losses, params)  # step 7: update
        # Not sure if I understood this part lol

        clipped_gradients, _ = tf.clip_by_global_norm(gradients, FLAGS.max_gradient_norm)

        # clip, then multiply, otherwise we are not learning the signals from critic
        # clipped_gradients: [T, batch_size, vocab_size]

        # updated_gradients = clipped_gradients * actor.critic_output
        # pass in as input

        actor.rl_gradient_norm = tf.global_norm(clipped_gradients)
        actor.rl_param_norm = tf.global_norm(params)

        actor.rl_updates = opt.apply_gradients(
            zip(clipped_gradients, params), global_step=actor.global_step)
Example #33
0
        def beam_step(time, beam_probs, beam_seqs, cand_probs, cand_seqs,
                      *states):
            batch_size = tf.shape(beam_probs)[0]
            inputs = tf.reshape(
                tf.slice(beam_seqs, [0, time], [batch_size, 1]), [batch_size])
            decoder_input = embedding_ops.embedding_lookup(
                self.L_pred, inputs)  # self.L_env
            decoder_output, state_output = self.decoder_graph(
                decoder_input, states)

            with vs.variable_scope("Logistic", reuse=True):
                do2d = tf.reshape(decoder_output, [-1, self.size])
                logits2d = rnn_cell._linear(do2d, self.tgt_vocab_size, True,
                                            1.0)
                logprobs2d = tf.nn.log_softmax(logits2d)

            total_probs = logprobs2d + tf.reshape(beam_probs, [-1, 1])
            total_probs_noEOS = tf.concat(1, [
                tf.slice(total_probs, [0, 0], [batch_size, EOS_ID]),
                tf.tile([[-3e38]], [batch_size, 1]),
                tf.slice(total_probs, [0, EOS_ID + 1],
                         [batch_size, self.tgt_vocab_size - EOS_ID - 1])
            ])

            flat_total_probs = tf.reshape(total_probs_noEOS, [-1])
            beam_k = tf.minimum(tf.size(flat_total_probs), self.beam_size)
            next_beam_probs, top_indices = tf.nn.top_k(flat_total_probs,
                                                       k=beam_k)

            next_bases = tf.floordiv(top_indices, self.tgt_vocab_size)
            next_mods = tf.mod(top_indices, self.tgt_vocab_size)

            next_states = [
                tf.gather(state, next_bases) for state in state_output
            ]
            next_beam_seqs = tf.concat(1, [
                tf.gather(beam_seqs, next_bases),
                tf.reshape(next_mods, [-1, 1])
            ])

            cand_seqs_pad = tf.pad(cand_seqs, [[0, 0], [0, 1]])
            beam_seqs_EOS = tf.pad(beam_seqs, [[0, 0], [0, 1]])
            new_cand_seqs = tf.concat(0, [cand_seqs_pad, beam_seqs_EOS])
            EOS_probs = tf.slice(total_probs, [0, EOS_ID], [batch_size, 1])
            new_cand_probs = tf.concat(
                0, [cand_probs, tf.reshape(EOS_probs, [-1])])

            cand_k = tf.minimum(tf.size(new_cand_probs), self.beam_size)
            next_cand_probs, next_cand_indices = tf.nn.top_k(new_cand_probs,
                                                             k=cand_k)
            next_cand_seqs = tf.gather(new_cand_seqs, next_cand_indices)

            return [
                time + 1, next_beam_probs, next_beam_seqs, next_cand_probs,
                next_cand_seqs
            ] + next_states
    def __call__(self, inputs, parent_state, cyc_state, scope=None):
        """Modified Long short-term memory for tree structure"""
        with vs.variable_scope(scope or type(self).__name__):   # "BasicTreeLSTMCell"
            # parameters of gates are concatenated into one multiply for efficiency
            parent_c, parent_h = parent_state
            cyc_c, cyc_h = cyc_state
            c = rnn_cell._linear([parent_c, cyc_c], self._num_units, True)
            concat = rnn_cell._linear([inputs, parent_h, cyc_h], 4 * self._num_units, True)

            # i = input_gate, j = new_input, f = forget_gate, o = output_gate
            i, j, f, o = array_ops.split(1, 4, concat)

            new_c = [c * rnn_cell.sigmoid(f + self._forget_bias) + rnn_cell.sigmoid(i) *
                                                                   self._activation(j)]
            new_h = self._activation(new_c) * rnn_cell.sigmoid(o)

            new_state = rnn_cell.LSTMStateTuple(new_c, new_h)

            return new_h, new_state
Example #35
0
  def __call__(self, inputs, state, scope=None):
    """Most basic RNN: output = new_state = activation(W * input + U * state + B)."""

    with vs.variable_scope(scope or type(self).__name__):  # "BasicRNNCell"
      assert(self._dropMaskInput.get_shape()[1:] == inputs.get_shape()[1:])
      assert(self._dropMaskState.get_shape()[1:] == state.get_shape()[1:])
      dropin = tf.mul(self._dropMaskInput, inputs)
      dropst = tf.mul(self._dropMaskState, state)

      output = self._activation(rnn_cell._linear([dropin, dropst], self._num_units, True))

    return output, output
Example #36
0
    def __call__(self, inputs, state, scope=None):
        """Does the stuff"""
        with tf.variable_scope(scope or type(self).__name__):
            h, mem = state

            concat = _linear(tf.concat(1, [h, inputs]),
                             4 * self._width,
                             True,
                             scope='keys')
            concat = tf.nn.tanh(concat)
            in_key, out_key = tf.split(1, 2, concat)

            in_val = _linear(inputs, self._width * 2, True, scope='input')
            in_val = in_val

            updated_mem = hrr.store(in_key, in_val, mem)
            output = hrr.retrieve(out_key, updated_mem)

            output = tf.nn.tanh(output)

            return output, (output, updated_mem)
  def __call__(self, inputs, state, scope=None):
    """Long short-term memory cell (LSTM)."""
    with vs.variable_scope(scope or type(self).__name__):  # "BasicLSTMCell"           
      # Parameters of gates are concatenated into one multiply for efficiency.         
      c, h = array_ops.split(1, 2, state)
      concat = rnn_cell._linear([inputs, h], 4 * self._num_units, True)

      # i = input_gate, j = new_input, f = forget_gate, o = output_gate                
      i, j, f, o = array_ops.split(1, 4, concat)

      new_c = c * tf.sigmoid(f + self._forget_bias) + tf.sigmoid(i) * self._activation(j)
      new_h = self._activation(new_c) * tf.sigmoid(o)

      return new_h, array_ops.concat(1, [new_c, new_h])
Example #38
0
def rnn_linear(all_states, dim, output_size, scope, reuse=False, return_param=False):
    with tf.variable_scope(scope, reuse=reuse) as v_s:
        # all_states: (batch_size, time, hidden_size)
        doshape = tf.shape(all_states)
        batch_size, unroll = doshape[0], doshape[1]

        flattened = tf.reshape(all_states, [-1, dim])
        result2d = rnn_cell._linear(flattened, output_size=output_size, bias=True)
        result3d = tf.reshape(result2d, tf.pack([batch_size, unroll, -1]))

    if return_param:
        linear_params = [v for v in tf.global_variables() if v.name.startswith(v_s.name)]
        return result3d, linear_params

    return result3d
Example #39
0
    def __call__(self, inputs, state, scope=None):
        """Long short-term memory cell (LSTM)."""
        with vs.variable_scope(scope
                               or type(self).__name__):  # "BasicLSTMCell"
            # Parameters of gates are concatenated into one multiply for efficiency.
            c, h = array_ops.split(1, 2, state)
            concat = rnn_cell._linear([inputs, h], 4 * self._num_units, True)

            # i = input_gate, j = new_input, f = forget_gate, o = output_gate
            i, j, f, o = array_ops.split(1, 4, concat)

            new_c = c * tf.sigmoid(f + self._forget_bias) + tf.sigmoid(
                i) * self._activation(j)
            new_h = self._activation(new_c) * tf.sigmoid(o)

            return new_h, array_ops.concat(1, [new_c, new_h])
Example #40
0
        def attention(query):
            with tf.variable_scope("Attention"):
                # attention on query (decoder states)
                query_feature = rnn_cell._linear(query,
                                                 bias=True,
                                                 output_size=attention_size,
                                                 scope="Att_W2")

                # reshape query_feature feature to (-1, 1, 1, attention_size) in order to do summation
                query_feature = tf.reshape(query_feature,
                                           (-1, 1, 1, attention_size))

                # compute attention vector u, should be (batch_size, attention_len)
                s = tf.reduce_sum(v *
                                  tf.nn.tanh(query_feature + hidden_feature),
                                  reduction_indices=[2, 3])
            return s
Example #41
0
  def setup_loss(self):
    with vs.variable_scope("Logistic"):
      doshape = tf.shape(self.decoder_output)
      T, batch_size = doshape[0], doshape[1]
      do2d = tf.reshape(self.decoder_output, [-1, self.size])
      logits2d = rnn_cell._linear(do2d, self.vocab_size, True, 1.0)
      outputs2d = tf.nn.log_softmax(logits2d)
      self.outputs = tf.reshape(outputs2d, tf.pack([T, batch_size, self.vocab_size]))

      targets_no_GO = tf.slice(self.target_tokens, [1, 0], [-1, -1])
      masks_no_GO = tf.slice(self.target_mask, [1, 0], [-1, -1])
      # easier to pad target/mask than to split decoder input since tensorflow does not support negative indexing
      labels1d = tf.reshape(tf.pad(targets_no_GO, [[0, 1], [0, 0]]), [-1])
      mask1d = tf.reshape(tf.pad(masks_no_GO, [[0, 1], [0, 0]]), [-1])
      losses1d = tf.nn.sparse_softmax_cross_entropy_with_logits(logits2d, labels1d) * tf.to_float(mask1d)
      losses2d = tf.reshape(losses1d, tf.pack([T, batch_size]))
      self.losses = tf.reduce_sum(losses2d) / tf.to_float(batch_size)
Example #42
0
  def setup_loss(self):
    with vs.variable_scope("Logistic"):
      doshape = tf.shape(self.decoder_output)
      T, batch_size = doshape[0], doshape[1]
      do2d = tf.reshape(self.decoder_output, [-1, self.size])
      logits2d = rnn_cell._linear(do2d, self.vocab_size, True, 1.0)
      outputs2d = tf.nn.log_softmax(logits2d)
      self.outputs = tf.reshape(outputs2d, tf.pack([T, batch_size, self.vocab_size]))

      targets_no_GO = tf.slice(self.target_tokens, [1, 0], [-1, -1])
      masks_no_GO = tf.slice(self.target_mask, [1, 0], [-1, -1])
      # easier to pad target/mask than to split decoder input since tensorflow does not support negative indexing
      labels1d = tf.reshape(tf.pad(targets_no_GO, [[0, 1], [0, 0]]), [-1])
      mask1d = tf.reshape(tf.pad(masks_no_GO, [[0, 1], [0, 0]]), [-1])
      losses1d = tf.nn.sparse_softmax_cross_entropy_with_logits(logits2d, labels1d) * tf.to_float(mask1d)
      losses2d = tf.reshape(losses1d, tf.pack([T, batch_size]))
      self.losses = tf.reduce_sum(losses2d) / tf.to_float(batch_size)
Example #43
0
File: vrnn.py Project: mrhaz/VRNN
    def __call__(self, inputs, state, scope=None):
        """Variational recurrent neural network cell (VRNN)."""
        with tf.variable_scope(scope or type(self).__name__):
            # Update the hidden state.
            z_t, z_mean_t, z_log_sigma_sq_t = state
            h_t_1 = self._activation(
                _linear([inputs, z_t, z_mean_t, z_log_sigma_sq_t],
                        2 * self._num_units, True))
            z_mean_t_1, z_log_sigma_sq_t_1 = tf.split(1, 2, h_t_1)

            # Sample.
            eps = tf.random_normal((tf.shape(inputs)[0], self._num_units),
                                   0.0,
                                   1.0,
                                   dtype=tf.float32)
            z_t_1 = tf.add(z_mean_t_1,
                           tf.mul(tf.sqrt(tf.exp(z_log_sigma_sq_t_1)), eps))

            return z_t_1, VRNNStateTuple(z_t_1, z_mean_t_1, z_log_sigma_sq_t_1)
Example #44
0
 def attention(query):
   """Put attention masks on hidden using hidden_features and query."""
   attn_weights = []
   ds = []  # Results of attention reads will be stored here.
   for i in xrange(num_heads):
     with variable_scope.variable_scope("Attention_%d" % i):
       y = rnn_cell._linear(query, attention_vec_size, True)
       y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
       # Attention mask is a softmax of v^T * tanh(...).
       s = math_ops.reduce_sum(
           v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3])
       a = nn_ops.softmax(s)
       attn_weights.append(a)
       # Now calculate the attention-weighted vector d.
       d = math_ops.reduce_sum(
           array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
           [1, 2])
       ds.append(array_ops.reshape(d, [-1, attn_size]))
   return attn_weights, ds
Example #45
0
  def downscale(self, inp, mask):
    with vs.variable_scope("Downscale"):
      inshape = tf.shape(inp)
      T, batch_size, dim = inshape[0], inshape[1], inshape[2]
      inp2d = tf.reshape(tf.transpose(inp, perm=[1, 0, 2]), [-1, 2 * self.size])
      out2d = rnn_cell._linear(inp2d, self.size, True, 1.0)
      out3d = tf.reshape(out2d, tf.pack((batch_size, tf.to_int32(T/2), dim)))
      out3d = tf.transpose(out3d, perm=[1, 0, 2])
      out3d.set_shape([None, None, self.size])
      out = tanh(out3d)

      mask = tf.transpose(mask)
      mask = tf.reshape(mask, [-1, 2])
      mask = tf.cast(mask, tf.bool)
      mask = tf.reduce_any(mask, reduction_indices=1)
      mask = tf.to_int32(mask)
      mask = tf.reshape(mask, tf.pack([batch_size, -1]))
      mask = tf.transpose(mask)
    return out, mask
Example #46
0
  def downscale(self, inp, mask):
    with vs.variable_scope("Downscale"):
      inshape = tf.shape(inp)
      T, batch_size, dim = inshape[0], inshape[1], inshape[2]
      inp2d = tf.reshape(tf.transpose(inp, perm=[1, 0, 2]), [-1, 2 * self.size])
      out2d = rnn_cell._linear(inp2d, self.size, True, 1.0)
      out3d = tf.reshape(out2d, tf.pack((batch_size, tf.to_int32(T/2), dim)))
      out3d = tf.transpose(out3d, perm=[1, 0, 2])
      out3d.set_shape([None, None, self.size])
      out = tanh(out3d)

      mask = tf.transpose(mask)
      mask = tf.reshape(mask, [-1, 2])
      mask = tf.cast(mask, tf.bool)
      mask = tf.reduce_any(mask, reduction_indices=1)
      mask = tf.to_int32(mask)
      mask = tf.reshape(mask, tf.pack([batch_size, -1]))
      mask = tf.transpose(mask)
    return out, mask
Example #47
0
    def attention_encode(self):
        # (length, batch_size, dim)
        query_w_matrix = self.normal_encode(self.encoder_inputs,
                                            self.source_mask)
        context_w_matrix = self.normal_encode(self.ctx_inputs,
                                              self.ctx_mask,
                                              reuse=True)

        # can add a query variation here (optional)
        # can take out coattention mix...but by experiment it should be better than no coattention

        # in PA4 it was also time-major

        # batch, p, size
        p_encoding = tf.transpose(context_w_matrix, perm=[1, 0, 2])
        # batch, q, size
        q_encoding = tf.transpose(query_w_matrix, perm=[1, 0, 2])
        # batch, size, q
        q_encoding_t = tf.transpose(query_w_matrix, perm=[1, 2, 0])

        # 2). Q->P Attention
        # [256,25,125] vs [128,125,11]
        A = batch_matmul(p_encoding, q_encoding_t)  # (batch, p, q)
        A_p = tf.nn.softmax(A)

        # 3). Paragrahp's context vectors
        C_p = batch_matmul(A_p, q_encoding)

        # 4). Linear mix of paragraph's context vectors and paragraph states
        flat_C_p = tf.reshape(C_p, [-1, self.FLAGS.size])
        flat_p_enc = tf.reshape(p_encoding, [-1, self.FLAGS.size])
        doshape = tf.shape(context_w_matrix)
        T, batch_size = doshape[0], doshape[1]

        # mixed_p: (batch * p_len, size)
        mixed_p = rnn_cell._linear([flat_C_p, flat_p_enc],
                                   self.FLAGS.size,
                                   bias=True)
        mixed_p = tf.reshape(mixed_p, tf.pack([T, -1, self.FLAGS.size]))

        # no extra layer of RNN on top of coattention result
        return mixed_p
Example #48
0
def linear(args, output_size, bias, bias_start=0.0, scope=None, squeeze=False, wd=0.0, input_keep_prob=1.0,
           is_train=None):
    if args is None or (nest.is_sequence(args) and not args):
        raise ValueError("`args` must be specified")
    if not nest.is_sequence(args):
        args = [args]

    flat_args = [flatten(arg, 1) for arg in args]
    if input_keep_prob < 1.0:
        assert is_train is not None
        flat_args = [tf.cond(is_train, lambda: tf.nn.dropout(arg, input_keep_prob), lambda: arg)
                     for arg in flat_args]
    flat_out = _linear(flat_args, output_size, bias, bias_start=bias_start, scope=scope)
    out = reconstruct(flat_out, args[0], 1)
    if squeeze:
        out = tf.squeeze(out, [len(args[0].get_shape().as_list())-1])
    if wd:
        add_wd(wd)

    return out
Example #49
0
 def attention(query):
     """Put attention masks on hidden using hidden_features and query."""
     attn_weights = []
     ds = []  # Results of attention reads will be stored here.
     for i in xrange(num_heads):
         with variable_scope.variable_scope("Attention_%d" % i):
             y = rnn_cell._linear(query, attention_vec_size, True)
             y = array_ops.reshape(y,
                                   [-1, 1, 1, attention_vec_size])
             # Attention mask is a softmax of v^T * tanh(...).
             s = math_ops.reduce_sum(
                 v[i] * math_ops.tanh(hidden_features[i] + y),
                 [2, 3])
             a = nn_ops.softmax(s)
             attn_weights.append(a)
             # Now calculate the attention-weighted vector d.
             d = math_ops.reduce_sum(
                 array_ops.reshape(a, [-1, attn_length, 1, 1]) *
                 hidden, [1, 2])
             ds.append(array_ops.reshape(d, [-1, attn_size]))
     return attn_weights, ds
Example #50
0
					def attention(query):
						"""Put attention masks on hidden using hidden_features and query."""
						ds = []  # Results of attention reads will be stored here.
						# if nest.is_sequence(query):  # If the query is a tuple, flatten it.
						# 	query_list = nest.flatten(query)
						# 	for q in query_list:  # Check that ndims == 2 if specified.
						# 		ndims = q.get_shape().ndims
						# 		if ndims:
						# 			assert ndims == 2
						# 	query = array_ops.concat(1, query_list)
						for a in xrange(num_heads):
							with variable_scope.variable_scope("Attention_%d" % a):
								y = rnn_cell._linear(query, attention_vec_size, True)
								y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
								# Attention mask is a softmax of v^T * tanh(...).
								s = math_ops.reduce_sum(v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])
								a = nn_ops.softmax(s)
								# Now calculate the attention-weighted vector d.
								d = math_ops.reduce_sum(array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2])
								ds.append(array_ops.reshape(d, [-1, attn_size]))
						return ds
Example #51
0
    def beam_step(time, beam_probs, beam_seqs, cand_probs, cand_seqs, *states):
      batch_size = tf.shape(beam_probs)[0]
      inputs = tf.reshape(tf.slice(beam_seqs, [0, time], [batch_size, 1]), [batch_size])
      decoder_input = embedding_ops.embedding_lookup(self.L_dec, inputs)
      decoder_output, state_output = self.decoder_graph(decoder_input, states)

      with vs.variable_scope("Logistic", reuse=True):
        do2d = tf.reshape(decoder_output, [-1, self.size])
        logits2d = rnn_cell._linear(do2d, self.vocab_size, True, 1.0)
        logprobs2d = tf.nn.log_softmax(logits2d)

      total_probs = logprobs2d + tf.reshape(beam_probs, [-1, 1])
      total_probs_noEOS = tf.concat(1, [tf.slice(total_probs, [0, 0], [batch_size, nlc_data.EOS_ID]),
                                        tf.tile([[-3e38]], [batch_size, 1]),
                                        tf.slice(total_probs, [0, nlc_data.EOS_ID + 1],
                                                 [batch_size, self.vocab_size - nlc_data.EOS_ID - 1])])

      flat_total_probs = tf.reshape(total_probs_noEOS, [-1])
      beam_k = tf.minimum(tf.size(flat_total_probs), self.beam_size)
      next_beam_probs, top_indices = tf.nn.top_k(flat_total_probs, k=beam_k)

      next_bases = tf.floordiv(top_indices, self.vocab_size)
      next_mods = tf.mod(top_indices, self.vocab_size)

      next_states = [tf.gather(state, next_bases) for state in state_output]
      next_beam_seqs = tf.concat(1, [tf.gather(beam_seqs, next_bases),
                                     tf.reshape(next_mods, [-1, 1])])

      cand_seqs_pad = tf.pad(cand_seqs, [[0, 0], [0, 1]])
      beam_seqs_EOS = tf.pad(beam_seqs, [[0, 0], [0, 1]])
      new_cand_seqs = tf.concat(0, [cand_seqs_pad, beam_seqs_EOS])
      EOS_probs = tf.slice(total_probs, [0, nlc_data.EOS_ID], [batch_size, 1])
      new_cand_probs = tf.concat(0, [cand_probs, tf.reshape(EOS_probs, [-1])])

      cand_k = tf.minimum(tf.size(new_cand_probs), self.beam_size)
      next_cand_probs, next_cand_indices = tf.nn.top_k(new_cand_probs, k=cand_k)
      next_cand_seqs = tf.gather(new_cand_seqs, next_cand_indices)

      return [time + 1, next_beam_probs, next_beam_seqs, next_cand_probs, next_cand_seqs] + next_states
def pointer_decoder(decoder_inputs, initial_state, attention_states, cell,
                    feed_prev=True, dtype=dtypes.float32, scope=None):
    """RNN decoder with pointer net for the sequence-to-sequence model.
    Args:
      decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
      initial_state: 2D Tensor [batch_size x cell.state_size].
      attention_states: 3D Tensor [batch_size x attn_length x attn_size].
      cell: rnn_cell.RNNCell defining the cell function and size.
      dtype: The dtype to use for the RNN initial state (default: tf.float32).
      scope: VariableScope for the created subgraph; default: "pointer_decoder".
    Returns:
      outputs: A list of the same length as decoder_inputs of 2D Tensors of shape
        [batch_size x output_size]. These represent the generated outputs.
        Output i is computed from input i (which is either i-th decoder_inputs.
        First, we run the cell
        on a combination of the input and previous attention masks:
          cell_output, new_state = cell(linear(input, prev_attn), prev_state).
        Then, we calculate new attention masks:
          new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
        and then we calculate the output:
          output = linear(cell_output, new_attn).
      states: The state of each decoder cell in each time-step. This is a list
        with length len(decoder_inputs) -- one item for each time-step.
        Each item is a 2D Tensor of shape [batch_size x cell.state_size].
    """
    if not decoder_inputs:
        raise ValueError("Must provide at least 1 input to attention decoder.")
    if not attention_states.get_shape()[1:2].is_fully_defined():
        raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
                         % attention_states.get_shape())

    with vs.variable_scope(scope or "point_decoder"):
        batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
        input_size = decoder_inputs[0].get_shape()[1].value
        attn_length = attention_states.get_shape()[1].value
        attn_size = attention_states.get_shape()[2].value

        # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
        hidden = array_ops.reshape(
            attention_states, [-1, attn_length, 1, attn_size])

        attention_vec_size = attn_size  # Size of query vectors for attention.
        k = vs.get_variable("AttnW", [1, 1, attn_size, attention_vec_size])
        hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")
        v = vs.get_variable("AttnV", [attention_vec_size])

        states = [initial_state]

        def attention(query):
            """Point on hidden using hidden_features and query."""
            with vs.variable_scope("Attention"):
                y = rnn_cell._linear(query, attention_vec_size, True)
                y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
                # Attention mask is a softmax of v^T * tanh(...).
                s = math_ops.reduce_sum(
                    v * math_ops.tanh(hidden_features + y), [2, 3])
                return s

        outputs = []
        prev = None
        batch_attn_size = array_ops.pack([batch_size, attn_size])
        attns = array_ops.zeros(batch_attn_size, dtype=dtype)

        attns.set_shape([None, attn_size])
        inps = []
        for i in xrange(len(decoder_inputs)):
            if i > 0:
                vs.get_variable_scope().reuse_variables()
            inp = decoder_inputs[i]

            if feed_prev and i > 0:
                inp = tf.pack(decoder_inputs)
                inp = tf.transpose(inp, perm=[1, 0, 2])
                inp = tf.reshape(inp, [-1, attn_length, input_size])
                inp = tf.reduce_sum(inp * tf.reshape(tf.nn.softmax(output), [-1, attn_length, 1]), 1)
                inp = tf.stop_gradient(inp)
                inps.append(inp)

            # Use the same inputs in inference, order internaly

            # Merge input and previous attentions into one vector of the right size.
            x = rnn_cell._linear([inp, attns], cell.output_size, True)
            # Run the RNN.
            cell_output, new_state = cell(x, states[-1])
            states.append(new_state)
            # Run the attention mechanism.
            output = attention(new_state)

            outputs.append(output)

    return outputs, states, inps
Example #53
0
def attention_RNN(encoder_outputs, 
                  encoder_state,
                  num_decoder_symbols,
                  sequence_length,
                  num_heads=1,
                  dtype=dtypes.float32,
                  use_attention=True,
                  loop_function=None,
                  scope=None):
  if use_attention:
    print ('Use the attention RNN model')
    if num_heads < 1:
      raise ValueError("With less than 1 heads, use a non-attention decoder.")
  
    with variable_scope.variable_scope(scope or "attention_RNN"):
      output_size = encoder_outputs[0].get_shape()[1].value
      top_states = [array_ops.reshape(e, [-1, 1, output_size])
                  for e in encoder_outputs]
      attention_states = array_ops.concat(1, top_states)
      if not attention_states.get_shape()[1:2].is_fully_defined():
        raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
                       % attention_states.get_shape())
  
      batch_size = array_ops.shape(top_states[0])[0]  # Needed for reshaping.
      attn_length = attention_states.get_shape()[1].value
      attn_size = attention_states.get_shape()[2].value
  
      # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
      hidden = array_ops.reshape(
          attention_states, [-1, attn_length, 1, attn_size])
      hidden_features = []
      v = []
      attention_vec_size = attn_size  # Size of query vectors for attention.
      for a in xrange(num_heads):
        k = variable_scope.get_variable("AttnW_%d" % a,
                                        [1, 1, attn_size, attention_vec_size])
        hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
        v.append(variable_scope.get_variable("AttnV_%d" % a,
                                             [attention_vec_size]))
  
      def attention(query):
        """Put attention masks on hidden using hidden_features and query."""
        attn_weights = []
        ds = []  # Results of attention reads will be stored here.
        for i in xrange(num_heads):
          with variable_scope.variable_scope("Attention_%d" % i):
            y = rnn_cell._linear(query, attention_vec_size, True)
            y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
            # Attention mask is a softmax of v^T * tanh(...).
            s = math_ops.reduce_sum(
                v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3])
            a = nn_ops.softmax(s)
            attn_weights.append(a)
            # Now calculate the attention-weighted vector d.
            d = math_ops.reduce_sum(
                array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
                [1, 2])
            ds.append(array_ops.reshape(d, [-1, attn_size]))
        return attn_weights, ds
  
      batch_attn_size = array_ops.pack([batch_size, attn_size])
      attns = [array_ops.zeros(batch_attn_size, dtype=dtype)
               for _ in xrange(num_heads)]
      for a in attns:  # Ensure the second shape of attention vectors is set.
        a.set_shape([None, attn_size])
  
      # loop through the encoder_outputs
      attention_encoder_outputs = list()
      sequence_attention_weights = list()
      for i in xrange(len(encoder_outputs)):
        if i > 0:
          variable_scope.get_variable_scope().reuse_variables()
        if i == 0:
          with variable_scope.variable_scope("Initial_Decoder_Attention"):
            initial_state = rnn_cell._linear(encoder_state, output_size, True)
          attn_weights, ds = attention(initial_state)
        else:
          attn_weights, ds = attention(encoder_outputs[i])
        output = array_ops.concat(1, [ds[0], encoder_outputs[i]]) # NOTE: here we temporarily assume num_head = 1
        with variable_scope.variable_scope("AttnRnnOutputProjection"):
          logit = rnn_cell._linear(output, num_decoder_symbols, True)
        attention_encoder_outputs.append(logit) # NOTE: here we temporarily assume num_head = 1
        sequence_attention_weights.append(attn_weights[0]) # NOTE: here we temporarily assume num_head = 1
  else:
    print ('Use the NON attention RNN model')
    with variable_scope.variable_scope(scope or "non-attention_RNN"):
      attention_encoder_outputs = list()
      sequence_attention_weights = list()
      
      # copy over logits once out of sequence_length
      if encoder_outputs[0].get_shape().ndims != 1:
        (fixed_batch_size, output_size) = encoder_outputs[0].get_shape().with_rank(2)
      else:
        fixed_batch_size = encoder_outputs[0].get_shape().with_rank_at_least(1)[0]

      if fixed_batch_size.value: 
        batch_size = fixed_batch_size.value
      else:
        batch_size = array_ops.shape(encoder_outputs[0])[0]
      if sequence_length is not None:
        sequence_length = math_ops.to_int32(sequence_length)
      if sequence_length is not None:  # Prepare variables
        zero_logit = array_ops.zeros(
            array_ops.pack([batch_size, num_decoder_symbols]), encoder_outputs[0].dtype)
        zero_logit.set_shape(
            tensor_shape.TensorShape([fixed_batch_size.value, num_decoder_symbols]))
        min_sequence_length = math_ops.reduce_min(sequence_length)
        max_sequence_length = math_ops.reduce_max(sequence_length)
    
      for time, input_ in enumerate(encoder_outputs):
        if time > 0: variable_scope.get_variable_scope().reuse_variables()
        # pylint: disable=cell-var-from-loop
        # call_cell = lambda: cell(input_, state)
        generate_logit = lambda: rnn_cell._linear(encoder_outputs[time], num_decoder_symbols, True)
        # pylint: enable=cell-var-from-loop
        if sequence_length is not None:
          logit = _step(
              time, sequence_length, min_sequence_length, max_sequence_length, zero_logit, generate_logit)
        else:
          logit = generate_logit
        attention_encoder_outputs.append(logit)   
        
  return attention_encoder_outputs, sequence_attention_weights
 def __call__(self, inputs, state, scope=None):
   """Most basic RNN: output = new_state = tanh(W * input + U * state + B)."""
   with vs.variable_scope(scope or type(self).__name__):  # "BasicRNNCell"            
     output = self._activation(rnn_cell._linear([inputs, state], self._num_units, True))
   return output, output
Example #55
0
		def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
			# return attention_seq2seq(encoder_inputs, decoder_inputs, cell,  self.input_size, self.hidden_size, self.output_size, feed_previous=do_decode)
			# return basic_seq2seq(encoder_inputs, decoder_inputs, cell, self.input_size, self.hidden_size, self.output_size)

			with variable_scope.variable_scope("my_seq2seq"):

				wrapper_cell = tf.nn.rnn_cell.InputProjectionWrapper(cell, self.hidden_size, self.input_size)

				encoder_outputs, enc_state = rnn.rnn(wrapper_cell, encoder_inputs, dtype=dtypes.float32)


				if do_decode:
					def simple_loop_function(prev, _):
						_next = tf.greater_equal(prev, 0.5)
						_next = tf.to_float(_next)
						return _next	
					loop_function = simple_loop_function
				else:
					loop_function = None

				#################
				# ATTENTION DECODER
				#################
				# First calculate a concatenation of encoder outputs to put attention on.
				top_states = [array_ops.reshape(e, [-1, 1, wrapper_cell.output_size]) for e in encoder_outputs]
				attention_states = array_ops.concat(1, top_states)


				# return tf.nn.seq2seq.attention_decoder(decoder_inputs, enc_state, attention_states, wrapper_cell, output_size=self.output_size,loop_function=loop_function)

				initial_state = enc_state
				output_size = self.output_size
				num_heads = 1
				dtype = dtypes.float32
				scope = None
				initial_state_attention = False

				if not decoder_inputs:
					raise ValueError("Must provide at least 1 input to attention decoder.")
				if num_heads < 1:
					raise ValueError("With less than 1 heads, use a non-attention decoder.")
				if not attention_states.get_shape()[1:2].is_fully_defined():
					raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
							% attention_states.get_shape())
				if output_size is None:
					output_size = wrapper_cell.output_size

				with variable_scope.variable_scope(scope or "attention_decoder") as scope:
					# dtype = scope.dtype

					batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
					attn_length = attention_states.get_shape()[1].value
					attn_size = attention_states.get_shape()[2].value

					# To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
					hidden = array_ops.reshape(attention_states, [-1, attn_length, 1, attn_size])
					hidden_features = []
					v = []
					attention_vec_size = attn_size  # Size of query vectors for attention.
					for a in xrange(num_heads):
						k = variable_scope.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size])
						hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
						v.append(variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size]))

					state = initial_state

					def attention(query):
						"""Put attention masks on hidden using hidden_features and query."""
						ds = []  # Results of attention reads will be stored here.
						# if nest.is_sequence(query):  # If the query is a tuple, flatten it.
						# 	query_list = nest.flatten(query)
						# 	for q in query_list:  # Check that ndims == 2 if specified.
						# 		ndims = q.get_shape().ndims
						# 		if ndims:
						# 			assert ndims == 2
						# 	query = array_ops.concat(1, query_list)
						for a in xrange(num_heads):
							with variable_scope.variable_scope("Attention_%d" % a):
								y = rnn_cell._linear(query, attention_vec_size, True)
								y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
								# Attention mask is a softmax of v^T * tanh(...).
								s = math_ops.reduce_sum(v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])
								a = nn_ops.softmax(s)
								# Now calculate the attention-weighted vector d.
								d = math_ops.reduce_sum(array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2])
								ds.append(array_ops.reshape(d, [-1, attn_size]))
						return ds

					outputs = []
					prev = None
					batch_attn_size = array_ops.pack([batch_size, attn_size])
					attns = [array_ops.zeros(batch_attn_size, dtype=dtype)
								for _ in xrange(num_heads)]
					for a in attns:  # Ensure the second shape of attention vectors is set.
						a.set_shape([None, attn_size])
					if initial_state_attention:
						attns = attention(initial_state)
					for i, inp in enumerate(decoder_inputs):
						if i > 0:
							variable_scope.get_variable_scope().reuse_variables()
						# If loop_function is set, we use it instead of decoder_inputs.
						if loop_function is not None and prev is not None:
							with variable_scope.variable_scope("loop_function", reuse=True):
								inp = loop_function(prev, i)
						# Merge input and previous attentions into one vector of the right size.
						input_size = inp.get_shape().with_rank(2)[1]
						if input_size.value is None:
							raise ValueError("Could not infer input size from input: %s" % inp.name)
						x = rnn_cell._linear([inp] + attns, input_size, True)
						# Run the RNN.
						cell_output, state = wrapper_cell(x, state)
						# Run the attention mechanism.
						if i == 0 and initial_state_attention:
							with variable_scope.variable_scope(variable_scope.get_variable_scope(),
					                                       reuse=True):
								attns = attention(state)
						else:
							attns = attention(state)

						with variable_scope.variable_scope("AttnOutputProjection"):
							output = rnn_cell._linear([cell_output] + attns, output_size, True)
							output = tf.nn.sigmoid(output)
						if loop_function is not None:
							prev = output
						outputs.append(output)

					return outputs, state
  def __call__(self, inputs, state, scope=None):
    """Run one step of LSTM.
    Args:
      inputs: input Tensor, 2D, batch x num_units.
      state: if `state_is_tuple` is False, this must be a state Tensor,
        `2-D, batch x state_size`.  If `state_is_tuple` is True, this must be a
        tuple of state Tensors, both `2-D`, with column sizes `c_state` and
        `m_state`.
      scope: VariableScope for the created subgraph; defaults to "LSTMCell".
    Returns:
      A tuple containing:
      - A `2-D, [batch x output_dim]`, Tensor representing the output of the
        LSTM after reading `inputs` when previous state was `state`.
        Here output_dim is:
           num_proj if num_proj was set,
           num_units otherwise.
      - Tensor(s) representing the new state of LSTM after reading `inputs` when
        the previous state was `state`.  Same type and shape(s) as `state`.
    Raises:
      ValueError: If input size cannot be inferred from inputs via
        static shape inference.
    """
    num_proj = self._num_units if self._num_proj is None else self._num_proj

    if self._state_is_tuple:
      (c_prev, m_prev) = state
    else:
      c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units])
      m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj])

    input_size = inputs.get_shape().with_rank(2)[1]
    if input_size.value is None:
      raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
    with vs.variable_scope(scope or type(self).__name__,
                           initializer=self._initializer):  # "LSTMCell"

      s1 = vs.get_variable("s1", initializer=tf.ones([4 * self._num_units]), dtype=tf.float32)
      s2 = vs.get_variable("s2", initializer=tf.ones([4 * self._num_units]), dtype=tf.float32)
      s3 = vs.get_variable("s3", initializer=tf.ones([self._num_units]), dtype=tf.float32)

      b1 = vs.get_variable("b1", initializer=tf.zeros([4 * self._num_units]), dtype=tf.float32)
      b2 = vs.get_variable("b2", initializer=tf.zeros([4 * self._num_units]), dtype=tf.float32)
      b3 = vs.get_variable("b3", initializer=tf.zeros([self._num_units]), dtype=tf.float32)

      # s1 = tf.Variable(tf.ones([4 * self._num_units]), name="s1")
      # s2 = tf.Variable(tf.ones([4 * self._num_units]), name="s2")
      # s3 = tf.Variable(tf.ones([self._num_units]), name="s3")
      #
      # b1 = tf.Variable(tf.zeros([4 * self._num_units]), name="b1")
      # b2 = tf.Variable(tf.zeros([4 * self._num_units]), name="b2")
      # b3 = tf.Variable(tf.zeros([self._num_units]), name="b3")

      input_below_ = rnn_cell._linear([inputs],
                                      4 * self._num_units, False, scope="out_1")
      input_below_ = ln(input_below_, s1, b1)
      state_below_ = rnn_cell._linear([m_prev],
                                      4 * self._num_units, False, scope="out_2")
      state_below_ = ln(state_below_, s2, b2)
      lstm_matrix = tf.add(input_below_, state_below_)

      i, j, f, o = array_ops.split(1, 4, lstm_matrix)

      c = (sigmoid(f) * c_prev + sigmoid(i) *
             self._activation(j))

    # Currently normalizing c causes lot of nan's in the model, thus commenting it out for now.
     # c_ = ln(c, s3, b3)
      c_ = c
      m = sigmoid(o) * self._activation(c_)

    new_state = (LSTMStateTuple(c, m) if self._state_is_tuple
                 else array_ops.concat(1, [c, m]))
    return m, new_state