Example #1
0
    def __call__(self, inputs, state, scope=None):
        """Simplified Gating LSTM."""
        with vs.variable_scope(scope or "simplified_gating_lstm_cell"):
            c, h = state
            with vs.variable_scope("gates_0") as gate_scope:
                if self._architecture in ['LS1', 'LS2']:
                    concat = _linear(
                        [h],
                        3 * self._num_units,
                        True if self._architecture == 'LS1' else False,
                        scope=scope)
                    i, f, o = array_ops.split(value=concat,
                                              num_or_size_splits=3,
                                              axis=1)
                elif self._architecture == 'LS3':
                    dtype = inputs.dtype
                    bias = vs.get_variable("bias",
                                           shape=[3 * self._num_units],
                                           dtype=dtype)
                    i, f, o = array_ops.split(value=bias,
                                              num_or_size_splits=3,
                                              axis=0)

            with vs.variable_scope("gates_1"):
                j = _linear([inputs, h], self._num_units, True, scope=scope)

            new_c = c * sigmoid(f + self._forget_bias) + sigmoid(
                i) * self._activation(j)
            new_h = self._activation(new_c) * sigmoid(o)

            new_state = LSTMStateTuple(new_c, new_h)
            return new_h, new_state
Example #2
0
 def __call__(self, inputs, state, scope=None):
     """JZS with num_units cells."""
     with vs.variable_scope(scope or "jzs_cell"):
         # We start with bias of 1.0 to not reset and not update.
         if self._architecture == 'JZS1':
             r, u = [inputs, state], [inputs]
         elif self._architecture == 'JZS2':
             r, u = [state], [inputs, state]
         elif self._architecture == 'JZS3':
             r, u = [inputs, state], [inputs, tanh(state)]
         with vs.variable_scope("gates_0"):
             r = _linear(r, self._num_units, True, 1.0, scope=scope)
             if self._architecture == 'JZS2':
                 r = r + inputs
         with vs.variable_scope("gates_1"):
             u = _linear(u, self._num_units, True, 1.0, scope=scope)
         r, u = sigmoid(r), sigmoid(u)
         with vs.variable_scope("candidate"):
             if self._architecture == 'JZS1':
                 c = _linear(
                     [r * state], self._num_units, True,
                     scope=scope) + tanh(inputs)
             elif self._architecture in ['JZS2', 'JZS3']:
                 c = _linear([inputs, r * state],
                             self._num_units,
                             True,
                             scope=scope)
         c = self._activation(c)
         new_h = u * c + (1 - u) * state
     return new_h, new_h
Example #3
0
    def call(self, inputs, state):
        """Gated recurrent unit (GRU) with nunits cells."""
        with vs.variable_scope("gates"):  # Reset gate and update gate.
            # We start with bias of 1.0 to not reset and not update.
            bias_ones = self._bias_initializer
            if self._bias_initializer is None:
                dtype = [a.dtype for a in [inputs, state]][0]
                bias_ones = init_ops.constant_initializer(1.0, dtype=dtype)
            #value = sigmoid(_linear([inputs, state], 2 * self._num_units, True,
            #    bias_ones, self._kernel_initializer))
            #r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)

            value = _linear([inputs, state], 2 * self._num_units, True,
                            bias_ones, self._kernel_initializer)
            r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)
            r = ln(r, scope='r/')
            u = ln(u, scope='u/')
            r, u = sigmoid(r), sigmoid(u)

        with vs.variable_scope("candidate"):
            #      c = self._activation(_linear([inputs, r * state], self._num_units, True,
            #          self._bias_initializer, self._kernel_initializer))
            #    new_h = u * state + (1 - u) * c

            Cand = _linear([inputs, r * state], self._num_units, True)
            c_pre = ln(Cand, scope='new_h/')
            c = self._activation(c_pre)
        new_h = u * state + (1 - u) * c
        return new_h, new_h
Example #4
0
    def __call__(self, inputs, state, scope=None):
        """Gated recurrent unit (GRU) with nunits cells."""
        dtype = inputs.dtype
        batch_size, feature_size = inputs.get_shape().as_list()
        if self._use_tgate:
            # Time gate
            feature_size = feature_size - 1
            tvscope = vs.get_variable_scope()
            with vs.variable_scope(tvscope, initializer=None) as unit_scope:
                with vs.variable_scope(unit_scope) as time_gate_scope:
                    w_t1 = vs.get_variable(
                        "w_t1", shape=[1, self._num_units], dtype=dtype)
                    bias_t1 = vs.get_variable(
                        "bias_t1", [self._num_units], dtype=dtype,
                        initializer=init_ops.constant_initializer(0.0, dtype=dtype))
                    w_tx1 = vs.get_variable(
                        "w_tx1", shape=[feature_size, self._num_units], dtype=dtype)
                seq = tf.slice(inputs, begin=[0, 0], size=[batch_size, feature_size])
                delta_t = tf.slice(inputs, begin=[0, 56], size=[batch_size, 1])


                t1_act = (self._activation(math_ops.matmul(delta_t, w_t1)) +
                          math_ops.matmul(seq, w_tx1) + bias_t1)
                t1 = sigmoid(t1_act)
                inputs = seq
        # for initial state
        (state, state_decay) = state
        with vs.variable_scope("gates"):  # Reset gate and update gate.
            # We start with bias of 1.0 to not reset and not update.
            value = sigmoid(_linear(
                [inputs, state], 2 * self._num_units, True, 1.0))
            r, u = array_ops.split(value=value,
                                   num_or_size_splits=2,
                                   axis=1)
        with vs.variable_scope("candidate"):
            c = self._activation(_linear([inputs, r * state],
                                         self._num_units, True))
        new_h = u * state + (1 - u) * c

        if self._use_tgate:
            new_h_decay = u * t1 * state_decay + (1 - u * t1) * c
            new_state = (new_h, new_h_decay)
            new_state = (TGRUStateTuple(new_h, new_h_decay))
            new_h = tf.concat([new_h, new_h_decay], axis=1)
        else:
            new_state = (new_h, new_h)
            new_state = (TGRUStateTuple(new_h, new_h))

        return new_h, new_state
Example #5
0
 def attention(query, use_attention=False):
   """Put attention masks on hidden using hidden_features and query."""
   attn_weights = []
   ds = []  # Results of attention reads will be stored here.
   for i in xrange(num_heads):
     with variable_scope.variable_scope("Attention_%d" % i):
       y = rnn_cell._linear(query, attention_vec_size, True)
       y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
       # Attention mask is a softmax of v^T * tanh(...).
       s = math_ops.reduce_sum(
           v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3])
       if use_attention is False: # apply mean pooling
           weights = tf.tile(sequence_length, tf.stack([attn_length]))
           weights = array_ops.reshape(weights, tf.shape(s))
           a = array_ops.ones(tf.shape(s), dtype=dtype) / math_ops.to_float(weights)
           # a = array_ops.ones(tf.shape(s), dtype=dtype) / math_ops.to_float(tf.shape(s)[1])
       else:
         a = nn_ops.softmax(s)
       attn_weights.append(a)
       # Now calculate the attention-weighted vector d.
       d = math_ops.reduce_sum(
           array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
           [1, 2])
       ds.append(array_ops.reshape(d, [-1, attn_size]))
   return attn_weights, ds
Example #6
0
    def getMetaResults(self, meta_output, input, dimensions, scope="meta"):
        """calculate the gates results of basic lstm with meta-lstm network"""    
        # with tf.variable_scope('z_trans'):
        #     meta_output = rnn_cell._linear(meta_output, self._meta_num_units, False)

        with tf.variable_scope(scope):
            W_matrix_list = []
            input_shape = int(input.get_shape()[-1])

            #generate parameters of basic lstm
            for i in np.arange(4):
                P = tf.get_variable('P{}'.format(i), shape=[self._meta_num_units, dimensions],
                    initializer=tf.uniform_unit_scaling_initializer(),dtype=tf.float32)
                Q = tf.get_variable('Q{}'.format(i), shape=[self._meta_num_units, input_shape], 
                    initializer=tf.uniform_unit_scaling_initializer(),dtype=tf.float32)
                
                _W_matrix = tf.matmul(tf.reshape(tf.matrix_diag(meta_output),[-1, self._meta_num_units]), P)
                _W_matrix = tf.reshape(_W_matrix, [-1, self._meta_num_units, dimensions])
                _W_matrix = tf.matmul(tf.reshape(tf.transpose(_W_matrix, [0,2,1]), [-1, self._meta_num_units]), Q)
                _W_matrix = tf.reshape(_W_matrix, [-1, dimensions, input_shape])
                W_matrix_list.append(_W_matrix)
            W_matrix = tf.concat(values=W_matrix_list, axis=1)
            Bias = rnn_cell._linear(meta_output, 4*dimensions, False)

            result = tf.matmul(W_matrix, tf.expand_dims(input, -1))
            result = tf.add(tf.reshape(result, [-1, 4*dimensions]), Bias)
            return result
    def __call__(self, inputs, state, d_act, scope=None):
        """Long short-term memory cell (LSTM)."""
        with vs.variable_scope(scope or type(self).__name__):  # "BasicLSTMCell"
        # Parameters of gates are concatenated into one multiply for efficiency.
            if self._state_is_tuple:
                c, h = state
            else:
                # c, h = array_ops.split(1, 2, state)
                c, h = array_ops.split(state, 2, 1)
            concat = _linear([inputs, h], 4 * self._num_units, True)

            # i = input_gate, j = new_input, f = forget_gate, o = output_gate
            # i, j, f, o = array_ops.split(1, 4, concat)
            i, j, f, o = array_ops.split(concat, 4, 1)
            
            w_d = vs.get_variable('w_d', [self.key_words_voc_size, self._num_units])
            
            new_c = (c * sigmoid(f + self._forget_bias) + sigmoid(i) *
                    self._activation(j)) + tf.tanh(tf.matmul(d_act, w_d))
            new_h = self._activation(new_c) * sigmoid(o)

            if self._state_is_tuple:
                new_state = LSTMStateTuple(new_c, new_h)
            else:
                # new_state = array_ops.concat(1, [new_c, new_h])
                new_state = array_ops.concat([new_c, new_h], 1)
            return new_h, new_state
 def attention(query, use_attention=False):
   """Put attention masks on hidden using hidden_features and query."""
   attn_weights = []
   ds = []  # Results of attention reads will be stored here.
   for i in xrange(num_heads):
     with variable_scope.variable_scope("Attention_%d" % i):
       y = rnn_cell._linear(query, attention_vec_size, True)
       y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
       # Attention mask is a softmax of v^T * tanh(...).
       s = math_ops.reduce_sum(
           v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3])
       if use_attention is False: # apply mean pooling
           weights = tf.tile(sequence_length, tf.stack([attn_length]))
           weights = array_ops.reshape(weights, tf.shape(s))
           a = array_ops.ones(tf.shape(s), dtype=dtype) / math_ops.to_float(weights)
           # a = array_ops.ones(tf.shape(s), dtype=dtype) / math_ops.to_float(tf.shape(s)[1])
       else:
         a = nn_ops.softmax(s)
       attn_weights.append(a)
       # Now calculate the attention-weighted vector d.
       d = math_ops.reduce_sum(
           array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
           [1, 2])
       ds.append(array_ops.reshape(d, [-1, attn_size]))
   return attn_weights, ds
Example #9
0
def linear(args, output_size, bias, bias_start=0.0, scope=None, squeeze=True, wd=0.0, input_keep_prob=1.0, is_train=True):
    if args is None or (nest.is_sequence(args) and not args):
        raise ValueError("args must be specified")
    if not nest.is_sequence(args):
        args = [args]
    flat_args = [flatten(arg, 1) for arg in args]
    is_train = tf.convert_to_tensor(is_train, dtype=tf.bool)
    #if input_keep_prob is not None:
        #assert is_train is not None
    flat_args = [tf.cond(is_train, lambda : tf.nn.dropout(arg, input_keep_prob), lambda : arg) for arg in flat_args]
    flat_out = _linear(flat_args, output_size, bias, bias_start=bias_start, scope=scope)
    out = reconstruct(flat_out, args[0], 1)
    shape = out.get_shape().as_list()
    shape.pop()
    print("shape", shape)

    if squeeze:
        #out = tf.squeeze(out, [len(args[0].get_shape().as_list())-1])
        out = tf.squeeze(out, axis=[3])

    #out = tf.reshape(out, shape=shape)

    if wd:
        add_wd(wd)
    print("out", out)
    return out
Example #10
0
 def __call__(self, inputs, state, context, scope=None):
   """Gated recurrent unit (GRU) with nunits cells."""
   with _checked_scope(self, scope or "gru_cell", reuse=self._reuse):
     with vs.variable_scope("gates"):  # Reset gate and update gate.
       # We start with bias of 1.0 to not reset and not update.
       value = sigmoid(_linear(
         [inputs, state, context], 2 * self._num_units, True, 1.0))
       r, u = array_ops.split(
           value=value,
           num_or_size_splits=2,
           axis=1)
     with vs.variable_scope("candidate"):
       c = self._activation(_linear([inputs, r * state],
                                    self._num_units, True))
     new_h = u * state + (1 - u) * c
   return new_h, new_h
Example #11
0
    def __call__(self, inputs, state):
        """Gated recurrent unit (GRU) with nunits cells."""
        with vs.variable_scope("Gates"):  # Reset gate and update gate.,reuse=True
            # We start with bias of 1.0 to not reset and not update.
            value =_linear([inputs, state], 2 * self._num_units, True, 1.0)
            r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)
            r = ln(r, scope = 'r/')
            u = ln(u, scope = 'u/')
            r, u = sigmoid(r), sigmoid(u)
        with vs.variable_scope("Candidate"):
#            with vs.variable_scope("Layer_Parameters"):
            Cand = _linear([inputs,  r *state], self._num_units, True)
            c_pre = ln(Cand,  scope = 'new_h/')
            c = self._activation(c_pre)
        new_h = u * state + (1 - u) * c
        return new_h, new_h
Example #12
0
def linear(args,
           output_size,
           bias,
           bias_start=0.0,
           scope=None,
           squeeze=False,
           wd=0.0,
           input_keep_prob=1.0,
           is_train=None):
    if args is None or (nest.is_sequence(args) and not args):
        raise ValueError("`args` must be specified")
    if not nest.is_sequence(args):
        args = [args]

    flat_args = [flatten(arg, 1) for arg in args]
    if input_keep_prob < 1.0:
        assert is_train is not None
        flat_args = [
            tf.cond(is_train, lambda: tf.nn.dropout(arg, input_keep_prob),
                    lambda: arg) for arg in flat_args
        ]
    with tf.variable_scope(scope or 'Linear'):
        flat_out = _linear(flat_args, output_size, bias, bias_start=bias_start)
    out = reconstruct(flat_out, args[0], 1)
    if squeeze:
        out = tf.squeeze(out, [len(args[0].get_shape().as_list()) - 1])
    if wd:
        add_wd(wd)

    return out
Example #13
0
    def __call__(self, inputs, state, scope=None):
        """Long short-term memory cell (LSTM).
        @param: inputs (batch,n)
        @param state: the states and hidden unit of the two cells
        """
        with tf.variable_scope(scope or type(self).__name__):
            c1, c2, h1, h2 = state

            # change bias argument to False since LN will add bias via shift
            concat = _linear([inputs, h1, h2], 5 * self._num_units, False)

            i, j, f1, f2, o = tf.split(value=concat,
                                       num_or_size_splits=5,
                                       axis=1)

            # add layer normalization to each gate
            i = ln(i, scope='i/')
            j = ln(j, scope='j/')
            f1 = ln(f1, scope='f1/')
            f2 = ln(f2, scope='f2/')
            o = ln(o, scope='o/')

            new_c = (c1 * tf.nn.sigmoid(f1 + self._forget_bias) +
                     c2 * tf.nn.sigmoid(f2 + self._forget_bias) +
                     tf.nn.sigmoid(i) * self._activation(j))

            # add layer_normalization in calculation of new hidden state
            new_h = self._activation(ln(new_c,
                                        scope='new_h/')) * tf.nn.sigmoid(o)
            new_state = LSTMStateTuple(new_c, new_h)

            return new_h, new_state
Example #14
0
    def decode(self, h_q, h_p, scope=None, reuse=None):
        """
        takes in a knowledge representation
        and output a probability estimation over
        all paragraph tokens on which token should be
        the start of the answer span, and which should be
        the end of the answer span.

        :param knowledge_rep: it is a representation of the paragraph and question,
                              decided by how you choose to implement the encoder
        :return:
        """
        # Linear mix: h_q * W1 + h_p * W2 + b
        with vs.variable_scope('a_s'):
            a_s = _linear([h_q, h_p], self.output_size, True)
        with vs.variable_scope('a_e'):
            a_e = _linear([h_q, h_p], self.output_size, True)
        return a_s, a_e
Example #15
0
 def attention(query):
     """Point on hidden using hidden_features and query."""
     with vs.variable_scope("Attention"):
         y = core_rnn_cell_impl._linear(query, attention_vec_size, True)
         y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
         # Attention mask is a softmax of v^T * tanh(...).
         s = math_ops.reduce_sum(
             v * math_ops.tanh(hidden_features + y), [2, 3])
         return s
 def __init__(self, num_units, encoder_output, scope=None):
     self.hs = encoder_output
     with tf.variable_scope(scope or type(self).__name__):
         with tf.variable_scope("Attn1"):
             hs2d = tf.reshape(self.hs, [-1, num_units])
             phi_hs2d = tf.tanh(
                 core_rnn_cell_impl._linear(hs2d, num_units, True, 1.0))
             self.phi_hs = tf.reshape(phi_hs2d, tf.shape(self.hs))
     super(GRUCellAttn, self).__init__(num_units)
        def beam_step(time, beam_probs, beam_seqs, cand_probs, cand_seqs,
                      *states):
            batch_size = tf.shape(beam_probs)[0]
            inputs = tf.reshape(
                tf.slice(beam_seqs, [0, time], [batch_size, 1]), [batch_size])
            decoder_input = tf.nn.embedding_lookup(self.L_dec, inputs)
            decoder_output, state_output = self.decoder_graph(
                decoder_input, states)

            with tf.variable_scope("Logistic", reuse=True):
                do2d = tf.reshape(decoder_output, [-1, self.size])
                logits2d = core_rnn_cell_impl._linear(do2d, self.vocab_size,
                                                      True, 1.0)
                logprobs2d = tf.nn.log_softmax(logits2d)

            total_probs = logprobs2d + tf.reshape(beam_probs, [-1, 1])
            total_probs_noEOS = tf.concat([
                tf.slice(total_probs, [0, 0], [batch_size, nlc_data.EOS_ID]),
                tf.tile([[-3e38]], [batch_size, 1]),
                tf.slice(total_probs, [0, nlc_data.EOS_ID + 1],
                         [batch_size, self.vocab_size - nlc_data.EOS_ID - 1])
            ], 1)

            flat_total_probs = tf.reshape(total_probs_noEOS, [-1])
            beam_k = tf.minimum(tf.size(flat_total_probs), self.beam_size)
            next_beam_probs, top_indices = tf.nn.top_k(flat_total_probs,
                                                       k=beam_k)

            next_bases = tf.floordiv(top_indices, self.vocab_size)
            next_mods = tf.mod(top_indices, self.vocab_size)

            next_states = [
                tf.gather(state, next_bases) for state in state_output
            ]
            next_beam_seqs = tf.concat([
                tf.gather(beam_seqs, next_bases),
                tf.reshape(next_mods, [-1, 1])
            ], 1)

            cand_seqs_pad = tf.pad(cand_seqs, [[0, 0], [0, 1]])
            beam_seqs_EOS = tf.pad(beam_seqs, [[0, 0], [0, 1]])
            new_cand_seqs = tf.concat([cand_seqs_pad, beam_seqs_EOS], 0)
            EOS_probs = tf.slice(total_probs, [0, nlc_data.EOS_ID],
                                 [batch_size, 1])
            new_cand_probs = tf.concat(
                [cand_probs, tf.reshape(EOS_probs, [-1])], 0)

            cand_k = tf.minimum(tf.size(new_cand_probs), self.beam_size)
            next_cand_probs, next_cand_indices = tf.nn.top_k(new_cand_probs,
                                                             k=cand_k)
            next_cand_seqs = tf.gather(new_cand_seqs, next_cand_indices)

            return [
                time + 1, next_beam_probs, next_beam_seqs, next_cand_probs,
                next_cand_seqs
            ] + next_states
Example #18
0
def highway(input_, size, layer_size=1, bias=-2, f=tf.nn.relu):
    """Highway Network (cf. http://arxiv.org/abs/1505.00387).

    t = sigmoid(Wy + b)
    z = t * g(Wy + b) + (1 - t) * y
    where g is nonlinearity, t is transform gate, and (1 - t) is carry gate.
    """
    output = input_
    for idx in range(layer_size):
        with tf.variable_scope('output_lin_%d' % idx):
            output = f(core_rnn_cell_impl._linear(output, size, 0))

        with tf.variable_scope('transform_lin_%d' % idx):
            transform_gate = tf.sigmoid(
                core_rnn_cell_impl._linear(input_, size, 0) + bias)
            carry_gate = 1. - transform_gate

        output = transform_gate * output + carry_gate * input_

    return output
Example #19
0
    def __call__(self, inputs, state, scope=None):
        """GRU with attention."""

        with tf.variable_scope(scope or 'attention_cell_wrapper'):
            output, _ = self._cell(inputs, state)
            att = _linear([output, self._attn_vec],
                          self.output_size,
                          bias=True)
            output = output * tf.sigmoid(att)

        return output, output
Example #20
0
 def __call__(self, inputs, state, scope=None):
     """Long short-term memory cell (LSTM)."""
     with vs.variable_scope(scope or type(self).__name__):
         # Parameters of gates are concatenated into one multiply for efficiency.
         c, h = state
         concat = _linear([inputs, h], 4 * self._num_units, True)
         # i = input_gate, j = new_input, f = forget_gate, o = output_gate
         i, j, f, o = array_ops.split(value=concat,
                                      num_or_size_splits=4,
                                      axis=1)
         new_c = (c * sigmoid(f) + sigmoid(i) * self._activation(j))
         new_h = self._activation(new_c) * sigmoid(o)
         new_state = (new_c, new_h)
         return new_h, new_state
Example #21
0
            def body(previous_finished, time_step, previous_state,
                     running_output, running_state, ponder_steps, remainders,
                     running_p_sum):

                current_inputs = tf.where(tf.equal(time_step, 1),
                                          inputs_and_one, inputs_and_zero)
                current_output, current_state = self._cell(
                    current_inputs, previous_state)

                if state_is_tuple:
                    joint_current_state = tf.concat(current_state, 1)
                else:
                    joint_current_state = current_state

                current_h = tf.nn.sigmoid(
                    tf.squeeze(
                        _linear([joint_current_state], 1, True,
                                self._init_halting_bias), 1))

                current_h_sum = running_p_sum + current_h

                limit_condition = time_step >= self._ponder_limit
                halting_condition = current_h_sum >= 1.0 - self._epsilon
                current_finished = tf.logical_or(halting_condition,
                                                 limit_condition)
                just_finished = tf.logical_xor(current_finished,
                                               previous_finished)

                current_p = tf.where(current_finished, 1.0 - running_p_sum,
                                     current_h)
                expanded_current_p = tf.expand_dims(current_p, 1)

                running_output += expanded_current_p * current_output

                if state_is_tuple:
                    running_state += tf.expand_dims(expanded_current_p,
                                                    0) * current_state
                else:
                    running_state += expanded_current_p * current_state

                ponder_steps = tf.where(just_finished,
                                        tf.fill([batch_size], time_step),
                                        ponder_steps)
                remainders = tf.where(just_finished, current_p, remainders)
                running_p_sum += current_p

                return (current_finished, time_step + 1, current_state,
                        running_output, running_state, ponder_steps,
                        remainders, running_p_sum)
 def __call__(self, inputs, state, scope=None):
     gru_out, gru_state = super(GRUCellAttn,
                                self).__call__(inputs, state, scope)
     with tf.variable_scope(scope or type(self).__name__):
         with tf.variable_scope("Attn2"):
             gamma_h = tf.tanh(
                 core_rnn_cell_impl._linear(gru_out, self._num_units, True,
                                            1.0))
         weights = tf.reduce_sum(self.phi_hs * gamma_h,
                                 reduction_indices=2,
                                 keep_dims=True)
         weights = tf.exp(
             weights -
             tf.reduce_max(weights, reduction_indices=0, keep_dims=True))
         weights = weights / (1e-6 + tf.reduce_sum(
             weights, reduction_indices=0, keep_dims=True))
         context = tf.reduce_sum(self.hs * weights, reduction_indices=0)
         with tf.variable_scope("AttnConcat"):
             out = tf.nn.relu(
                 core_rnn_cell_impl._linear([context, gru_out],
                                            self._num_units, True, 1.0))
         self.attn_map = tf.squeeze(
             tf.slice(weights, [0, 0, 0], [-1, -1, 1]))
         return (out, out)
Example #23
0
	def __call__(self, inputs, state, scope=None):
		"""Long short-term memory cell (LSTM)."""
		with _checked_scope(self, scope or "basic_lstm_cell", 
			reuse=self._reuse):
			# Parameters of gates are concatenated into one multiply for efficiency.
			c, h = state 
			# i = input_gate, j = new_input, f = forget_gate, o = output_gate
			concat = _linear([inputs, h], 4 * self._num_units, True)
			i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1)

			new_c = (c * tf.nn.sigmoid(f + self._forget_bias) + \
				tf.nn.sigmoid(i) * self._activation(j))

			new_h = self._activation(new_c) * tf.nn.sigmoid(o)

			new_state = (new_c, new_h) 

			return new_h, new_state
Example #24
0
 def attention(query):
   """Put attention masks on hidden using hidden_features and query."""
   attn_weights = []
   ds = []  # Results of attention reads will be stored here.
   for i in xrange(num_heads):
     with variable_scope.variable_scope("Attention_%d" % i):
       y = rnn_cell._linear(query, attention_vec_size, True)
       y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
       # Attention mask is a softmax of v^T * tanh(...).
       s = math_ops.reduce_sum(
           v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3])
       a = nn_ops.softmax(s)
       attn_weights.append(a)
       # Now calculate the attention-weighted vector d.
       d = math_ops.reduce_sum(
           array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
           [1, 2])
       ds.append(array_ops.reshape(d, [-1, attn_size]))
   return attn_weights, ds
Example #25
0
 def attention(query):
     """Put attention masks on hidden using hidden_features and query."""
     attn_weights = []
     ds = []  # Results of attention reads will be stored here.
     for i in xrange(num_heads):
         with variable_scope.variable_scope("Attention_%d" % i):
             y = rnn_cell._linear(query, attention_vec_size, True)
             y = array_ops.reshape(y,
                                   [-1, 1, 1, attention_vec_size])
             # Attention mask is a softmax of v^T * tanh(...).
             s = math_ops.reduce_sum(
                 v[i] * math_ops.tanh(hidden_features[i] + y),
                 [2, 3])
             a = nn_ops.softmax(s)
             attn_weights.append(a)
             # Now calculate the attention-weighted vector d.
             d = math_ops.reduce_sum(
                 array_ops.reshape(a, [-1, attn_length, 1, 1]) *
                 hidden, [1, 2])
             ds.append(array_ops.reshape(d, [-1, attn_size]))
     return attn_weights, ds
    def setup_loss(self):
        with tf.variable_scope("Logistic"):
            doshape = tf.shape(self.decoder_output)
            T, batch_size = doshape[0], doshape[1]
            do2d = tf.reshape(self.decoder_output, [-1, self.size])
            logits2d = core_rnn_cell_impl._linear(do2d, self.vocab_size, True,
                                                  1.0)
            outputs2d = tf.nn.log_softmax(logits2d)
            self.outputs = tf.reshape(
                outputs2d, tf.stack([T, batch_size, self.vocab_size]))

            targets_no_GO = tf.slice(self.target_tokens, [1, 0], [-1, -1])
            masks_no_GO = tf.slice(self.target_mask, [1, 0], [-1, -1])
            # easier to pad target/mask than to split decoder input since
            # tensorflow does not support negative indexing
            labels1d = tf.reshape(tf.pad(targets_no_GO, [[0, 1], [0, 0]]),
                                  [-1])
            mask1d = tf.reshape(tf.pad(masks_no_GO, [[0, 1], [0, 0]]), [-1])
            losses1d = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=logits2d, labels=labels1d) * tf.to_float(mask1d)
            losses2d = tf.reshape(losses1d, tf.stack([T, batch_size]))
            self.losses = tf.reduce_sum(losses2d) / tf.to_float(batch_size)
    def downscale(self, inp, mask):
        #return inp, mask

        with tf.variable_scope("Downscale"):
            inshape = tf.shape(inp)
            T, batch_size, dim = inshape[0], inshape[1], inshape[2]
            inp2d = tf.reshape(tf.transpose(inp, perm=[1, 0, 2]),
                               [-1, 2 * self.size])
            out2d = core_rnn_cell_impl._linear(inp2d, self.size, True, 1.0)
            out3d = tf.reshape(out2d,
                               tf.stack((batch_size, tf.to_int32(T / 2), dim)))
            out3d = tf.transpose(out3d, perm=[1, 0, 2])
            out3d.set_shape([None, None, self.size])
            out = tf.tanh(out3d)

            mask = tf.transpose(mask)
            mask = tf.reshape(mask, [-1, 2])
            mask = tf.cast(mask, tf.bool)
            mask = tf.reduce_any(mask, reduction_indices=1)
            mask = tf.to_int32(mask)
            mask = tf.reshape(mask, tf.stack([batch_size, -1]))
            mask = tf.transpose(mask)
        return out, mask
Example #28
0
def attention_RNN(encoder_outputs,
                  encoder_state,
                  num_decoder_symbols,
                  sequence_length,
                  num_heads=1,
                  dtype=dtypes.float32,
                  use_attention=True,
                  loop_function=None,
                  scope=None):
  if use_attention:
    print ('Use the attention RNN model')
    if num_heads < 1:
      raise ValueError("With less than 1 heads, use a non-attention decoder.")

    with variable_scope.variable_scope(scope or "attention_RNN"):
      output_size = encoder_outputs[0].get_shape()[1].value
      top_states = [array_ops.reshape(e, [-1, 1, output_size])
                  for e in encoder_outputs]
      attention_states = array_ops.concat(axis=1, values=top_states)
      if not attention_states.get_shape()[1:2].is_fully_defined():
        raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
                       % attention_states.get_shape())

      batch_size = array_ops.shape(top_states[0])[0]  # Needed for reshaping.
      attn_length = attention_states.get_shape()[1].value
      attn_size = attention_states.get_shape()[2].value

      # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
      hidden = array_ops.reshape(
          attention_states, [-1, attn_length, 1, attn_size])
      hidden_features = []
      v = []
      attention_vec_size = attn_size  # Size of query vectors for attention.
      for a in xrange(num_heads):
        k = variable_scope.get_variable("AttnW_%d" % a,
                                        [1, 1, attn_size, attention_vec_size])
        hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
        v.append(variable_scope.get_variable("AttnV_%d" % a,
                                             [attention_vec_size]))

      def attention(query):
        """Put attention masks on hidden using hidden_features and query."""
        attn_weights = []
        ds = []  # Results of attention reads will be stored here.
        for i in xrange(num_heads):
          with variable_scope.variable_scope("Attention_%d" % i):
            y = rnn_cell._linear(query, attention_vec_size, True)
            y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
            # Attention mask is a softmax of v^T * tanh(...).
            s = math_ops.reduce_sum(
                v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3])
            a = nn_ops.softmax(s)
            attn_weights.append(a)
            # Now calculate the attention-weighted vector d.
            d = math_ops.reduce_sum(
                array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
                [1, 2])
            ds.append(array_ops.reshape(d, [-1, attn_size]))
        return attn_weights, ds

      batch_attn_size = array_ops.stack([batch_size, attn_size])
      attns = [array_ops.zeros(batch_attn_size, dtype=dtype)
               for _ in xrange(num_heads)]
      for a in attns:  # Ensure the second shape of attention vectors is set.
        a.set_shape([None, attn_size])

      # loop through the encoder_outputs
      attention_encoder_outputs = list()
      sequence_attention_weights = list()
      for i in xrange(len(encoder_outputs)):
        if i > 0:
          variable_scope.get_variable_scope().reuse_variables()
        if i == 0:
          with variable_scope.variable_scope("Initial_Decoder_Attention"):
            initial_state = rnn_cell._linear(encoder_state, output_size, True)
          attn_weights, ds = attention(initial_state)
        else:
          attn_weights, ds = attention(encoder_outputs[i])
        output = array_ops.concat(axis=1, values=[ds[0], encoder_outputs[i]]) # NOTE: here we temporarily assume num_head = 1
        with variable_scope.variable_scope("AttnRnnOutputProjection"):
          logit = rnn_cell._linear(output, num_decoder_symbols, True)
        attention_encoder_outputs.append(logit) # NOTE: here we temporarily assume num_head = 1
        sequence_attention_weights.append(attn_weights[0]) # NOTE: here we temporarily assume num_head = 1
  else:
    print ('Use the NON attention RNN model')
    with variable_scope.variable_scope(scope or "non-attention_RNN"):
      attention_encoder_outputs = list()
      sequence_attention_weights = list()

      # copy over logits once out of sequence_length
      if encoder_outputs[0].get_shape().ndims != 1:
        (fixed_batch_size, output_size) = encoder_outputs[0].get_shape().with_rank(2)
      else:
        fixed_batch_size = encoder_outputs[0].get_shape().with_rank_at_least(1)[0]

      if fixed_batch_size.value:
        batch_size = fixed_batch_size.value
      else:
        batch_size = array_ops.shape(encoder_outputs[0])[0]
      if sequence_length is not None:
        sequence_length = math_ops.to_int32(sequence_length)
      if sequence_length is not None:  # Prepare variables
        zero_logit = array_ops.zeros(
            array_ops.pack([batch_size, num_decoder_symbols]), encoder_outputs[0].dtype)
        zero_logit.set_shape(
            tensor_shape.TensorShape([fixed_batch_size.value, num_decoder_symbols]))
        min_sequence_length = math_ops.reduce_min(sequence_length)
        max_sequence_length = math_ops.reduce_max(sequence_length)

      for time, input_ in enumerate(encoder_outputs):
        if time > 0: variable_scope.get_variable_scope().reuse_variables()
        # pylint: disable=cell-var-from-loop
        # call_cell = lambda: cell(input_, state)
        generate_logit = lambda: rnn_cell._linear(encoder_outputs[time], num_decoder_symbols, True)
        # pylint: enable=cell-var-from-loop
        if sequence_length is not None:
          logit = _step(
              time, sequence_length, min_sequence_length, max_sequence_length, zero_logit, generate_logit)
        else:
          logit = generate_logit
        attention_encoder_outputs.append(logit)

  return attention_encoder_outputs, sequence_attention_weights
Example #29
0
    def __call__(self, inputs, state, scope=None):
        """Run one step of simplified LSTM.

    Args:
      inputs: input Tensor, 2D, batch x num_units.
      state: This must be a tuple of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`.
      scope: VariableScope for the created subgraph; defaults to "simplified_lstm_cell".

    Returns:
      A tuple containing:

      - A `2-D, [batch x output_dim]`, Tensor representing the output of the
        LSTM after reading `inputs` when previous state was `state`.
        Here output_dim is:
           num_proj if num_proj was set,
           num_units otherwise.
      - Tensor(s) representing the new state of LSTM after reading `inputs` when
        the previous state was `state`.  Same type and shape(s) as `state`.

    Raises:
      ValueError: If input size cannot be inferred from inputs via static shape inference.
    """
        num_proj = self._num_units if self._num_proj is None else self._num_proj

        (c_prev, m_prev) = state

        dtype = inputs.dtype
        input_size = inputs.get_shape().with_rank(2)[1]
        if input_size.value is None:
            raise ValueError(
                "Could not infer input size from inputs.get_shape()[-1]")
        with vs.variable_scope(scope or "simplified_lstm_cell",
                               initializer=self._initializer) as unit_scope:
            n_eqs = 3 if self._architecture in ['NOG', 'NFG', 'NIG', 'CIFG'
                                                ] else 4
            lstm_matrix = _linear([inputs, m_prev],
                                  n_eqs * self._num_units,
                                  bias=True,
                                  scope=scope)

            if self._architecture == 'NOG':
                i, j, f = array_ops.split(value=lstm_matrix,
                                          num_or_size_splits=3,
                                          axis=1)
            elif self._architecture in ['NFG', 'CIFG']:
                i, j, o = array_ops.split(value=lstm_matrix,
                                          num_or_size_splits=3,
                                          axis=1)
            elif self._architecture == 'NIG':
                j, f, o = array_ops.split(value=lstm_matrix,
                                          num_or_size_splits=3,
                                          axis=1)
            else:
                i, j, f, o = array_ops.split(value=lstm_matrix,
                                             num_or_size_splits=4,
                                             axis=1)

            # Diagonal connections
            if self._use_peepholes:
                with vs.variable_scope(unit_scope) as projection_scope:
                    if self._num_unit_shards is not None:
                        projection_scope.set_partitioner(None)
                    if self._architecture not in ['NFG', 'CIFG']:
                        w_f_diag = vs.get_variable("w_f_diag",
                                                   shape=[self._num_units],
                                                   dtype=dtype)
                    if self._architecture != 'NIG':
                        w_i_diag = vs.get_variable("w_i_diag",
                                                   shape=[self._num_units],
                                                   dtype=dtype)
                    if self._architecture != 'NOG':
                        w_o_diag = vs.get_variable("w_o_diag",
                                                   shape=[self._num_units],
                                                   dtype=dtype)

            if self._use_peepholes:
                if self._architecture == 'NIG':
                    c = sigmoid(f + self._forget_bias + w_f_diag *
                                c_prev) * c_prev + self._activation(j)
                elif self._architecture == 'NFG':
                    c = c_prev + sigmoid(i + w_i_diag *
                                         c_prev) * self._activation(j)
                elif self._architecture == 'NIAF':
                    c = sigmoid(f + self._forget_bias + w_f_diag * c_prev
                                ) * c_prev + sigmoid(i + w_i_diag * c_prev) * j
                elif self._architecture == 'CIFG':
                    _i = sigmoid(i + w_i_diag * c_prev)
                    c = (1 - _i) * c_prev + _i * self._activation(j)
                else:
                    c = sigmoid(f + self._forget_bias + w_f_diag * c_prev
                                ) * c_prev + sigmoid(i + w_i_diag * c_prev
                                                     ) * self._activation(j)
            else:
                if self._architecture == 'NIG':
                    c = sigmoid(
                        f + self._forget_bias) * c_prev + self._activation(j)
                elif self._architecture == 'NFG':
                    c = c_prev + sigmoid(i) * self._activation(j)
                elif self._architecture == 'NIAF':
                    c = sigmoid(f +
                                self._forget_bias) * c_prev + sigmoid(i) * j
                elif self._architecture == 'CIFG':
                    _i = sigmoid(i)
                    c = (1 - _i) * c_prev + _i * self._activation(j)
                else:
                    c = sigmoid(f + self._forget_bias) * c_prev + sigmoid(
                        i) * self._activation(j)

            if self._cell_clip is not None:
                # pylint: disable=invalid-unary-operand-type
                c = clip_ops.clip_by_value(c, -self._cell_clip,
                                           self._cell_clip)
                # pylint: enable=invalid-unary-operand-type

            if self._use_peepholes:
                if self._architecture == 'NOG':
                    m = self._activation(c)
                elif self._architecture == 'NOAF':
                    m = sigmoid(o + w_o_diag * c) * c
                else:
                    m = sigmoid(o + w_o_diag * c) * self._activation(c)
            else:
                if self._architecture == 'NOG':
                    m = self._activation(c)
                elif self._architecture == 'NOAF':
                    m = sigmoid(o) * c
                else:
                    m = sigmoid(o) * self._activation(c)

            if self._num_proj is not None:
                with vs.variable_scope("projection") as proj_scope:
                    m = _linear(m, self._num_proj, bias=False, scope=scope)

                if self._proj_clip is not None:
                    # pylint: disable=invalid-unary-operand-type
                    m = clip_ops.clip_by_value(m, -self._proj_clip,
                                               self._proj_clip)
                    # pylint: enable=invalid-unary-operand-type

        new_state = LSTMStateTuple(c, m)
        return m, new_state
Example #30
0
  def __call__(self, inputs, state, scope=None):
    """Run one step of LSTM.

    Args:
      inputs: input Tensor, 2D, batch x num_units.
      state: if `state_is_tuple` is False, this must be a state Tensor,
        `2-D, batch x state_size`.  If `state_is_tuple` is True, this must be a
        tuple of state Tensors, both `2-D`, with column sizes `c_state` and
        `m_state`.
      scope: VariableScope for the created subgraph; defaults to "lstm_cell".

    Returns:
      A tuple containing:

      - A `2-D, [batch x output_dim]`, Tensor representing the output of the
        LSTM after reading `inputs` when previous state was `state`.
        Here output_dim is:
           num_proj if num_proj was set,
           num_units otherwise.
      - Tensor(s) representing the new state of LSTM after reading `inputs` when
        the previous state was `state`.  Same type and shape(s) as `state`.

    Raises:
      ValueError: If input size cannot be inferred from inputs via
        static shape inference.
    """
    num_proj = self._num_units if self._num_proj is None else self._num_proj

    if self._state_is_tuple:
      (c_prev, m_prev) = state
    else:
      c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units])
      m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj])

    dtype = inputs.dtype
    input_size = inputs.get_shape().with_rank(2)[1]
    if input_size.value is None:
      raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
    with _checked_scope(self, scope or "lstm_cell",
                        initializer=self._initializer,
                        reuse=self._reuse) as unit_scope:
      if self._num_unit_shards is not None:
        unit_scope.set_partitioner(
            partitioned_variables.fixed_size_partitioner(
                self._num_unit_shards))
      # i = input_gate, j = new_input, f = forget_gate, o = output_gate
      lstm_matrix = _linear([inputs, m_prev], 4 * self._num_units, bias=True)
      i, j, f, o = array_ops.split(
          value=lstm_matrix, num_or_size_splits=4, axis=1)
      # Diagonal connections
      if self._use_peepholes:
        with vs.variable_scope(unit_scope) as projection_scope:
          if self._num_unit_shards is not None:
            projection_scope.set_partitioner(None)
          w_f_diag = vs.get_variable(
              "w_f_diag", shape=[self._num_units], dtype=dtype)
          w_i_diag = vs.get_variable(
              "w_i_diag", shape=[self._num_units], dtype=dtype)
          w_o_diag = vs.get_variable(
              "w_o_diag", shape=[self._num_units], dtype=dtype)

      if self._use_peepholes:
        c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev +
             sigmoid(i + w_i_diag * c_prev) * self._activation(j))
      else:
        c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) *
             self._activation(j))

      if self._cell_clip is not None:
        # pylint: disable=invalid-unary-operand-type
        c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip)
        # pylint: enable=invalid-unary-operand-type
      if self._use_peepholes:
        m = sigmoid(o + w_o_diag * c) * self._activation(c)
      else:
        m = sigmoid(o) * self._activation(c)

      if self._num_proj is not None:
        with vs.variable_scope("projection") as proj_scope:
          if self._num_proj_shards is not None:
            proj_scope.set_partitioner(
                partitioned_variables.fixed_size_partitioner(
                    self._num_proj_shards))
          m = _linear(m, self._num_proj, bias=False)

        if self._proj_clip is not None:
          # pylint: disable=invalid-unary-operand-type
          m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip)
          # pylint: enable=invalid-unary-operand-type

    new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else
                 array_ops.concat([c, m], 1))
    return m, new_state
Example #31
0
    def __call__(self, inputs, state, scope=None):
        """Run one step of TLSTM.
        """
        sigmoid = math_ops.sigmoid
        tanh = math_ops.tanh

        (c_prev, m_prev) = state

        dtype = inputs.dtype
        input_size = inputs.get_shape().with_rank(2)[1]
        if input_size.value is None:
            raise ValueError("Could not infer input size from inputs.get_shape()[-1]")

        batch_size, feature_size = inputs.get_shape().as_list()
        feature_size = feature_size - 1

        seq = tf.slice(inputs, begin=[0, 0], size=[batch_size, feature_size])
        delta_t = tf.slice(inputs, begin=[0, 48], size=[batch_size, 1])

        scope = scope or vs.get_variable_scope()
        with vs.variable_scope(scope, initializer=self._initializer) as unit_scope:

            # i = input_gate, j = new_input, f = forget_gate, o = output_gate

            lstm_matrix = _linear([seq, m_prev], output_size=4 * self._num_units, bias=True)

            # Time gate
            with vs.variable_scope(unit_scope) as time_gate_scope:
                w_t1 = vs.get_variable(
                    "w_t1", shape=[1, self._num_units], dtype=dtype)
                bias_t1 = vs.get_variable(
                    "bias_t1", [self._num_units], dtype=dtype,
                    initializer=init_ops.constant_initializer(0.0, dtype=dtype))
                w_tx1 = vs.get_variable(
                    "w_tx1", shape=[feature_size, self._num_units], dtype=dtype)
                w_tx2 = vs.get_variable(
                    "w_tx2", shape=[feature_size, self._num_units], dtype=dtype)
                w_t2 = vs.get_variable(
                    "w_t2", shape=[1, self._num_units], dtype=dtype)
                bias_t2 = vs.get_variable(
                    "bias_t2", [self._num_units], dtype=dtype,
                    initializer=init_ops.constant_initializer(0.0, dtype=dtype))
                w_to = vs.get_variable(
                    "w_to", shape=[1, self._num_units], dtype=dtype)

            w_t1_with_constraint = tf.minimum(w_t1, 0)
            t1_act = (self._activation(math_ops.matmul(delta_t, w_t1_with_constraint)) +
                      math_ops.matmul(seq, w_tx1) + bias_t1)
            t2_act = (self._activation(math_ops.matmul(delta_t, w_t2)) +
                      math_ops.matmul(seq, w_tx2) + bias_t2)
            t1 = sigmoid(t1_act)
            t2 = sigmoid(t2_act)


            i, j, f, o = array_ops.split(
                value=lstm_matrix, num_or_size_splits=4, axis=1)

            # Diagonal connections
            if self._use_peepholes:
                with vs.variable_scope(unit_scope) as projection_scope:
                    w_f_diag = vs.get_variable(
                        "w_f_diag", shape=[self._num_units], dtype=dtype)
                    w_i_diag = vs.get_variable(
                        "w_i_diag", shape=[self._num_units], dtype=dtype)
                    w_o_diag = vs.get_variable(
                        "w_o_diag", shape=[self._num_units], dtype=dtype)

            if self._use_peepholes:
                c_hat = ((1 - sigmoid(i + w_i_diag * c_prev)*t1) * c_prev +
                         sigmoid(i + w_i_diag * c_prev)*t1 * self._activation(j))
                c = ((1 - sigmoid(i + w_i_diag * c_prev)) * c_prev +
                     sigmoid(i + w_i_diag * c_prev)*t2 * self._activation(j))
            else:
                c_hat = ((1 - sigmoid(i)) * c_prev +
                         sigmoid(i + w_i_diag * c_prev)*t1 * self._activation(j))
                c = ((1 - sigmoid(i)) * c_prev +
                     sigmoid(i + w_i_diag * c_prev)*t2 * self._activation(j))

            if self._cell_clip is not None:
                c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip)
            if self._use_peepholes:
                m = (sigmoid(o + math_ops.matmul(delta_t, w_to) + w_o_diag * c) *
                     self._activation(c_hat))
            else:
                m = sigmoid(o + math_ops.matmul(delta_t, w_to)) * self._activation(c_hat)

        new_state = (LSTMStateTuple(c, m))
        return m, new_state
Example #32
0
def pointer_decoder(decoder_inputs, initial_state, attention_states, cell,
                    feed_prev=True, dtype=dtypes.float32, scope=None):
    """RNN decoder with pointer net for the sequence-to-sequence model.
    Args:
      decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
      initial_state: 2D Tensor [batch_size x cell.state_size].
      attention_states: 3D Tensor [batch_size x attn_length x attn_size].
      cell: rnn_cell.RNNCell defining the cell function and size.
      dtype: The dtype to use for the RNN initial state (default: tf.float32).
      scope: VariableScope for the created subgraph; default: "pointer_decoder".
    Returns:
      outputs: A list of the same length as decoder_inputs of 2D Tensors of shape
        [batch_size x output_size]. These represent the generated outputs.
        Output i is computed from input i (which is either i-th decoder_inputs.
        First, we run the cell
        on a combination of the input and previous attention masks:
          cell_output, new_state = cell(linear(input, prev_attn), prev_state).
        Then, we calculate new attention masks:
          new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
        and then we calculate the output:
          output = linear(cell_output, new_attn).
      states: The state of each decoder cell in each time-step. This is a list
        with length len(decoder_inputs) -- one item for each time-step.
        Each item is a 2D Tensor of shape [batch_size x cell.state_size].
    """
    if not decoder_inputs:
        raise ValueError("Must provide at least 1 input to attention decoder.")
    if not attention_states.get_shape()[1:2].is_fully_defined():
        raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
                         % attention_states.get_shape())

    with vs.variable_scope(scope or "point_decoder"):
        batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
        input_size = decoder_inputs[0].get_shape()[1].value
        attn_length = attention_states.get_shape()[1].value
        attn_size = attention_states.get_shape()[2].value

        # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
        hidden = array_ops.reshape(
            attention_states, [-1, attn_length, 1, attn_size])

        attention_vec_size = attn_size  # Size of query vectors for attention.
        k = vs.get_variable("AttnW", [1, 1, attn_size, attention_vec_size])
        hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")
        v = vs.get_variable("AttnV", [attention_vec_size])

        states = [initial_state]

        def attention(query):
            """Point on hidden using hidden_features and query."""
            with vs.variable_scope("Attention"):
                y = core_rnn_cell_impl._linear(query, attention_vec_size, True)
                y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
                # Attention mask is a softmax of v^T * tanh(...).
                s = math_ops.reduce_sum(
                    v * math_ops.tanh(hidden_features + y), [2, 3])
                return s

        outputs = []
        prev = None
        batch_attn_size = array_ops.stack([batch_size, attn_size])
        attns = array_ops.zeros(batch_attn_size, dtype=dtype)

        attns.set_shape([None, attn_size])
        inps = []
        for i in range(len(decoder_inputs)):
            if i > 0:
                vs.get_variable_scope().reuse_variables()
            inp = decoder_inputs[i]

            if feed_prev and i > 0:
                inp = tf.stack(decoder_inputs)
                inp = tf.transpose(inp, perm=[1, 0, 2])
                inp = tf.reshape(inp, [-1, attn_length, input_size])
                inp = tf.reduce_sum(inp * tf.reshape(tf.nn.softmax(output), [-1, attn_length, 1]), 1)
                inp = tf.stop_gradient(inp)
                inps.append(inp)

            # Use the same inputs in inference, order internaly

            # Merge input and previous attentions into one vector of the right size.
            x = core_rnn_cell_impl._linear([inp, attns], cell.output_size, True)
            # Run the RNN.
            cell_output, new_state = cell(x, states[-1])
            states.append(new_state)
            # Run the attention mechanism.
            output = attention(new_state)

            outputs.append(output)

    return outputs, states, inps
Example #33
0
    def __call__(self, inputs, state, scope=None):
        """ Phased long short-term memory cell (P-LSTM)."""
        with vs.variable_scope(scope or type(self).__name__):
            # Parameters of gates are concatenated into one multiply for efficiency.
            c_prev, h_prev = state

            # (batch_size, seq_len, 2)
            # NB: here we explicitly give t as input.
            x = tf.reshape(inputs[:, 0], (-1, 1))
            t = inputs[:, 1][
                -1]  # Now we only accept one id. We have a batch so it's a bit more complex.

            # maybe the information should come from the outside. To be defined later.

            concat = _linear([x, h_prev], 4 * self._num_units, True)
            # i = input_gate, j = new_input, f = forget_gate, o = output_gate
            i, j, f, o = array_ops.split(value=concat,
                                         num_or_size_splits=4,
                                         axis=1)

            dtype = inputs.dtype
            tau = vs.get_variable('tau',
                                  shape=[self._num_units],
                                  initializer=random_exp_initializer(
                                      0, self.tau_init),
                                  dtype=dtype)

            r_on = vs.get_variable('r_on',
                                   shape=[self._num_units],
                                   initializer=init_ops.constant_initializer(
                                       self.r_on_init),
                                   dtype=dtype)

            s = vs.get_variable(
                's',
                shape=[self._num_units],
                initializer=init_ops.random_uniform_initializer(
                    0., tau.initialized_value()),
                dtype=dtype)

            times = tf.tile(tf.reshape(t, [-1, 1]), [1, self._num_units])
            phase = phi(times, s, tau)
            kappa = time_gate_fast(phase, r_on, self._leak_rate,
                                   self._training_phase)

            w_o_peephole = None
            if self._use_peepholes:
                w_i_peephole = vs.get_variable('W_I_peephole',
                                               shape=[self._num_units],
                                               dtype=dtype)
                w_f_peephole = vs.get_variable('W_F_peephole',
                                               shape=[self._num_units],
                                               dtype=dtype)
                w_o_peephole = vs.get_variable('W_O_peephole',
                                               shape=[self._num_units],
                                               dtype=dtype)
                f += w_f_peephole * c_prev
                i += w_i_peephole * c_prev

            new_c_tilde = sigmoid(f) * c_prev + sigmoid(i) * self._activation(
                j)
            if self._use_peepholes:
                o += w_o_peephole * new_c_tilde

            new_h_tilde = sigmoid(o) * self._activation(new_c_tilde)
            """
            Hi all,
            Yes, Philippe, you are correct in that Equation 4 should reference c_tilde and not c.
            I can add a point to the paper to mention that, and will update Figure 1 so the line is
            correctly drawn to c_tilde instead. The intuition here is that the gates should be blind
            to the effect of the khronos gate; input, forget and output gate should all operate as if
            the cell were a normal LSTM cell, while the khronos gate allows it to either operate or
            not operate (and then linearly interpolates between these two states). If the output gate
            is influenced by the khronos gate (if the peepholes reference c instead of c_tilde), then
            the PLSTM would no longer be a gated LSTM cell, but somehow be self-dependent on the time gate's actual operation.
            I think everyone's right in that it wouldn't influence much -- but it should be updated in
            the paper. Thanks very much for pointing out the issue, Philippe!
            -Danny"""

            # Apply Khronos gate
            new_h = kappa * new_h_tilde + (1 - kappa) * h_prev
            new_c = kappa * new_c_tilde + (1 - kappa) * c_prev
            new_state = (new_c, new_h)
            return new_h, new_state