Ejemplo n.º 1
0
    def __call__(self, inputs, state, scope=None):
        with tf.variable_scope(scope or type(self).__name__):  # "GRUCell"
            with tf.variable_scope("Gates"):  # Reset gate and update gate.
                # We start with bias of 1.0 to not reset and not update.
                ru = core_rnn_cell._linear([inputs, state],
                                           2 * self._num_units, True, 1.0)
                ru = tf.nn.sigmoid(ru)
                r, u = tf.split(ru, 2, 1)
            with tf.variable_scope("Candidate"):
                lambdas = core_rnn_cell._linear([inputs, state],
                                                self._num_weights, True)
                lambdas = tf.split(tf.nn.softmax(lambdas), self._num_weights,
                                   1)

                Ws = tf.get_variable("Ws",
                                     shape=[
                                         self._num_weights,
                                         inputs.get_shape()[1], self._num_units
                                     ])
                Ws = [
                    tf.squeeze(i) for i in tf.split(0, self._num_weights, Ws)
                ]

                candidate_inputs = []

                for idx, W in enumerate(Ws):
                    candidate_inputs.append(
                        tf.matmul(inputs, W) * lambdas[idx])

                Wx = tf.add_n(candidate_inputs)

                c = tf.nn.tanh(Wx + core_rnn_cell._linear(
                    [r * state], self._num_units, True, scope="second"))
            new_h = u * state + (1 - u) * c
        return new_h, new_h
Ejemplo n.º 2
0
  def call(self, inputs, state, att_score=None):
    if self._gate_linear is None:
      bias_ones = self._bias_initializer
      if self._bias_initializer is None:
        bias_ones = init_ops.constant_initializer(1.0, dtype=inputs.dtype)
      with vs.variable_scope("gates"):  # Reset gate and update gate.
        self._gate_linear = _linear(
            [inputs, state],
            2 * self._num_units,
            True,
            bias_initializer=bias_ones,
            kernel_initializer=self._kernel_initializer)

    value = math_ops.sigmoid(self._gate_linear([inputs, state]))
    r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)

    r_state = r * state
    if self._candidate_linear is None:
      with vs.variable_scope("candidate"):
        self._candidate_linear = _linear(
            [inputs, r_state],
            self._num_units,
            True,
            bias_initializer=self._bias_initializer,
            kernel_initializer=self._kernel_initializer)
    c = self._activation(self._candidate_linear([inputs, r_state]))
    u = (1.0 - att_score) * u
    new_h = u * state + (1 - u) * c
    return new_h, new_h
Ejemplo n.º 3
0
    def __call__(self, inputs, state, scope=None):
        """Long short-term memory cell (LSTM)."""
        with tf.variable_scope(scope or type(self).__name__):
            c, h = state

            # change bias argument to False since LN will add bias via shift
            concat = core_rnn_cell._linear([inputs, h, c], 2 * self._num_units,
                                           False)
            i, f = tf.split(concat, 2, 1)
            j = core_rnn_cell._linear([inputs, h], self._num_units, False)

            # add layer normalization to each gate
            i = ln(i, scope='i/')
            j = ln(j, scope='j/')
            f = ln(f, scope='f/')

            new_c = (c * tf.nn.sigmoid(f + self._forget_bias) +
                     tf.nn.sigmoid(i) * self._activation(j))
            o = core_rnn_cell._linear([inputs, h, new_c], self._num_units,
                                      False)
            o = ln(o, scope='o/')

            # add layer_normalization in calculation of new hidden state
            new_h = self._activation(ln(new_c,
                                        scope='new_h/')) * tf.nn.sigmoid(o)
            new_state = tf.nn.rnn_cell.LSTMStateTuple(new_c, new_h)

            return new_h, new_state
Ejemplo n.º 4
0
                    def get_next_input():
                        # compute Badhanau style attention
                        #performing convolution or reshaping input to (-1,2*d) and then doing matmul, is essentially the same operation
                        #see matrix_mult.py...conv2d might be faster??
                        #https://stackoverflow.com/questions/38235555/tensorflow-matmul-of-input-matrix-with-batch-data
                        encoder_features = tf.nn.conv2d(
                            encoder_output, W_att_enc, [1, 1, 1, 1], "SAME"
                        )  # shape (batch_size,max_enc_steps,1,attention_vec_size)
                        dec_portion = tf.matmul(previous_state.h, W_att_dec)
                        decoder_features = tf.expand_dims(
                            tf.expand_dims(dec_portion, 1), 1
                        )  # reshape to (batch_size, 1, 1, attention_vec_size)
                        #python broadcasting will alllow the two features to get added
                        e_not_masked = tf.reduce_sum(
                            v_blend *
                            tf.nn.tanh(encoder_features + decoder_features),
                            [2, 3])  # calculate e, (batch_size, max_enc_steps)
                        #The shape of output of a softmax is the same as the input: it just normalizes the values.
                        attn_dist = tf.nn.softmax(
                            e_not_masked)  # (batch_size, max_enc_steps)
                        attn_dist = tf.Print(attn_dist, [tf.shape(attn_dist)],
                                             message="attn_dist",
                                             first_n=5,
                                             summarize=200)

                        #Multiplying all the 2d vectors with same attn_dist values,and finally keeping 1 2d vector for every batch example
                        context_vector = tf.reduce_sum(
                            tf.reshape(attn_dist, [N, -1, 1, 1]) *
                            encoder_output,
                            [1, 2])  # shape (batch_size, attn_size).
                        context_vector = tf.reshape(context_vector,
                                                    [-1, 2 * nodes])
                        #next_input = tf.cond(self.is_train, lambda: tf.concat(
                        #    [tf.reshape(decoder_emb_inp[:, time], (N, dw)), context_vector], 1),
                        #                     lambda: tf.concat([tf.nn.embedding_lookup(word_emb_mat, prediction), context_vector], 1))
                        #output_logits = tf.add(tf.matmul(previous_output, W_dense), b_dense)
                        prediction = tf.cond(
                            self.pointer_gen,
                            lambda: execute_pointer_network(attn_dist),
                            lambda: execute_normal_decoder(
                                previous_output, W_dense, b_dense))

                        with tf.variable_scope("modified_dec_inputs",
                                               reuse=tf.AUTO_REUSE):
                            next_input = tf.cond(
                                self.is_train,
                                lambda: _linear(args=[context_vector] + [
                                    tf.reshape(decoder_emb_inp[:, time],
                                               (N, dw))
                                ],
                                                output_size=dw,
                                                bias=True),
                                lambda: _linear([context_vector] + [
                                    tf.nn.embedding_lookup(
                                        word_emb_mat, prediction)
                                ], dw, True))

                        return next_input, attn_dist
Ejemplo n.º 5
0
 def __call__(self, inputs, state):
     with vs.variable_scope('Gates'):
         value = _linear([inputs, state], 2 * self._num_units, True, 1.0)
         r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)
         r = ln(r, scope='r/')
         u = ln(u, scope='u/')
         r, u = sigmoid(r), sigmoid(u)
     with vs.variable_scope('Candidate'):
         Cand = _linear([inputs, r * state], self._num_units, True)
         c_pre = ln(Cand, scope='new_h/')
         c = self._activation(c_pre)
     new_h = u * state + (1 - u) * c
     return new_h, new_h
Ejemplo n.º 6
0
    def __build_encoder_state_computer(self, emb_encoder_inputs, encoder_mask):
        with variable_scope.variable_scope(variable_scope.get_variable_scope(),
                                           reuse=None):
            with variable_scope.variable_scope("seq2seq_Encoder"):
                encoder_cell_fw = tf.nn.rnn_cell.LSTMCell(self.hidden_size)
                encoder_cell_bw = tf.nn.rnn_cell.LSTMCell(self.hidden_size)

                encoder_cell_fw = tf.nn.rnn_cell.DropoutWrapper(
                    encoder_cell_fw, output_keep_prob=self.keep_prob)
                encoder_cell_bw = tf.nn.rnn_cell.DropoutWrapper(
                    encoder_cell_bw, output_keep_prob=self.keep_prob)

                (outputs, encoder_state_fw,
                 encoder_state_bw) = rnn.static_bidirectional_rnn(
                     encoder_cell_fw,
                     encoder_cell_bw,
                     emb_encoder_inputs,
                     dtype=tf.float32)

                encoder_outputs = outputs

                encoder_state_c = encoder_state_bw[0]
                encoder_state_m = encoder_state_bw[1]

                with variable_scope.variable_scope("initial_transfor_c"):
                    final_state_c = core_rnn_cell._linear(
                        encoder_state_c, self.hidden_size, True)
                    final_state_c = tf.tanh(final_state_c)

                with variable_scope.variable_scope("initial_transfor_m"):
                    final_state_m = core_rnn_cell._linear(
                        encoder_state_m, self.hidden_size, True)
                    final_state_m = tf.tanh(final_state_m)

                final_state = tf.nn.rnn_cell.LSTMStateTuple(
                    final_state_c, final_state_m)

                # First calculate a concatenation of encoder outputs to put attention on.
                # cell.output_size is embedding_size
                top_states = [
                    array_ops.reshape(e,
                                      [-1, 1, encoder_cell_fw.output_size * 2])
                    for e in encoder_outputs
                ]

                attention_states = array_ops.concat(top_states, 1)

                final_attention_states = tf.multiply(encoder_mask,
                                                     attention_states)
                return final_state, final_attention_states
Ejemplo n.º 7
0
    def __call__(self, inputs, state, scope=None):
        """Long short-term memory cell (LSTM).
        @param: inputs (batch,n)
        @param state: the states and hidden unit of the two cells
        """
        with tf.variable_scope(scope or type(self).__name__):
            c1, c2, h1, h2 = state

            # change bias argument to False since LN will add bias via shift
            concat = _linear([inputs, h1, h2], 5 * self._num_units, False)

            i, j, f1, f2, o = tf.split(value=concat,
                                       num_or_size_splits=5,
                                       axis=1)

            # add layer normalization to each gate
            i = ln(i, scope='i/')
            j = ln(j, scope='j/')
            f1 = ln(f1, scope='f1/')
            f2 = ln(f2, scope='f2/')
            o = ln(o, scope='o/')

            new_c = (c1 * tf.nn.sigmoid(f1 + self._forget_bias) +
                     c2 * tf.nn.sigmoid(f2 + self._forget_bias) +
                     tf.nn.sigmoid(i) * self._activation(j))

            # add layer_normalization in calculation of new hidden state
            new_h = self._activation(ln(new_c,
                                        scope='new_h/')) * tf.nn.sigmoid(o)
            new_state = LSTMStateTuple(new_c, new_h)

            return new_h, new_state
Ejemplo n.º 8
0
def linear(args,
           output_size,
           bias,
           bias_start=0.0,
           scope=None,
           squeeze=False,
           wd=0.0,
           input_keep_prob=1.0,
           is_train=None):
    if args is None or (nest.is_sequence(args) and not args):
        raise ValueError("`args` must be specified")
    if not nest.is_sequence(args):
        args = [args]

    flat_args = [flatten(arg, 1) for arg in args]
    if input_keep_prob < 1.0:
        assert is_train is not None
        flat_args = [
            tf.cond(is_train, lambda: tf.nn.dropout(arg, input_keep_prob),
                    lambda: arg) for arg in flat_args
        ]
    with tf.variable_scope(scope or 'Linear'):
        flat_out = _linear(
            flat_args,
            output_size,
            bias,
            bias_initializer=tf.constant_initializer(bias_start))
    out = reconstruct(flat_out, args[0], 1)
    if squeeze:
        out = tf.squeeze(out, [len(args[0].get_shape().as_list()) - 1])
    if wd:
        add_wd(wd)

    return out
Ejemplo n.º 9
0
    def __call__(self, inputs, state, scope=None):
        """MGU with nunits cells."""
        with tf.variable_scope(scope or type(self).__name__):  # "MGUCell"
            with tf.variable_scope("forget_gate"):
                arg = _linear([state, inputs], self._num_units, True)
                f = math_ops.sigmoid(arg)

            print(f)

            with tf.variable_scope("candidate"):
                h_tilde = tf.tanh(
                    _linear([inputs, f * state], self._num_units, True))

            h = (1 - f) * state + f * h_tilde

        return h, h
Ejemplo n.º 10
0
def __call__(self, inputs, state, scope=None):
    """Gated recurrent unit (GRU) with nunits cells."""
    with _checked_scope(self, scope or "gru_cell", reuse=self._reuse):
        with vs.variable_scope("gates"):  # Reset gate and update gate.
            # We start with bias of 1.0 to not reset and not update.
            # 一次计算出两个gate的值
            value = sigmoid(
                _linear([inputs, state], 2 * self._num_units, True, 1.0))
            r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)
        with vs.variable_scope("candidate"):
            c = self._activation(
                _linear([inputs, r * state], self._num_units, True))

        new_h = u * state + (1 - u) * c
    # GRU里面输出和state都是一个h
    return new_h, new_h
Ejemplo n.º 11
0
    def call(self, input, states):
        h = states.h
        c = states.c
        z = states.z
        ha, hb, z_b = tf.split(input,
                               [self.h_above_size, self.h_below_size, 1], 1)
        s_rec = h
        s_td = z * ha
        s_bu = z_b * hb
        bias_init = tf.constant_initializer(0, dtype=tf.float32)
        concat = core_rnn_cell._linear(
            [s_rec, s_td, s_bu],
            4 * self.hstate_size + 1,
            bias=True,
            bias_initializer=bias_init)  #[B,4d+1]  ,d is the state_size
        pre_f, pre_i, pre_o, pre_g, pre_z_next = tf.split(
            concat, [
                self.hstate_size, self.hstate_size, self.hstate_size,
                self.hstate_size, 1
            ], 1)

        i = tf.sigmoid(pre_i)  # [B, h_l]
        g = tf.tanh(pre_g)  # [B, h_l]
        f = tf.sigmoid(pre_f)  # [B, h_l]
        o = tf.sigmoid(pre_o)  # [B, h_l]

        z = tf.squeeze(z, axis=[1])
        z_b = tf.squeeze(z_b, axis=[1])

        c_next = tf.where(
            tf.equal(z, tf.constant(1, dtype=tf.float32)),
            tf.multiply(i, g),  #flush
            tf.where(
                tf.equal(z_b, tf.constant(1, dtype=tf.float32)),
                tf.add(tf.multiply(c, f), tf.multiply(i, g)),  #update
                tf.identity(c)  #copy
            ))

        h_next = tf.where(
            tf.equal(z, tf.constant(1, dtype=tf.float32)),
            tf.multiply(o, tf.tanh(c_next)),  #flush
            tf.where(
                tf.equal(z_b, tf.constant(1, dtype=tf.float32)),
                tf.multiply(o, tf.tanh(c_next)),  #update
                tf.identity(h)  #copy
            ))

        slope_multiplier = 1
        pre_z_next = tf.sigmoid(pre_z_next * slope_multiplier)
        g = tf.get_default_graph()
        with g.gradient_override_map({"Round": "Identity"}):
            z_next = tf.round(pre_z_next)

        out_state = HMLSTMStateTuple(c=c_next, h=h_next, z=z_next)

        h_next = tf.nn.dropout(h_next, keep_prob=self.keep_p)
        output = tf.concat([h_next, z_next], axis=1)
        return output, out_state, concat
Ejemplo n.º 12
0
 def attention(query):
     """Point on hidden using hidden_features and query."""
     with vs.variable_scope("Attention"):
         y = core_rnn_cell._linear(query, attention_vec_size, True)
         y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
         # Attention mask is a softmax of v^T * tanh(...).
         s = math_ops.reduce_sum(v * math_ops.tanh(hidden_features + y),
                                 [2, 3])
         return s
Ejemplo n.º 13
0
    def call(self, inputs, state):
        """
        Hierarchical multi-scale long short-term memory cell (HMLSTM)

        inputs: [B, hb_l + 1 + ha_l]
        state: (c=[B, h_l], h=[B, h_l], z=[B, 1])

        output: [B, h_l + 1]
        new_state: (c=[B, h_l], h=[B, h_l], z=[B, 1])
        """
        c = state.c  # [B, h_l]
        h = state.h  # [B, h_l]
        z = state.z  # [B, 1]

        in_splits = tf.constant([self._h_below_size, 1, self._h_above_size])

        hb, zb, ha = array_ops.split(
            value=inputs, num_or_size_splits=in_splits, axis=1,
            name='split')  # [B, hb_l], [B, 1], [B, ha_l]

        s_recurrent = h  # [B, h_l]

        expanded_z = z  # [B, 1]
        s_above = tf.multiply(expanded_z, ha)  # [B, ha_l]
        s_below = tf.multiply(zb, hb)  # [B, hb_l]

        length = 4 * self._num_units + 1
        states = [s_recurrent, s_above, s_below]

        bias_init = tf.constant_initializer(-1e5, dtype=tf.float32)

        # [B, 4 * h_l + 1]
        concat = core_rnn_cell._linear(states,
                                       length,
                                       bias=False,
                                       bias_initializer=bias_init)

        gate_splits = tf.constant(([self._num_units] * 4) + [1],
                                  dtype=tf.int32)

        i, g, f, o, z_tilde = array_ops.split(value=concat,
                                              num_or_size_splits=gate_splits,
                                              axis=1)

        i = tf.sigmoid(i)  # [B, h_l]
        g = tf.tanh(g)  # [B, h_l]
        f = tf.sigmoid(f)  # [B, h_l]
        o = tf.sigmoid(o)  # [B, h_l]

        new_c = self.calculate_new_cell_state(c, g, i, f, z, zb)
        new_h = self.calculate_new_hidden_state(h, o, new_c, z, zb)
        new_z = tf.expand_dims(self.calculate_new_indicator(z_tilde), -1)

        output = array_ops.concat((new_h, new_z), axis=1)  # [B, h_l + 1]
        new_state = HMLSTMState(c=new_c, h=new_h, z=new_z)

        return output, new_state
 def __call__(self, inputs, state):
     '''Gated recurrent unit (GRU) with nunits cells'''
     with vs.variable_scope('Gates'):
         value = _linear([inputs, state],
                         2 * self._num_units,
                         True,
                         kernel_initializer=tf.constant_initializer(1.0))
         r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)
         r = ln(r, scope='r/')
         u = ln(u, scope='u/')
         r, u = sigmoid(r), sigmoid(u)
         pass
     with vs.variable_scope('Candidate'):
         Cand = _linear([inputs, r * state], self._num_units, True)
         c_pre = ln(Cand, scope='new_h/')
         c = self._activation(c_pre)
         pass
     new_h = u * state + (1 - u) * c
     return new_h, new_h
Ejemplo n.º 15
0
    def call(self, inputs, state):
        """
            Conditionl GRU operations
            inputs: [batch_size, num_units]
            state: (h=[batch_size, num_units], c=[batch_size, num_units])
            output: [batch_size, num_units]
            new_state: (h=[batch_size, num_units], c=[batch_size, num_units])
        """

        h = state.h
        c = state.c

        bias_ones = self._bias_initializer
        if self._bias_initializer is None:
            bias_ones = init_ops.constant_initializer(1.0, dtype=inputs.dtype)
        with vs.variable_scope('gates'):
            val_concat = core_rnn_cell._linear(
                [inputs, h, c],
                2 * self._num_units,
                bias=False,
                bias_initializer=self._bias_initializer,
                kernel_initializer=self._kernel_initializer)

        val = math_ops.sigmoid(val_concat)
        r, z = array_ops.split(value=val, num_or_size_splits=2, axis=1)

        r_state = r * h

        with vs.variable_scope('candidate'):
            hbar_out = core_rnn_cell._linear(
                [inputs, r_state, c],
                self._num_units,
                bias=False,
                bias_initializer=self._bias_initializer,
                kernel_initializer=self._kernel_initializer)

        hbar = self._activation(hbar_out)
        output = (1 - z) * h + z * hbar

        new_state = ConditionalGRUState(h=output, c=c)

        return output, new_state
Ejemplo n.º 16
0
def highway(input_, size, layer_size=1, bias=-2, f=tf.nn.relu):
    """Highway Network (cf. http://arxiv.org/abs/1505.00387).

    t = sigmoid(Wy + b)
    z = t * g(Wy + b) + (1 - t) * y
    where g is nonlinearity, t is transform gate, and (1 - t) is carry gate.
    """
    output = input_
    for idx in range(layer_size):
        with tf.variable_scope('output_lin_%d' % idx):
            output = f(core_rnn_cell._linear(output, size, 0))

        with tf.variable_scope('transform_lin_%d' % idx):
            transform_gate = tf.sigmoid(
                core_rnn_cell._linear(input_, size, 0) + bias)
            carry_gate = 1. - transform_gate

        output = transform_gate * output + carry_gate * input_

    return output
def __call__(self, inputs, state, scope=None):
    """Long short-term memory cell with attention (LSTMA)."""
    if self._state_is_tuple:
        # 这里把state分为三个部分,LSTM的state,attns(代表attention向量)和attn的state
        state, attns, attn_states = state
    else:
        # 如果不是元组,就按照长度切分
        states = state
        state = array_ops.slice(states, [0, 0], [-1, self._cell.state_size])
        attns = array_ops.slice(
            states, [0, self._cell.state_size], [-1, self._attn_size])
        attn_states = array_ops.slice(
            states, [0, self._cell.state_size + self._attn_size],
            [-1, self._attn_size * self._attn_length])

    # attention状态是[None x Attention向量长度 x Attention窗口长度]
    attn_states = array_ops.reshape(attn_states, [-1, self._attn_length, self._atten_size])
    input_size = self._input_size

    if input_size is None:
        input_size = inputs.get_shape().as_list()[1]
    # 让input 和 attns 进行一个什么运算呢?
    inputs = _linear([inputs, attns], input_size, True)
    lstm_output, new_state = self._cell(inputs, state)
    if self._state_is_tuple:
        new_state_cat = array_ops.concat(nest.flatten(new_state), 1)
    else:
        new_state_cat = new_state

        # 利用attention机制计算出下一时刻需要的上下文向量c_t和attention状态(隐藏状态)h_j
        new_attns, new_attn_states = self._attention(new_state_cat, attn_states)
        with vs.variable_scope("attn_output_projection"):
            # 利用c_t和x_t(y_{t-1})计算出t时刻输出s_t
            output = _linear([lstm_output, new_state], self._atten_size,True)
        # 把当前时刻输出s_t增加到下一时刻attention状态去
        new_attn_states = array_ops.concat([new_attn_states, array_ops.expand_dims(output,1)],1)
        new_attn_states = array_ops.reshape(new_attn_states, [-1, self._attn_length * self._attn_size])
        new_state = (new_state, new_attns, new_attn_states)
        if not self._state_is_tuple:
            new_state = array_ops.concat(list(new_state), 1)
        return output, new_state
Ejemplo n.º 18
0
    def __call__(self, lm_inputs, seq_len):
        """Runs RNN and returns the logits."""
        params = self.params
        emb_inputs = self.prepare_decoder_input(lm_inputs[:-1, :])
        outputs, _ = \
            tf.nn.dynamic_rnn(self.cell, emb_inputs,
                              sequence_length=seq_len,
                              dtype=tf.float32, time_major=True)
        # T x B x H => (T x B) x H
        outputs = tf.reshape(outputs, [-1, self.cell.output_size])

        with tf.variable_scope("rnn"):
            # Additional variable scope required to mimic the attention
            # decoder scope so that variable initialization is hassle free
            if params.lm_hidden_size != params.proj_size:
                with tf.variable_scope("SimpleProjection"):
                    outputs = _linear([outputs], params.proj_size, True)

            with tf.variable_scope("OutputProjection"):
                outputs = _linear([outputs], params.vocab_size, True)

        return outputs
Ejemplo n.º 19
0
        def attention(query):
            '''
            Point on hidden using hidden_features and query
            :param query:shape:[batch_size,attention_size]
            :return:
            '''
            with vs.variable_scope('Attention'):
                # y shape:[batch_size,attention_size]
                # 相当于执行W2Dj的运算
                y=core_rnn_cell._linear(query,attention_vec_size,True)
                # y shape:[batch_size,1,1,attention_size]
                y=array_ops.reshape(y,[-1,1,1,attention_vec_size])

                # Attention mask is softmax of v^T *tanh(...)
                s=math_ops.reduce_sum(v*math_ops.tanh(hidden_features+y),[2,3])
                return s
Ejemplo n.º 20
0
            def attention(query):
                """Put attention masks on hidden using hidden_features and query."""
                ds = []  # Results of attention reads will be stored here.
                if nest.is_sequence(
                        query):  # If the query is a tuple, flatten it.
                    query_list = nest.flatten(query)
                    for q in query_list:  # Check that ndims == 2 if specified.
                        ndims = q.get_shape().ndims
                        if ndims:
                            assert ndims == 2
                    query = array_ops.concat(query_list, 1)

                for a in range(num_heads):
                    with variable_scope.variable_scope("Attention_%d" % a):
                        y = core_rnn_cell._linear(query, attention_vec_size,
                                                  True)
                        y = array_ops.reshape(y,
                                              [-1, 1, 1, attention_vec_size])
                        # Attention mask is a softmax of v^T * tanh(...).
                        s = math_ops.reduce_sum(
                            v[a] * math_ops.tanh(hidden_features[a] + y),
                            [2, 3])
                        a = nn_ops.softmax(s)
                        #a = a + 1e-5

                        a1 = tf.multiply(a, encoder_mask)
                        #print (mask_a.get_shape())
                        floor = math_ops.reduce_sum(a1, axis=1)
                        floor = tf.stack([floor], axis=1)

                        #print (floor.get_shape())
                        a2 = tf.truediv(a1, floor)
                        nan_bool = tf.is_nan(a2)
                        #mask_a = tf.select(nan_bool, a1+0.1, a2)
                        mask_a = a2

                        #print (mask_a.get_shape())
                        #print ("_____________")

                        # Now calculate the attention-weighted vector d.
                        d = math_ops.reduce_sum(
                            array_ops.reshape(mask_a, [-1, attn_length, 1, 1])
                            * hidden, [1, 2])

                        ds.append(array_ops.reshape(
                            d, [-1, attn_size]))  #remember this size
                return ds, mask_a
Ejemplo n.º 21
0
                def attention(query, prev_alpha):
                    """Put attention masks on hidden using hidden_features and query."""
                    with tf.variable_scope("Attention"):
                        y = _linear(query, params.attention_vec_size, True)
                        y = tf.reshape(y, [-1, 1, 1, params.attention_vec_size])
                        s = tf.reduce_sum(
                            v * tf.tanh(hidden_features + y), [2, 3])

                        alpha = tf.nn.softmax(s) * attn_mask
                        sum_vec = tf.reduce_sum(alpha, reduction_indices=[1], keepdims=True)
                        norm_term = tf.tile(sum_vec, tf.stack([1, tf.shape(alpha)[1]]))
                        alpha = alpha / norm_term

                        alpha = tf.expand_dims(alpha, 2)
                        alpha = tf.expand_dims(alpha, 3)
                        context_vec = tf.reduce_sum(alpha * hidden, [1, 2])
                    return tuple([context_vec, alpha])
Ejemplo n.º 22
0
    def __call__(self, inputs, state, scope=None):
        """Run one step of minimal RNN.
          Args:
            inputs: input Tensor, 2D, batch x num_units.
            state: a state Tensor, `2-D, batch x state_size`.
          Returns:
            A tuple containing:
            - A `2-D, [batch x num_units]`, Tensor representing the output of the
              cell after reading `inputs` when previous state was `state`.
            - A `2-D, [batch x num_units]`, Tensor representing the new state of cell after reading `inputs` when
              the previous state was `state`.  Same type and shape(s) as `state`.
          Raises:
            ValueError:
            - If input size cannot be inferred from inputs via
              static shape inference.
            - If state is not `2D`.
        """
        # Phi projection to a latent space / candidate
        #z = inputs
        z = self._activation(inputs)
        """for i, layer_size in enumerate(self._num_units):
          with tf.variable_scope("phi_" + str(i)):
            z = self._activation(_linear(
                z,
                layer_size,
                True,
                bias_initializer=self._bias_initializer,
                kernel_initializer=self._kernel_initializer))"""

        # Update gate
        bias_ones = self._bias_initializer
        if self._bias_initializer is None:
            bias_ones = init_ops.constant_initializer(1.0, dtype=inputs.dtype)
        with tf.variable_scope("update_gate"):
            arg = _linear([state, z],
                          self._num_units[-1],
                          True,
                          bias_initializer=bias_ones,
                          kernel_initializer=self._kernel_initializer)
            u = math_ops.sigmoid(arg)

        # Activation step
        new_h = u * state + (1 - u) * z

        return new_h, new_h
Ejemplo n.º 23
0
         def attention(query,attn_size,V,hidden_features,
                       attn_length,attn_states,name,mask):
             
             cs = [] 
             if nest.is_sequence(query):
                 query_list = nest.flatten(query)
             query = tf.concat(query_list,1) 
 
             with tf.variable_scope("Attention"+name) as scope:
                 y = _linear(
                     args=query, output_size=attn_size, bias=True,
                     bias_initializer = self.initializer, kernel_initializer = self.initializer)
 
                 
                 y = tf.reshape(y, [-1, 1, 1, attn_size]) 
                 
                 s = tf.reduce_sum(V * tf.nn.tanh(hidden_features + y), [2, 3])
                 a_masked = masked_attention(s,mask)
                 c = tf.reduce_sum(tf.reshape(
                     a_masked, [-1, attn_length, 1, 1])*attn_states, [1,2])
                 cs=tf.reshape(c, [-1, attn_size])
 
             return cs,a_masked
Ejemplo n.º 24
0
                hidden_conv = tf.expand_dims(state_outputs, 2)
                # k: [filter_height, filter_width, in_channels, out_channels]
                k = tf.get_variable("AttnW", [1, 1, attn_size, attn_size])
                # [bs, nstep, 1, embed size * 2]
                hidden_features = tf.nn.conv2d(hidden_conv, k, [1, 1, 1, 1], "SAME")  
                # [bs, nstep, embed size * 2]
                hidden_features = tf.reshape(hidden_features, origin_shape)
                # [bs, 1, nstep, embed size * 2]
                hidden_features = tf.expand_dims(hidden_features, 1)
                v = tf.get_variable("AttnV", [attn_size])

                slot_inputs_shape = tf.shape(slot_inputs)
                # [bs * nstep, embed size * 2]
                slot_inputs = tf.reshape(slot_inputs, [-1, attn_size])
                # [bs * nstep, embed size * 2]
                y = core_rnn_cell._linear(slot_inputs, attn_size, True) 
                # [bs , nstep, embed size * 2]
                y = tf.reshape(y, slot_inputs_shape)
                # [bs , nstep, 1, embed size * 2]
                y = tf.expand_dims(y, 2)
                # [bs , nstep, nstep] = [bs, 1, nstep, hidden size] + [bs , nstep, 1, embed size * 2]
                s = tf.reduce_sum(v * tf.tanh(hidden_features + y), [3])
                a = tf.nn.softmax(s)
                # a shape = [bs, nstep, nstep, 1]
                a = tf.expand_dims(a, -1)
                # a shape = [bs, nstep, embed size * 2]
                slot_d = tf.reduce_sum(a * hidden, [2])
                slot_output = tf.reshape(slot_d,[-1,attn_size])
        else:
            attn_size = state_shape[2].value
            slot_d=state_outputs
Ejemplo n.º 25
0
def createModel(
    input_data,
    input_size,
    sequence_length,
    slot_size,
    intent_size,
    layer_size=128,
    isTraining=True,
):
    cell_fw = tf.contrib.rnn.BasicLSTMCell(layer_size)
    cell_bw = tf.contrib.rnn.BasicLSTMCell(layer_size)

    if isTraining == True:
        cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw,
                                                input_keep_prob=0.5,
                                                output_keep_prob=0.5)
        cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw,
                                                input_keep_prob=0.5,
                                                output_keep_prob=0.5)

    embedding = tf.get_variable("embedding", [input_size, layer_size])
    inputs = tf.nn.embedding_lookup(embedding, input_data)

    state_outputs, final_state = tf.nn.bidirectional_dynamic_rnn(
        cell_fw,
        cell_bw,
        inputs,
        sequence_length=sequence_length,
        dtype=tf.float32)

    final_state = tf.concat([
        final_state[0][0], final_state[0][1], final_state[1][0],
        final_state[1][1]
    ], 1)
    state_outputs = tf.concat([state_outputs[0], state_outputs[1]], 2)
    state_shape = state_outputs.get_shape()

    with tf.variable_scope("attention"):
        slot_inputs = state_outputs
        if remove_slot_attn == False:
            with tf.variable_scope("slot_attn"):
                attn_size = state_shape[2].value
                origin_shape = tf.shape(state_outputs)
                hidden = tf.expand_dims(state_outputs, 1)
                hidden_conv = tf.expand_dims(state_outputs, 2)
                # hidden shape = [batch, sentence length, 1, hidden size]
                k = tf.get_variable("AttnW", [1, 1, attn_size, attn_size])
                hidden_features = tf.nn.conv2d(hidden_conv, k, [1, 1, 1, 1],
                                               "SAME")
                hidden_features = tf.reshape(hidden_features, origin_shape)
                hidden_features = tf.expand_dims(hidden_features, 1)
                v = tf.get_variable("AttnV", [attn_size])

                slot_inputs_shape = tf.shape(slot_inputs)
                slot_inputs = tf.reshape(slot_inputs, [-1, attn_size])
                y = core_rnn_cell._linear(slot_inputs, attn_size, True)
                y = tf.reshape(y, slot_inputs_shape)
                y = tf.expand_dims(y, 2)
                s = tf.reduce_sum(v * tf.tanh(hidden_features + y), [3])
                a = tf.nn.softmax(s)
                # a shape = [batch, input size, sentence length, 1]
                a = tf.expand_dims(a, -1)
                slot_d = tf.reduce_sum(a * hidden, [2])
        else:
            attn_size = state_shape[2].value
            slot_inputs = tf.reshape(slot_inputs, [-1, attn_size])

        intent_input = final_state
        with tf.variable_scope("intent_attn"):
            attn_size = state_shape[2].value
            hidden = tf.expand_dims(state_outputs, 2)
            k = tf.get_variable("AttnW", [1, 1, attn_size, attn_size])
            hidden_features = tf.nn.conv2d(hidden, k, [1, 1, 1, 1], "SAME")
            v = tf.get_variable("AttnV", [attn_size])

            y = core_rnn_cell._linear(intent_input, attn_size, True)
            y = tf.reshape(y, [-1, 1, 1, attn_size])
            s = tf.reduce_sum(v * tf.tanh(hidden_features + y), [2, 3])
            a = tf.nn.softmax(s)
            a = tf.expand_dims(a, -1)
            a = tf.expand_dims(a, -1)
            d = tf.reduce_sum(a * hidden, [1, 2])

            if add_final_state_to_intent == True:
                intent_output = tf.concat([d, intent_input], 1)
            else:
                intent_output = d

        with tf.variable_scope("slot_gated"):
            intent_gate = core_rnn_cell._linear(intent_output, attn_size, True)
            intent_gate = tf.reshape(
                intent_gate, [-1, 1, intent_gate.get_shape()[1].value])
            v1 = tf.get_variable("gateV", [attn_size])
            if remove_slot_attn == False:
                slot_gate = v1 * tf.tanh(slot_d + intent_gate)
            else:
                slot_gate = v1 * tf.tanh(state_outputs + intent_gate)
            slot_gate = tf.reduce_sum(slot_gate, [2])
            slot_gate = tf.expand_dims(slot_gate, -1)
            if remove_slot_attn == False:
                slot_gate = slot_d * slot_gate
            else:
                slot_gate = state_outputs * slot_gate
            slot_gate = tf.reshape(slot_gate, [-1, attn_size])
            slot_output = tf.concat([slot_gate, slot_inputs], 1)

    with tf.variable_scope("intent_proj"):
        intent = core_rnn_cell._linear(intent_output, intent_size, True)

    with tf.variable_scope("slot_proj"):
        slot = core_rnn_cell._linear(slot_output, slot_size, True)

    outputs = [slot, intent]
    return outputs
Ejemplo n.º 26
0
  def call(self, inputs, state, att_score=None):
    time_now_score = tf.expand_dims(inputs[:,-1], -1)
    time_last_score = tf.expand_dims(inputs[:,-2], -1)
    inputs = inputs[:,:-2]
    inputs = inputs * att_score
    num_proj = self._num_units if self._num_proj is None else self._num_proj
    sigmoid = math_ops.sigmoid

    if self._state_is_tuple:
      (c_prev, m_prev) = state
    else:
      c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units])
      m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj])

    dtype = inputs.dtype
    input_size = inputs.get_shape().with_rank(2)[1]
    if input_size.value is None:
      raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
            
    if self._time_kernel_w1 is None:
      scope = vs.get_variable_scope()
      with vs.variable_scope(
          scope, initializer=self._initializer) as unit_scope:
        with vs.variable_scope(unit_scope):
          self._time_input_w1 = vs.get_variable(
              "_time_input_w1", shape=[self._num_units], dtype=dtype)
          self._time_input_bias1 = vs.get_variable(
              "_time_input_bias1", shape=[self._num_units], dtype=dtype)
          self._time_input_w2 = vs.get_variable(
              "_time_input_w2", shape=[self._num_units], dtype=dtype)
          self._time_input_bias2 = vs.get_variable(
              "_time_input_bias2", shape=[self._num_units], dtype=dtype)
          self._time_kernel_w1 = vs.get_variable(
              "_time_kernel_w1", shape=[input_size, self._num_units], dtype=dtype)
          self._time_kernel_t1 = vs.get_variable(
              "_time_kernel_t1", shape=[self._num_units, self._num_units], dtype=dtype)
          self._time_bias1 = vs.get_variable(
              "_time_bias1", shape=[self._num_units], dtype=dtype)
          self._time_kernel_w2 = vs.get_variable(
              "_time_kernel_w2", shape=[input_size, self._num_units], dtype=dtype)
          self._time_kernel_t2 = vs.get_variable(
              "_time_kernel_t2", shape=[self._num_units, self._num_units], dtype=dtype)
          self._time_bias2 = vs.get_variable(
              "_time_bias2", shape=[self._num_units], dtype=dtype)
          self._o_kernel_t1 = vs.get_variable(
              "_o_kernel_t1", shape=[self._num_units, self._num_units], dtype=dtype)    
          self._o_kernel_t2 = vs.get_variable(
              "_o_kernel_t2", shape=[self._num_units, self._num_units], dtype=dtype)  
                
    time_now_input = tf.nn.tanh(time_now_score * self._time_input_w1 + self._time_input_bias1)
    time_last_input = tf.nn.tanh(time_last_score * self._time_input_w2 + self._time_input_bias2)      

    time_now_state = math_ops.matmul(inputs, self._time_kernel_w1) + math_ops.matmul(time_now_input, self._time_kernel_t1) + self._time_bias1
    time_last_state = math_ops.matmul(inputs, self._time_kernel_w2) + math_ops.matmul(time_last_input, self._time_kernel_t2) + self._time_bias2
    
    if self._linear1 is None:
      scope = vs.get_variable_scope()
      with vs.variable_scope(
          scope, initializer=self._initializer) as unit_scope:
        if self._num_unit_shards is not None:
          unit_scope.set_partitioner(
              partitioned_variables.fixed_size_partitioner(
                  self._num_unit_shards))
        self._linear1 = _linear([inputs, m_prev], 4 * self._num_units, True)

    # i = input_gate, j = new_input, f = forget_gate, o = output_gate
    lstm_matrix = self._linear1([inputs, m_prev])
    i, j, f, o = array_ops.split(
        value=lstm_matrix, num_or_size_splits=4, axis=1)
    o = o + math_ops.matmul(time_now_input, self._o_kernel_t1) + math_ops.matmul(time_last_input, self._o_kernel_t2)   
    # Diagonal connections
    if self._use_peepholes and not self._w_f_diag:
      scope = vs.get_variable_scope()
      with vs.variable_scope(
          scope, initializer=self._initializer) as unit_scope:
        with vs.variable_scope(unit_scope):
          self._w_f_diag = vs.get_variable(
              "w_f_diag", shape=[self._num_units], dtype=dtype)
          self._w_i_diag = vs.get_variable(
              "w_i_diag", shape=[self._num_units], dtype=dtype)
          self._w_o_diag = vs.get_variable(
              "w_o_diag", shape=[self._num_units], dtype=dtype)

    if self._use_peepholes:
      c = (sigmoid(f + self._forget_bias + self._w_f_diag * c_prev) * sigmoid(time_last_state) * c_prev +
           sigmoid(i + self._w_i_diag * c_prev) * sigmoid(time_now_state) * self._activation(j))
    else:
      c = (sigmoid(f + self._forget_bias) * sigmoid(time_last_state) * c_prev + sigmoid(i) * sigmoid(time_now_state) * self._activation(j))

    if self._cell_clip is not None:
      # pylint: disable=invalid-unary-operand-type
      c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip)
      # pylint: enable=invalid-unary-operand-type
    if self._use_peepholes:
      m = sigmoid(o + self._w_o_diag * c) * self._activation(c)
    else:
      m = sigmoid(o) * self._activation(c)

    if self._num_proj is not None:
      if self._linear2 is None:
        scope = vs.get_variable_scope()
        with vs.variable_scope(scope, initializer=self._initializer):
          with vs.variable_scope("projection") as proj_scope:
            if self._num_proj_shards is not None:
              proj_scope.set_partitioner(
                  partitioned_variables.fixed_size_partitioner(
                      self._num_proj_shards))
            self._linear2 = _linear(m, self._num_proj, False)
      m = self._linear2(m)

      if self._proj_clip is not None:
        # pylint: disable=invalid-unary-operand-type
        m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip)
        # pylint: enable=invalid-unary-operand-type

    new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else
                 array_ops.concat([c, m], 1))
    return m, new_state
Ejemplo n.º 27
0
    def step(self, time, inputs, state, name=None):
        """Perform a decoding step.
        Args:
        time: scalar `int32` tensor.
        inputs: A (structure of) input tensors.
        state: A (structure of) state tensors and TensorArrays.
        name: Name scope for any created operations.
        Returns:
        `(outputs, next_state, next_inputs, finished)`.
        """
        with ops.name_scope(name, 'PGDecoderStep', (time, inputs, state)):
            cell_outputs, cell_state = self._cell(inputs, state)
            # the first cell state contains attention, which is context
            attention = cell_state[0].attention
            att_cell_state = cell_state[0].cell_state
            alignments = cell_state[0].alignments

            with tf.variable_scope('calculate_pgen'):
                p_gen = _linear([attention, inputs, att_cell_state], 1, True)
                p_gen = tf.sigmoid(p_gen)

            if self._output_layer is not None:
                cell_outputs = self._output_layer(cell_outputs)

            vocab_dist = tf.nn.softmax(cell_outputs) * p_gen

            # z = tf.reduce_sum(alignments,axis=1)
            # z = tf.reduce_sum(tf.cast(tf.less_equal(alignments, 0),tf.int32))
            alignments = alignments * (1 - p_gen)

            # x = tf.reduce_sum(tf.cast(tf.less_equal((1-p_gen), 0),tf.int32))
            # y = tf.reduce_sum(tf.cast(tf.less_equal(alignments[3], 0),tf.int32))

            # this is only for debug
            # alignments2 =  tf.Print(alignments2,[tf.shape(inputs),x,y,alignments[2][9:12]],message="zeros in vocab dist and alignments")

            # since we have OOV words, we need expand the vocab dist
            vocab_size = tf.shape(vocab_dist)[-1]
            extended_vsize = vocab_size + self.source_oov_words
            batch_size = tf.shape(vocab_dist)[0]
            extra_zeros = tf.zeros((batch_size, self.source_oov_words))
            # batch * extend vocab size
            vocab_dists_extended = tf.concat(axis=-1,
                                             values=[vocab_dist, extra_zeros])
            # vocab_dists_extended = tf.Print(vocab_dists_extended,[tf.shape(vocab_dists_extended),self.source_oov_words],message='vocab_dists_extended size')

            batch_nums = tf.range(0, limit=batch_size)  # shape (batch_size)
            batch_nums = tf.expand_dims(batch_nums, 1)  # shape (batch_size, 1)
            attn_len = tf.shape(self.source_extend_tokens)[
                1]  # number of states we attend over
            batch_nums = tf.tile(batch_nums,
                                 [1, attn_len])  # shape (batch_size, attn_len)
            indices = tf.stack((batch_nums, self.source_extend_tokens),
                               axis=2)  # shape (batch_size, enc_t, 2)
            shape = [batch_size, extended_vsize]
            attn_dists_projected = tf.scatter_nd(indices, alignments, shape)

            final_dists = attn_dists_projected + vocab_dists_extended
            # final_dists = tf.Print(final_dists,[tf.reduce_sum(tf.cast(tf.less_equal(final_dists[0],0),tf.int32))],message='final dist')
            # note: sample_ids will contains OOV words
            sample_ids = self._helper.sample(time=time,
                                             outputs=final_dists,
                                             state=cell_state)

            (finished, next_inputs, next_state) = self._helper.next_inputs(
                time=time,
                outputs=cell_outputs,
                state=cell_state,
                sample_ids=sample_ids,
            )

            outputs = tf.contrib.seq2seq.BasicDecoderOutput(
                final_dists, sample_ids)
            return (outputs, next_state, next_inputs, finished)
Ejemplo n.º 28
0
def createModel(input_data, input_size, sequence_length, slot_size, intent_size, layer_size=128, isTraining=True):
    cell_fw = tf.contrib.rnn.BasicLSTMCell(layer_size)
    cell_bw = tf.contrib.rnn.BasicLSTMCell(layer_size)

    if isTraining == True:
        cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, input_keep_prob=0.5,
                                                output_keep_prob=0.5)
        cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, input_keep_prob=0.5,
                                                output_keep_prob=0.5)
    # embedding layer, [word size, embed size] 724, 64
    if arg.embedding_path:
        embedding_weight = np.load(arg.embedding_path)
        embedding = tf.Variable(embedding_weight, name='embedding', dtype=tf.float32)
    else:
        embedding = tf.get_variable('embedding', [input_size, layer_size])
    # [bs, nstep, embed size]
    inputs = tf.nn.embedding_lookup(embedding, input_data)
    # state_outputs: [bs, nstep, embed size], final_state: [4, bs, embed size] include cell state * 2, hidden state * 2
    state_outputs, final_state = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs,
                                                                 sequence_length=sequence_length, dtype=tf.float32)
    # [bs, embed size * 4]
    final_state = tf.concat([final_state[0][0], final_state[0][1], final_state[1][0], final_state[1][1]], 1)
    # [bs, nstep, embed size * 2]
    state_outputs = tf.concat([state_outputs[0], state_outputs[1]], 2)
    state_shape = state_outputs.get_shape()

    with tf.variable_scope('attention'):
        # [bs, nstep, embed size * 2]
        slot_inputs = state_outputs
        if not remove_slot_attn:
            with tf.variable_scope('slot_attn'):
                # embed size * 2
                attn_size = state_shape[2].value
                origin_shape = tf.shape(state_outputs)
                # [bs, 1, nstep, embed size * 2]
                hidden = tf.expand_dims(state_outputs, 1)
                # [bs, nstep, 1, embed size * 2]
                hidden_conv = tf.expand_dims(state_outputs, 2)
                # k: [filter_height, filter_width, in_channels, out_channels]
                k = tf.get_variable("AttnW", [1, 1, attn_size, attn_size])
                # [bs, nstep, 1, embed size * 2]
                hidden_features = tf.nn.conv2d(hidden_conv, k, [1, 1, 1, 1], "SAME")
                # [bs, nstep, embed size * 2]
                hidden_features = tf.reshape(hidden_features, origin_shape)
                # [bs, 1, nstep, embed size * 2]
                hidden_features = tf.expand_dims(hidden_features, 1)
                v = tf.get_variable("AttnV", [attn_size])

                slot_inputs_shape = tf.shape(slot_inputs)
                # [bs * nstep, embed size * 2]
                slot_inputs = tf.reshape(slot_inputs, [-1, attn_size])
                # [bs * nstep, embed size * 2]
                y = core_rnn_cell._linear(slot_inputs, attn_size, True)
                # [bs , nstep, embed size * 2]
                y = tf.reshape(y, slot_inputs_shape)
                # [bs , nstep, 1, embed size * 2]
                y = tf.expand_dims(y, 2)
                # [bs , nstep, nstep] = [bs, 1, nstep, hidden size] + [bs , nstep, 1, embed size * 2]
                s = tf.reduce_sum(v * tf.tanh(hidden_features + y), [3])
                a = tf.nn.softmax(s)
                # a shape = [bs, nstep, nstep, 1]
                a = tf.expand_dims(a, -1)
                # a shape = [bs, nstep, embed size * 2]
                slot_d = tf.reduce_sum(a * hidden, [2])
        else:
            attn_size = state_shape[2].value
            slot_inputs = tf.reshape(slot_inputs, [-1, attn_size])

        intent_input = final_state
        with tf.variable_scope('intent_attn'):
            attn_size = state_shape[2].value
            # [bs, nstep, 1, embed size * 2]
            hidden = tf.expand_dims(state_outputs, 2)
            k = tf.get_variable("AttnW", [1, 1, attn_size, attn_size])
            # [bs, nstep, 1, embed size * 2]
            hidden_features = tf.nn.conv2d(hidden, k, [1, 1, 1, 1], "SAME")
            v = tf.get_variable("AttnV", [attn_size])

            # [bs, embed size * 4]
            y = core_rnn_cell._linear(intent_input, attn_size, True)
            # [bs, 1, 1, embed size * 4]
            y = tf.reshape(y, [-1, 1, 1, attn_size])
            # [bs, nstep]
            s = tf.reduce_sum(v * tf.tanh(hidden_features + y), [2, 3])
            a = tf.nn.softmax(s)
            # [bs, nstep, 1]
            a = tf.expand_dims(a, -1)
            # [bs, nstep, 1, 1]
            a = tf.expand_dims(a, -1)
            # [bs, embed size * 2]
            d = tf.reduce_sum(a * hidden, [1, 2])

            if add_final_state_to_intent == True:
                # [bs, embed size * 2 + embed size * 4]
                intent_output = tf.concat([d, intent_input], 1)
            else:
                intent_output = d

        with tf.variable_scope('slot_gated'):
            # [bs, embed size * 2]
            intent_gate = core_rnn_cell._linear(intent_output, attn_size, True)
            # [bs, 1,embed size * 2]
            intent_gate = tf.reshape(intent_gate, [-1, 1, intent_gate.get_shape()[1].value])
            v1 = tf.get_variable("gateV", [attn_size])
            if not remove_slot_attn:
                # [bs, nstep, embed size * 2]
                slot_gate = v1 * tf.tanh(slot_d + intent_gate)
            else:
                # [bs, nstep, embed size * 2]
                slot_gate = v1 * tf.tanh(state_outputs + intent_gate)
            # [bs, nstep]
            slot_gate = tf.reduce_sum(slot_gate, [2])
            # [bs, nstep, 1]
            slot_gate = tf.expand_dims(slot_gate, -1)
            if not remove_slot_attn:
                # [bs, nstep, embed size * 2]
                slot_gate = slot_d * slot_gate
            else:
                slot_gate = state_outputs * slot_gate
            # [bs * nstep, embed size * 2]
            slot_gate = tf.reshape(slot_gate, [-1, attn_size])
            # [bs * nstep, embed size * 4]
            slot_output = tf.concat([slot_gate, slot_inputs], 1)

    with tf.variable_scope('intent_proj'):
        # [bs, intent_size]
        intent = core_rnn_cell._linear(intent_output, intent_size, True)
    with tf.variable_scope('slot_proj'):
        # [bs * nsetp, intent_size]
        slot = core_rnn_cell._linear(slot_output, slot_size, True)
        if arg.use_crf:
            nstep = tf.shape(state_outputs)[1]
            slot = tf.reshape(slot, [-1, nstep, slot_size])

    outputs = [slot, intent]
    return outputs
Ejemplo n.º 29
0
def createModel(input_data,
                input_size,
                sequence_length,
                slots,
                slot_size,
                intent_size,
                layer_size=128,
                isTraining=True):
    cell_fw = tf.contrib.rnn.BasicLSTMCell(layer_size)
    cell_bw = tf.contrib.rnn.BasicLSTMCell(layer_size)

    if isTraining == True:
        cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw,
                                                input_keep_prob=0.5,
                                                output_keep_prob=0.5)
        cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw,
                                                input_keep_prob=0.5,
                                                output_keep_prob=0.5)
    if arg.embedding_path:
        print("Loading embedding with numpy!")
        embedding_weight = np.load(arg.embedding_path)
        embedding = tf.Variable(embedding_weight,
                                name='embedding',
                                dtype=tf.float32)
    else:
        embedding = tf.get_variable('embedding', [input_size, layer_size])
    inputs = tf.nn.embedding_lookup(embedding, input_data)
    state_outputs, final_state = tf.nn.bidirectional_dynamic_rnn(
        cell_fw,
        cell_bw,
        inputs,
        sequence_length=sequence_length,
        dtype=tf.float32)
    final_state = tf.concat([
        final_state[0][0], final_state[0][1], final_state[1][0],
        final_state[1][1]
    ], 1)
    state_outputs = tf.concat([state_outputs[0], state_outputs[1]], 2)
    state_shape = state_outputs.get_shape()

    with tf.variable_scope('attention'):
        slot_inputs = state_outputs
        if not remove_slot_attn:
            with tf.variable_scope('slot_attn'):
                attn_size = state_shape[2].value
                origin_shape = tf.shape(state_outputs)
                hidden = tf.expand_dims(state_outputs, 1)
                hidden_conv = tf.expand_dims(state_outputs, 2)
                k = tf.get_variable("AttnW", [1, 1, attn_size, attn_size])
                hidden_features = tf.nn.conv2d(hidden_conv, k, [1, 1, 1, 1],
                                               "SAME")
                hidden_features = tf.reshape(hidden_features, origin_shape)
                hidden_features = tf.expand_dims(hidden_features, 1)
                v = tf.get_variable("AttnV", [attn_size])
                slot_inputs_shape = tf.shape(slot_inputs)
                slot_inputs = tf.reshape(slot_inputs, [-1, attn_size])
                y = core_rnn_cell._linear(slot_inputs, attn_size, True)
                y = tf.reshape(y, slot_inputs_shape)
                y = tf.expand_dims(y, 2)
                s = tf.reduce_sum(v * tf.tanh(hidden_features + y), [3])
                a = tf.nn.softmax(s)
                a = tf.expand_dims(a, -1)
                slot_d = tf.reduce_sum(a * hidden, [2])
                slot_reinforce_state = tf.expand_dims(slot_d, 2)
        else:
            attn_size = state_shape[2].value
            slot_d = slot_inputs
            slot_reinforce_state = tf.expand_dims(slot_inputs, 2)
            slot_inputs = tf.reshape(slot_inputs, [-1, attn_size])

        intent_input = final_state
        with tf.variable_scope('intent_attn'):
            attn_size = state_shape[2].value
            hidden = tf.expand_dims(state_outputs, 2)
            k = tf.get_variable("AttnW", [1, 1, attn_size, attn_size])
            hidden_features = tf.nn.conv2d(hidden, k, [1, 1, 1, 1], "SAME")
            v = tf.get_variable("AttnV", [attn_size])

            y = core_rnn_cell._linear(intent_input, attn_size, True)
            y = tf.reshape(y, [-1, 1, 1, attn_size])
            s = tf.reduce_sum(v * tf.tanh(hidden_features + y), [2, 3])
            a = tf.nn.softmax(s)
            a = tf.expand_dims(a, -1)
            a = tf.expand_dims(a, -1)
            d = tf.reduce_sum(a * hidden, [1, 2])
            r_intent = d
            intent_context_states = d

        if arg.priority_order == 'intent_first':
            for n in range(arg.iteration_num):
                with tf.variable_scope('intent_subnet' + str(n - 1)):
                    attn_size = state_shape[2].value
                    hidden = tf.expand_dims(state_outputs, 2)
                    k1 = tf.get_variable("W1", [1, 1, attn_size, attn_size])
                    k2 = tf.get_variable('W2', [1, 1, attn_size, attn_size])
                    slot_reinforce_features = tf.nn.conv2d(
                        slot_reinforce_state, k1, [1, 1, 1, 1], "SAME")
                    hidden_features = tf.nn.conv2d(hidden, k2, [1, 1, 1, 1],
                                                   "SAME")
                    v1 = tf.get_variable("AttnV", [attn_size])
                    bias = tf.get_variable("Bias", [attn_size])
                    s = tf.reduce_sum(
                        v1 * tf.tanh(hidden_features +
                                     slot_reinforce_features + bias), [2, 3])
                    a = tf.nn.softmax(s)
                    a = tf.expand_dims(a, -1)
                    a = tf.expand_dims(a, -1)
                    r = tf.reduce_sum(a * slot_reinforce_state, [1, 2])

                    r_intent = r + intent_context_states

                    intent_output = tf.concat([r_intent, intent_input], 1)

                with tf.variable_scope('slot_subnet' + str(n - 1)):
                    intent_gate = core_rnn_cell._linear(
                        r_intent, attn_size, True)
                    intent_gate = tf.reshape(
                        intent_gate,
                        [-1, 1, intent_gate.get_shape()[1].value])
                    v1 = tf.get_variable("gateV", [attn_size])
                    relation_factor = v1 * tf.tanh(slot_d + intent_gate)
                    relation_factor = tf.reduce_sum(relation_factor, [2])
                    relation_factor = tf.expand_dims(relation_factor, -1)
                    slot_reinforce_state1 = slot_d * relation_factor
                    slot_reinforce_state = tf.expand_dims(
                        slot_reinforce_state1, 2)
                    slot_reinforce_vector = tf.reshape(slot_reinforce_state1,
                                                       [-1, attn_size])
                    slot_output = tf.concat(
                        [slot_reinforce_vector, slot_inputs], 1)

        else:
            for n in range(arg.iteration_num):
                with tf.variable_scope('slot_subnet' + str(n - 1)):
                    intent_gate = core_rnn_cell._linear(
                        r_intent, attn_size, True)
                    intent_gate = tf.reshape(
                        intent_gate,
                        [-1, 1, intent_gate.get_shape()[1].value])
                    v1 = tf.get_variable("gateV", [attn_size])
                    relation_factor = v1 * tf.tanh(slot_d + intent_gate)
                    relation_factor = tf.reduce_sum(relation_factor, [2])
                    relation_factor = tf.expand_dims(relation_factor, -1)
                    slot_reinforce_state = slot_d * relation_factor
                    slot_reinforce_vector = tf.reshape(slot_reinforce_state,
                                                       [-1, attn_size])
                    slot_output = tf.concat(
                        [slot_reinforce_vector, slot_inputs], 1)

                with tf.variable_scope('intent_subnet' + str(n - 1)):
                    attn_size = state_shape[2].value
                    hidden = tf.expand_dims(state_outputs, 2)
                    slot_reinforce_output = tf.expand_dims(
                        slot_reinforce_state, 2)
                    k1 = tf.get_variable("W1", [1, 1, attn_size, attn_size])
                    k2 = tf.get_variable('W2', [1, 1, attn_size, attn_size])
                    slot_features = tf.nn.conv2d(slot_reinforce_output, k1,
                                                 [1, 1, 1, 1], "SAME")
                    hidden_features = tf.nn.conv2d(hidden, k2, [1, 1, 1, 1],
                                                   "SAME")
                    v1 = tf.get_variable("AttnV", [attn_size])
                    bias = tf.get_variable("Bias", [attn_size])
                    s = tf.reduce_sum(
                        v1 * tf.tanh(hidden_features + slot_features + bias),
                        [2, 3])
                    a = tf.nn.softmax(s)
                    a = tf.expand_dims(a, -1)
                    a = tf.expand_dims(a, -1)
                    r = tf.reduce_sum(a * slot_reinforce_output, [1, 2])

                    r_intent = r + intent_context_states

                    intent_output = tf.concat([r_intent, intent_input], 1)

    with tf.variable_scope('intent_proj'):
        intent = core_rnn_cell._linear(intent_output, intent_size, True)
    with tf.variable_scope('slot_proj'):
        slot = core_rnn_cell._linear(slot_output, slot_size, True)
        if arg.use_crf:
            nstep = tf.shape(state_outputs)[1]
            slot = tf.reshape(slot, [-1, nstep, slot_size])
    outputs = [slot, intent]
    return outputs
Ejemplo n.º 30
0
def createModel(input_data,
                input_size,
                sequence_length,
                slot_size,
                intent_size,
                remove_slot_attn,
                add_final_state_to_intent,
                use_crf,
                layer_size=128,
                isTraining=True,
                embedding_path=None,
                use_batch_crossent=True):

    #cell_fw = tf.contrib.rnn.BasicLSTMCell(layer_size)
    cell_fw = tf.nn.rnn_cell.LSTMCell(layer_size)
    cell_bw = tf.nn.rnn_cell.LSTMCell(layer_size)
    #cell_bw = tf.contrib.rnn.BasicLSTMCell(layer_size)

    if isTraining == True:
        cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw,
                                                input_keep_prob=0.5,
                                                output_keep_prob=0.5)
        cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw,
                                                input_keep_prob=0.5,
                                                output_keep_prob=0.5)
    # embedding layer, [word size, embed size] 724, 64
    if embedding_path:
        embedding_weight = np.load(embedding_path)
        embedding = tf.Variable(embedding_weight,
                                name='embedding',
                                dtype=tf.float32)
    else:
        embedding = tf.get_variable('embedding', [input_size, layer_size])
    # embedding:[vocab_size, embedding_size]
    # input_data:[batch, input_sequence_length]
    # inputs:[batch, input_sequence_length, embedding_size]
    inputs = tf.nn.embedding_lookup(embedding, input_data)
    # state_outputs: [batch, nstep, embed size], final_state: [4, bs, embed size] include cell state * 2, hidden state * 2

    #(output_fw, output_bw), (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
    # output_fw: [batch, input_sequence_length, num_units],它的值为hidden_state
    # output_bw: [batch, input_sequence_length, num_units],它的值为hidden_state
    # (cell_state_fw, hidden_state_fw) = states_fw
    # cell_state_fw: [batch, num_units]
    # hidden_state_fw: [batch, num_units]
    (output_fw, output_bw), (state_fw,
                             state_bw) = tf.nn.bidirectional_dynamic_rnn(
                                 cell_fw=cell_fw,
                                 cell_bw=cell_bw,
                                 inputs=inputs,
                                 sequence_length=sequence_length,
                                 dtype=tf.float32)
    (cell_state_fw, hidden_state_fw) = state_fw
    (cell_state_bw, hidden_state_bw) = state_bw
    # [batch, hidden_size * 4]
    final_state = tf.concat(
        [cell_state_fw, hidden_state_fw, cell_state_bw, hidden_state_bw],
        axis=1)
    # sequence_outputs:[batch, input_sequence_length, hidden_size* 2]
    sequence_outputs = tf.concat([output_fw, output_bw], axis=2)
    print("cell_state_fw:", cell_state_fw, " hidden_state_fw:",
          hidden_state_fw)
    print("final_state:", final_state)
    print("squence_outputs:", sequence_outputs)

    # tensor.get_shape()返回的是tuple,不是tensor
    sequence_output_shape = sequence_outputs.get_shape(
    )  # [batch, input_sequence_length, hidden_size* 2]
    """
    sequence output作为attention的输入
    计算context_i,即论文中C_i^S
    """
    with tf.variable_scope('attention'):
        # state_outputs:[batch, input_sequence_length, hidden_size* 2]
        slot_inputs = sequence_outputs
        if not remove_slot_attn:  # 需要slot attention
            with tf.variable_scope('slot_attn'):
                """
                e_{i,k}=V^T*tanh(W_{he}*h_k+W_{ie}*h_i)
                alpha_{i,j} = softmax(e{i,j})
                c_i = sum_{j}(alpha_{i,j}*h_j)
                
                y_i=softmax(W_hy*(h_i+c_i))
               
                其中: 
                W_{he}*h_k:用的是卷积实现
                W_{ie}*h_i:用的是线性映射 _linear()
                """
                """
                hidden_features = W_{he}*h_k:用的是卷积实现
                """
                # attn_size = hidden_size * 2
                attn_size = sequence_output_shape[2].value
                # [batch, height=input_sequence_length, width=1, channel=hidden_size * 2]
                hidden_input_conv = tf.expand_dims(sequence_outputs, axis=2)
                # W_he: [filter_height=1, filter_width=1, in_channels=hidden*2, out_channels=hidden*2], 注意: 1*1的核
                W_he = tf.get_variable("slot_AttnW",
                                       shape=[1, 1, attn_size, attn_size])
                # 物理意义:对hidden的各维之间进行卷积,等价于: W_{he}*h_k,不过不太清楚为何用卷积来实现
                # hidden_features:[batch, height=input_sequence_length, width=1, channel=hidden_size * 2]
                hidden_features = tf.nn.conv2d(input=hidden_input_conv,
                                               filter=W_he,
                                               strides=[1, 1, 1, 1],
                                               padding="SAME")
                # hidden_features:[batch, 1,input_sequence_length, hidden_size * 2]
                hidden_features = tf.transpose(hidden_features,
                                               perm=[0, 2, 1, 3])
                """
                # 下面是原作者的写法,比较冗余
                origin_shape = tf.shape(sequence_outputs) # 返回的是tensor
                # hidden_features:[batch,input_sequence_length,hidden_size * 2]
                hidden_features = tf.reshape(hidden_features, origin_shape)
                # hidden_features:[batch, 1,input_sequence_length, hidden_size * 2]
                hidden_features = tf.expand_dims(hidden_features, 1)
                """
                """
                # 下面的代码比较啰嗦
                # [batch, input_sequence_length, hidden_size* 2]
                slot_inputs_shape = tf.shape(slot_inputs) #返回tensor
                # slot_inputs:[batch * input_sequence_length, hidden_size * 2]
                slot_inputs = tf.reshape(slot_inputs, [-1, attn_size])
                # [batch * input_sequence_length, hidden_size * 2]
                # W_{ie}*h_i+bias, 注意:这里并没有显式定义W_ie,因为在_linear函数中会自己定义W_ie
                y = core_rnn_cell._linear(slot_inputs, output_size=attn_size, bias=True)
                #y = tf.layers.dense(slot_inputs, attn_size, use_bias=True, activation=None) # 线性函数也可以这样写
                # [batch , input_sequence_length, hidden_size* 2]
                y = tf.reshape(y, slot_inputs_shape)
                # [batch , input_sequence_length, 1, hidden_size* 2]
                y = tf.expand_dims(y, 2)
                print("layer_y:", y)
                """
                """
                y = W_{ie}*h_i:用的是线性映射 _linear(), W_{ie}未显式声明,在Linear函数中
                """
                # sequence_output:[batch, input_sequence_length, hidden_size* 2]
                # slot_inputs:[batch * input_sequence_length, hidden_size * 2]
                slot_inputs = tf.reshape(sequence_outputs, [-1, attn_size])
                # y: [batch , input_sequence_length, hidden_size* 2]
                y = tf.layers.dense(inputs=sequence_outputs,
                                    units=attn_size,
                                    activation=None,
                                    use_bias=True)
                # [batch , input_sequence_length, 1, hidden_size* 2]
                y = tf.expand_dims(y, 2)
                print("layer_y:", y)
                """
                e_{i,k}=V^T*tanh(W_{he}*h_k+W_{ie}*h_i)
                注意:
                在seq2seq-attention中,e_{i,k}=g(s_{i-1},h_k), 
                即e_{i,k}是由encoder中的hidden与decoder中的hidden共同作用而来
                但此处的e_{i,k}比较特殊,h_k,h_i都由encoder的hidden隐向量得来
                因此, 这种做法有点类似于 transformer中的query-key-value-attention的query的计算方式
                """
                # [batch , nstep, nstep] = [batch, 1, nstep, hidden_size*2] + [batch , nstep, 1, hidden_size * 2]
                # hidden_features:[batch, 1,input_sequence_length, hidden_size * 2]
                # y:[batch, input_sequence_length,1 , hidden_size* 2]
                # bahdanau_activate:[batch, input_sequence_length, input_sequence_length, hidden_size* 2]
                # 有维度为1的,会自动广播
                bahdanau_activate = tf.tanh(hidden_features + y)
                # V:[attn_size=hidden_size*2]
                V = tf.get_variable("slot_AttnV", [attn_size])
                # v_bahdanau:[batch, input_sequence_length, input_sequence_length, hidden_size* 2]
                v_bahdanau = V * bahdanau_activate  #  注意:此处是一阶与4阶相乘,注意:这里是点乘,不是矩阵相乘
                # logit_i_k:[batch, input_sequence_length, input_sequence_length]
                logit_i_k = tf.reduce_sum(v_bahdanau, axis=[3])
                # 这里与上一步结合起来,就是:e(i,k)=v^T*tanh(w1*hk+w2*hi), (n*1)^T *(n*1)将向量映射成分数
                """
                alpha_{i,j} = softmax(e{i,j})
                c_i = sum_{j}(alpha_{i,j}*h_j)
                """
                # score_i_k:[batch, input_sequence_length, input_sequence_length]
                score_i_k = tf.nn.softmax(logit_i_k, axis=-1)
                # score_i_k:[batch, input_sequence_length=i, input_sequence_length=k, 1]
                score_i_k = tf.expand_dims(score_i_k, axis=-1)
                # hidden=[batch, 1, input_sequence_length, hidden_size* 2]
                hidden = tf.expand_dims(sequence_outputs, axis=1)
                """
                原论文中的C_i^S = slot_context_hidden
                """
                # score_i_k:[batch, input_sequence_length, input_sequence_length, 1]
                # hidden:[batch, 1, input_sequence_length, hidden_size* 2]
                # slot_context_hidden: [batch, input_sequence_length, hidden_size * 2]
                slot_context_hidden = tf.reduce_sum(score_i_k * hidden,
                                                    axis=[2])
        else:
            """ 
            不需attention,直接将sequence output作为预测slot的输入
            """
            # attn_size = hidden size * 2
            attn_size = sequence_output_shape[2].value
            # [batch*input_sequence_length, hidden_size* 2]
            slot_inputs = tf.reshape(slot_inputs, [-1, attn_size])

        # ===============intent attention ============================
        """
        计算c_I
        注意:intent attention是针对最后的hidden state进行的
        """
        # intent_input:[batch, hidden_size * 4]
        intent_input = final_state
        with tf.variable_scope('intent_attn'):
            # attn_size: hidden_size*2
            attn_size = sequence_output_shape[2].value
            # hidden:[batch, input_sequence_length, 1, hidden_size*2]
            hidden = tf.expand_dims(sequence_outputs, 2)
            """
            注意:虽然名字相同, 但variable_scope不同,与slot-attn中的不是同一个变量!!!
            """
            """
            hidden_features = W_{he}*h_k:用的是卷积实现
            """
            # W_he: [filter_height=1, filter_width=1, in_channels=hidden*2, out_channels=hidden*2], 注意: 1*1的核
            W_he = tf.get_variable("intent_AttnW",
                                   shape=[
                                       1, 1, attn_size, attn_size
                                   ])  # 注意:此处与 slot_attention中用的是相同的attention
            # 物理意义:对hidden的各维之间进行卷积,等价于: W_{he}*h_k
            # [batch, input_sequence_length, 1, hidden_size*2]
            hidden_features = tf.nn.conv2d(input=hidden,
                                           filter=W_he,
                                           strides=[1, 1, 1, 1],
                                           padding="SAME")
            """
            y = W_{ie}*h_i:用的是线性映射 _linear() ,W_{ie}未显式声明,在Linear函数中
            """
            # intent_input:[batch, hidden_size*4]
            # y: [batch, attn_size=hidden_size*2]
            y = core_rnn_cell._linear(intent_input,
                                      output_size=attn_size,
                                      bias=True)
            print("intent-attn, attn_size:", attn_size, " y:", y)
            # [batch, 1, 1, hidden_size * 2]
            y = tf.reshape(y, shape=[-1, 1, 1, attn_size])
            """
            e_{i,k}=V^T*tanh(W_{he}*h_k+W_{ie}*h_i)
            """
            # V:[batch, input_sequence_length, 1, hidden_size*2]
            # hidden_features:[batch, input_sequence_length, 1, hidden_size*2]
            # y:[batch, 1, 1, hidden_size * 2]
            # bahdanau_activate:[batch, input_sequence_length, 1, hidden_size*2]
            V = tf.get_variable("intent_AttnV", shape=[attn_size])
            bahdanau_activate = V * tf.tanh(hidden_features + y)
            # logit_i_k:[batch, input_sequence_length]
            logit_i_k = tf.reduce_sum(bahdanau_activate, axis=[2, 3])
            """
            alpha_{i,j} = softmax(e{i,j})
            c_i = sum_{j}(alpha_{i,j}*h_j)
            """
            # [batch, input_sequence_length]
            score_i_k = tf.nn.softmax(logit_i_k)
            # [batch, input_sequence_length, 1]
            score_i_k = tf.expand_dims(score_i_k, axis=-1)
            # score_i_k:[batch, input_sequence_length, 1, 1]
            score_i_k = tf.expand_dims(score_i_k, axis=-1)
            # 注意: c_intent 为hidden在各时间长度上进行加权平均
            # score_i_k:[batch, input_sequence_length, 1, 1]
            # hidden:[batch, input_sequence_length, 1, hidden_size*2]
            # intent_context_hidden:[batch, hidden_size*2]
            intent_context_hidden = tf.reduce_sum(score_i_k * hidden,
                                                  axis=[1, 2])

            if add_final_state_to_intent == True:
                """
                c_I = c_i + h_T, T代表最后时刻 encoder_final_state
                """
                # intent_input:[batch, hidden_size * 4]
                # intent_context_hidden:[batch, hidden_size*2]
                # intent_output:[batch, hidden_size* 2 + hidden_size * 4]
                intent_output = tf.concat(
                    [intent_context_hidden, intent_input], 1)
            else:
                """
                c_I = c_i, T代表最后时刻 encoder_final_state
                """
                # c_intent = context_intent
                # intent_context_hidden:[batch, hidden_size*2]
                intent_output = intent_context_hidden
        """
        计算slot_gate
        slot_gate=v*tanh(c_i^S + W*c_I)
        """
        with tf.variable_scope('slot_gated'):
            # W*c_I
            # intent_gate:[batch, hidden_size * 2]
            intent_gate = core_rnn_cell._linear(intent_output,
                                                output_size=attn_size,
                                                bias=True)  # W*c_intent
            embed_size = intent_gate.get_shape()[1].value
            # [batch, 1, hidden_size * 2]
            intent_gate = tf.reshape(intent_gate, [-1, 1, embed_size])
            # V_gate:[hidden_size*2]
            V_gate = tf.get_variable("gateV", [attn_size])
            if not remove_slot_attn:  # 需要slot attention
                """
                需要slot attention
                slot_context_hidden:c_i^S, intent_gate:W*c_I
                
                论文中公式(6):
                g=sum(v*tanh(c_i + W*c^I)) 
                """
                # slot_context_hidden: [batch, input_sequence_length, hidden_size * 2]
                # intent_gate:[batch, 1, hidden_size * 2]
                # slot_gate:[batch, input_sequence_length, hidden_size * 2]
                slot_gate = V_gate * tf.tanh(slot_context_hidden + intent_gate)
            else:
                """
                不需要slot attention,用原始的hidden输入
                论文中公式(8):
                g=sum(v*tanh(h_i + W*c^I)) 
                
                """
                # V_gate:[hidden_size*2]
                # sequence_outputs:[batch, input_sequence_length, hidden_size * 2]
                # intent_gate:[batch, 1, hidden_size * 2]
                # slot_gate:[batch, input_sequence_length, hidden_size * 2]
                slot_gate = V_gate * tf.tanh(sequence_outputs + intent_gate)
            # slot_gate:[batch, input_sequence_length, 1]
            slot_gate = tf.reduce_sum(slot_gate, axis=[2], keep_dims=True)
            """
            h_i+c_i^S*slot_gate
            """
            if not remove_slot_attn:  # 需要slot attention
                """
                论文中公式(7):
                y_i(slot)=softmax(W_hy(hi+c_i^S*slot_gate))
                中的 c_i^S*slot_gate部分
                """
                # slot_context_hidden: [batch, input_sequence_length, hidden_size * 2]
                # slot_gate:[batch, input_sequence_length, 1]
                # context_slot_gate:[batch, input_sequence_length, hidden_size* 2]
                context_slot_gate = slot_context_hidden * slot_gate
            else:
                """
                论文中公式(9):
                y_i(slot)=softmax(W_hy(hi+h_i*slot_gate))
                中的 c_i*slot_gate部分
                """
                # sequence_outputs:[batch, input_sequence_length, hidden_size* 2]
                # context_slot_gate:[batch, input_sequence_length, hidden_size* 2]
                context_slot_gate = sequence_outputs * slot_gate
            # context_slot_gate:[batch * input_sequence_length, attn_size=hidden_size*2]
            context_slot_gate = tf.reshape(context_slot_gate, [-1, attn_size])
            """
            hi+c_i*slot_gate
            or 
            hi+h_i*slot_gate
            """
            # context_slot_gate:[batch * input_sequence_length, attn_size=hidden_size*2]
            # slot_inputs:[batch * input_sequence_length, hidden_size * 2]
            # slot_output:[batch * input_sequence_length, hidden_size * 4]
            slot_output = tf.concat([context_slot_gate, slot_inputs], axis=1)
    """
    注意:上面 slot_output与paper中的公式稍有不同,此处是将h_i, c_i^S*slot_gate concat起来,而非相加
    
    原paper中公式: 
    y_i(slot) = softmax(W_hy(h_i+c_i^S*slot_gate)) (7)
    or 
    y_i(slot) = crf(W_hy(h_i+c_i^S*slot_gate))
    """
    with tf.variable_scope('slot_proj'):
        # slot_output:[batch * input_sequence_length, hidden_size * 4]
        # slot_logits:[batch * input_sequence_length, slot_size]
        # linear里的矩阵为论文中:W_s{hy}
        slot_logits = core_rnn_cell._linear(slot_output,
                                            output_size=slot_size,
                                            bias=True)
        if use_crf or use_batch_crossent:
            # sequence_outputs:[batch, input_sequence_length, hidden_size* 2]
            nstep = tf.shape(sequence_outputs)[1]
            # slot_logits:[batch, input_sequence_length, slot_size]
            slot_logits = tf.reshape(slot_logits, [-1, nstep, slot_size])
    """
    y(intent) = softmax(W_hy(c_I + h_T))
    """
    with tf.variable_scope('intent_proj'):
        # intent_output:[batch, hidden_size* 2 + hidden_size * 4]
        # intent_logits:[batch, intent_size]
        # linear里的矩阵为论文中:W_I{hy}
        intent_logits = core_rnn_cell._linear(intent_output,
                                              output_size=intent_size,
                                              bias=True)

    return [slot_logits, intent_logits]