Exemple #1
0
 def gated_self_attention(self, enc):
     """gated self-attention layer"""
     with tf.variable_scope("Gated_Self_Attention_Layer"):
         attn_dim = enc.get_shape().as_list()[-1]
         s, _, _ = bilinear_attention(queries=enc,
                                      units=attn_dim,
                                      memory=enc,
                                      num_heads=1,
                                      mask=self.para_mask,
                                      bias=False,
                                      return_weights=True)
         f = tf.nn.tanh(
             conv(tf.concat([enc, s], axis=-1),
                  output_size=attn_dim,
                  bias=False,
                  name="f"))
         g = tf.nn.sigmoid(
             conv(tf.concat([enc, s], axis=-1),
                  output_size=attn_dim,
                  bias=False,
                  name="g"))
         return g * f + (1 - g) * enc
Exemple #2
0
    def sample(self):
        """sampling function used during inference, reuse the parameters defined in decode()"""
        with tf.variable_scope("Decoder_Layer", reuse=True):
            memory = self.enc
            h = tf.nn.tanh(
                _linear(self.init_h,
                        output_size=self.d,
                        bias=False,
                        scope="h_initial"))
            c = tf.nn.tanh(
                _linear(self.init_c,
                        output_size=self.d,
                        bias=False,
                        scope="c_initial"))
            hh = tf.zeros((self.N, self.d))
            state = (c, h) if self.layer == 1 else [(c, h)
                                                    for _ in range(self.layer)]
            prev, attn_w = None, None
            symbols = []
            prev_probs = tf.zeros(self.N)

            # the ground-truth question, only the start token will be used
            oups = tf.split(self.que, [1] * (self.QL + 2), 1)
            for i, inp in enumerate(oups):
                einp = tf.reshape(
                    tf.nn.embedding_lookup(self.plus_word_mat, inp),
                    [self.N, self.dw])
                if prev is not None:
                    with tf.variable_scope("loop_function", reuse=True):
                        einp, prev_symbol, prev_probs = self._loop_function_sample(
                            prev, attn_w, prev_probs, i)
                        symbols.append(prev_symbol)

                cinp = tf.concat([einp, hh], 1)
                h, state = self.decoder_cell(cinp, state)
                # compute context vector
                attn, _, attn_w = bilinear_attention(
                    tf.expand_dims(h, 1),
                    units=self.d,
                    num_heads=1,
                    # attns=tf.expand_dims(tf.reduce_sum(coverage, 0), 1),
                    memory=memory,
                    scope="temporal_attention",
                    mask=self.para_mask,
                    bias=False,
                    return_weights=True)
                attn_dim = attn.get_shape().as_list()[-1]
                attn = tf.reshape(attn, [-1, attn_dim])
                attn_w = tf.reshape(attn_w, [-1, self.PL])
                # attention vector
                hh = tf.nn.tanh(
                    _linear(tf.concat([attn, h], 1),
                            output_size=self.d,
                            bias=False,
                            scope="hh"))

                with tf.variable_scope("AttnOutputProjection"):
                    # maxout
                    output = _linear(tf.concat([attn, h], 1),
                                     output_size=2 * self.dw,
                                     bias=False,
                                     scope="maxout")
                    output = tf.reshape(output, [-1, self.dw, 2])
                    output = tf.reduce_max(output, 2)

                prev = output

            einp, prev_symbol, prev_probs = self._loop_function_sample(
                prev, attn_w, prev_probs, i)
            symbols.append(prev_symbol)

            return symbols, tf.expand_dims(prev_probs, 1)
Exemple #3
0
    def search(self, beam_size, prev_probs=None):
        """beam search function used during inference, reuse the parameters defined in decode()"""
        with tf.variable_scope("Decoder_Layer", reuse=True):
            memory = self.enc
            # specify the loop function, either for standard beam search or diverse beam search
            loop_function = self._loop_function_diverse_search if self.diverse_beam else self._loop_function_search
            # init the decoder's state
            h = tf.nn.tanh(
                _linear(self.init_h,
                        output_size=self.d,
                        bias=False,
                        scope="h_initial"))
            c = tf.nn.tanh(
                _linear(self.init_c,
                        output_size=self.d,
                        bias=False,
                        scope="c_initial"))
            hh = tf.zeros(
                (self.N, 1, self.d))  # the attention vector from previous step
            state = (c, h) if self.layer == 1 else [(c, h)
                                                    for _ in range(self.layer)]
            prev, attn_w = None, None  # the output vector and attention logits from previous step
            # the accumulated log probabilities of the beam
            prev_probs = prev_probs if prev_probs is not None else tf.zeros(
                (self.N, 1))
            finished = tf.cast(tf.zeros((self.N, 1)),
                               tf.bool)  # whether </S> is encountered
            symbols = []  # the output words at each step in the beam
            attn_ws = []  # the attention logits at each step in the beam
            # the decoder states at each step in the beam
            hs = [tf.reshape(h, [self.N, 1, self.d])]

            # the ground-truth question, only the start token will be used
            oups = tf.split(self.que, [1] * (self.QL + 2), 1)
            for i, inp in enumerate(oups):
                einp = tf.nn.embedding_lookup(self.plus_word_mat, inp)
                if prev is not None:
                    # from the second step
                    with tf.variable_scope("loop_function", reuse=True):
                        einp, prev_probs, index, prev_symbol, finished = loop_function(
                            beam_size, prev, attn_w, prev_probs, finished, i)
                        hh = tf.gather_nd(
                            hh, index)  # update prev attention vector
                        state = tuple(tf.gather_nd(s, index) for s in state) if self.layer == 1 else \
                            [tuple(tf.gather_nd(s, index) for s in sta) for sta in state]  # update prev state
                        for j, symbol in enumerate(symbols):
                            symbols[j] = tf.gather_nd(
                                symbol, index)  # update prev symbols
                        symbols.append(prev_symbol)
                        for j, hsi in enumerate(hs):
                            hs[j] = tf.gather_nd(hsi, index)
                # update cell
                state = tuple(tf.reshape(s, [-1, self.d]) for s in state) if self.layer == 1 else \
                    [tuple(tf.reshape(s, [-1, self.d]) for s in sta) for sta in state]
                cinp = tf.concat([einp, hh], -1)
                cinp_dim = cinp.get_shape().as_list()[-1]
                h, state = self.decoder_cell(tf.reshape(cinp, [-1, cinp_dim]),
                                             state)
                # compute context vector
                attn, _, attn_w = bilinear_attention(
                    tf.reshape(h, [self.N, -1, self.d]),
                    units=self.d,
                    num_heads=1,
                    memory=memory,
                    mask=self.para_mask,
                    scope="temporal_attention",
                    bias=False,
                    return_weights=True)
                attn_dim = attn.get_shape().as_list()[-1]
                attn = tf.reshape(attn, [-1, attn_dim])
                attn_w = tf.reshape(attn_w, [self.N, -1, self.PL])
                attn_ws.append(attn_w)

                # attention vector
                hh = tf.nn.tanh(
                    _linear(tf.concat([attn, h], -1),
                            output_size=self.d,
                            bias=False,
                            scope="hh"))
                hh = tf.reshape(hh, [self.N, -1, self.d])

                # reshape for next step's indexing convenience
                state = tuple(tf.reshape(s, [self.N, -1, self.d]) for s in state) if self.layer == 1 else \
                    [tuple(tf.reshape(s, [self.N, -1, self.d]) for s in sta) for sta in state]
                hs.append(tf.reshape(h, [self.N, -1, self.d]))

                with tf.variable_scope("AttnOutputProjection"):
                    # maxout
                    output = _linear(tf.concat([attn, h], -1),
                                     output_size=2 * self.dw,
                                     bias=False,
                                     scope="maxout")
                    output = tf.reshape(output, [self.N, -1, self.dw, 2])
                    output = tf.reduce_max(output, -1)

                prev = output

            # process the last symbol
            einp, prev_probs, index, prev_symbol, finished = loop_function(
                beam_size, prev, attn_w, prev_probs, finished, i)
            for j, symbol in enumerate(symbols):
                symbols[j] = tf.gather_nd(symbol, index)  # update prev symbols
            symbols.append(prev_symbol)

            return symbols, prev_probs
Exemple #4
0
    def decode(self, que, reuse=None):
        """decoding function used during training, decoder is a 2-layer uni-lstm"""
        with tf.variable_scope("Decoder_Layer", reuse=reuse):
            memory = self.enc
            # init the decoder's state
            h = tf.nn.tanh(
                _linear(self.init_h,
                        output_size=self.d,
                        bias=False,
                        scope="h_initial"))
            c = tf.nn.tanh(
                _linear(self.init_c,
                        output_size=self.d,
                        bias=False,
                        scope="c_initial"))
            hh = tf.zeros(
                (self.N, self.d))  # the attention vector from previous step
            state = (c, h) if self.layer == 1 else [(c, h)
                                                    for _ in range(self.layer)]
            attn_ws = []  # save every step's attention logits
            outputs = []  # save every step's output vectors
            # the ground-truth question
            oups = tf.split(que, [1] * (self.QL + 2), 1)
            for i, inp in enumerate(oups):
                if i > 0:
                    tf.get_variable_scope().reuse_variables()
                # word embedding + previous attention vector
                einp = tf.reshape(
                    tf.nn.embedding_lookup(self.plus_word_mat, inp),
                    [self.N, self.dw])
                cinp = tf.concat([einp, hh], 1)
                # update cell
                h, state = self.decoder_cell(cinp, state)

                # attention, obtain the context vector and attention logits
                attn, _, attn_w = bilinear_attention(
                    tf.expand_dims(h, 1),
                    units=self.d,
                    num_heads=1,
                    memory=memory,
                    scope="temporal_attention",
                    mask=self.para_mask,
                    bias=False,
                    return_weights=True)
                attn_dim = attn.get_shape().as_list()[-1]
                attn = tf.reshape(attn, [-1, attn_dim])
                attn_w = tf.reshape(attn_w, [-1, self.PL])
                attn_ws.append(attn_w)

                # attention vector
                hh = tf.nn.tanh(
                    _linear(tf.concat([attn, h], 1),
                            output_size=self.d,
                            bias=False,
                            scope="hh"))

                with tf.variable_scope("AttnOutputProjection"):
                    # maxout
                    output = _linear(tf.concat([attn, h], 1),
                                     output_size=2 * self.dw,
                                     bias=False,
                                     scope="maxout")
                    output = tf.reshape(output, [-1, self.dw, 2])
                    output = tf.reduce_max(output, 2)
                    outputs.append(output)

            return outputs, oups, attn_ws