Exemple #1
0
def bilinear_attention(query, context, 
                query_mask, context_mask, dropout_ratio,
                scope, reuse=None):
    with tf.variable_scope(scope+"_Context_to_Query_Attention_Layer", reuse=reuse):
        context_ = tf.transpose(context, [0,2,1])
        hidden_dim = query.get_shape()[-1]

        attn_W = tf.get_variable("AttnW", dtype=tf.float32,
                                    shape=[hidden_dim, hidden_dim],
                                    initializer=initializer)

        weighted_query = tf.tensordot(query, attn_W, axes=[[2], [0]])

        S = tf.matmul(weighted_query, context_)  # batch x q_len x c_len

        mask_q = tf.expand_dims(query_mask, 1)
        mask_c = tf.expand_dims(context_mask, 1)

        S_ = tf.nn.softmax(qanet_layers.mask_logits(S, mask = mask_c))
        c2q = tf.matmul(S_, context)

        S_T = tf.nn.softmax(qanet_layers.mask_logits(tf.transpose(S, [0,2,1]), mask = mask_q))
        q2c = tf.matmul(S_T, query)

        return c2q, q2c
Exemple #2
0
def dot_attention(query, context,
                query_mask, context_mask, dropout_ratio,
                scope, reuse=None):

    hidden_dim = query.get_shape()[-1]
    Wd = tf.get_variable("Wd", dtype=tf.float32,
                                    shape=[hidden_dim, hidden_dim],
                                    initializer=initializer)

    Vd = tf.get_variable("Vd", dtype=tf.float32,
                                    shape=[hidden_dim, 1],
                                    initializer=initializer)

    # batch x len_query x 1 x hidden_dim
    query_ = tf.expand_dims(query, 2)
    # batch x 1 x len_context x hidden_dim
    context_ = tf.expand_dims(context, 1)

    # batch x len_query x len_context x hidden_dim
    dot_attention = query_ * context_
    dot_attention = tf.einsum("abcd,de->abce", dot_attention, Wd)
    dot_attention = tf.einsum("abce,ef->abcf", dot_attention, Vd)

    # batch x len_query x len_context
    S = tf.squeeze(dot_attention, -1)
    mask_q = tf.expand_dims(query_mask, 1) # batch x 1 x query_len
    mask_c = tf.expand_dims(context_mask, 1) # batch x 1 x context_len

    S_ = tf.nn.softmax(qanet_layers.mask_logits(S, mask = mask_c))
    c2q = tf.matmul(S_, context) 

    S_T = tf.nn.softmax(qanet_layers.mask_logits(tf.transpose(S, [0,2,1]), mask = mask_q))
    q2c = tf.matmul(S_T, query)

    return c2q, q2c
Exemple #3
0
def self_attention(query, context,
                query_mask, context_mask, dropout_ratio,
                scope, reuse=None):

    hidden_dim = query.get_shape()[-1]
    Wq_1 = tf.get_variable("Wq_1", dtype=tf.float32,
                                    shape=[hidden_dim, hidden_dim],
                                    initializer=initializer)

    Vq = tf.get_variable("Vq", dtype=tf.float32,
                                    shape=[hidden_dim, 1],
                                    initializer=initializer)

    Wp_1 = tf.get_variable("Wp_1", dtype=tf.float32,
                                    shape=[hidden_dim, hidden_dim],
                                    initializer=initializer)

    Wp_2 = tf.get_variable("Wp_2", dtype=tf.float32,
                                    shape=[hidden_dim, 1],
                                    initializer=initializer)

    # S = tf.matmul(tf.nn.tanh(tf.matmul(query, Wq_1)), Vq_1)
    S = tf.nn.tanh(tf.einsum("abc,cd->abd", query, Wq_1))
    S = tf.einsum("abd,de->abe", S, Vq)

    S = tf.squeeze(S, -1) # batch x query_len

    mask_q = query_mask # batch x query_len

    S_ = tf.nn.softmax(qanet_layers.mask_logits(S, mask = mask_q))
    S_ = tf.expand_dims(S_, axis=-1) # batch x len x 1
    query_attn = tf.reduce_sum(S_ * query, axis=1, keepdims=True) # batch x 1 x hidden_dim

    # batch x context_len x 1
    S = tf.nn.tanh(tf.einsum("abc,cd->abd", context, Wp_1))
    S += tf.nn.tanh(tf.einsum("abc,cd->abd", query_attn, Wp_1))
    S = tf.nn.tanh(S)
    S = tf.einsum("abd,de->abe", S, Vq)

    S = tf.squeeze(S, -1) # batch x context_len

    mask_c = context_mask # batch x context_len
    S_ = tf.nn.softmax(qanet_layers.mask_logits(S, mask = mask_c))
    S_ = tf.expand_dims(S_, axis=-1) # batch x context_len x 1
    context_attn = tf.reduce_sum(S_ * context, axis=1, keepdims=True) # batch x 1 x hidden_dim
    context_attn = tf.squeeze(context_attn, axis=1)
    
    return context_attn
Exemple #4
0
    def build_encoder(self, input_lengths, input_mask, *args, **kargs):
        reuse = kargs["reuse"]
        word_emb, word_drop_mask = self.build_emebdding(*args, **kargs)
        dropout_rate = tf.cond(self.is_training,
                               lambda: self.config.dropout_rate, lambda: 0.0)

        word_drop_mask = tf.cast(word_drop_mask, tf.float32)
        word_drop_mask = tf.squeeze(word_drop_mask, axis=-1)

        input_mask = tf.cast(input_mask, tf.float32)
        input_mask *= word_drop_mask

        word_emb = tf.nn.dropout(word_emb, 1 - dropout_rate)

        H_enc_2 = tf.reduce_max(qanet_layers.mask_logits(
            word_emb, tf.expand_dims(input_mask, -1)),
                                axis=1)

        input_mask = tf.expand_dims(input_mask, -1)
        H_enc_1 = tf.reduce_sum(word_emb * tf.cast(input_mask, tf.float32), 1)
        H_enc_1 = tf.div(H_enc_1, tf.reduce_sum(input_mask, axis=1) + EPSILON)

        H_enc = tf.concat([H_enc_1, H_enc_2], 1)

        return H_enc
Exemple #5
0
def query_context_attention(query,
                            context,
                            max_query_len,
                            max_context_len,
                            query_mask,
                            context_mask,
                            dropout_ratio,
                            scope,
                            reuse=None):
    with tf.variable_scope(scope + "_Context_to_Query_Attention_Layer",
                           reuse=reuse):
        context_ = tf.transpose(context, [0, 2, 1])
        hiddem_dim = query.get_shape()[-1]

        attn_W = tf.get_variable("AttnW",
                                 dtype=tf.float32,
                                 shape=[hiddem_dim, hiddem_dim],
                                 initializer=initializer)

        weighted_query = tf.tensordot(query, attn_W, axes=[[2], [0]])

        S = tf.matmul(weighted_query, context_)  # batch x q_len x c_len

        mask_q = tf.expand_dims(query_mask, 1)
        mask_c = tf.expand_dims(context_mask, 1)

        S_ = tf.nn.softmax(qanet_layers.mask_logits(S, mask=mask_c))
        c2q = tf.matmul(S_, context)

        S_T = tf.nn.softmax(
            qanet_layers.mask_logits(tf.transpose(S, [0, 2, 1]), mask=mask_q))
        q2c = tf.matmul(S_T, query)

        query_attention_outputs = tf.concat(
            [query, c2q, query - c2q, query * c2q], axis=-1)
        query_attention_outputs *= tf.expand_dims(
            tf.cast(query_mask, tf.float32), -1)

        context_attention_outputs = tf.concat(
            [context, q2c, context - q2c, context * q2c], axis=-1)
        context_attention_outputs *= tf.expand_dims(
            tf.cast(context_mask, tf.float32), -1)

        return query_attention_outputs, context_attention_outputs
Exemple #6
0
def concat_attention(query, context, 
                query_mask, context_mask, dropout_ratio,
                scope, reuse=None):
    hidden_dim = query.get_shape()[-1]
    Wc_1 = tf.get_variable("Wc_1", dtype=tf.float32,
                                    shape=[hidden_dim, hidden_dim],
                                    initializer=initializer)

    Wc_2 = tf.get_variable("Wc_2", dtype=tf.float32,
                                    shape=[hidden_dim, hidden_dim],
                                    initializer=initializer)

    Vc = tf.get_variable("Vc", dtype=tf.float32,
                                    shape=[hidden_dim, 1],
                                    initializer=initializer)

    # batch x len x hidden_dim
    attention_1 = tf.einsum("abc,cd->abd", query, Wc_1)
    attention_2 = tf.einsum("abc,cd->abd", context, Wc_2)

    # concat attention
    # batch x len_query x 1 x hidden_dim
    attention_1 = tf.expand_dims(attention_1, 2)
    # batch x 1 x len_context x hidden_dim
    attention_2 = tf.expand_dims(attention_2, 1)

    # batch x len_query x len_context x hidden_dim
    attention = tf.nn.tanh(attention_1+attention_2)

    # batch x len_query x len_context x 1
    S = tf.einsum("abcd,de->abce", attention, Vc)
    S = tf.squeeze(S, -1) # batch x len_query x len_context

    mask_q = tf.expand_dims(query_mask, 1) # batch x 1 x query_len
    mask_c = tf.expand_dims(context_mask, 1) # batch x 1 x context_len

    S_ = tf.nn.softmax(qanet_layers.mask_logits(S, mask = mask_c))
    c2q = tf.matmul(S_, context) 

    S_T = tf.nn.softmax(qanet_layers.mask_logits(tf.transpose(S, [0,2,1]), mask = mask_q))
    q2c = tf.matmul(S_T, query)

    return c2q, q2c
Exemple #7
0
def trilinear_attention(ques_emb, context_emb, ques_mask, context_mask,
                        dropout_keep_prob, config):
    attention_outputs = []
    C = tf.tile(tf.expand_dims(context_emb, 2), [1, 1, config.max_q_len, 1])
    Q = tf.tile(tf.expand_dims(ques_emb, 1), [1, config.max_p_len, 1, 1])
    S = qanet_layers.trilinear([C, Q, C * Q],
                               input_keep_prob=1.0 - dropout_keep_prob)
    mask_q = tf.expand_dims(ques_mask, 1)
    S_ = tf.nn.softmax(qanet_layers.mask_logits(S, mask=mask_q))
    mask_c = tf.expand_dims(context_mask, 2)
    S_T = tf.transpose(
        tf.nn.softmax(qanet_layers.mask_logits(S, mask=mask_c), dim=1),
        (0, 2, 1))
    c2q = tf.matmul(S_, ques_emb)  #
    q2c = tf.matmul(tf.matmul(S_, S_T), context_emb)
    attention_outputs.extend([context_emb, c2q, context_emb * c2q])
    if config.q2c:
        attention_outputs.append(context_emb * q2c)

    return tf.concat(attention_outputs, axis=-1)
Exemple #8
0
def task_specific_attention(inputs,
                            output_size,
                            input_mask,
                            initializer=layers.xavier_initializer(),
                            activation_fn=tf.tanh,
                            scope=None,
                            reuse=None):
    """
    Performs task-specific attention reduction, using learned
    attention context vector (constant within task of interest).
    self-attentive sentence embedding

    Args:
        inputs: Tensor of shape [batch_size, units, input_size]
            `input_size` must be static (known)
            `units` axis will be attended over (reduced from output)
            `batch_size` will be preserved
        output_size: Size of output's inner (feature) dimension

    Returns:
        outputs: Tensor of shape [batch_size, output_dim].
    """
    assert len(
        inputs.get_shape()) == 3 and inputs.get_shape()[-1].value is not None

    with tf.variable_scope(scope + '_attention', reuse=reuse) as scope:
        print("--------------using self attention----------------")
        attention_context_vector = tf.get_variable(
            name='attention_context_vector',
            shape=[output_size],
            initializer=initializer,
            dtype=tf.float32)
        input_projection = layers.fully_connected(
            inputs, output_size, activation_fn=activation_fn,
            scope=scope)  # batch x max_len x output_size

        vector_attn = tf.reduce_sum(tf.multiply(input_projection,
                                                attention_context_vector),
                                    axis=2)  # batch x max_len
        input_mask = tf.cast(input_mask, tf.float32)
        attention_weights = tf.nn.softmax(
            qanet_layers.mask_logits(vector_attn, mask=input_mask))
        attention_weights = tf.expand_dims(attention_weights, -1)
        # vector_attn_max = tf.reduce_max(qanet_layers.mask_logits(vector_attn, extend_mask), axis=1)

        # attention_weights = tf.exp(vector_attn-vector_attn_max) * tf.cast(extend_mask, tf.float32) # batch x max_len x 1
        # attention_weights = attention_weights / tf.reduce_sum(attention_weights, axis=1, keep_dims=True) # batch x max_len x 1

        weighted_projection = tf.multiply(input_projection, attention_weights)

        outputs = tf.reduce_sum(weighted_projection, axis=1)

        return outputs
Exemple #9
0
    def build_emebdding(self, *args, **kargs):

        reuse = kargs["reuse"]
        dropout_rate = tf.cond(self.is_training,
                               lambda: self.config.dropout_rate, lambda: 0.0)
        word_emb = tf.nn.embedding_lookup(self.emb_mat, self.sent_token)

        if self.config.with_word_drop:

            word_drop_rate = tf.cond(self.is_training,
                                     lambda: self.config.word_drop_rate,
                                     lambda: 0.0)
            word_emb, word_drop_mask = common_utils.word_dropout(
                word_emb, word_drop_rate)
        else:
            word_drop_mask = self.sent_token_mask
        entity_emb = tf.nn.embedding_lookup(self.emb_mat, self.entity_token)

        [_, _, entity_emb
         ] = layer_utils.my_lstm_layer(entity_emb,
                                       self.config.context_lstm_dim,
                                       input_lengths=self.entity_token_len,
                                       scope_name=self.config.scope,
                                       reuse=reuse,
                                       is_training=self.is_training,
                                       dropout_rate=dropout_rate,
                                       use_cudnn=self.config.use_cudnn)

        entity_mask = tf.expand_dims(self.entity_token_mask,
                                     axis=-1)  # batch x len x 1
        entity_emb = tf.reduce_max(qanet_layers.mask_logits(
            entity_emb, entity_mask),
                                   axis=1)

        entity_emb = tf.expand_dims(entity_emb, axis=1)
        seq_len = tf.reduce_max(self.sent_token_len)
        entity_emb = tf.tile(entity_emb, [1, seq_len, 1])

        mask = tf.expand_dims(self.sent_token_mask, -1)
        word_emb = tf.concat([word_emb, entity_emb], axis=-1)
        word_emb *= tf.cast(mask, tf.float32)

        print(word_emb.get_shape(), "=====word with entity========")
        if self.config.with_char:
            char_emb = self.build_char_embedding(self.sent_char,
                                                 self.sent_char_len,
                                                 self.char_mat,
                                                 is_training=self.is_training,
                                                 reuse=reuse)
            word_emb = tf.concat([word_emb, char_emb], axis=-1)

        return word_emb, word_drop_mask
Exemple #10
0
def bilinear_attention(ques_emb, context_emb, ques_mask, context_mask,
                       dropout_keep_prob, config):

    attention_outputs = []

    context_ = tf.transpose(context_emb, [0, 2, 1])
    hiddem_dim = ques_emb.get_shape()[-1]

    attn_W = tf.get_variable(
        "AttnW",
        shape=[hiddem_dim, hiddem_dim],
        dtype=tf.float32,
        initializer=tf.contrib.layers.variance_scaling_initializer(
            factor=1.0, mode='FAN_AVG', uniform=True, dtype=tf.float32))

    weighted_query = tf.tensordot(ques_emb, attn_W, axes=[[2], [0]])

    S = tf.matmul(weighted_query, context_)  # batch x q_len x c_len

    mask_q = tf.expand_dims(ques_mask, 1)  # batch x 1 x q_len
    mask_c = tf.expand_dims(context_mask, 1)  # batch x 1 x c_len

    S_max = tf.nn.softmax(
        tf.expand_dims(
            tf.reduce_max(qanet_layers.mask_logits(S, mask=mask_c), axis=1),
            1), -1)  # batch x 1 x c_len
    c2q = tf.matmul(S_max, context_emb)

    S_T = tf.nn.softmax(
        qanet_layers.mask_logits(tf.transpose(S, [0, 2, 1]),
                                 mask=mask_q))  # batch x c_len x q_len
    q2c = tf.matmul(S_T, ques_emb)  # batch x c_len x c_dim

    attention_outputs.extend([context_emb, q2c, context_emb * q2c])
    if config.q2c:
        attention_outputs.append(context_emb * c2q)
    return tf.concat(attention_outputs, axis=-1)
Exemple #11
0
def query_context_attention(query, context, max_query_len, max_context_len, 
                query_mask, context_mask, dropout_ratio,
                scope, reuse=None):
    with tf.variable_scope(scope+"_Context_to_Query_Attention_Layer", reuse=reuse):
        # context_ = tf.transpose(context, [0,2,1])
        hiddem_dim = query.get_shape()[-1]

        query_ = tf.nn.l2_normalize(query, axis=-1)
        context_ = tf.nn.l2_normalize(context, axis=-1)

        # attn_W = tf.get_variable("AttnW", dtype=tf.float32,
        #                             shape=[hiddem_dim, hiddem_dim],
        #                             initializer=initializer)

        S = tf.matmul(query_, tf.transpose(context_, [0,2,1]))

        # S = tf.matmul(weighted_query, context_)  # batch x q_len x c_len

        mask_q = tf.expand_dims(query_mask, 1)
        mask_c = tf.expand_dims(context_mask, 1)

        S_ = tf.nn.softmax(qanet_layers.mask_logits(S, mask = mask_c))
        c2q = tf.matmul(S_, context)

        S_T = tf.nn.softmax(qanet_layers.mask_logits(tf.transpose(S, [0,2,1]), mask = mask_q))
        q2c = tf.matmul(S_T, query)

        query_attention_outputs = c2q #tf.concat([query*c2q, c2q], axis=-1)
        # query_attention_outputs *= tf.expand_dims(tf.cast(query_mask, tf.float32), -1)

        context_attention_outputs = q2c #tf.concat([context*q2c, q2c], axis=-1)
        # context_attention_outputs *= tf.expand_dims(tf.cast(context_mask, tf.float32), -1)

        # query_attention_outputs = tf.nn.dropout(query_attention_outputs, 1 - dropout_ratio)
        # context_attention_outputs = tf.nn.dropout(context_attention_outputs, 1 - dropout_ratio)

        return query_attention_outputs, context_attention_outputs
Exemple #12
0
def memory_attention_v2(query,
                        memory,
                        query_mask,
                        scope,
                        memory_mask=None,
                        reuse=None,
                        attention_output="soft",
                        num_heads=8,
                        dropout_rate=0.0,
                        threshold=0.1,
                        apply_hard_attn=False):
    """
    query: batch x len x query_dim
    memory: batch x num_calsses x mem_dim
    """
    with tf.variable_scope(scope + "_label_attention", reuse=reuse):

        query_dim = query.get_shape()[-1]
        mem_dim = memory.get_shape()[-1]

        # batch x num_calsses x mem_dim
        # memory_ = tf.transpose(memory, [0,2,1])

        attn_W = tf.get_variable("AttnW",
                                 dtype=tf.float32,
                                 shape=[query_dim, mem_dim],
                                 initializer=initializer)

        # bacth x len x mem_dim
        weighted_query = tf.einsum("abc,cd->abd", query, attn_W)
        S = tf.einsum("abd,ed->abe", weighted_query, memory)

        # batch x len x num_classes

        # batch x 1 x len
        mask_q = tf.expand_dims(query_mask, axis=1)

        # batch x num_classes x len
        S_ = tf.nn.softmax(
            qanet_layers.mask_logits(tf.transpose(S, [0, 2, 1]), mask=mask_q))
        # batch x num_classes x dim
        output = tf.matmul(S_, query)
        print("==memory attention==", output.get_shape())

        return output
def memory_attention(query,
                     memory,
                     query_mask,
                     scope,
                     memory_mask=None,
                     reuse=None):
    """
    query: batch x len x query_dim
    memory: batch x num_calsses x mem_dim
    """
    with tf.variable_scope(scope + "_Context_to_Query_Attention_Layer",
                           reuse=reuse):

        query_dim = query.get_shape()[-1]
        mem_dim = memory.get_shape()[-1]

        # batch x num_calsses x mem_dim
        memory_ = tf.transpose(memory, [0, 2, 1])

        attn_W = tf.get_variable("AttnW",
                                 dtype=tf.float32,
                                 shape=[query_dim, mem_dim],
                                 initializer=initializer)

        # bacth x len x mem_dim
        weighted_query = tf.tensordot(query, attn_W, axes=[[2], [0]])

        # batch x len x num_classes
        S = tf.matmul(weighted_query, memory_)

        # batch x 1 x len
        mask_q = tf.expand_dims(query_mask, axis=1)

        # batch x num_classes x len
        S_ = tf.nn.softmax(
            qanet_layers.mask_logits(tf.transpose(S, [0, 2, 1]), mask=mask_q))
        # batch x num_classes x dim
        output = tf.matmul(S_, query)
        print(output.get_shape(), "=====")

        output = tf.reduce_sum(output, axis=1)

        return output
def memory_attention_v1(query,
                        memory,
                        query_mask,
                        scope,
                        memory_mask=None,
                        reuse=None,
                        attention_output="soft",
                        num_heads=8,
                        dropout_rate=0.0,
                        threshold=0.1,
                        apply_hard_attn=False):
    """
    query: batch x len x query_dim
    memory: batch x num_calsses x mem_dim
    """
    with tf.variable_scope(scope + "_label_attention", reuse=reuse):

        query_dim = query.get_shape()[-1]
        mem_dim = memory.get_shape()[-1]

        # batch x num_calsses x mem_dim
        memory_ = tf.transpose(memory, [0, 2, 1])

        attn_W = tf.get_variable("AttnW",
                                 dtype=tf.float32,
                                 shape=[query_dim, mem_dim],
                                 initializer=initializer)

        # bacth x len x mem_dim
        weighted_query = tf.tensordot(query, attn_W, axes=[[2], [0]])

        # batch x len x num_classes
        S = tf.matmul(weighted_query, memory_)

        # batch x 1 x len
        mask_q = tf.expand_dims(query_mask, axis=1)

        # batch x num_classes x len
        S_ = tf.nn.softmax(
            qanet_layers.mask_logits(tf.transpose(S, [0, 2, 1]), mask=mask_q))
        # batch x num_classes x dim
        output = tf.matmul(S_, query)
        print(output.get_shape(), "==output shape===")
        if apply_hard_attn:
            presence_vec = output * output  # batch x num_class x dim
            presence_vec = tf.sqrt(tf.reduce_sum(presence_vec,
                                                 axis=-1))  # batch x num_class

            presence_vec = tf.nn.softmax(presence_vec, axis=-1)
            presence_mask = hard_attention_mask(presence_vec, threshold)

            output *= tf.expand_dims(presence_mask, -1)

            # presence_vec = tf.nn.softmax(presence_vec)
            # idx = tf.where(presence_vec > threshold)
            # batch_idxs = tf.range(0, tf.shape(output)[0])
            # batch_idxs = tf.expand_dims(batch_idxs, 1)

            # idxs = tf.concat([batch_idxs, idx], 1)
            # output = tf.gather_nd(output, idxs)

            print(output.get_shape(), "==hard attention output shape===")

        if attention_output == "soft" and not apply_hard_attn:
            class_dim = memory.get_shape()[1]
            class_attention = tf.get_variable("class_attn",
                                              dtype=tf.float32,
                                              shape=[query_dim],
                                              initializer=initializer)
            # batch x num_classes
            attn_output = tf.reduce_sum(output * class_attention, axis=-1)
            attn_output = tf.softmax(attn_output)  # batch x num_classes
            attn_output = tf.expand_dims(attn_output,
                                         axis=-1)  # batch x num_classes x 1
            output = tf.reduce_sum(attn_output * output, axis=1)

        elif attention_output == "sum" and apply_hard_attn:
            output = tf.reduce_sum(output, axis=1)

        elif attention_output == "multi_head":
            # get memory mask
            ignore_padding = (1 - presence_mask)
            ignore_padding = attention_bias_ignore_padding(ignore_padding)
            encoder_self_attention_bias = ignore_padding

            output = multihead_attention_texar(
                output,
                memory=None,
                memory_attention_bias=encoder_self_attention_bias,
                num_heads=num_heads,
                num_units=None,
                dropout_rate=dropout_rate,
                scope="multihead_attention")
            output = tf.reduce_sum(output, axis=1)

        return output
Exemple #15
0
    def build_interactor(self, sent1_emb, sent2_emb, sent1_len, sent2_len,
                         sent1_mask, sent2_mask, *args, **kargs):

        num_lstm_layers = kargs["num_lstm_layers"]
        dropout_rate = tf.cond(self.is_training,
                               lambda: self.config.dropout_rate, lambda: 0.0)

        input_dim = sent1_emb.get_shape()[-1]
        with tf.variable_scope(self.config.scope + "_embed_hishway"):
            sent1_repres = match_utils.multi_highway_layer(
                sent1_emb, input_dim, self.config.highway_layer_num)
            tf.get_variable_scope().reuse_variables()
            sent2_repres = match_utils.multi_highway_layer(
                sent2_emb, input_dim, self.config.highway_layer_num)
        match_dim = self.emb_size
        for i in range(num_lstm_layers):
            with tf.variable_scope(self.config.scope +
                                   "_densely_co_attentive_{}".format(i),
                                   reuse=None):
                sent1_repres_, match_dim_ = self.build_encoder(sent1_repres,
                                                               sent1_len,
                                                               reuse=None)
                sent2_repres_, match_dim_ = self.build_encoder(sent2_repres,
                                                               sent1_len,
                                                               reuse=True)
                match_dim += match_dim_
                print("===before=====", i, sent1_repres_.get_shape(),
                      sent2_repres_.get_shape())
                if self.config.get("co_attention", None):
                    [query_attention, context_attention
                     ] = drcn_utils.query_context_attention(sent1_repres_,
                                                            sent2_repres_,
                                                            sent1_len,
                                                            sent2_len,
                                                            sent1_mask,
                                                            sent2_mask,
                                                            dropout_rate,
                                                            self.config.scope,
                                                            reuse=None)

                    sent1_repres = tf.concat(
                        [sent1_repres_, query_attention, sent1_repres],
                        axis=-1)
                    sent2_repres = tf.concat(
                        [sent2_repres_, context_attention, sent2_repres],
                        axis=-1)
                    match_dim += match_dim_
                else:
                    sent1_repres = tf.concat([sent1_repres_, sent1_repres],
                                             axis=-1)
                    sent2_repres = tf.concat([sent2_repres_, sent2_repres],
                                             axis=-1)

                print("====i====", sent1_repres.get_shape(),
                      sent2_repres.get_shape())
                if np.mod(i + 1, 2) == 0 and self.config.with_auto_encoding:
                    sent1_repres = self.auto_encoder(sent1_repres, reuse=None)
                    sent2_repres = self.auto_encoder(sent2_repres, reuse=True)

                if self.config.recurrent_layer_norm:
                    sent1_repres = tf.contrib.layers.layer_norm(
                        sent1_repres, reuse=None, scope="lstm_layer_norm")
                    sent2_repres = tf.contrib.layers.layer_norm(
                        sent2_repres, reuse=True, scope="lstm_layer_norm")

        mask_q = tf.expand_dims(sent1_mask, -1)
        mask_c = tf.expand_dims(sent2_mask, -1)

        v_1_max = tf.reduce_max(qanet_layers.mask_logits(sent1_repres, mask_q),
                                axis=1)
        v_2_max = tf.reduce_max(qanet_layers.mask_logits(sent2_repres, mask_c),
                                axis=1)

        v = tf.concat([
            v_1_max, v_2_max, v_1_max * v_2_max, v_1_max - v_2_max,
            tf.abs(v_1_max - v_2_max)
        ],
                      axis=-1)
        v = tf.nn.dropout(v, 1 - dropout_rate)
        match_dim = match_dim * 5

        return v_1_max, v_2_max, v, match_dim
Exemple #16
0
    def build_encoder(self, input_lengths, input_mask, *args, **kargs):

        reuse = kargs["reuse"]
        word_emb = self.build_emebdding(*args, **kargs)
        dropout_rate = tf.cond(self.is_training,
                               lambda: self.config.dropout_rate, lambda: 0.0)

        word_emb = tf.nn.dropout(word_emb, 1 - dropout_rate)
        with tf.variable_scope(self.config.scope + "_input_highway",
                               reuse=reuse):
            input_dim = word_emb.get_shape()[-1]
            sent_repres = match_utils.multi_highway_layer(
                word_emb, input_dim, self.config.highway_layer_num)

            if self.config.rnn == "lstm":
                [sent_repres_fw, sent_repres_bw, sent_repres
                 ] = layer_utils.my_lstm_layer(sent_repres,
                                               self.config.context_lstm_dim,
                                               input_lengths=input_lengths,
                                               scope_name=self.config.scope,
                                               reuse=reuse,
                                               is_training=self.is_training,
                                               dropout_rate=dropout_rate,
                                               use_cudnn=self.config.use_cudnn)
                match_dim = self.config.context_lstm_dim * 6

            elif self.config.rnn == "slstm":

                word_emb_proj = tf.layers.dense(word_emb,
                                                self.config.slstm_hidden_size)

                initial_hidden_states = word_emb_proj
                initial_cell_states = tf.identity(initial_hidden_states)

                [new_hidden_states, new_cell_states, dummynode_hidden_states
                 ] = slstm_utils.slstm_cell(self.config,
                                            self.config.scope,
                                            self.config.slstm_hidden_size,
                                            input_lengths,
                                            initial_hidden_states,
                                            initial_cell_states,
                                            self.config.slstm_layer_num,
                                            dropout_rate,
                                            reuse=reuse)

                sent_repres = new_hidden_states
                match_dim = self.config.slstm_hidden_size * 3

            if self.config.multi_head:
                mask = tf.cast(input_mask, tf.float32)
                ignore_padding = (1 - mask)
                ignore_padding = label_network_utils.attention_bias_ignore_padding(
                    ignore_padding)
                encoder_self_attention_bias = ignore_padding

                sent_repres = label_network_utils.multihead_attention_texar(
                    sent_repres,
                    memory=None,
                    memory_attention_bias=encoder_self_attention_bias,
                    num_heads=8,
                    num_units=128,
                    dropout_rate=dropout_rate,
                    scope="multihead_attention")

            v_attn = self_attn.multi_dimensional_attention(
                sent_repres, input_mask,
                'multi_dim_attn_for_%s' % self.config.scope, 1 - dropout_rate,
                self.is_training, self.config.weight_decay, "relu")

            mask = tf.expand_dims(input_mask, -1)
            v_sum = tf.reduce_sum(sent_repres * tf.cast(mask, tf.float32), 1)
            v_ave = tf.div(
                v_sum,
                tf.expand_dims(
                    tf.cast(input_lengths, tf.float32) + EPSILON, -1))

            v_max = tf.reduce_max(qanet_layers.mask_logits(sent_repres, mask),
                                  axis=1)

            v_last = esim_utils.last_relevant_output(sent_repres,
                                                     input_lengths)

            out = tf.concat([v_ave, v_max, v_last, v_attn], axis=-1)

        return out, match_dim
Exemple #17
0
    def build_interactor(self, sent1_repres, sent2_repres, sent1_len,
                         sent2_len, sent1_mask, sent2_mask, *args, **kargs):
        reuse = kargs["reuse"]
        input_dim = sent1_repres.get_shape()[-1]
        dropout_rate = tf.cond(self.is_training,
                               lambda: self.config.dropout_rate, lambda: 0.0)

        with tf.variable_scope(self.config.scope + "_interaction_module",
                               reuse=reuse):

            if self.config.with_self_attention:
                v_1_attn = esim_utils.multihead_attention(
                    sent1_repres,
                    sent1_repres,
                    num_units=None,
                    num_heads=self.config.num_heads,
                    dropout_rate=dropout_rate,
                    is_training=True,
                    causality=False,
                    scope="multihead_attention",
                    reuse=None)

                v_2_attn = esim_utils.multihead_attention(
                    sent2_repres,
                    sent2_repres,
                    num_units=None,
                    num_heads=self.config.num_heads,
                    dropout_rate=dropout_rate,
                    is_training=True,
                    causality=False,
                    scope="multihead_attention",
                    reuse=True)

                sent1_repres = tf.concat([sent1_repres, v_1_attn], axis=-1)
                sent2_repres = tf.concat([sent2_repres, v_2_attn], axis=-1)

            [query_attention_outputs, context_attention_outputs
             ] = esim_utils.query_context_attention(sent1_repres,
                                                    sent2_repres,
                                                    sent1_len,
                                                    sent2_len,
                                                    sent1_mask,
                                                    sent2_mask,
                                                    dropout_rate,
                                                    self.config.scope,
                                                    reuse=reuse)

            if self.config.rnn == "lstm":
                [sent1_repres_fw, sent1_repres_bw, sent1_repres
                 ] = layer_utils.my_lstm_layer(query_attention_outputs,
                                               self.config.context_lstm_dim,
                                               input_lengths=sent1_len,
                                               scope_name=self.config.scope,
                                               reuse=None,
                                               is_training=self.is_training,
                                               dropout_rate=dropout_rate,
                                               use_cudnn=self.config.use_cudnn)

                [sent2_repres_fw, sent2_repres_bw, sent2_repres
                 ] = layer_utils.my_lstm_layer(context_attention_outputs,
                                               self.config.context_lstm_dim,
                                               input_lengths=sent2_len,
                                               scope_name=self.config.scope,
                                               reuse=True,
                                               is_training=self.is_training,
                                               dropout_rate=dropout_rate,
                                               use_cudnn=self.config.use_cudnn)
                match_dim = self.config.context_lstm_dim * 8

            elif self.config.rnn == "slstm":

                sent1_initial_hidden_states = tf.layers.dense(
                    query_attention_outputs, self.config.slstm_hidden_size)
                sent1_initial_cell_states = tf.identity(
                    sent1_initial_hidden_states)

                [
                    new_sent1_hidden_states, new_sent1_cell_states,
                    dummynode_sent1_hidden_states
                ] = slstm_utils.slstm_cell(self.config,
                                           self.config.scope,
                                           self.config.slstm_hidden_size,
                                           sent1_len,
                                           sent1_initial_hidden_states,
                                           sent1_initial_cell_states,
                                           self.config.slstm_layer_num,
                                           dropout_rate,
                                           reuse=None)

                sent1_repres = new_sent1_hidden_states

                sent2_initial_hidden_states = tf.layers.dense(
                    context_attention_outputs, self.config.slstm_hidden_size)
                sent2_initial_cell_states = tf.identity(
                    sent2_initial_hidden_states)

                [
                    new_sent2_hidden_states, new_sent2_cell_states,
                    dummynode_sent2_hidden_states
                ] = slstm_utils.slstm_cell(self.config,
                                           self.config.scope,
                                           self.config.slstm_hidden_size,
                                           sent2_len,
                                           sent2_initial_hidden_states,
                                           sent2_initial_cell_states,
                                           self.config.slstm_layer_num,
                                           dropout_rate,
                                           reuse=True)

                sent2_repres = new_sent2_hidden_states
                match_dim = self.config.slstm_hidden_size * 4

            v_1_sum = tf.reduce_sum(sent1_repres, 1)
            v_1_ave = tf.div(
                v_1_sum,
                tf.expand_dims(tf.cast(sent1_len, tf.float32) + EPSILON, -1))

            v_2_sum = tf.reduce_sum(sent2_repres, 1)
            v_2_ave = tf.div(
                v_2_sum,
                tf.expand_dims(tf.cast(sent2_len, tf.float32) + EPSILON, -1))

            # v_1_max = tf.reduce_max(sent1_repres, 1)
            # v_2_max = tf.reduce_max(sent2_repres, 1)

            mask_q = tf.expand_dims(sent1_mask, -1)
            mask_c = tf.expand_dims(sent2_mask, -1)

            v_1_max = tf.reduce_max(qanet_layers.mask_logits(
                sent1_repres, mask_q),
                                    axis=1)
            v_2_max = tf.reduce_max(qanet_layers.mask_logits(
                sent2_repres, mask_c),
                                    axis=1)

            out1 = tf.concat([v_1_ave, v_1_max], axis=-1)
            out2 = tf.concat([v_2_ave, v_2_max], axis=-1)

            out = tf.concat([v_1_ave, v_2_ave, v_1_max, v_2_max], 1)

            return out1, out2, out, match_dim