Ejemplo n.º 1
0
def multi_dimensional_attention(rep_tensor,
                                rep_mask,
                                scope=None,
                                keep_prob=1.,
                                is_train=None,
                                wd=0.,
                                activation='elu',
                                tensor_dict=None,
                                name=None):
    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(
        rep_tensor)[2]
    ivec = rep_tensor.get_shape()[2]
    with tf.variable_scope(scope or 'multi_dimensional_attention'):
        map1 = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map1',
                              activation, False, wd, keep_prob, is_train)
        map2 = bn_dense_layer(map1, ivec, True, 0., 'bn_dense_map2', 'linear',
                              False, wd, keep_prob, is_train)
        map2_masked = exp_mask_for_high_rank(map2, rep_mask)

        soft = tf.nn.softmax(map2_masked, 1)  # bs,sl,vec
        attn_output = tf.reduce_sum(soft * rep_tensor, 1)  # bs, vec

        # save attn
        if tensor_dict is not None and name is not None:
            tensor_dict[name] = soft

        return attn_output
Ejemplo n.º 2
0
def time_aware_attention(train_inputs, embed, mask, embedding_size, k):
    with tf.variable_scope('time_aware_attention'):
        attn_weights = tf.Variable(
            tf.truncated_normal([embedding_size, k],
                                stddev=1.0 / math.sqrt(k)))
        attn_biases = tf.Variable(tf.zeros([k]))

        # weight add bias
        attn_embed = tf.nn.bias_add(attn_weights, attn_biases)

        # multiplying it with Ei
        attn_scalars = tf.tensordot(embed, attn_embed, axes=[[2], [0]])

        # get abs of distance
        train_delta = tf.abs(train_inputs[:, :, 1])

        # distance function is log(dist+1)
        dist_fun = tf.log(tf.to_float(train_delta) + 1.0)

        # reshape the dist_fun
        dist_fun = tf.reshape(
            dist_fun, [tf.shape(dist_fun)[0],
                       tf.shape(dist_fun)[1], 1])

        # the attribution logits
        attn_logits = tf.multiply(attn_scalars, dist_fun)

        # the attribution logits sum
        attn_logits_sum = tf.reduce_sum(attn_logits, -1, keepdims=True)
        attn_logits_sum = exp_mask_for_high_rank(attn_logits_sum, mask)

        # get weights via softmax
        attn_softmax = tf.nn.softmax(attn_logits_sum, 1)

        # the weighted sum
        attn_embed_weighted = tf.multiply(attn_softmax, embed)
        attn_embed_weighted = mask_for_high_rank(attn_embed_weighted, mask)

        reduced_embed = tf.reduce_sum(attn_embed_weighted, 1)
        # obtain two scalars
        scalar1 = tf.log(tf.to_float(tf.shape(embed)[1]) + 1.0)
        scalar2 = tf.reduce_sum(tf.pow(attn_softmax, 2), 1)
        # the scalared embed
        reduced_embed = tf.multiply(reduced_embed, scalar1)
        reduced_embed = tf.multiply(reduced_embed, scalar2)

        return reduced_embed, attn_embed_weighted
Ejemplo n.º 3
0
Archivo: nn.py Proyecto: zkyzq/BiBloSA
def pooling_with_mask(rep_tensor, rep_mask, method='max', scope=None):
    # rep_tensor have one more rank than rep_mask
    with tf.name_scope(scope or '%s_pooling' % method):

        if method == 'max':
            rep_tensor_masked = exp_mask_for_high_rank(rep_tensor, rep_mask)
            output = tf.reduce_max(rep_tensor_masked, -2)
        elif method == 'mean':
            rep_tensor_masked = mask_for_high_rank(rep_tensor,
                                                   rep_mask)  # [...,sl,hn]
            rep_sum = tf.reduce_sum(rep_tensor_masked, -2)  #[..., hn]
            denominator = tf.reduce_sum(tf.cast(rep_mask, tf.int32), -1,
                                        True)  # [..., 1]
            denominator = tf.where(
                tf.equal(denominator, tf.zeros_like(denominator, tf.int32)),
                tf.ones_like(denominator, tf.int32), denominator)
            output = rep_sum / tf.cast(denominator, tf.float32)
        else:
            raise AttributeError('No Pooling method name as %s' % method)
        return output
Ejemplo n.º 4
0
def self_attention_for_selected_head(
        head_selection, head_org_idx, sl_head, rep_head_mask,
        dep_selection, dep_org_idx, sl_dep, rep_dep_mask,
        rep_map, rep_dep_tensor, keep_prob, is_train, direction, ivec
):
    # data for self-attention
    rep_map_dp = dropout(rep_map, keep_prob, is_train)
    rep_dep_tensor_dp, _, _ = reduce_data_rep_max_len(rep_map_dp, dep_selection)
    rep_head_tensor_dp, _, _ = reduce_data_rep_max_len(rep_map_dp, head_selection)

    # mask generation
    dep_idxs = tf.tile(tf.expand_dims(dep_org_idx, 1), [1, sl_head, 1])
    head_idxs = tf.tile(tf.expand_dims(head_org_idx, 2), [1, 1, sl_dep])

    if direction is None:
        direct_mask = tf.not_equal(head_idxs, dep_idxs)  # [bs, slh, sld]
    else:
        if direction == 'forward':
            direct_mask = tf.greater(head_idxs, dep_idxs)  # [bs, slh, sld]
        else:
            direct_mask = tf.less(head_idxs, dep_idxs)  # [bs, slh, sld]
    # [bs, slh, slh]
    rep_mask_tile = tf.logical_and(tf.expand_dims(rep_dep_mask, 1), tf.expand_dims(rep_head_mask, 2))
    attn_mask = tf.logical_and(direct_mask, rep_mask_tile)  # [bs, slh, sld]

    # tensor tile
    rep_map_tile = tf.tile(tf.expand_dims(rep_dep_tensor, 1), [1, sl_head, 1, 1])  # bs,slh,sld,vec
    with tf.variable_scope('attention'):  # bs,sl,sl,vec
        f_bias = tf.get_variable('f_bias', [ivec], tf.float32, tf.constant_initializer(0.))
        dependent = linear(rep_dep_tensor_dp, ivec, False, scope='linear_dependent')  # bs,sld,vec
        dependent_etd = tf.expand_dims(dependent, 1)  # bs,1,sld,vec
        head = linear(rep_head_tensor_dp, ivec, False, scope='linear_head')  # bs,slh,vec
        head_etd = tf.expand_dims(head, 2)  # bs,slh,1,vec

        logits = scaled_tanh(dependent_etd + head_etd + f_bias, 5.0)  # bs,slh,sld,vec
        logits_masked = exp_mask_for_high_rank(logits, attn_mask)  # bs,slh,sld,vec
        attn_score = tf.nn.softmax(logits_masked, 2)  # bs,slh,sld,vec
        attn_score = mask_for_high_rank(attn_score, attn_mask)
        attn_result = tf.reduce_sum(attn_score * rep_map_tile, 2)  # bs,slh,vec -> head_org_idx
    return attn_result
Ejemplo n.º 5
0
def first_level_sa(rep_tensor,
                   rep_mask,
                   keep_prob=1.,
                   is_train=None,
                   wd=0.,
                   activation='relu'):
    # bs, sw, cl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2], tf.shape(rep_tensor)[3]
    ivec = rep_tensor.get_shape()[3]
    with tf.variable_scope('first_level_sa'):
        print('original: ', rep_tensor.get_shape())
        map1 = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map1',
                              activation, False, wd, keep_prob, is_train)
        print('map1: ', map1.get_shape())
        map2 = bn_dense_layer(map1, ivec, True, 0., 'bn_dense_map2', 'linear',
                              False, wd, keep_prob, is_train)
        print('map2: ', map2.get_shape())
        map2_masked = exp_mask_for_high_rank(map2, rep_mask)

        soft = tf.nn.softmax(map2_masked, 2)  # bs,sk,code_len,vec
        attn_output = tf.reduce_sum(soft * rep_tensor, 2)  # bs, sk, vec

        return attn_output
Ejemplo n.º 6
0
def directional_attention_with_dense(rep_tensor,
                                     rep_mask,
                                     direction=None,
                                     scope=None,
                                     keep_prob=1.,
                                     is_train=None,
                                     wd=0.,
                                     activation='elu',
                                     tensor_dict=None,
                                     name=None,
                                     hn=None):
    def scaled_tanh(x, scale=5.):
        return scale * tf.nn.tanh(1. / scale * x)

    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(
        rep_tensor)[2]
    ivec = hn or rep_tensor.get_shape()[2]
    with tf.variable_scope(scope or 'directional_attention_%s' % direction
                           or 'diag'):
        # mask generation
        sl_indices = tf.range(sl, dtype=tf.int32)
        sl_col, sl_row = tf.meshgrid(sl_indices, sl_indices)
        if direction is None:
            direct_mask = tf.cast(
                tf.diag(-tf.ones([sl], tf.int32)) + 1, tf.bool)
        else:
            if direction == 'forward':
                direct_mask = tf.greater(sl_row, sl_col)
            else:
                direct_mask = tf.greater(sl_col, sl_row)
        direct_mask_tile = tf.tile(tf.expand_dims(direct_mask, 0),
                                   [bs, 1, 1])  # bs,sl,sl
        rep_mask_tile = tf.tile(tf.expand_dims(rep_mask, 1),
                                [1, sl, 1])  # bs,sl,sl
        attn_mask = tf.logical_and(direct_mask_tile, rep_mask_tile)  # bs,sl,sl

        # non-linear
        rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map',
                                 activation, False, wd, keep_prob, is_train)
        rep_map_tile = tf.tile(tf.expand_dims(rep_map, 1),
                               [1, sl, 1, 1])  # bs,sl,sl,vec
        rep_map_dp = dropout(rep_map, keep_prob, is_train)

        # attention
        with tf.variable_scope('attention'):  # bs,sl,sl,vec
            f_bias = tf.get_variable('f_bias', [ivec], tf.float32,
                                     tf.constant_initializer(0.))
            dependent = linear(rep_map_dp,
                               ivec,
                               False,
                               scope='linear_dependent')  # bs,sl,vec
            dependent_etd = tf.expand_dims(dependent, 1)  # bs,1,sl,vec
            head = linear(rep_map_dp, ivec, False,
                          scope='linear_head')  # bs,sl,vec
            head_etd = tf.expand_dims(head, 2)  # bs,sl,1,vec

            logits = scaled_tanh(dependent_etd + head_etd + f_bias,
                                 5.0)  # bs,sl,sl,vec

            logits_masked = exp_mask_for_high_rank(logits, attn_mask)
            attn_score = tf.nn.softmax(logits_masked, 2)  # bs,sl,sl,vec
            attn_score = mask_for_high_rank(attn_score, attn_mask)

            attn_result = tf.reduce_sum(attn_score * rep_map_tile,
                                        2)  # bs,sl,vec

        with tf.variable_scope('output'):
            o_bias = tf.get_variable('o_bias', [ivec], tf.float32,
                                     tf.constant_initializer(0.))
            # input gate
            fusion_gate = tf.nn.sigmoid(
                linear(rep_map, ivec, True, 0., 'linear_fusion_i', False, wd,
                       keep_prob, is_train) +
                linear(attn_result, ivec, True, 0., 'linear_fusion_a', False,
                       wd, keep_prob, is_train) + o_bias)
            output = fusion_gate * rep_map + (1 - fusion_gate) * attn_result
            output = mask_for_high_rank(output, rep_mask)

        # save attn
        if tensor_dict is not None and name is not None:
            tensor_dict[name + '_dependent'] = dependent
            tensor_dict[name + '_head'] = head
            tensor_dict[name] = attn_score
            tensor_dict[name + '_gate'] = fusion_gate
        return output
Ejemplo n.º 7
0
def simple_block_attention(rep_tensor,
                           rep_mask,
                           block_len=5,
                           scope=None,
                           direction=None,
                           keep_prob=1.,
                           is_train=None,
                           wd=0.,
                           activation='elu',
                           hn=None):
    assert direction is not None

    def scaled_tanh(x, scale=5.):
        return scale * tf.nn.tanh(1. / scale * x)

    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(
        rep_tensor)[2]
    org_ivec = rep_tensor.get_shape().as_list()[2]
    ivec = hn or org_ivec
    with tf.variable_scope(scope or 'block_simple'):
        # @1. split sequence
        with tf.variable_scope('split_seq'):
            block_num = tf.cast(
                tf.ceil(
                    tf.divide(tf.cast(sl, tf.float32),
                              tf.cast(block_len, tf.float32))), tf.int32)
            comp_len = block_num * block_len - sl

            rep_tensor_comp = tf.concat(
                [rep_tensor,
                 tf.zeros([bs, comp_len, org_ivec], tf.float32)], 1)
            rep_mask_comp = tf.concat([
                rep_mask,
                tf.cast(tf.zeros([bs, comp_len], tf.int32), tf.bool)
            ], 1)

            rep_tensor_split = tf.reshape(
                rep_tensor_comp,
                [bs, block_num, block_len, org_ivec])  # bs,bn,bl,d
            rep_mask_split = tf.reshape(rep_mask_comp,
                                        [bs, block_num, block_len])  # bs,bn,bl

            # non-linear
            rep_map = bn_dense_layer(rep_tensor_split, ivec, True, 0.,
                                     'bn_dense_map', activation, False, wd,
                                     keep_prob, is_train)  # bs,bn,bl,vec
            rep_map_tile = tf.tile(tf.expand_dims(rep_map, 2),
                                   [1, 1, block_len, 1, 1])  # bs,bn,bl,bl,vec
            # rep_map_dp = dropout(rep_map, keep_prob, is_train)
            bn = block_num
            bl = block_len

        with tf.variable_scope('self_attention'):
            # @2.self-attention in block
            # mask generation
            sl_indices = tf.range(block_len, dtype=tf.int32)
            sl_col, sl_row = tf.meshgrid(sl_indices, sl_indices)
            if direction == 'forward':
                direct_mask = tf.greater(sl_row, sl_col)  # bl,bl
            else:
                direct_mask = tf.greater(sl_col, sl_row)  # bl,bl
            direct_mask_tile = tf.tile(
                tf.expand_dims(tf.expand_dims(direct_mask, 0), 0),
                [bs, bn, 1, 1])  # bs,bn,bl,bl
            rep_mask_tile_1 = tf.tile(tf.expand_dims(rep_mask_split, 2),
                                      [1, 1, bl, 1])  # bs,bn,bl,bl
            rep_mask_tile_2 = tf.tile(tf.expand_dims(rep_mask_split, 3),
                                      [1, 1, 1, bl])  # bs,bn,bl,bl
            rep_mask_tile = tf.logical_and(rep_mask_tile_1, rep_mask_tile_2)
            attn_mask = tf.logical_and(direct_mask_tile,
                                       rep_mask_tile,
                                       name='attn_mask')  # bs,bn,bl,bl

            # attention
            f_bias = tf.get_variable('f_bias', [ivec], tf.float32,
                                     tf.constant_initializer(0.))
            dependent_head = linear(rep_map, 2 * ivec, False, 0.,
                                    'linear_dependent_head', False, wd,
                                    keep_prob, is_train)  # bs,bn,bl,2vec
            dependent, head = tf.split(dependent_head, 2, 3)
            dependent_etd = tf.expand_dims(dependent, 2)  # bs,bn,1,bl,vec
            head_etd = tf.expand_dims(head, 3)  # bs,bn,bl,1,vec
            logits = scaled_tanh(dependent_etd + head_etd + f_bias,
                                 5.0)  # bs,bn,bl,bl,vec
            logits_masked = exp_mask_for_high_rank(logits, attn_mask)
            attn_score = tf.nn.softmax(logits_masked, 3)  # bs,bn,bl,bl,vec
            attn_score = mask_for_high_rank(attn_score,
                                            attn_mask)  # bs,bn,bl,bl,vec
            self_attn_result = tf.reduce_sum(attn_score * rep_map_tile,
                                             3)  # bs,bn,bl,vec

        with tf.variable_scope('source2token_self_attn'):
            inter_block_logits = bn_dense_layer(self_attn_result, ivec, True,
                                                0., 'bn_dense_map', 'linear',
                                                False, wd, keep_prob,
                                                is_train)  # bs,bn,bl,vec
            inter_block_logits_masked = exp_mask_for_high_rank(
                inter_block_logits, rep_mask_split)  # bs,bn,bl,vec
            inter_block_soft = tf.nn.softmax(inter_block_logits_masked,
                                             2)  # bs,bn,bl,vec
            inter_block_attn_output = tf.reduce_sum(
                self_attn_result * inter_block_soft, 2)  # bs,bn,vec

        with tf.variable_scope('self_attn_inter_block'):
            inter_block_attn_output_mask = tf.cast(tf.ones([bs, bn], tf.int32),
                                                   tf.bool)
            block_ct_res = directional_attention_with_dense(
                inter_block_attn_output, inter_block_attn_output_mask,
                direction, 'disa', keep_prob, is_train, wd,
                activation)  # [bs,bn,vec]

            block_ct_res_tile = tf.tile(tf.expand_dims(
                block_ct_res, 2), [1, 1, bl, 1])  #[bs,bn,vec]->[bs,bn,bl,vec]

        with tf.variable_scope('combination'):
            # input:1.rep_map[bs,bn,bl,vec]; 2.self_attn_result[bs,bn,bl,vec]; 3.rnn_res_tile[bs,bn,bl,vec]
            rep_tensor_with_ct = tf.concat(
                [rep_map, self_attn_result, block_ct_res_tile],
                -1)  # [bs,bn,bl,3vec]
            new_context_and_gate = linear(rep_tensor_with_ct, 2 * ivec, True,
                                          0., 'linear_new_context_and_gate',
                                          False, wd, keep_prob,
                                          is_train)  # [bs,bn,bl,2vec]
            new_context, gate = tf.split(new_context_and_gate, 2,
                                         3)  # bs,bn,bl,vec
            if activation == "relu":
                new_context_act = tf.nn.relu(new_context)
            elif activation == "elu":
                new_context_act = tf.nn.elu(new_context)
            elif activation == "linear":
                new_context_act = tf.identity(new_context)
            else:
                raise RuntimeError
            gate_sig = tf.nn.sigmoid(gate)
            combination_res = gate_sig * new_context_act + (
                1 - gate_sig) * rep_map  # bs,bn,bl,vec

        with tf.variable_scope('restore_original_length'):
            combination_res_reshape = tf.reshape(
                combination_res, [bs, bn * bl, ivec])  # bs,bn*bl,vec
            output = combination_res_reshape[:, :sl, :]
            return output
Ejemplo n.º 8
0
def gated_self_attention(rep_tensor,
                         rep_mask,
                         scope=None,
                         keep_prob=1.,
                         is_train=None,
                         wd=0.,
                         activation='elu',
                         hn=None,
                         position_mask_type=None):
    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(
        rep_tensor)[2]
    ivec = rep_tensor.get_shape().as_list()[2]
    ivec = hn or ivec
    with tf.variable_scope(scope or 'gated_self_attention_%s' %
                           (position_mask_type or 'None')):
        rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map',
                                 activation, False, wd, keep_prob, is_train)

        # mask generation
        rep_mask_epd1 = tf.expand_dims(rep_mask, 1)  # bs,1,sl
        rep_mask_epd2 = tf.expand_dims(rep_mask, 2)  # bs,sl,1
        rep_mask_mat = tf.logical_and(rep_mask_epd1, rep_mask_epd2)  # bs,sl,sl

        if position_mask_type in ['forward', 'backward']:
            sl_indices = tf.range(sl, dtype=tf.int32)
            sl_col, sl_row = tf.meshgrid(sl_indices, sl_indices)
            if position_mask_type == 'forward':
                position_mask = tf.greater(sl_row, sl_col)
            else:
                position_mask = tf.greater(sl_col, sl_row)
            position_mask = tf.tile(tf.expand_dims(position_mask, 0),
                                    [bs, 1, 1])
            position_mask = tf.logical_and(rep_mask_mat, position_mask)

        else:
            position_mask = rep_mask_mat

        position_mask_ft = tf.cast(position_mask, tf.float32)

        # attention
        with tf.variable_scope('intra_sent_attn'):  # bs,sl,hn
            # rep_tensor_mean = pooling_with_mask(rep_tensor, rep_mask, 'mean')  # bs, hn
            rep_tensor_for_attn = rep_map

            pre_align_score = bn_dense_layer(  # bs,sl,hn
                rep_tensor_for_attn, ivec, True, 0., 'intra_sent_map1',
                activation, False, wd, keep_prob, is_train)
            align_score = bn_dense_layer(  # bs,sl,hn
                pre_align_score, ivec, True, 0., 'intra_sent_map2', 'linear',
                False, wd, keep_prob, is_train)
            align_score_w_mask = exp_mask_for_high_rank(align_score,
                                                        rep_mask)  # bs,sl,hn
            exp_align_score = tf.exp(align_score_w_mask)  # bs,sl,hn

            accum_z_deno = tf.matmul(position_mask_ft, exp_align_score)
            accum_z_deno = tf.where(
                tf.greater(accum_z_deno, tf.zeros_like(accum_z_deno)),
                accum_z_deno, tf.ones_like(accum_z_deno))

            rep_mul_score = rep_map * exp_align_score
            accum_rep_mul_score = tf.matmul(position_mask_ft, rep_mul_score)

            attn_res = accum_rep_mul_score / accum_z_deno

        with tf.variable_scope('context_fusion_gate'):
            fusion_gate = tf.nn.sigmoid(
                bn_dense_layer([rep_map, attn_res], hn, True, 0.,
                               'linear_fusion_gate', activation, False, wd,
                               keep_prob, is_train))
            output = fusion_gate * rep_map + (1 - fusion_gate) * attn_res

        output = mask_for_high_rank(output, rep_mask)
        return output
Ejemplo n.º 9
0
def visit_sa_with_dense(rep_tensor,
                        keep_prob=1.,
                        is_train=None,
                        wd=0.,
                        activation='relu',
                        hn=None,
                        is_scale=True,
                        is_plus_sa=True):

    batch_size, sw_len, vec_size = tf.shape(rep_tensor)[0], tf.shape(
        rep_tensor)[1], tf.shape(rep_tensor)[2]
    ivec = rep_tensor.get_shape().as_list()[2]
    ivec = hn or ivec
    with tf.variable_scope('temporal_attention'):
        # mask generation
        attn_mask = tf.cast(
            tf.diag(-tf.ones([sw_len], tf.int32)) + 1,
            tf.bool)  # batch_size, code_len, code_len

        # non-linear for context
        rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map',
                                 activation, False, wd, keep_prob, is_train)
        rep_map_tile = tf.tile(tf.expand_dims(rep_map, 1),
                               [1, sw_len, 1, 1])  # bs,sl,sl,vec
        rep_map_dp = dropout(rep_map, keep_prob, is_train)

        # attention
        with tf.variable_scope('attention'):  # bs,sl,sl,vec

            f_bias = tf.get_variable('f_bias', [ivec], tf.float32,
                                     tf.constant_initializer(0.))
            dependent = linear(
                rep_map_dp, ivec, False,
                scope='linear_dependent')  # batch_size, code_len, vec_size
            dependent_etd = tf.expand_dims(
                dependent, 1)  # batch_size, code_len,code_len, vec_size
            head = linear(
                rep_map_dp, ivec, False,
                scope='linear_head')  # batch_size, code_len, vec_size
            head_etd = tf.expand_dims(
                head, 2)  # batch_size, code_len,code_len, vec_size

            if is_plus_sa:
                attention_fact = dependent_etd + head_etd + f_bias
            else:
                return rep_map

            if is_scale:
                logits = scaled_tanh(attention_fact, 5.0)  # bs,sl,sl,vec
            else:
                logits = linear(tf.nn.tanh(attention_fact),
                                ivec,
                                True,
                                scope='linear_attn_fact')

            logits_masked = exp_mask_for_high_rank(logits, attn_mask)
            attn_score = tf.nn.softmax(logits_masked, 2)  # bs,sl,sl,vec
            attn_score = mask_for_high_rank(attn_score, attn_mask)

            attn_result = tf.reduce_sum(attn_score * rep_map_tile,
                                        2)  # bs,sl,vec

        with tf.variable_scope('output'):
            o_bias = tf.get_variable('o_bias', [ivec], tf.float32,
                                     tf.constant_initializer(0.))
            # input gate
            fusion_gate = tf.nn.sigmoid(
                linear(rep_map, ivec, True, 0., 'linear_fusion_i', False, wd,
                       keep_prob, is_train) +
                linear(attn_result, ivec, True, 0., 'linear_fusion_a', False,
                       wd, keep_prob, is_train) + o_bias)
            output = fusion_gate * rep_map + (1 - fusion_gate) * attn_result

        return output
Ejemplo n.º 10
0
def masked_positional_self_attention(sigma, rep_tensor, rep_mask, direction=None, scope=None,
                                     keep_prob=1., is_train=None, wd=0., activation='elu',
                                     tensor_dict=None, name=None):
    def scaled_tanh(x, scale=5.):
        return scale * tf.nn.tanh(1./scale * x)

    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2]
    ivec = rep_tensor.get_shape()[2]
    with tf.variable_scope(scope or 'directional_attention_%s' % direction or 'diag'):
        # mask generation
        sl_indices = tf.range(sl, dtype=tf.int32)
        sl_col, sl_row = tf.meshgrid(sl_indices, sl_indices)
        if direction is None:
            direct_mask0 = tf.greater(sl_row + sigma, sl_col)
            direct_mask1 = tf.greater(sl_col + sigma, sl_row)
            direct_mask2 = tf.cast(1 - tf.diag(tf.ones([sl], tf.int32)), tf.bool)
            direct_mask = tf.logical_and(tf.logical_and(direct_mask0, direct_mask1), direct_mask2)
        else:
            if direction == 'forward':
                direct_mask = tf.greater(sl_row, sl_col)
            else:
                direct_mask = tf.greater(sl_col, sl_row)
        direct_mask_tile = tf.tile(tf.expand_dims(direct_mask, 0), [bs, 1, 1])  # bs,sl,sl
        rep_mask_tile = tf.tile(tf.expand_dims(rep_mask, 1), [1, sl, 1])  # bs,sl,sl
        attn_mask = tf.logical_and(direct_mask_tile, rep_mask_tile)  # bs,sl,sl

        # non-linear
        rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map', activation,
                                 False, wd, keep_prob, is_train)
        rep_map_tile = tf.tile(tf.expand_dims(rep_map, 1), [1, sl, 1, 1])  # bs,sl,sl,vec
        rep_map_dp = dropout(rep_map, keep_prob, is_train)

        # attention
        with tf.variable_scope('attention'):  # bs,sl,sl,1
            f_bias = tf.get_variable('f_bias', [1], tf.float32, tf.constant_initializer(0.))
            dependent = linear(rep_map_dp, 1, False, scope='linear_dependent')  # bs,sl,1
            dependent_etd = tf.expand_dims(dependent, 1)  # bs,1,sl,1
            head = linear(rep_map_dp, 1, False, scope='linear_head') # bs,sl,1
            head_etd = tf.expand_dims(head, 2)  # bs,sl,1,1

            logits = scaled_tanh(dependent_etd + head_etd + f_bias, 5.0)  # bs,sl,sl,1

            logits_masked = exp_mask_for_high_rank(logits, attn_mask)
            if direction is not None:
                dis_mask = -tf.log(tf.cast(tf.abs(sl_col - sl_row) + 
                                           tf.diag(tf.ones([sl], tf.int32)), tf.float32))
                logits_masked = dis_mask_for_high_rank(logits_masked, dis_mask)
            attn_score = tf.nn.softmax(logits_masked, 2)  # bs,sl,sl,vec
            attn_score = mask_for_high_rank(attn_score, attn_mask)
            attn_score = tf.tile(tf.expand_dims(tf.reshape(attn_score, [bs, sl, sl]), 3), [1, 1, 1, ivec])

            attn_result = tf.reduce_sum(attn_score * rep_map_tile, 2)  # bs,sl,vec

        with tf.variable_scope('output'):
            output = attn_result

        # save attn
        if tensor_dict is not None and name is not None:
            tensor_dict[name + '_dependent'] = dependent
            tensor_dict[name + '_head'] = head
            tensor_dict[name] = attn_score
        return output