Exemple #1
0
def multi_head_w_global(  # Added and Modified by xxx xxx
        x, scope, n_state, n_head, train=None, scale=False, resid_dropout=0.9, attn_dropout=0.9,
        use_global=False, use_direction=False, b=None, global_afn='exp',
):
    assert n_state % n_head == 0
    with tf.variable_scope(scope):
        sl = shape_list(x)[-2]
        if not use_direction:
            b = tf.matrix_band_part(tf.ones([sl, sl]), -1, 0)  # Lower triangular part.
            b = tf.reshape(b, [1, 1, sl, sl])

        c = conv1d(x, 'c_attn_openai_trans', n_state * 3, 1, train=train)
        q, k, v = tf.split(c, 3, 2)
        q = split_heads(q, n_head)  # bs,hd,sl,d
        k = split_heads(k, n_head, k=True)  # bs,hd,d,sl
        v = split_heads(v, n_head)  # bs,hd,sl,d

        # 1. t2t
        w = tf.matmul(q, k)  # bs,hd,sl, sl
        if scale:
            n_state_hd = shape_list(v)[-1]
            w = w * tf.rsqrt(tf.cast(n_state_hd, tf.float32))

        if use_global:
            e_w = activation_name_to_func(global_afn)(w) * b

            # 2. s2t
            w_g = split_heads(conv1d(x, "c_w_g", n_state, 1, train=train), n_head)  # bs,hd,sl,d
            e_w_g = tf.exp(w_g)  # # bs,hd,sl,d

            # 3. mtsa
            accum_z_deno = tf.matmul(e_w, e_w_g)  # bs,hd,sl,dim
            accum_z_deno = tf.where(  # in case of NaN and Inf
                tf.greater(accum_z_deno, tf.zeros_like(accum_z_deno)),
                accum_z_deno,
                tf.ones_like(accum_z_deno)
            )
            e_w = dropout(e_w, math.sqrt(attn_dropout), train)
            e_w_g = dropout(e_w_g, math.sqrt(attn_dropout), train)
            rep_mul_score = v * e_w_g
            accum_rep_mul_score = tf.matmul(e_w, rep_mul_score)
            a = accum_rep_mul_score / accum_z_deno
        else:
            w = w * b + -1e9 * (1 - b)
            w = tf.nn.softmax(w)
            w = w * b  # fixed the bug
            w = dropout(w, attn_dropout, train)  # attention dropout
            a = tf.matmul(w, v)

        a = merge_heads(a)
        a = conv1d(a, 'c_proj_openai_trans', n_state, 1, train=train)
        a = dropout(a, resid_dropout, train, )
        return a
Exemple #2
0
def mlp(x, scope, n_state, train=None, afn='gelu', resid_dropout=0.9):  # read: 3layer mlp
    with tf.variable_scope(scope):
        nx = shape_list(x)[-1]
        act = act_name2fn(afn)
        h = act(conv1d(x, 'c_fc_openai_trans', n_state, 1, train=train))
        h2 = conv1d(h, 'c_proj_openai_trans', nx, 1, train=train)
        h2 = dropout(h2, resid_dropout, train)
        return h2
Exemple #3
0
def qqp_logits_sentence_encoding(s1_rep, s2_rep, afn, n_state, is_train, clf_dropout, highway=False):   # TODO: change this to my style (bn_dense_layer)
    out_rep = tf.concat([tf.abs(s1_rep - s2_rep), s1_rep * s2_rep], -1)
    act = act_name2fn(afn)
    h = act(conv1d(out_rep, 'c_fc', n_state, 1, train=is_train))

    if highway:
        trans = conv1d(h, 'c_trans', n_state, 1, train=is_train)
        gate = tf.nn.sigmoid(conv1d(h, 'c_gate', n_state, 1, train=is_train))
        h = gate * trans + (1 - gate) * h

    h_dp = dropout(h, clf_dropout, is_train)
    return conv1d(h_dp, 'c_logits', 2, 1, train=is_train)
Exemple #4
0
def multi_head(x, scope, n_state, n_head, train=None, scale=False, resid_dropout=0.9, attn_dropout=0.9):
    assert n_state % n_head == 0
    with tf.variable_scope(scope):
        c = conv1d(x, 'c_attn_openai_trans', n_state * 3, 1, train=train)  # position-wise fully-connected layer
        q, k, v = tf.split(c, 3, 2)
        q = split_heads(q, n_head)
        k = split_heads(k, n_head, k=True)
        v = split_heads(v, n_head)
        a = _attn(q, k, v, train=train, scale=scale, attn_dropout=attn_dropout)
        a = merge_heads(a)
        a = conv1d(a, 'c_proj_openai_trans', n_state, 1, train=train)
        a = dropout(a, resid_dropout, train, )
        return a
Exemple #5
0
def _attn(q, k, v, train=None, scale=False, attn_dropout=0.9):  # read
    w = tf.matmul(q, k)

    if scale:
        n_state = shape_list(v)[-1]
        w = w*tf.rsqrt(tf.cast(n_state, tf.float32))

    w = mask_attn_weights(w)  # highlight, this is uni-directional self-attention
    w = tf.nn.softmax(w)

    # w = tf.Print(w, [tf.shape(w)])

    w = dropout(w, attn_dropout, train)  # attention dropout

    a = tf.matmul(w, v)
    return a
Exemple #6
0
def s2t_self_attn(
        tensor_input, tensor_mask, deep_act=None, method='multi_dim',
        wd=0., keep_prob=1., is_training=None,
        scope=None, **kwargs
):
    use_deep = isinstance(deep_act, str)  # use Two layers or Single layer for the alignment score
    with tf.variable_scope(scope or 's2t_self_attn_{}'.format(method)):
        tensor_shape = get_shape_list(tensor_input)
        hn = tensor_shape[-1]  # hidden state number

        if method == 'additive':
            align_scores = bn_dense_layer_v2(  # bs,sl,hn/1
                tensor_input, hn if use_deep else 1, True, 0., 'align_score_1', 'linear', False,
                wd, keep_prob, is_training
            )
            if use_deep:
                align_scores = bn_dense_layer_v2(  # bs,sl,1
                    act_name2fn(deep_act)(align_scores), 1, True, 0., 'align_score_2', 'linear', False,
                    wd, keep_prob, is_training
                )
        elif method == 'multi_dim':
            align_scores = bn_dense_layer_v2(  # bs,sl,hn
                tensor_input, hn, False, 0., 'align_score_1', 'linear', False,
                wd, keep_prob, is_training
            )
            if use_deep:
                align_scores = bn_dense_layer_v2(  # bs,sl,hn
                    act_name2fn(deep_act)(align_scores), hn, True, 0., 'align_score_2', 'linear', False,
                    wd, keep_prob, is_training
                )
        elif method == 'multi_dim_head':
            get_shape_list(tensor_input, expected_rank=3)  # the input should be rank-3
            assert 'head_num' in kwargs and isinstance(kwargs['head_num'], int)
            head_num = kwargs['head_num']
            assert hn % head_num == 0
            head_dim = hn // head_num

            tensor_input_heads = split_head(tensor_input, head_num)  # [bs,hd,sl,hd_dim]

            align_scores_heads = bn_dense_layer_multi_head(  # [bs,hd,sl,hd_dim]
                tensor_input_heads, head_dim, True, 0., 'align_scores_heads_1', 'linear', False,
                wd, keep_prob, is_training
            )
            if use_deep:
                align_scores_heads = bn_dense_layer_multi_head(  # [bs,hd,sl,hd_dim]
                    act_name2fn(deep_act)(align_scores_heads), head_dim,
                    True, 0., 'align_scores_heads_2', 'linear', False,
                    wd, keep_prob, is_training
                )
            align_scores = combine_head(align_scores_heads)  # [bs,sl,dim]
        else:
            raise AttributeError

        # attention procedure align_scores [bs,sl,1/dim]
        align_scores_masked = exp_mask_v3(align_scores, tensor_mask, multi_head=False, high_dim=True)  # bs,sl,hn
        attn_prob = tf.nn.softmax(align_scores_masked, axis=-2)  # bs,sl,hn

        if 'attn_keep_prob' in kwargs and isinstance(kwargs['attn_keep_prob'], float):
            attn_prob = dropout(attn_prob, kwargs['attn_keep_prob'], is_training)  # bs,sl,hn

        attn_res = tf.reduce_sum(  # [bs,sl,hn] -> [bs,dim]
            mask_v3(attn_prob*tensor_input, tensor_mask, high_dim=True), axis=-2
        )

        return attn_res  # [bs,hn]
Exemple #7
0
def cond_attn(
        pairwise_scores, featurewise_scores, value_features, from_mask, to_mask,
        attn_keep_prob=1., is_training=None,
        extra_pairwise_mask=None, name=None
):
    """

    :param pairwise_scores: [bs,[head],slf,slt]
    :param featurewise_scores:  [bs,[head],slt,hn]
    :param value_features:  [bs,[head],slt,hn]
    :param from_mask:
    :param to_mask:
    :param extra_pairwise_mask:
    :return:
    """
    with tf.name_scope(name or 'cond_attn'):
        # sanity check
        pairwise_shape = get_shape_list(pairwise_scores)
        featurewise_shape = get_shape_list(featurewise_scores)
        value_shape = get_shape_list(value_features)

        pairwise_ndim = len(pairwise_shape)
        featurewise_ndim = len(featurewise_shape)
        value_ndim = len(value_shape)

        assert featurewise_shape[-1] == value_shape[-1]
        assert pairwise_ndim in [3, 4] and pairwise_ndim == featurewise_ndim and featurewise_ndim == value_ndim

        multi_head = True if pairwise_ndim == 4 else False  # if the multi-head included

        cross_attn_mask = cross_attn_mask_generation(  # [bs,slf,slt]
            from_mask, to_mask, mutual=True
        )

        if multi_head:  # add the multi-head dim
            cross_attn_mask = tf.expand_dims(cross_attn_mask, 1)  # [bs,[1],slf,slt]

        if not isinstance(extra_pairwise_mask, type(None)):
            # the extra_pairwise_mask could be include the multi-head
            extra_pairwise_mask_shape = get_shape_list(extra_pairwise_mask)
            assert len(extra_pairwise_mask_shape) in [3, 4]

            assert multi_head or len(extra_pairwise_mask_shape) == 3  # if multi_head=False, shape must be 3-D

            if multi_head and len(extra_pairwise_mask_shape) == 3:
                extra_pairwise_mask = tf.expand_dims(cross_attn_mask, 1)  # [bs,[1],slf,slt]

            cross_attn_mask = tf.logical_and(cross_attn_mask, extra_pairwise_mask)  # [bs,[1],slf,slt]

        e_dot_logits = mask_v3(  # bs,head,sl1,sl2
            tf.exp(pairwise_scores), cross_attn_mask, multi_head=False, high_dim=False)  # the multi-head has been add

        e_multi_logits = mask_v3(
            tf.exp(featurewise_scores), to_mask, multi_head=multi_head, high_dim=True
        )

        with tf.name_scope("hybrid_attn"):
            # Z: softmax normalization term in attention probabilities calculation
            accum_z_deno = tf.matmul(e_dot_logits, e_multi_logits)  # num,bs,sl,dim
            accum_z_deno = tf.where(  # in case of NaN and Inf
                tf.greater(accum_z_deno, tf.zeros_like(accum_z_deno)),
                accum_z_deno,
                tf.ones_like(accum_z_deno)
            )
            # attention dropout
            e_dot_logits = dropout(e_dot_logits, math.sqrt(attn_keep_prob), is_training)
            e_multi_logits = dropout(e_multi_logits, math.sqrt(attn_keep_prob), is_training)
            # sum of exp(logits) \multiply attention target sequence
            rep_mul_score = value_features * e_multi_logits
            accum_rep_mul_score = tf.matmul(e_dot_logits, rep_mul_score)
            # calculate the final attention results
            attn_res = accum_rep_mul_score / accum_z_deno

        if multi_head:
            attn_res = combine_head(attn_res)  # [bs,slf,hd_num*hd_dim]

    return attn_res  # [bs,slf,hn/hd_num*hd_dim]
Exemple #8
0
def multihead_attention_decoder(
    tensor_from,
    tensor_to,
    mask_to,
    mask_direction=None,  # [bs,slf,slt]
    act_name="relu",
    hn=768,
    head_num=12,
    wd=0.,
    is_training=None,
    keep_prob_dense=1.,
    keep_prob_attn=1.,
    tensor_to_prev=None,
    mask_prev_to=None,
    scope=None,
):
    head_dim = hn // head_num
    with tf.variable_scope(scope or "multihead_attention_decoder"):
        # if not isinstance(tensor_to_prev, type(None)):  # to print the shape
        #     tensor_from = tf.Print(tensor_from, [
        #         tf.shape(tensor_from), tf.shape(tensor_to),  tf.shape(mask_to),  tf.shape(tensor_to_prev)])

        if isinstance(tensor_to_prev, type(None)):
            tensor_to_all = tensor_to  # bs,sl,hn
            mask_to_all = mask_to  # bs,sl
        else:
            tensor_to_all = tf.concat([tensor_to_prev, tensor_to],
                                      -2)  # bs,psl+1,hn
            if mask_prev_to is None:
                mask_prev_to = tf.cast(
                    tf.ones(get_shape_list(tensor_to_prev, 3)[:2], tf.int32),
                    tf.bool)  # bs,psl
            mask_to_all = tf.concat([mask_prev_to, mask_to], -1)  # bs,psl+1

        attn_scores = compatibility_fn(
            tensor_from,
            tensor_to_all,
            method="multi_head",
            head_num=head_num,
            hn=hn,
            wd=wd,
            is_training=is_training,
            keep_prob=keep_prob_dense,
        )  # [bs,hd_num,slf,slt]
        v_heads = bn_dense_layer_v2(  # bs,slt,hd_dim * hd_num
            tensor_to_all,
            head_dim,
            True,
            0.,
            'v_heads',
            'linear',
            False,
            wd,
            keep_prob_dense,
            is_training,
            dup_num=head_num)
        v_heads = split_head(v_heads, head_num)  # # bs,hd_num,slt,hd_dim

        # mask the self-attention scores
        attn_scores_mask = tf.expand_dims(mask_to_all, 1)  # bs,1,tsl
        if (not isinstance(mask_direction, type(None))) and isinstance(
                tensor_to_prev, type(None)):
            attn_scores_mask = tf.logical_and(attn_scores_mask,
                                              mask_direction)  # bs,tsl,tsl
        attn_scores_masked = exp_mask_v3(
            attn_scores, attn_scores_mask,
            multi_head=True)  # [bs,hd_num,slf,slt]
        attn_prob = tf.nn.softmax(attn_scores_masked)
        attn_prob = dropout(attn_prob, keep_prob_attn,
                            is_training)  # [bs,hd_num,slf,slt]

        v_heads_etd = tf.expand_dims(v_heads, 2)  # bs,hd_num,1,slt,hd_dim
        attn_prob_etd = tf.expand_dims(attn_prob, -1)  # bs,hd_num,slf,slt,1

        attn_res = tf.reduce_sum(v_heads_etd * attn_prob_etd,
                                 3)  # bs,hd_num,slf,hd_dim
        out_prev = combine_head(attn_res)  # bs,fsl,hn

        # if mask_direction is not None and tensor_to_prev is None:
        #     attn_scores = exp_mask_v3(attn_scores, mask_direction, multi_head=True)  # [bs,hd_num,slf,slt]
        # attn_scores = dropout(attn_scores, keep_prob_attn, is_training)
        #
        # attn_res = softsel( # [bs,hd_num,slf,dhn]
        #     v_heads, attn_scores, mask_to_all,
        #     mask_add_head_dim_for_scores=True,
        #     input_add_multi_head_dim=False,
        #     score_add_hn_dim=True,
        #     axis=3)
        # out_prev = combine_head(attn_res)
        # dense layer
        out = bn_dense_layer_v2(out_prev, hn, True, 0., "output_transformer",
                                act_name, False, wd, keep_prob_dense,
                                is_training)
        return out