Ejemplo n.º 1
0
    def build_network(self):
        _logger.add()
        _logger.add('building %s neural network structure...' % cfg.network_type)
        tds, cds = self.tds, self.cds
        tl = self.tl
        tel, cel, cos, ocd, fh = self.tel, self.cel, self.cos, self.ocd, self.fh
        hn = self.hn
        bs = self.bs

        with tf.variable_scope('emb'):
            token_emb_mat = generate_embedding_mat(tds, tel, init_mat=self.token_emb_mat,
                                                   extra_mat=self.glove_emb_mat, extra_trainable=self.finetune_emb,
                                                   scope='gene_token_emb_mat')
            emb = tf.nn.embedding_lookup(token_emb_mat, self.token_seq)  # bs,sl1,tel

        with tf.variable_scope('sent_encoding'):
            rep = sentence_encoding_models(
                emb, self.token_mask, cfg.context_fusion_method, 'relu',
                'ct_based_sent2vec', cfg.wd, self.is_train, cfg.dropout,
                block_len=cfg.block_len)

        with tf.variable_scope('output'):
            pre_logits = tf.nn.relu(linear([rep], hn, True, scope='pre_logits_linear',
                                           wd=cfg.wd, input_keep_prob=cfg.dropout,
                                           is_train=self.is_train))  # bs, hn
            logits = linear([pre_logits], self.output_class, False, scope='get_output',
                            wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train) # bs, 5
        _logger.done()
        return logits
Ejemplo n.º 2
0
    def build_network(self):
        _logger.add()
        _logger.add('building %s neural network structure...' % cfg.network_type)
        tds, cds = self.tds, self.cds
        tl = self.tl
        tel, cel, cos, ocd, fh = self.tel, self.cel, self.cos, self.ocd, self.fh
        hn = self.hn
        bs, sl, ol, mc = self.bs, self.sl, self.ol, self.mc

        with tf.variable_scope('emb'):
            token_emb_mat = generate_embedding_mat(tds, tel, init_mat=self.token_emb_mat,
                                                   extra_mat=self.glove_emb_mat, extra_trainable=self.finetune_emb,
                                                   scope='gene_token_emb_mat')
            emb = tf.nn.embedding_lookup(token_emb_mat, self.token_seq)  # bs,sl,tel
            self.tensor_dict['emb'] = emb

        rep = disan(
            emb, self.token_mask, 'DiSAN', cfg.dropout,
            self.is_train, cfg.wd, 'relu', tensor_dict=self.tensor_dict, name='')

        with tf.variable_scope('output'):
            pre_logits = tf.nn.relu(linear([rep], hn, True, scope='pre_logits_linear',
                                          wd=cfg.wd, input_keep_prob=cfg.dropout,
                                          is_train=self.is_train))  # bs, hn
            logits = linear([pre_logits], self.output_class, False, scope='get_output',
                            wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train) # bs, 5
        _logger.done()
        return logits
Ejemplo n.º 3
0
def fusion_gate(rep1, rep2, wd, keep_prob, is_train):
    ivec = rep1.get_shape().as_list()[1]
    with tf.variable_scope('output'):
        o_bias = tf.get_variable('o_bias', [ivec], tf.float32,
                                 tf.constant_initializer(0.))
        # input gate
        fusion_g = tf.nn.sigmoid(
            linear(rep1, ivec, True, 0., 'linear_fusion_i', False, wd,
                   keep_prob, is_train) +
            linear(rep2, ivec, True, 0., 'linear_fusion_a', False, wd,
                   keep_prob, is_train) + o_bias)
        output = fusion_g * rep1 + (1 - fusion_g) * rep2
    return output
Ejemplo n.º 4
0
 def do_shift(self, data_for_shift):
     hn, dropout, is_train, wd = self.hn, self.dropout, self.is_train, self.wd
     with tf.variable_scope('sr_%s' % self.method_type):
         I = tf.nn.sigmoid(
             linear([data_for_shift], hn, True, 0., 'W_i_0', False, 0.,
                    dropout, is_train))
         O = tf.nn.sigmoid(
             linear([data_for_shift], hn, True, 0., 'W_o_0', False, 0.,
                    dropout, is_train))
         U = tf.nn.tanh(
             linear([data_for_shift], hn, True, 0., 'W_u_0', False, 0.,
                    dropout, is_train))
         C = I * U  # bs, hn
         H = O * tf.nn.tanh(C)  # bs, hn
         return tf.concat([H, C], -1)  # bs, hn*2
Ejemplo n.º 5
0
def normal_attention(rep_tensor,
                     rep_mask,
                     scope=None,
                     keep_prob=1.,
                     is_train=None,
                     wd=0.,
                     activation='elu',
                     tensor_dict=None,
                     name=None):
    batch_size, code_len, vec_size = tf.shape(rep_tensor)[0], tf.shape(
        rep_tensor)[1], tf.shape(rep_tensor)[2]
    ivec = rep_tensor.get_shape()[2]
    with tf.variable_scope(scope or 'normal_attention'):
        rep_tensor_map = bn_dense_layer(rep_tensor, ivec, True, 0.,
                                        'bn_dense_map', activation, False, wd,
                                        keep_prob, is_train)

        rep_tensor_logits = get_logits([rep_tensor_map],
                                       None,
                                       False,
                                       scope='self_attn_logits',
                                       mask=rep_mask,
                                       input_keep_prob=keep_prob,
                                       is_train=is_train)  # bs,sl
        attn_result = softsel(rep_tensor, rep_tensor_logits,
                              rep_mask)  # bs,vec

        # save attn
        if tensor_dict is not None and name is not None:
            tensor_dict[name] = tf.nn.softmax(rep_tensor_logits)

        with tf.variable_scope('output'):
            o_bias = tf.get_variable('o_bias', [ivec], tf.float32,
                                     tf.constant_initializer(0.))
            # input gate
            fusion_gate = tf.nn.sigmoid(
                linear(rep_tensor_map, ivec, True, 0., 'linear_fusion_i',
                       False, wd, keep_prob, is_train) +
                linear(attn_result, ivec, True, 0., 'linear_fusion_a', False,
                       wd, keep_prob, is_train) + o_bias)
            output = fusion_gate * rep_tensor_map + (1 -
                                                     fusion_gate) * attn_result
            output = mask_for_high_rank(output, rep_mask)  # bs,sl,vec
        return output
Ejemplo n.º 6
0
    def do_reduce(self, data_for_reduce, mask_for_reduce):
        hn, dropout, is_train, wd = self.hn, self.dropout, self.is_train, self.wd
        with tf.variable_scope('sr_%s' % self.method_type):
            left_child_hid = data_for_reduce[:, 0, :hn]
            left_child_cell = data_for_reduce[:, 0, hn:]

            right_child_hid = data_for_reduce[:, 1, :hn]
            right_child_cell = data_for_reduce[:, 1, hn:]

            # LSTM update
            I = tf.nn.sigmoid(
                linear([left_child_hid], hn, False, 0., 'W_i_l', False, 0.,
                       dropout, is_train) +
                linear([right_child_hid], hn, True, 0., 'W_i_r', False, 0.,
                       dropout, is_train), )

            F_l = tf.nn.sigmoid(
                linear([left_child_hid], hn, False, 0., 'W_f_l_l', False, 0.,
                       dropout, is_train) +
                linear([right_child_hid], hn, True, 0., 'W_f_l_r', False, 0.,
                       dropout, is_train))

            F_r = tf.nn.sigmoid(
                linear([left_child_hid], hn, False, 0., 'W_f_r_l', False, 0.,
                       dropout, is_train) +
                linear([right_child_hid], hn, True, 0., 'W_f_r_r', False, 0.,
                       dropout, is_train))

            O = tf.nn.sigmoid(
                linear([left_child_hid], hn, False, 0., 'W_o_l', False, 0.,
                       dropout, is_train) +
                linear([right_child_hid], hn, True, 0., 'W_o_r', False, 0.,
                       dropout, is_train))

            U = tf.nn.tanh(
                linear([left_child_hid], hn, False, 0., 'W_u_l', False, 0.,
                       dropout, is_train) +
                linear([right_child_hid], hn, True, 0., 'W_u_r', False, 0.,
                       dropout, is_train))

            C = I * U + F_l * left_child_cell + F_r * right_child_cell
            H = O * tf.nn.tanh(C)
            return tf.concat([H, C], -1)
Ejemplo n.º 7
0
    def do_shift(self, data_for_shift):
        hn, dropout, is_train, wd = self.hn, self.dropout, self.is_train, self.wd
        with tf.variable_scope('sr_%s' % self.method_type):
            print('var num in (2.1) :',
                  len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)))

            I = tf.nn.sigmoid(
                linear([data_for_shift], hn, False, 0., 'W_i_0', False, 0.,
                       dropout, is_train) + self.bias_I)
            O = tf.nn.sigmoid(
                linear([data_for_shift], hn, False, 0., 'W_o_0', False, 0.,
                       dropout, is_train) + self.bias_O)
            U = tf.nn.tanh(
                linear([data_for_shift], hn, False, 0., 'W_u_0', False, 0.,
                       dropout, is_train) + self.bias_U)
            print('var num in (2.2) :',
                  len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)))

            C = I * U  # bs, hn
            H = O * tf.nn.tanh(C)  # bs, hn
            return tf.concat([H, C], -1)  # bs, 2*hn
Ejemplo n.º 8
0
 def do_shift(self, data_for_shift):
     with tf.variable_scope('sr_%s' % self.method_type):
         shifted_value = tf.nn.relu(
             linear([data_for_shift],
                    self.hn,
                    True,
                    0.,
                    'shift_linear',
                    False,
                    input_keep_prob=self.dropout,
                    is_train=self.is_train))
         return shifted_value
Ejemplo n.º 9
0
    def build_network(self):
        _logger.add()
        _logger.add('building %s neural network structure...' % cfg.network_type)
        tds, cds = self.tds, self.cds
        tl = self.tl
        tel, cel, cos, ocd, fh = self.tel, self.cel, self.cos, self.ocd, self.fh
        hn = self.hn
        bs, sl1, sl2 = self.bs, self.sl1, self.sl2

        with tf.variable_scope('emb'):
            token_emb_mat = generate_embedding_mat(tds, tel, init_mat=self.token_emb_mat,
                                                   extra_mat=self.glove_emb_mat, extra_trainable=self.finetune_emb,
                                                   scope='gene_token_emb_mat')
            s1_emb = tf.nn.embedding_lookup(token_emb_mat, self.sent1_token)  # bs,sl1,tel
            s2_emb = tf.nn.embedding_lookup(token_emb_mat, self.sent2_token)  # bs,sl2,tel
            self.tensor_dict['s1_emb'] = s1_emb
            self.tensor_dict['s2_emb'] = s2_emb

        with tf.variable_scope('sent_enc_attn'):
            s1_rep = traditional_attention(
                s1_emb, self.sent1_token_mask, 'traditional_attention',
                cfg.dropout, self.is_train, cfg.wd,
                tensor_dict=self.tensor_dict, name='s1_attn')
            tf.get_variable_scope().reuse_variables()
            s2_rep = traditional_attention(
                s2_emb, self.sent2_token_mask, 'traditional_attention',
                cfg.dropout, self.is_train, cfg.wd,
                tensor_dict=self.tensor_dict, name='s2_attn')

            self.tensor_dict['s1_rep'] = s1_rep
            self.tensor_dict['s2_rep'] = s2_rep

        with tf.variable_scope('output'):
            out_rep = tf.concat([s1_rep, s2_rep, s1_rep - s2_rep, s1_rep * s2_rep], -1)
            pre_output = tf.nn.elu(linear([out_rep], hn, True, 0., scope= 'pre_output', squeeze=False,
                                           wd=cfg.wd, input_keep_prob=cfg.dropout,is_train=self.is_train))
            logits = linear([pre_output], self.output_class, True, 0., scope= 'logits', squeeze=False,
                            wd=cfg.wd, input_keep_prob=cfg.dropout,is_train=self.is_train)
            self.tensor_dict[logits] = logits
        return logits # logits
Ejemplo n.º 10
0
def self_attention_for_selected_head(
        head_selection, head_org_idx, sl_head, rep_head_mask,
        dep_selection, dep_org_idx, sl_dep, rep_dep_mask,
        rep_map, rep_dep_tensor, keep_prob, is_train, direction, ivec
):
    # data for self-attention
    rep_map_dp = dropout(rep_map, keep_prob, is_train)
    rep_dep_tensor_dp, _, _ = reduce_data_rep_max_len(rep_map_dp, dep_selection)
    rep_head_tensor_dp, _, _ = reduce_data_rep_max_len(rep_map_dp, head_selection)

    # mask generation
    dep_idxs = tf.tile(tf.expand_dims(dep_org_idx, 1), [1, sl_head, 1])
    head_idxs = tf.tile(tf.expand_dims(head_org_idx, 2), [1, 1, sl_dep])

    if direction is None:
        direct_mask = tf.not_equal(head_idxs, dep_idxs)  # [bs, slh, sld]
    else:
        if direction == 'forward':
            direct_mask = tf.greater(head_idxs, dep_idxs)  # [bs, slh, sld]
        else:
            direct_mask = tf.less(head_idxs, dep_idxs)  # [bs, slh, sld]
    # [bs, slh, slh]
    rep_mask_tile = tf.logical_and(tf.expand_dims(rep_dep_mask, 1), tf.expand_dims(rep_head_mask, 2))
    attn_mask = tf.logical_and(direct_mask, rep_mask_tile)  # [bs, slh, sld]

    # tensor tile
    rep_map_tile = tf.tile(tf.expand_dims(rep_dep_tensor, 1), [1, sl_head, 1, 1])  # bs,slh,sld,vec
    with tf.variable_scope('attention'):  # bs,sl,sl,vec
        f_bias = tf.get_variable('f_bias', [ivec], tf.float32, tf.constant_initializer(0.))
        dependent = linear(rep_dep_tensor_dp, ivec, False, scope='linear_dependent')  # bs,sld,vec
        dependent_etd = tf.expand_dims(dependent, 1)  # bs,1,sld,vec
        head = linear(rep_head_tensor_dp, ivec, False, scope='linear_head')  # bs,slh,vec
        head_etd = tf.expand_dims(head, 2)  # bs,slh,1,vec

        logits = scaled_tanh(dependent_etd + head_etd + f_bias, 5.0)  # bs,slh,sld,vec
        logits_masked = exp_mask_for_high_rank(logits, attn_mask)  # bs,slh,sld,vec
        attn_score = tf.nn.softmax(logits_masked, 2)  # bs,slh,sld,vec
        attn_score = mask_for_high_rank(attn_score, attn_mask)
        attn_result = tf.reduce_sum(attn_score * rep_map_tile, 2)  # bs,slh,vec -> head_org_idx
    return attn_result
Ejemplo n.º 11
0
 def do_reduce(self, data_for_reduce, mask_for_reduce):
     with tf.variable_scope('sr_%s' % self.method_type):
         data_for_reduce_re = tf.reshape(data_for_reduce, [-1, 2 * self.hn])
         reduced_value = tf.nn.relu(
             linear([data_for_reduce_re],
                    self.hn,
                    True,
                    0.,
                    'reduce_linear',
                    False,
                    input_keep_prob=self.dropout,
                    is_train=self.is_train))
         return reduced_value
Ejemplo n.º 12
0
def gene_similarity_mat_and_mask(tensor_row, tensor_col,
                                 mask_for_tensor_row,
                                 mask_for_tensor_col,
                                 similarity_method='inner', hn=100, scope = None):
    with tf.variable_scope(scope or 'gene_similarity_mat_and_mask'):
        # --------parameters--------
        t_main = tensor_row  # [bs,sl,vec]
        t_sec = tensor_col  # [bs,ql,vec]
        mask_main = mask_for_tensor_row  # [bs,sl]
        mask_sec = mask_for_tensor_col  # [bs,ql]

        bs, sl, vec = tf.shape(t_main)[0], tf.shape(t_main)[1], tf.shape(t_main)[2]
        ql = tf.shape(t_sec)[1]
        # -------------------------------
        # --------similarity_mat--------
        mask_main_etd = tf.expand_dims(mask_main, 2)  # bs,sl,1
        mask_sec_etd = tf.expand_dims(mask_sec, 1)  # bs,1,ql
        mask_similarity_mat = tf.logical_and(mask_main_etd, mask_sec_etd)  # bs,sl,ql
        if similarity_method == 'inner':
            t_main_etd = tf.expand_dims(t_main, 2)  # bs,sl,1,vec
            t_sec_etd = tf.expand_dims(t_sec, 1)  # bs,1,ql,vec
            similarity_mat = tf.reduce_sum(t_main_etd*t_sec_etd, -1)  # bs,sl,ql
        elif similarity_method == 'tri_linear':
            t_main_tiled = tf.tile(tf.expand_dims(t_main, 2), [1, 1, ql, 1])  # bs,sl,ql,vec
            t_sec_tiled = tf.tile(tf.expand_dims(t_sec, 1), [1, sl, 1, 1])  # bs,sl,ql,vec
            similarity_mat = get_logits([t_main_tiled, t_sec_tiled], None, False,
                                        scope='tri_linear_tri_linear', func='tri_linear')
        elif similarity_method == 'map_linear':
            t_main_map = tf.nn.relu(linear([t_main], hn, True, scope='linear_map_main'))
            t_sec_map = tf.nn.relu(linear([t_sec], hn, True, scope='linear_map_sec'))
            t_main_map_etd = tf.expand_dims(t_main_map, 2)  # bs,sl,1,hn
            t_sec_map_etd = tf.expand_dims(t_sec_map, 1)  # bs,1,ql,hn
            similarity_mat = tf.reduce_sum(t_main_map_etd * t_sec_map_etd, -1)  # bs,sl,ql
        else:
            raise AttributeError('No similarity matrix calculation method \'%s\'' % similarity_method)

        return similarity_mat, mask_similarity_mat
Ejemplo n.º 13
0
def self_align_attention(rep_tensor, mask, scope=None, simplify=True, hn=None):  # correct
    """
    attention strategy 4: self * self => attention self
    :param rep_tensor: rank is three [bs,sl,hn]
    :param mask: [bs,sl] tf.bool
    :param scope
    :param simplify:
    :return:  attended tensor [bs,sl,hn]
    """
    with tf.name_scope(scope or 'self_attention'):
        bs = tf.shape(rep_tensor)[0]
        sl = tf.shape(rep_tensor)[1]
        #vec = tf.shape(rep_tensor)[2]
        ivec = rep_tensor.get_shape().as_list()[-1]

        to_be_attended = tf.tile(tf.expand_dims(rep_tensor, 1), [1, sl, 1, 1])
        if not simplify:
            assert hn is not None
            rep_tensor = tf.nn.relu(linear([rep_tensor], hn, True, 0., 'linear_transform'))
        # 1. self alignment
        mask_tiled_sec = tf.tile(tf.expand_dims(mask, 1), [1, sl, 1])  # bs,sl,sl
        mask_tiled_mian = tf.tile(tf.expand_dims(mask, 2), [1, 1, sl])  # bs,sl,sl
        mask_tiled = tf.logical_and(mask_tiled_sec, mask_tiled_mian)
        input_sec = tf.tile(tf.expand_dims(rep_tensor, 1), [1, sl, 1, 1])  # bs,1-sl,sl,hn
        input_main = tf.tile(tf.expand_dims(rep_tensor, 2), [1, 1, sl, 1])  # bs,sl,1-sl,hn
        # self_alignment = tf.reduce_sum(input_sec * input_main, -1)  # bs,sl,sl
        self_alignment = (1.0 / ivec) * tf.reduce_sum(input_sec * input_main, -1)  # bs,sl,sl
        # 2. generate diag~/ mat
        # diag = tf.expand_dims(
        #     tf.cast(tf.logical_not(
        #         tf.cast(
        #             tf.diag(
        #                 tf.ones([sl], tf.int32)), tf.bool)
        #     ), tf.float32), 0)  # 1,sl,sl
        diag = tf.expand_dims(tf.logical_not(
                tf.cast(tf.diag(tf.ones([sl], tf.int32)), tf.bool)), 0)  # 1,sl,sl
        diag = tf.tile(diag, [bs, 1, 1])  # bs, sl, sl
        # self_alignment = self_alignment * diag  # bs,sl,sl
        # 3. attend data
        context = softsel(to_be_attended, self_alignment, tf.logical_and(mask_tiled, diag))  # [bs,sl,sl],  bs,sl,hn
        return context
Ejemplo n.º 14
0
def self_choose_attention(rep_tensor, rep_mask, hn,  # correct
                          keep_prob=1., is_train=None, scope=None, simplify=False):
    """
    self soft choose attention with 
    :param rep_tensor: rank must be 3 [bs,sl,hn]
    :param rep_mask: [bs,sl]
    :param hn: 
    :param keep_prob: 
    :param is_train: 
    :param scope:
    :param simplify
    :return: 
    """
    with tf.variable_scope(scope or 'self_choose_attention'):
        if not simplify:
            rep_tensor_map = tf.nn.relu(linear([rep_tensor], hn, True, scope='linear_map',
                                        input_keep_prob=keep_prob, is_train=is_train))
        else:
            rep_tensor_map = tf.identity(rep_tensor)
        rep_tensor_logits = get_logits([rep_tensor_map], None, False, scope='self_attn_logits',
                                       mask=rep_mask, input_keep_prob=keep_prob, is_train=is_train)  # bs,sl
        attn_res = softsel(rep_tensor, rep_tensor_logits, rep_mask)  # bs,vec
        return attn_res
Ejemplo n.º 15
0
def context_fusion_layers(rep_tensor,
                          rep_mask,
                          method,
                          activation_function,
                          scope=None,
                          wd=0.,
                          is_train=None,
                          keep_prob=1.,
                          **kwargs):
    method_name_list = [
        'lstm',
        'gru',
        'sru',
        'sru_normal',  # rnn
        'cnn',
        'multi_head',
        'multi_head_git',
        'disa',
        'mpsa',
        'block'
    ]
    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(
        rep_tensor)[2]
    ivec = rep_tensor.get_shape().as_list()[2]

    context_fusion_output = None
    with tf.variable_scope(scope or 'context_fusion_layers'):
        if method in ['lstm', 'gru', 'sru_normal']:
            context_fusion_output = contextual_bi_rnn(rep_tensor, rep_mask,
                                                      ivec, method, False, wd,
                                                      keep_prob, is_train,
                                                      'ct_bi_%s' % method)
        elif method == 'sru':
            context_fusion_output = bi_sru_recurrent_network(
                rep_tensor, rep_mask, is_train, keep_prob, wd, 'ct_bi_sru')
        elif method == 'cnn':
            context_fusion_output = cnn_for_context_fusion(
                rep_tensor, rep_mask, (3, 4, 5), 200, 'ct_cnn', is_train,
                keep_prob, wd)
        elif method == 'multi_head':
            context_fusion_output = multi_head_attention(
                rep_tensor, rep_mask, 8, 75, 'ct_multi_head', is_train,
                keep_prob, wd)
        elif method == 'multi_head_git':
            context_fusion_output = multi_head_attention_git(
                rep_tensor, rep_mask, 8, 600, 'ct_multi_head', is_train,
                keep_prob, wd)
        elif method == 'disa':
            with tf.variable_scope('ct_disa'):
                disa_fw = directional_attention_with_dense(
                    rep_tensor, rep_mask, 'forward', 'fw_disa', keep_prob,
                    is_train, wd, activation_function)
                disa_bw = directional_attention_with_dense(
                    rep_tensor, rep_mask, 'backward', 'bw_disa', keep_prob,
                    is_train, wd, activation_function)
                context_fusion_output = tf.concat([disa_fw, disa_bw], -1)
        elif method == 'block':
            if 'block_len' in kwargs.keys():
                block_len = kwargs['block_len']
            else:
                block_len = None
            if block_len is None:
                block_len = tf.cast(
                    tf.ceil(tf.pow(tf.cast(2 * sl, tf.float32), 1.0 / 3)),
                    tf.int32)
            context_fusion_output = bi_directional_simple_block_attention(
                rep_tensor, rep_mask, block_len, 'ct_block_attn', keep_prob,
                is_train, wd, activation_function)
        elif method == 'mpsa':
            with tf.variable_scope('ct_mpsa'):
                mpsa_fw = masked_positional_self_attention(
                    0, rep_tensor, rep_mask, 'forward', 'fw_mpsa', keep_prob,
                    is_train, wd, activation_function)
                mpsa_bw = masked_positional_self_attention(
                    0, rep_tensor, rep_mask, 'backward', 'bw_mpsa', keep_prob,
                    is_train, wd, activation_function)
                mpsa_2g = masked_positional_self_attention(
                    2, rep_tensor, rep_mask, None, '2g_mpsa', keep_prob,
                    is_train, wd, activation_function)
                mpsa_3g = masked_positional_self_attention(
                    3, rep_tensor, rep_mask, None, '3g_mpsa', keep_prob,
                    is_train, wd, activation_function)
                sen_tensor = mask_for_high_rank(rep_tensor, rep_mask)
                sen_tensor_t = tf.expand_dims(sen_tensor, 2)
                fw_res = tf.expand_dims(mpsa_fw, 2)
                bw_res = tf.expand_dims(mpsa_bw, 2)
                g2_res = tf.expand_dims(mpsa_2g, 2)
                g3_res = tf.expand_dims(mpsa_3g, 2)
                tmp_res = tf.concat(
                    [sen_tensor_t, fw_res, bw_res, g2_res, g3_res],
                    2)  # bs,sl,5,ivec

                bs, sl = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1]
                ivec = rep_tensor.get_shape()[2]
                num = tmp_res.get_shape()[2]
                bias = tf.get_variable('bias', [num * ivec], tf.float32,
                                       tf.constant_initializer(0.))
                softmax_gate = linear(sen_tensor, num * ivec, True, 0.,
                                      'linear_softmax', False, wd, keep_prob,
                                      is_train) + bias  # bs,sl,5*ivec
                fusion_gate = tf.nn.softmax(
                    tf.reshape(softmax_gate, [bs, sl, num, ivec]), 2)
                context_fusion_output = tf.reduce_sum(fusion_gate * tmp_res,
                                                      2)  # bs,sl,ivec
        else:
            raise RuntimeError

        return context_fusion_output
Ejemplo n.º 16
0
def simple_block_attention(rep_tensor,
                           rep_mask,
                           block_len=5,
                           scope=None,
                           direction=None,
                           keep_prob=1.,
                           is_train=None,
                           wd=0.,
                           activation='elu',
                           hn=None):
    assert direction is not None

    def scaled_tanh(x, scale=5.):
        return scale * tf.nn.tanh(1. / scale * x)

    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(
        rep_tensor)[2]
    org_ivec = rep_tensor.get_shape().as_list()[2]
    ivec = hn or org_ivec
    with tf.variable_scope(scope or 'block_simple'):
        # @1. split sequence
        with tf.variable_scope('split_seq'):
            block_num = tf.cast(
                tf.ceil(
                    tf.divide(tf.cast(sl, tf.float32),
                              tf.cast(block_len, tf.float32))), tf.int32)
            comp_len = block_num * block_len - sl

            rep_tensor_comp = tf.concat(
                [rep_tensor,
                 tf.zeros([bs, comp_len, org_ivec], tf.float32)], 1)
            rep_mask_comp = tf.concat([
                rep_mask,
                tf.cast(tf.zeros([bs, comp_len], tf.int32), tf.bool)
            ], 1)

            rep_tensor_split = tf.reshape(
                rep_tensor_comp,
                [bs, block_num, block_len, org_ivec])  # bs,bn,bl,d
            rep_mask_split = tf.reshape(rep_mask_comp,
                                        [bs, block_num, block_len])  # bs,bn,bl

            # non-linear
            rep_map = bn_dense_layer(rep_tensor_split, ivec, True, 0.,
                                     'bn_dense_map', activation, False, wd,
                                     keep_prob, is_train)  # bs,bn,bl,vec
            rep_map_tile = tf.tile(tf.expand_dims(rep_map, 2),
                                   [1, 1, block_len, 1, 1])  # bs,bn,bl,bl,vec
            # rep_map_dp = dropout(rep_map, keep_prob, is_train)
            bn = block_num
            bl = block_len

        with tf.variable_scope('self_attention'):
            # @2.self-attention in block
            # mask generation
            sl_indices = tf.range(block_len, dtype=tf.int32)
            sl_col, sl_row = tf.meshgrid(sl_indices, sl_indices)
            if direction == 'forward':
                direct_mask = tf.greater(sl_row, sl_col)  # bl,bl
            else:
                direct_mask = tf.greater(sl_col, sl_row)  # bl,bl
            direct_mask_tile = tf.tile(
                tf.expand_dims(tf.expand_dims(direct_mask, 0), 0),
                [bs, bn, 1, 1])  # bs,bn,bl,bl
            rep_mask_tile_1 = tf.tile(tf.expand_dims(rep_mask_split, 2),
                                      [1, 1, bl, 1])  # bs,bn,bl,bl
            rep_mask_tile_2 = tf.tile(tf.expand_dims(rep_mask_split, 3),
                                      [1, 1, 1, bl])  # bs,bn,bl,bl
            rep_mask_tile = tf.logical_and(rep_mask_tile_1, rep_mask_tile_2)
            attn_mask = tf.logical_and(direct_mask_tile,
                                       rep_mask_tile,
                                       name='attn_mask')  # bs,bn,bl,bl

            # attention
            f_bias = tf.get_variable('f_bias', [ivec], tf.float32,
                                     tf.constant_initializer(0.))
            dependent_head = linear(rep_map, 2 * ivec, False, 0.,
                                    'linear_dependent_head', False, wd,
                                    keep_prob, is_train)  # bs,bn,bl,2vec
            dependent, head = tf.split(dependent_head, 2, 3)
            dependent_etd = tf.expand_dims(dependent, 2)  # bs,bn,1,bl,vec
            head_etd = tf.expand_dims(head, 3)  # bs,bn,bl,1,vec
            logits = scaled_tanh(dependent_etd + head_etd + f_bias,
                                 5.0)  # bs,bn,bl,bl,vec
            logits_masked = exp_mask_for_high_rank(logits, attn_mask)
            attn_score = tf.nn.softmax(logits_masked, 3)  # bs,bn,bl,bl,vec
            attn_score = mask_for_high_rank(attn_score,
                                            attn_mask)  # bs,bn,bl,bl,vec
            self_attn_result = tf.reduce_sum(attn_score * rep_map_tile,
                                             3)  # bs,bn,bl,vec

        with tf.variable_scope('source2token_self_attn'):
            inter_block_logits = bn_dense_layer(self_attn_result, ivec, True,
                                                0., 'bn_dense_map', 'linear',
                                                False, wd, keep_prob,
                                                is_train)  # bs,bn,bl,vec
            inter_block_logits_masked = exp_mask_for_high_rank(
                inter_block_logits, rep_mask_split)  # bs,bn,bl,vec
            inter_block_soft = tf.nn.softmax(inter_block_logits_masked,
                                             2)  # bs,bn,bl,vec
            inter_block_attn_output = tf.reduce_sum(
                self_attn_result * inter_block_soft, 2)  # bs,bn,vec

        with tf.variable_scope('self_attn_inter_block'):
            inter_block_attn_output_mask = tf.cast(tf.ones([bs, bn], tf.int32),
                                                   tf.bool)
            block_ct_res = directional_attention_with_dense(
                inter_block_attn_output, inter_block_attn_output_mask,
                direction, 'disa', keep_prob, is_train, wd,
                activation)  # [bs,bn,vec]

            block_ct_res_tile = tf.tile(tf.expand_dims(
                block_ct_res, 2), [1, 1, bl, 1])  #[bs,bn,vec]->[bs,bn,bl,vec]

        with tf.variable_scope('combination'):
            # input:1.rep_map[bs,bn,bl,vec]; 2.self_attn_result[bs,bn,bl,vec]; 3.rnn_res_tile[bs,bn,bl,vec]
            rep_tensor_with_ct = tf.concat(
                [rep_map, self_attn_result, block_ct_res_tile],
                -1)  # [bs,bn,bl,3vec]
            new_context_and_gate = linear(rep_tensor_with_ct, 2 * ivec, True,
                                          0., 'linear_new_context_and_gate',
                                          False, wd, keep_prob,
                                          is_train)  # [bs,bn,bl,2vec]
            new_context, gate = tf.split(new_context_and_gate, 2,
                                         3)  # bs,bn,bl,vec
            if activation == "relu":
                new_context_act = tf.nn.relu(new_context)
            elif activation == "elu":
                new_context_act = tf.nn.elu(new_context)
            elif activation == "linear":
                new_context_act = tf.identity(new_context)
            else:
                raise RuntimeError
            gate_sig = tf.nn.sigmoid(gate)
            combination_res = gate_sig * new_context_act + (
                1 - gate_sig) * rep_map  # bs,bn,bl,vec

        with tf.variable_scope('restore_original_length'):
            combination_res_reshape = tf.reshape(
                combination_res, [bs, bn * bl, ivec])  # bs,bn*bl,vec
            output = combination_res_reshape[:, :sl, :]
            return output
Ejemplo n.º 17
0
    def build_network(self):
        _logger.add()
        _logger.add('building %s neural network structure...' %
                    cfg.network_type)
        tds, cds = self.tds, self.cds
        tl = self.tl
        tel, cel, cos, ocd, fh = self.tel, self.cel, self.cos, self.ocd, self.fh
        hn = self.hn
        bs, sl1, sl2 = self.bs, self.sl1, self.sl2

        with tf.variable_scope('emb'):
            token_emb_mat = generate_embedding_mat(
                tds,
                tel,
                init_mat=self.token_emb_mat,
                extra_mat=self.glove_emb_mat,
                extra_trainable=self.finetune_emb,
                scope='gene_token_emb_mat')
            s1_emb = tf.nn.embedding_lookup(token_emb_mat,
                                            self.sent1_token)  # bs,sl1,tel
            s2_emb = tf.nn.embedding_lookup(token_emb_mat,
                                            self.sent2_token)  # bs,sl2,tel
            self.tensor_dict['s1_emb'] = s1_emb
            self.tensor_dict['s2_emb'] = s2_emb

        with tf.variable_scope('context_fusion'):
            s1_seq_rep = multi_mask_tensorized_self_attn(
                s1_emb,
                self.sent1_token_mask,
                hn=2 * hn,
                head_num=2,
                is_train=self.is_train,
                attn_keep_prob=1.,
                dense_keep_prob=cfg.dropout,
                wd=cfg.wd,
                use_direction=True,
                attn_self=False,
                use_fusion_gate=True,
                final_mask_ft=None,
                dot_activation_name='sigmoid',
                use_input_for_attn=False,
                add_layer_for_multi=True,
                activation_func_name='elu',
                apply_act_for_v=True,
                input_hn=None,
                output_hn=None,
                accelerate=False,
                merge_var=False,
                scope='multi_mask_tensorized_self_attn')

            tf.get_variable_scope().reuse_variables()

            s2_seq_rep = multi_mask_tensorized_self_attn(
                s2_emb,
                self.sent2_token_mask,
                hn=2 * hn,
                head_num=2,
                is_train=self.is_train,
                attn_keep_prob=1.,
                dense_keep_prob=cfg.dropout,
                wd=cfg.wd,
                use_direction=True,
                attn_self=False,
                use_fusion_gate=True,
                final_mask_ft=None,
                dot_activation_name='sigmoid',
                use_input_for_attn=False,
                add_layer_for_multi=True,
                activation_func_name='elu',
                apply_act_for_v=True,
                input_hn=None,
                output_hn=None,
                accelerate=False,
                merge_var=False,
                scope='multi_mask_tensorized_self_attn')

        with tf.variable_scope('compression'):
            s1_rep = multi_dimensional_attention(s1_seq_rep,
                                                 self.sent1_token_mask,
                                                 's2t_attn', cfg.dropout,
                                                 self.is_train, cfg.wd, 'elu')

            tf.get_variable_scope().reuse_variables()

            s2_rep = multi_dimensional_attention(s2_seq_rep,
                                                 self.sent2_token_mask,
                                                 's2t_attn', cfg.dropout,
                                                 self.is_train, cfg.wd, 'elu')

        with tf.variable_scope('output'):
            out_rep = tf.concat(
                [s1_rep, s2_rep, s1_rep - s2_rep, s1_rep * s2_rep], -1)
            pre_output = tf.nn.elu(
                linear([out_rep],
                       hn,
                       True,
                       0.,
                       scope='pre_output',
                       squeeze=False,
                       wd=cfg.wd,
                       input_keep_prob=cfg.dropout,
                       is_train=self.is_train))
            pre_output1 = highway_net(pre_output, hn, True, 0., 'pre_output1',
                                      'elu', False, cfg.wd, cfg.dropout,
                                      self.is_train)
            logits = linear([pre_output1],
                            self.output_class,
                            True,
                            0.,
                            scope='logits',
                            squeeze=False,
                            wd=cfg.wd,
                            input_keep_prob=cfg.dropout,
                            is_train=self.is_train)
            self.tensor_dict[logits] = logits
        return logits  # logits
Ejemplo n.º 18
0
    def build_network(self):
        _logger.add()
        _logger.add('building %s neural network structure...' %
                    cfg.network_type)
        tds, cds = self.tds, self.cds
        tl = self.tl
        tel, cel, cos, ocd, fh = self.tel, self.cel, self.cos, self.ocd, self.fh
        hn = self.hn
        bs, sl, ol, mc = self.bs, self.sl, self.ol, self.mc

        with tf.variable_scope('emb'):
            token_emb_mat = generate_embedding_mat(
                tds,
                tel,
                init_mat=self.token_emb_mat,
                extra_mat=self.glove_emb_mat,
                extra_trainable=self.finetune_emb,
                scope='gene_token_emb_mat')
            emb = tf.nn.embedding_lookup(token_emb_mat,
                                         self.token_seq)  # bs,sl,tel
            self.tensor_dict['emb'] = emb

        with tf.variable_scope('sent_encoding'):
            act_name = 'relu'
            seq_rep = multi_mask_tensorized_self_attn(
                emb,
                self.token_mask,
                hn=2 * hn,
                head_num=2,
                is_train=self.is_train,
                attn_keep_prob=1.,
                dense_keep_prob=cfg.dropout,
                wd=cfg.wd,
                use_direction=True,
                attn_self=False,
                use_fusion_gate=True,
                final_mask_ft=None,
                dot_activation_name='sigmoid',
                use_input_for_attn=False,
                add_layer_for_multi=True,
                activation_func_name=act_name,
                apply_act_for_v=True,
                input_hn=None,
                output_hn=None,
                accelerate=False,
                merge_var=False,
                scope='proposed_model')

            rep = multi_dim_souce2token_self_attn(seq_rep, self.token_mask,
                                                  's2t_self_attn', cfg.dropout,
                                                  self.is_train, cfg.wd,
                                                  act_name)

        with tf.variable_scope('output'):
            pre_logits = tf.nn.relu(
                linear([rep],
                       hn,
                       True,
                       scope='pre_logits_linear',
                       wd=cfg.wd,
                       input_keep_prob=cfg.dropout,
                       is_train=self.is_train))  # bs, hn
            logits = linear([pre_logits],
                            self.output_class,
                            False,
                            scope='get_output',
                            wd=cfg.wd,
                            input_keep_prob=cfg.dropout,
                            is_train=self.is_train)  # bs, 5
        _logger.done()
        return logits
Ejemplo n.º 19
0
def masked_positional_self_attention(sigma, rep_tensor, rep_mask, direction=None, scope=None,
                                     keep_prob=1., is_train=None, wd=0., activation='elu',
                                     tensor_dict=None, name=None):
    def scaled_tanh(x, scale=5.):
        return scale * tf.nn.tanh(1./scale * x)

    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2]
    ivec = rep_tensor.get_shape()[2]
    with tf.variable_scope(scope or 'directional_attention_%s' % direction or 'diag'):
        # mask generation
        sl_indices = tf.range(sl, dtype=tf.int32)
        sl_col, sl_row = tf.meshgrid(sl_indices, sl_indices)
        if direction is None:
            direct_mask0 = tf.greater(sl_row + sigma, sl_col)
            direct_mask1 = tf.greater(sl_col + sigma, sl_row)
            direct_mask2 = tf.cast(1 - tf.diag(tf.ones([sl], tf.int32)), tf.bool)
            direct_mask = tf.logical_and(tf.logical_and(direct_mask0, direct_mask1), direct_mask2)
        else:
            if direction == 'forward':
                direct_mask = tf.greater(sl_row, sl_col)
            else:
                direct_mask = tf.greater(sl_col, sl_row)
        direct_mask_tile = tf.tile(tf.expand_dims(direct_mask, 0), [bs, 1, 1])  # bs,sl,sl
        rep_mask_tile = tf.tile(tf.expand_dims(rep_mask, 1), [1, sl, 1])  # bs,sl,sl
        attn_mask = tf.logical_and(direct_mask_tile, rep_mask_tile)  # bs,sl,sl

        # non-linear
        rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map', activation,
                                 False, wd, keep_prob, is_train)
        rep_map_tile = tf.tile(tf.expand_dims(rep_map, 1), [1, sl, 1, 1])  # bs,sl,sl,vec
        rep_map_dp = dropout(rep_map, keep_prob, is_train)

        # attention
        with tf.variable_scope('attention'):  # bs,sl,sl,1
            f_bias = tf.get_variable('f_bias', [1], tf.float32, tf.constant_initializer(0.))
            dependent = linear(rep_map_dp, 1, False, scope='linear_dependent')  # bs,sl,1
            dependent_etd = tf.expand_dims(dependent, 1)  # bs,1,sl,1
            head = linear(rep_map_dp, 1, False, scope='linear_head') # bs,sl,1
            head_etd = tf.expand_dims(head, 2)  # bs,sl,1,1

            logits = scaled_tanh(dependent_etd + head_etd + f_bias, 5.0)  # bs,sl,sl,1

            logits_masked = exp_mask_for_high_rank(logits, attn_mask)
            if direction is not None:
                dis_mask = -tf.log(tf.cast(tf.abs(sl_col - sl_row) + 
                                           tf.diag(tf.ones([sl], tf.int32)), tf.float32))
                logits_masked = dis_mask_for_high_rank(logits_masked, dis_mask)
            attn_score = tf.nn.softmax(logits_masked, 2)  # bs,sl,sl,vec
            attn_score = mask_for_high_rank(attn_score, attn_mask)
            attn_score = tf.tile(tf.expand_dims(tf.reshape(attn_score, [bs, sl, sl]), 3), [1, 1, 1, ivec])

            attn_result = tf.reduce_sum(attn_score * rep_map_tile, 2)  # bs,sl,vec

        with tf.variable_scope('output'):
            output = attn_result

        # save attn
        if tensor_dict is not None and name is not None:
            tensor_dict[name + '_dependent'] = dependent
            tensor_dict[name + '_head'] = head
            tensor_dict[name] = attn_score
        return output
Ejemplo n.º 20
0
def directional_attention_with_selections(
        rep_tensor, rep_mask, dep_selection, head_selection, direction=None, hn=None, keep_unselected=True,
        scope=None, keep_prob=1., is_train=None, wd=0., activation='elu'):

    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2]
    org_ivec = rep_tensor.get_shape().as_list()[2]
    ivec = hn or org_ivec

    with tf.variable_scope(scope or 'directional_attention_%s' % direction or 'diag'):
        # non-linear
        rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map', activation,
                                 False, wd, keep_prob, is_train)
        # ensure the seletion is right
        dep_selection = tf.logical_and(rep_mask, dep_selection)
        head_selection = tf.logical_and(rep_mask, head_selection)
        rep_dep_tensor, rep_dep_mask, dep_org_idx = reduce_data_rep_max_len(rep_map, dep_selection)
        rep_head_tensor,rep_head_mask, head_org_idx = reduce_data_rep_max_len(rep_map, head_selection)
        sl_dep, sl_head = tf.shape(rep_dep_tensor)[1], tf.shape(rep_head_tensor)[1]

        if keep_unselected:
            unhead_selection = tf.logical_and(rep_mask, tf.logical_not(head_selection))
            rep_unhead_tensor, rep_unhead_mask, unhead_org_idx = reduce_data_rep_max_len(rep_map, unhead_selection)
            sl_unhead = tf.shape(rep_unhead_tensor)[1]

        attn_result = tf.cond(
            tf.equal(sl_head, 0),
            lambda: tf.zeros([bs, 0, hn], tf.float32),
            lambda: self_attention_for_selected_head(
                head_selection, head_org_idx, sl_head, rep_head_mask,
                dep_selection, dep_org_idx, sl_dep, rep_dep_mask,
                rep_map, rep_dep_tensor, keep_prob, is_train, direction, ivec
            )
        )

        if keep_unselected:
            input_idx = tf.tile(tf.expand_dims(tf.range(sl), 0), [bs, 1])
            pooling_result = tf.cond(
                tf.equal(sl_unhead, 0),
                lambda: tf.zeros([bs, 0, hn], tf.float32),
                lambda: mean_pooling_for_unselected_head(
                    unhead_org_idx, sl_unhead, rep_unhead_mask,
                    input_idx, sl, rep_mask, rep_map, None)  # todo: point !
            )

        with tf.variable_scope('output'):
            if keep_unselected:
                range_head = tf.tile(tf.expand_dims(tf.range(bs), -1), [1, sl_head])
                scatter_attn = tf.cond(
                    tf.equal(sl_head, 0),
                    lambda: tf.zeros([bs, sl+1, hn], tf.float32),
                    lambda: tf.scatter_nd(
                        tf.stack([range_head, head_org_idx], -1), attn_result, [bs, sl+1, hn])
                )

                range_unhead = tf.tile(tf.expand_dims(tf.range(bs), -1), [1, sl_unhead])
                scatter_pooling = tf.cond(
                    tf.equal(sl_unhead, 0),
                    lambda: tf.zeros([bs, sl+1, hn], tf.float32),
                    lambda: tf.scatter_nd(
                        tf.stack([range_unhead, unhead_org_idx], -1), pooling_result, [bs, sl+1, hn])
                )

                self_attn_input = rep_map
                context_features = tf.add(scatter_attn[:, :-1], scatter_pooling[:, :-1], 'context_features')
                output_mask = rep_mask
            else:
                self_attn_input = rep_head_tensor
                context_features = attn_result
                output_mask = rep_head_mask

            # context fusion gate
            o_bias = tf.get_variable('o_bias', [ivec], tf.float32, tf.constant_initializer(0.))
            fusion_gate = tf.nn.sigmoid(
                linear(self_attn_input, ivec, True, 0., 'linear_fusion_i', False, wd, keep_prob, is_train) +
                linear(context_features, ivec, True, 0., 'linear_fusion_a', False, wd, keep_prob, is_train) +
                o_bias)
            output = fusion_gate * self_attn_input + (1 - fusion_gate) * context_features

        return output, output_mask
Ejemplo n.º 21
0
    def build_network(self):
        _logger.add()
        _logger.add('building %s neural network structure...' %
                    cfg.network_type)
        tds, cds = self.tds, self.cds
        tl = self.tl
        tel, cel, cos, ocd, fh = self.tel, self.cel, self.cos, self.ocd, self.fh
        hn = self.hn
        bs, sl1, sl2 = self.bs, self.sl1, self.sl2

        with tf.variable_scope('emb'):
            token_emb_mat = generate_embedding_mat(
                tds,
                tel,
                init_mat=self.token_emb_mat,
                extra_mat=self.glove_emb_mat,
                extra_trainable=self.finetune_emb,
                scope='gene_token_emb_mat')
            s1_emb = tf.nn.embedding_lookup(token_emb_mat,
                                            self.sent1_token)  # bs,sl1,tel
            s2_emb = tf.nn.embedding_lookup(token_emb_mat,
                                            self.sent2_token)  # bs,sl2,tel
            self.tensor_dict['s1_emb'] = s1_emb
            self.tensor_dict['s2_emb'] = s2_emb

        with tf.variable_scope('sent_encoding'):
            act_func_str = 'elu' if cfg.context_fusion_method in [
                'block', 'disa'
            ] else 'relu'

            s1_rep = sentence_encoding_models(s1_emb,
                                              self.sent1_token_mask,
                                              cfg.context_fusion_method,
                                              act_func_str,
                                              'ct_based_sent2vec',
                                              cfg.wd,
                                              self.is_train,
                                              cfg.dropout,
                                              block_len=cfg.block_len)

            tf.get_variable_scope().reuse_variables()

            s2_rep = sentence_encoding_models(s2_emb,
                                              self.sent2_token_mask,
                                              cfg.context_fusion_method,
                                              act_func_str,
                                              'ct_based_sent2vec',
                                              cfg.wd,
                                              self.is_train,
                                              cfg.dropout,
                                              block_len=cfg.block_len)

            self.tensor_dict['s1_rep'] = s1_rep
            self.tensor_dict['s2_rep'] = s2_rep

        with tf.variable_scope('output'):
            act_func = tf.nn.elu if cfg.context_fusion_method in [
                'block', 'disa'
            ] else tf.nn.relu

            out_rep = tf.concat(
                [s1_rep, s2_rep, s1_rep - s2_rep, s1_rep * s2_rep], -1)
            pre_output = act_func(
                linear([out_rep],
                       hn,
                       True,
                       0.,
                       scope='pre_output',
                       squeeze=False,
                       wd=cfg.wd,
                       input_keep_prob=cfg.dropout,
                       is_train=self.is_train))
            logits = linear([pre_output],
                            self.output_class,
                            True,
                            0.,
                            scope='logits',
                            squeeze=False,
                            wd=cfg.wd,
                            input_keep_prob=cfg.dropout,
                            is_train=self.is_train)
            self.tensor_dict[logits] = logits
        return logits  # logits
Ejemplo n.º 22
0
    def build_network(self):
        _logger.add()
        _logger.add('building %s neural network structure...' %
                    cfg.network_type)
        tds, cds = self.tds, self.cds
        tl = self.tl
        tel, cel, cos, ocd, fh = self.tel, self.cel, self.cos, self.ocd, self.fh
        hn = self.hn
        bs, sl, ol, mc = self.bs, self.sl, self.ol, self.mc

        with tf.variable_scope('emb'):
            token_emb_mat = generate_embedding_mat(
                tds,
                tel,
                init_mat=self.token_emb_mat,
                extra_mat=self.glove_emb_mat,
                extra_trainable=self.finetune_emb,
                scope='gene_token_emb_mat')
            emb = tf.nn.embedding_lookup(token_emb_mat,
                                         self.token_seq)  # bs,sl,tel
            self.tensor_dict['emb'] = emb

        with tf.variable_scope('ct_attn'):
            rep_fw = directional_attention_with_dense(
                emb,
                self.token_mask,
                'forward',
                'dir_attn_fw',
                cfg.dropout,
                self.is_train,
                cfg.wd,
                'relu',
                tensor_dict=self.tensor_dict,
                name='fw_attn')
            rep_bw = directional_attention_with_dense(
                emb,
                self.token_mask,
                'backward',
                'dir_attn_bw',
                cfg.dropout,
                self.is_train,
                cfg.wd,
                'relu',
                tensor_dict=self.tensor_dict,
                name='bw_attn')

            seq_rep = tf.concat([rep_fw, rep_bw], -1)

        with tf.variable_scope('sent_enc_attn'):
            rep = multi_dimensional_attention(seq_rep,
                                              self.token_mask,
                                              'multi_dimensional_attention',
                                              cfg.dropout,
                                              self.is_train,
                                              cfg.wd,
                                              'relu',
                                              tensor_dict=self.tensor_dict,
                                              name='attn')

        with tf.variable_scope('output'):
            pre_logits = tf.nn.relu(
                linear([rep],
                       hn,
                       True,
                       scope='pre_logits_linear',
                       wd=cfg.wd,
                       input_keep_prob=cfg.dropout,
                       is_train=self.is_train))  # bs, hn
            logits = linear([pre_logits],
                            self.output_class,
                            False,
                            scope='get_output',
                            wd=cfg.wd,
                            input_keep_prob=cfg.dropout,
                            is_train=self.is_train)  # bs, 5
        _logger.done()
        return logits
Ejemplo n.º 23
0
    def do_reduce(self, data_for_reduce, mask_for_reduce):
        hn, dropout, is_train, wd = self.hn, self.dropout, self.is_train, self.wd
        mc = tf.shape(data_for_reduce)[1]
        with tf.variable_scope('sr_%s' % self.method_type):
            self_choose_attention(data_for_reduce, mask_for_reduce, hn,
                                  dropout, is_train, 'change_me')
            children_hid = data_for_reduce[:, :, :hn]
            children_cell = data_for_reduce[:, :, hn:]

            I = tf.nn.sigmoid(
                linear([
                    self_choose_attention(children_hid, mask_for_reduce, hn,
                                          dropout, is_train, 'self_ch_i')
                ], hn, True, 0., 'linear_i', False, 0., dropout, is_train))

            # bs,mc,hn/ -> bs,1,mc,hn/2 -> bs,mc,mc,hn/2
            children_hid_tile_1 = tf.tile(tf.expand_dims(children_hid, 1),
                                          [1, mc, 1, 1])  #
            children_hid_tile_2 = tf.tile(tf.expand_dims(children_hid, 2),
                                          [1, 1, mc, 1])  #
            children_hid_tile = tf.concat(
                [children_hid_tile_1, children_hid_tile_2],
                -1)  # bs,mc,mc,2* hn
            children_hid_tile_re = tf.reshape(
                children_hid_tile, [-1, mc, 2 * hn])  # bs*mc,mc,2* hn
            # # mask
            mask_tile_1 = tf.tile(tf.expand_dims(mask_for_reduce, 1),
                                  [1, mc, 1])
            mask_tile_2 = tf.tile(tf.expand_dims(mask_for_reduce, 2),
                                  [1, 1, mc])
            mask_tile = tf.logical_and(mask_tile_1, mask_tile_2)
            mask_tile_re = tf.reshape(mask_tile, [-1, mc])

            # bs*mc, 2* hn -linear-> bs*mc,hn -re-> bs,mc,hn
            F = tf.nn.sigmoid(
                tf.reshape(
                    linear([
                        self_choose_attention(children_hid_tile_re,
                                              mask_tile_re, 2 * hn, dropout,
                                              is_train, 'self_ch_f')
                    ], hn, True, 0., 'linear_f', False, 0., dropout, is_train),
                    [-1, mc, hn]))

            O = tf.nn.sigmoid(
                linear([
                    self_choose_attention(children_hid, mask_for_reduce, hn,
                                          dropout, is_train, 'self_ch_o')
                ], hn, True, 0., 'linear_o', False, 0., dropout, is_train))

            U = tf.nn.tanh(
                linear([
                    self_choose_attention(children_hid, mask_for_reduce, hn,
                                          dropout, is_train, 'self_ch_u')
                ], hn, True, 0., 'linear_u', False, 0., dropout, is_train))

            # children_cell * F--[bs,mc,hn]   mask_for_reduce [bs,mc]->[bs,mc,1]
            C = I * U + tf.reduce_sum(
                normal_mask(children_cell * F,
                            tf.expand_dims(mask_for_reduce, -1)), 1)
            H = O * tf.nn.tanh(C)

            return tf.concat([H, C], -1)
Ejemplo n.º 24
0
    def build_network(self):
        _logger.add()
        _logger.add('building %s neural network structure...' %
                    cfg.network_type)

        tds, cds = self.tds, self.cds
        tl = self.tl
        tel, cel, cos, ocd, fh = self.tel, self.cel, self.cos, self.ocd, self.fh
        hn = self.hn
        bs, sl1, sl2 = self.bs, self.sl1, self.sl2

        with tf.variable_scope('emb'):
            token_emb_mat = generate_embedding_mat(
                tds,
                tel,
                init_mat=self.token_emb_mat,
                extra_mat=self.glove_emb_mat,
                extra_trainable=self.finetune_emb,
                scope='gene_token_emb_mat')
            s1_emb = tf.nn.embedding_lookup(token_emb_mat,
                                            self.sent1_token)  # bs,sl1,tel
            s2_emb = tf.nn.embedding_lookup(token_emb_mat,
                                            self.sent2_token)  # bs,sl2,tel
            self.tensor_dict['s1_emb'] = s1_emb
            self.tensor_dict['s2_emb'] = s2_emb

        with tf.variable_scope('hard_network'):
            # for sentence 1
            s1_emb_new = sequence_conditional_feature(s1_emb,
                                                      self.sent1_token_mask)
            s1_logpa_dep, s1_act_dep, s1_percentage_dep = generate_mask_with_rl(
                s1_emb_new, self.sent1_token_mask, False,
                'generate_mask_with_rl_dep', cfg.dropout, self.is_train,
                cfg.wd, 'relu', self.disable_rl, self.global_step, cfg.mode,
                cfg.start_only_rl, hn)  # [bs, sl] & [bs, sl]
            s1_logpa_head, s1_act_head, s1_percentage_head = generate_mask_with_rl(
                s1_emb_new, self.sent1_token_mask, False,
                'generate_mask_with_rl_head', cfg.dropout, self.is_train,
                cfg.wd, 'relu', self.disable_rl, self.global_step, cfg.mode,
                cfg.start_only_rl, hn)  # [bs, sl] & [bs, sl]
            s1_logpa = tf.concat([s1_logpa_dep, s1_logpa_head], -1)
            s1_act = tf.logical_and(tf.expand_dims(s1_act_dep, 1),
                                    tf.expand_dims(s1_act_head, 2))
            s1_percentage = s1_percentage_dep * s1_percentage_head

            tf.get_variable_scope().reuse_variables()
            # for sentence 2
            s2_emb_new = sequence_conditional_feature(s2_emb,
                                                      self.sent2_token_mask)
            s2_logpa_dep, s2_act_dep, s2_percentage_dep = generate_mask_with_rl(
                s2_emb_new, self.sent2_token_mask, False,
                'generate_mask_with_rl_dep', cfg.dropout, self.is_train,
                cfg.wd, 'relu', self.disable_rl, self.global_step, cfg.mode,
                cfg.start_only_rl, hn)  # [bs, sl] & [bs, sl]
            s2_logpa_head, s2_act_head, s2_percentage_head = generate_mask_with_rl(
                s2_emb_new, self.sent2_token_mask, False,
                'generate_mask_with_rl_head', cfg.dropout, self.is_train,
                cfg.wd, 'relu', self.disable_rl, self.global_step, cfg.mode,
                cfg.start_only_rl, hn)  # [bs, sl] & [bs, sl]
            s2_logpa = tf.concat([s2_logpa_dep, s2_logpa_head], -1)
            s2_act = tf.logical_and(tf.expand_dims(s2_act_dep, 1),
                                    tf.expand_dims(s2_act_head, 2))
            s2_percentage = s2_percentage_dep * s2_percentage_head

        keep_unselected = True
        with tf.variable_scope('ct_attn'):
            s1_fw, s1_token_mask_new = directional_attention_with_selections(
                s1_emb, self.sent1_token_mask, s1_act_dep, s1_act_head,
                'forward', hn, keep_unselected, 'dir_attn_fw', cfg.dropout,
                self.is_train, cfg.wd, 'relu')
            s1_bw, _ = directional_attention_with_selections(
                s1_emb, self.sent1_token_mask, s1_act_dep, s1_act_head,
                'backward', hn, keep_unselected, 'dir_attn_bw', cfg.dropout,
                self.is_train, cfg.wd, 'relu')

            s1_seq_rep = tf.concat([s1_fw, s1_bw], -1)

            tf.get_variable_scope().reuse_variables()

            s2_fw, s2_token_mask_new = directional_attention_with_selections(
                s2_emb, self.sent2_token_mask, s2_act_dep, s2_act_head,
                'forward', hn, keep_unselected, 'dir_attn_fw', cfg.dropout,
                self.is_train, cfg.wd, 'relu')
            s2_bw, _ = directional_attention_with_selections(
                s2_emb, self.sent2_token_mask, s2_act_dep, s2_act_head,
                'backward', hn, keep_unselected, 'dir_attn_bw', cfg.dropout,
                self.is_train, cfg.wd, 'relu')
            s2_seq_rep = tf.concat([s2_fw, s2_bw], -1)

        with tf.variable_scope('sentence_enc'):
            s1_rep = multi_dimensional_attention(s1_seq_rep,
                                                 s1_token_mask_new,
                                                 'multi_dimensional_attention',
                                                 cfg.dropout,
                                                 self.is_train,
                                                 cfg.wd,
                                                 'relu',
                                                 tensor_dict=self.tensor_dict,
                                                 name='s1_attn')
            tf.get_variable_scope().reuse_variables()
            s2_rep = multi_dimensional_attention(s2_seq_rep,
                                                 s2_token_mask_new,
                                                 'multi_dimensional_attention',
                                                 cfg.dropout,
                                                 self.is_train,
                                                 cfg.wd,
                                                 'relu',
                                                 tensor_dict=self.tensor_dict,
                                                 name='s2_attn')

        with tf.variable_scope('output'):
            out_rep = tf.concat([s1_rep * s2_rep, tf.abs(s1_rep - s2_rep)], -1)
            out_rep_map = bn_dense_layer(out_rep, hn, True, 0., 'out_rep_map',
                                         'relu', False, cfg.wd, cfg.dropout,
                                         self.is_train)
            if cfg.use_mse and cfg.mse_logits:
                logits = tf.nn.sigmoid(
                    linear(out_rep_map,
                           1,
                           True,
                           0.,
                           scope='logits',
                           squeeze=True,
                           wd=cfg.wd,
                           input_keep_prob=cfg.dropout,
                           is_train=self.is_train)) * 2. + 3.
            else:
                logits = linear([out_rep_map],
                                self.output_class,
                                True,
                                0.,
                                scope='logits',
                                squeeze=False,
                                wd=cfg.wd,
                                input_keep_prob=cfg.dropout,
                                is_train=self.is_train)
        return logits, (s1_act, s1_logpa), (s2_act, s2_logpa), (s1_percentage,
                                                                s2_percentage
                                                                )  # logits
Ejemplo n.º 25
0
    def build_network(self):
        _logger.add()
        _logger.add('building %s neural network structure...' % cfg.network_type)

        tds, cds = self.tds, self.cds
        tl = self.tl
        tel, cel, cos, ocd, fh = self.tel, self.cel, self.cos, self.ocd, self.fh
        hn = self.hn
        bs, sl1, sl2 = self.bs, self.sl1, self.sl2

        with tf.variable_scope('emb'):
            token_emb_mat = generate_embedding_mat(tds, tel, init_mat=self.token_emb_mat,
                                                   extra_mat=self.glove_emb_mat, extra_trainable=self.finetune_emb,
                                                   scope='gene_token_emb_mat')
            s1_emb = tf.nn.embedding_lookup(token_emb_mat, self.sent1_token)  # bs,sl1,tel
            s2_emb = tf.nn.embedding_lookup(token_emb_mat, self.sent2_token)  # bs,sl2,tel
            self.tensor_dict['s1_emb'] = s1_emb
            self.tensor_dict['s2_emb'] = s2_emb

        with tf.variable_scope('hard_network'):
            # s1_act, s1_logpa, s2_act, s2_logpa, choose_percentage
            s1_act = self.sent1_token_mask
            s1_logpa = tf.cast(s1_act, tf.float32)

            s2_act = self.sent2_token_mask
            s2_logpa = tf.cast(s2_act, tf.float32)

            s1_percentage = tf.ones([bs], tf.float32)
            s2_percentage = tf.ones([bs], tf.float32)

        with tf.variable_scope('ct_attn'):
            s1_fw = directional_attention_with_dense(
                s1_emb, self.sent1_token_mask, 'forward', 'dir_attn_fw',
                cfg.dropout, self.is_train, cfg.wd,
                tensor_dict=self.tensor_dict, name='s1_fw_attn')
            s1_bw = directional_attention_with_dense(
                s1_emb, self.sent1_token_mask, 'backward', 'dir_attn_bw',
                cfg.dropout, self.is_train, cfg.wd,
                tensor_dict=self.tensor_dict, name='s1_bw_attn')

            s1_seq_rep = tf.concat([s1_fw, s1_bw], -1)

            tf.get_variable_scope().reuse_variables()

            s2_fw = directional_attention_with_dense(
                s2_emb, self.sent2_token_mask, 'forward', 'dir_attn_fw',
                cfg.dropout, self.is_train, cfg.wd,
                tensor_dict=self.tensor_dict, name='s2_fw_attn')
            s2_bw = directional_attention_with_dense(
                s2_emb, self.sent2_token_mask, 'backward', 'dir_attn_bw',
                cfg.dropout, self.is_train, cfg.wd,
                tensor_dict=self.tensor_dict, name='s2_bw_attn')
            s2_seq_rep = tf.concat([s2_fw, s2_bw], -1)

        with tf.variable_scope('sentence_enc'):
            s1_rep = multi_dimensional_attention(
                s1_seq_rep, self.sent1_token_mask, 'multi_dimensional_attention',
                cfg.dropout, self.is_train, cfg.wd,
                tensor_dict=self.tensor_dict, name='s1_attn')
            tf.get_variable_scope().reuse_variables()
            s2_rep = multi_dimensional_attention(
                s2_seq_rep, self.sent2_token_mask, 'multi_dimensional_attention',
                cfg.dropout, self.is_train, cfg.wd,
                tensor_dict=self.tensor_dict, name='s2_attn')

        with tf.variable_scope('output'):
            out_rep = tf.concat([s1_rep, s2_rep, s1_rep - s2_rep, s1_rep * s2_rep], -1)
            out_rep_map = bn_dense_layer(
                out_rep, hn, True, 0., 'out_rep_map', 'elu', False, cfg.wd, cfg.dropout, self.is_train)
            pre_output1 = highway_network(
                out_rep_map, hn, True, 0., 'pre_output1', 'elu', False, cfg.wd, cfg.dropout, self.is_train)
            logits = linear([pre_output1], self.output_class, True, 0., scope='logits', squeeze=False,
                            wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train)
        return logits, (s1_act, s1_logpa), (s2_act, s2_logpa), (s1_percentage, s2_percentage)  # logits
Ejemplo n.º 26
0
def normal_attention(tensor_base, tensor_to_attend,
                     mask_for_tensor_base,
                     mask_for_tensor_to_attend,
                     similarity_method='inner', hn=100,
                     use_pooling=False, pooling_method='max',
                     reverse=False, scope=None):
    """
    normal_attention for attention strategy 2 
    :param tensor_base: rank 3 [bs,sl,vec]
    :param tensor_to_attend: rank 3 [bs,ql,vec]
    :param mask_for_tensor_base: [bs,ql]
    :param mask_for_tensor_to_attend: [bs,sl]
    :param similarity_method: 'inner' 'tri_linear' 'map_linear'
    :param hn: some method need 
    :param use_pooling: True or False
    :param pooling_method: 'max' or 'mean'
    :param reverse: if use strategy 3
    :param scope: 
    :return: use_pooling==True: [bs,sl,hn] else [bs,hn]
    """
    with tf.variable_scope(scope or 'normal_attention'):
        # --------parameters--------
        t_main = tensor_base  # [bs,sl,vec]
        t_sec = tensor_to_attend  # [bs,ql,vec]
        mask_main = mask_for_tensor_base  # [bs,sl]
        mask_sec = mask_for_tensor_to_attend  # [bs,ql]

        bs, sl, vec = tf.shape(t_main)[0], tf.shape(t_main)[1], tf.shape(t_main)[2]
        ql = tf.shape(t_sec)[1]
        # -------------------------------
        # --------similarity_mat--------
        mask_main_etd = tf.expand_dims(mask_main, 2)  # bs,sl,1
        mask_sec_etd = tf.expand_dims(mask_sec, 1)  # bs,1,ql
        mask_similarity_mat = tf.logical_and(mask_main_etd, mask_sec_etd)  # bs,sl,ql
        if similarity_method == 'inner':
            t_main_etd = tf.expand_dims(t_main, 2)  # bs,sl,1,vec
            t_sec_etd = tf.expand_dims(t_sec, 1)  # bs,1,ql,vec
            similarity_mat = tf.reduce_sum(t_main_etd*t_sec_etd, -1)  # bs,sl,ql
        elif similarity_method == 'tri_linear':
            t_main_tiled = tf.tile(tf.expand_dims(t_main, 2), [1, 1, ql, 1])  # bs,sl,ql,vec
            t_sec_tiled = tf.tile(tf.expand_dims(t_sec, 1), [1, sl, 1, 1])  # bs,sl,ql,vec
            similarity_mat = get_logits([t_main_tiled, t_sec_tiled], None, False,
                                        scope='tri_linear_tri_linear', func='tri_linear')
        elif similarity_method == 'map_linear':
            t_main_map = tf.nn.relu(linear([t_main], hn, True, scope='linear_map_main'))
            t_sec_map = tf.nn.relu(linear([t_sec], hn, True, scope='linear_map_sec'))
            t_main_map_etd = tf.expand_dims(t_main_map, 2)  # bs,sl,1,hn
            t_sec_map_etd = tf.expand_dims(t_sec_map, 1)  # bs,1,ql,hn
            similarity_mat = tf.reduce_sum(t_main_map_etd * t_sec_map_etd, -1)  # bs,sl,ql
        else:
            raise AttributeError('No similarity matrix calculation method \'%s\'' % similarity_method)
        # -------------------------------
        if use_pooling:
            # pool mat along -2
            if pooling_method == 'max':
                pooling_out = tf.reduce_max(exp_mask(similarity_mat, mask_similarity_mat), -2)  # bs,sl,ql -> bs,ql
            elif pooling_method == 'mean':
                sum_out = tf.reduce_sum(normal_mask(similarity_mat, mask_similarity_mat), -2)  # bs,sl,ql -> bs,ql
                num = tf.reduce_sum(tf.cast(mask_similarity_mat, tf.int32), -2)  # bs,ql
                num = tf.where(tf.equal(num, tf.zeros_like(num, tf.int32)),
                               tf.ones_like(num, tf.int32), num)
                pooling_out = sum_out / tf.cast(num, tf.float32)  # bs,ql
            else:
                raise AttributeError('No pooling method \'%s\'' % pooling_method)
            return softsel(t_sec, pooling_out, mask_sec)  # bs,ql,vec -> bs,ql
        else:
            t_sec_tiled = tf.tile(tf.expand_dims(t_sec, 1), [1, sl, 1, 1])  # bs,sl,ql,vec
            # target: q_tiled:[bs,sl,ql,hn]; logits: [bs,sl,ql]
            if not reverse:
                out = normal_softsel(t_sec_tiled, similarity_mat, mask_similarity_mat)
            else:
                out = reverse_softsel(t_sec_tiled, similarity_mat, mask_similarity_mat)
            return out  # bs,sl,vec
Ejemplo n.º 27
0
    def do_reduce(self, data_for_reduce, mask_for_reduce):
        hn, dropout, is_train, wd = self.hn, self.dropout, self.is_train, self.wd
        mc = tf.shape(data_for_reduce)[1]
        with tf.variable_scope('sr_%s' % self.method_type):
            print('var num in (2.3) :',
                  len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)))
            # bs, mc, hn
            children_hid_un = data_for_reduce[:, :, :hn]
            children_cell = data_for_reduce[:, :, hn:]

            # bs, mc, hn
            children_hid = tf.concat([
                children_hid_un,
                self_align_attention(children_hid_un, mask_for_reduce),
            ], -1)

            I = tf.nn.sigmoid(
                linear([
                    self_choose_attention(children_hid, mask_for_reduce, hn,
                                          dropout, is_train, 'self_ch_i', True)
                ], hn, False, 0., 'linear_i', False, 0., dropout, is_train) +
                self.bias_I)

            print('var num in (2.4) :',
                  len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)))

            # bs*mc, 2* hn -linear-> bs*mc,hn  -re-> bs,mc,hn
            F = tf.nn.sigmoid(
                linear([children_hid], hn, True, 0., 'linear_f', False, 0.,
                       dropout, is_train))

            print('var num in (2.5) :',
                  len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)))

            O = tf.nn.sigmoid(
                linear([
                    self_choose_attention(children_hid, mask_for_reduce, hn,
                                          dropout, is_train, 'self_ch_o', True)
                ], hn, False, 0., 'linear_o', False, 0., dropout, is_train) +
                self.bias_O)

            print('var num in (2.6) :',
                  len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)))

            U = tf.nn.tanh(
                linear([
                    self_choose_attention(children_hid, mask_for_reduce, hn,
                                          dropout, is_train, 'self_ch_u', True)
                ], hn, False, 0., 'linear_u', False, 0., dropout, is_train) +
                self.bias_U)

            print('var num in (2.7) :',
                  len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)))

            # children_cell * F--[bs,mc,hn]   mask_for_reduce [bs,mc]->[bs,mc,1]
            C = I * U + tf.reduce_sum(
                normal_mask(children_cell * F,
                            tf.expand_dims(mask_for_reduce, -1)), 1)
            H = O * tf.nn.tanh(C)

            return tf.concat([H, C], -1)
Ejemplo n.º 28
0
def directional_attention_with_dense(rep_tensor,
                                     rep_mask,
                                     direction=None,
                                     scope=None,
                                     keep_prob=1.,
                                     is_train=None,
                                     wd=0.,
                                     activation='elu',
                                     tensor_dict=None,
                                     name=None,
                                     hn=None):
    def scaled_tanh(x, scale=5.):
        return scale * tf.nn.tanh(1. / scale * x)

    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(
        rep_tensor)[2]
    ivec = hn or rep_tensor.get_shape()[2]
    with tf.variable_scope(scope or 'directional_attention_%s' % direction
                           or 'diag'):
        # mask generation
        sl_indices = tf.range(sl, dtype=tf.int32)
        sl_col, sl_row = tf.meshgrid(sl_indices, sl_indices)
        if direction is None:
            direct_mask = tf.cast(
                tf.diag(-tf.ones([sl], tf.int32)) + 1, tf.bool)
        else:
            if direction == 'forward':
                direct_mask = tf.greater(sl_row, sl_col)
            else:
                direct_mask = tf.greater(sl_col, sl_row)
        direct_mask_tile = tf.tile(tf.expand_dims(direct_mask, 0),
                                   [bs, 1, 1])  # bs,sl,sl
        rep_mask_tile = tf.tile(tf.expand_dims(rep_mask, 1),
                                [1, sl, 1])  # bs,sl,sl
        attn_mask = tf.logical_and(direct_mask_tile, rep_mask_tile)  # bs,sl,sl

        # non-linear
        rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map',
                                 activation, False, wd, keep_prob, is_train)
        rep_map_tile = tf.tile(tf.expand_dims(rep_map, 1),
                               [1, sl, 1, 1])  # bs,sl,sl,vec
        rep_map_dp = dropout(rep_map, keep_prob, is_train)

        # attention
        with tf.variable_scope('attention'):  # bs,sl,sl,vec
            f_bias = tf.get_variable('f_bias', [ivec], tf.float32,
                                     tf.constant_initializer(0.))
            dependent = linear(rep_map_dp,
                               ivec,
                               False,
                               scope='linear_dependent')  # bs,sl,vec
            dependent_etd = tf.expand_dims(dependent, 1)  # bs,1,sl,vec
            head = linear(rep_map_dp, ivec, False,
                          scope='linear_head')  # bs,sl,vec
            head_etd = tf.expand_dims(head, 2)  # bs,sl,1,vec

            logits = scaled_tanh(dependent_etd + head_etd + f_bias,
                                 5.0)  # bs,sl,sl,vec

            logits_masked = exp_mask_for_high_rank(logits, attn_mask)
            attn_score = tf.nn.softmax(logits_masked, 2)  # bs,sl,sl,vec
            attn_score = mask_for_high_rank(attn_score, attn_mask)

            attn_result = tf.reduce_sum(attn_score * rep_map_tile,
                                        2)  # bs,sl,vec

        with tf.variable_scope('output'):
            o_bias = tf.get_variable('o_bias', [ivec], tf.float32,
                                     tf.constant_initializer(0.))
            # input gate
            fusion_gate = tf.nn.sigmoid(
                linear(rep_map, ivec, True, 0., 'linear_fusion_i', False, wd,
                       keep_prob, is_train) +
                linear(attn_result, ivec, True, 0., 'linear_fusion_a', False,
                       wd, keep_prob, is_train) + o_bias)
            output = fusion_gate * rep_map + (1 - fusion_gate) * attn_result
            output = mask_for_high_rank(output, rep_mask)

        # save attn
        if tensor_dict is not None and name is not None:
            tensor_dict[name + '_dependent'] = dependent
            tensor_dict[name + '_head'] = head
            tensor_dict[name] = attn_score
            tensor_dict[name + '_gate'] = fusion_gate
        return output
Ejemplo n.º 29
0
def visit_sa_with_dense(rep_tensor,
                        keep_prob=1.,
                        is_train=None,
                        wd=0.,
                        activation='relu',
                        hn=None,
                        is_scale=True,
                        is_plus_sa=True):

    batch_size, sw_len, vec_size = tf.shape(rep_tensor)[0], tf.shape(
        rep_tensor)[1], tf.shape(rep_tensor)[2]
    ivec = rep_tensor.get_shape().as_list()[2]
    ivec = hn or ivec
    with tf.variable_scope('temporal_attention'):
        # mask generation
        attn_mask = tf.cast(
            tf.diag(-tf.ones([sw_len], tf.int32)) + 1,
            tf.bool)  # batch_size, code_len, code_len

        # non-linear for context
        rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map',
                                 activation, False, wd, keep_prob, is_train)
        rep_map_tile = tf.tile(tf.expand_dims(rep_map, 1),
                               [1, sw_len, 1, 1])  # bs,sl,sl,vec
        rep_map_dp = dropout(rep_map, keep_prob, is_train)

        # attention
        with tf.variable_scope('attention'):  # bs,sl,sl,vec

            f_bias = tf.get_variable('f_bias', [ivec], tf.float32,
                                     tf.constant_initializer(0.))
            dependent = linear(
                rep_map_dp, ivec, False,
                scope='linear_dependent')  # batch_size, code_len, vec_size
            dependent_etd = tf.expand_dims(
                dependent, 1)  # batch_size, code_len,code_len, vec_size
            head = linear(
                rep_map_dp, ivec, False,
                scope='linear_head')  # batch_size, code_len, vec_size
            head_etd = tf.expand_dims(
                head, 2)  # batch_size, code_len,code_len, vec_size

            if is_plus_sa:
                attention_fact = dependent_etd + head_etd + f_bias
            else:
                return rep_map

            if is_scale:
                logits = scaled_tanh(attention_fact, 5.0)  # bs,sl,sl,vec
            else:
                logits = linear(tf.nn.tanh(attention_fact),
                                ivec,
                                True,
                                scope='linear_attn_fact')

            logits_masked = exp_mask_for_high_rank(logits, attn_mask)
            attn_score = tf.nn.softmax(logits_masked, 2)  # bs,sl,sl,vec
            attn_score = mask_for_high_rank(attn_score, attn_mask)

            attn_result = tf.reduce_sum(attn_score * rep_map_tile,
                                        2)  # bs,sl,vec

        with tf.variable_scope('output'):
            o_bias = tf.get_variable('o_bias', [ivec], tf.float32,
                                     tf.constant_initializer(0.))
            # input gate
            fusion_gate = tf.nn.sigmoid(
                linear(rep_map, ivec, True, 0., 'linear_fusion_i', False, wd,
                       keep_prob, is_train) +
                linear(attn_result, ivec, True, 0., 'linear_fusion_a', False,
                       wd, keep_prob, is_train) + o_bias)
            output = fusion_gate * rep_map + (1 - fusion_gate) * attn_result

        return output