Beispiel #1
0
def mean_pooling_for_unselected_head(
        unhead_org_idx, sl_unhead, rep_unhead_mask,
        dep_org_idx, sl_dep, rep_dep_mask,
        rep_dep_tensor, direction
):
    with tf.name_scope('pooling_for_un_head'):
        undep_idxs = tf.tile(tf.expand_dims(dep_org_idx, 1), [1, sl_unhead, 1])  # [bs, sluh, sld]
        unhead_idxs = tf.tile(tf.expand_dims(unhead_org_idx, 2), [1, 1, sl_dep])  # [bs, sluh, sld]
        if direction is None:
            direct_mask_un = tf.not_equal(unhead_idxs, undep_idxs)  # [bs, sluh, sld]
        else:
            if direction == 'forward':
                direct_mask_un = tf.greater(unhead_idxs, undep_idxs)  # [bs, sluh, sld]
            else:
                direct_mask_un = tf.less(unhead_idxs, undep_idxs)  # [bs, sluh, sld]

        # [bs, sluh, sld]
        rep_mask_tile_un = tf.logical_and(tf.expand_dims(rep_dep_mask, 1), tf.expand_dims(rep_unhead_mask, 2))
        pooling_mask = tf.logical_and(direct_mask_un, rep_mask_tile_un)  # [bs, sluh, sld]

        # data for pooling
        pooling_data = tf.tile(tf.expand_dims(rep_dep_tensor, 1), [1, sl_unhead, 1, 1])  # bs,sluh,sld,hn
        # execute mean pooling based on pooling_mask[bs, sluh, sld] and pooling_data[bs,sluh,sld,hn]
        pooling_data = mask_for_high_rank(pooling_data, pooling_mask)  # [bs,sluh,sld,hn]
        pooling_data_sum = tf.reduce_sum(pooling_data, -2)  # [bs,sluh,hn]
        pooling_den = tf.reduce_sum(tf.cast(pooling_mask, tf.int32), -1, keep_dims=True)  # [bs,sluh]
        pooling_den = tf.where(tf.equal(pooling_den, 0), tf.ones_like(pooling_den), pooling_den)

        pooling_result = pooling_data_sum / tf.cast(pooling_den, tf.float32)
        return pooling_result
Beispiel #2
0
def cnn_for_sentence_encoding( # kim
        rep_tensor, rep_mask, filter_sizes=(3,4,5), num_filters=200, scope=None,
        is_train=None, keep_prob=1., wd=0.):
    """

    :param rep_tensor:
    :param rep_mask:
    :param filter_sizes:
    :param num_filters:
    :param scope:
    :param is_train:
    :param keep_prob:
    :param wd:
    :return:
    """
    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2]
    ivec = rep_tensor.get_shape().as_list()[2]

    with tf.variable_scope(scope or 'cnn_for_sentence_encoding'):
        rep_tensor = mask_for_high_rank(rep_tensor, rep_mask)
        rep_tensor_expand = tf.expand_dims(rep_tensor, 3)
        rep_tensor_expand_dp = dropout(rep_tensor_expand, keep_prob, is_train)

        # Create a convolution + maxpool layer for each filter size
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.variable_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, ivec, 1, num_filters]
                W = tf.get_variable('W', filter_shape, tf.float32)
                b = tf.get_variable('b', [num_filters], tf.float32)

                conv = tf.nn.conv2d(
                    rep_tensor_expand_dp,
                    W,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv")
                # Apply nonlinearity
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")  # bs, sl-fs+1, 1, fn
                # Maxpooling over the outputs
                # pooled = tf.nn.max_pool(
                #     h,
                #     ksize=[1, sl - filter_size + 1, 1, 1],
                #     strides=[1, 1, 1, 1],
                #     padding='VALID',
                #     name="pool")
                pooled = tf.reduce_max(h, 1, True)  # bs, 1, 1, fn
                pooled_outputs.append(pooled)

        # Combine all the pooled features
        num_filters_total = num_filters * len(filter_sizes)
        h_pool = tf.concat(pooled_outputs, 3)
        h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])

        if wd > 0.:
            add_reg_without_bias()

        return h_pool_flat
Beispiel #3
0
def cnn_for_context_fusion(rep_tensor,
                           rep_mask,
                           filter_sizes=(3, 4, 5),
                           num_filters=200,
                           scope=None,
                           is_train=None,
                           keep_prob=1.,
                           wd=0.):
    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(
        rep_tensor)[2]
    ivec = rep_tensor.get_shape().as_list()[2]

    with tf.variable_scope(scope or 'cnn_for_sentence_encoding'):
        rep_tensor = mask_for_high_rank(rep_tensor, rep_mask)
        rep_tensor_expand = tf.expand_dims(rep_tensor, 3)  # bs, sl,
        rep_tensor_expand_dp = dropout(rep_tensor_expand, keep_prob, is_train)

        # Create a convolution + maxpool layer for each filter size
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.variable_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, ivec, 1, num_filters]
                W = tf.get_variable('W', filter_shape, tf.float32)
                b = tf.get_variable('b', [num_filters], tf.float32)

                # # pading in the sequence
                if filter_size % 2 == 1:
                    padding_front = padding_back = int((filter_size - 1) / 2)
                else:
                    padding_front = (filter_size - 1) // 2
                    padding_back = padding_front + 1
                padding = [[0, 0], [padding_front, padding_back], [0, 0],
                           [0, 0]]
                rep_tensor_expand_dp_pad = tf.pad(rep_tensor_expand_dp,
                                                  padding)

                conv = tf.nn.conv2d(rep_tensor_expand_dp_pad,
                                    W,
                                    strides=[1, 1, 1, 1],
                                    padding="VALID",
                                    name="conv")
                # Apply nonlinearity
                h = tf.nn.relu(tf.nn.bias_add(conv, b),
                               name="relu")  # bs, sl, 1, fn
                h_squeeze = tf.squeeze(h, [2])  # bs, sl, fn
                pooled_outputs.append(h_squeeze)

        # Combine all the pooled features
        result = tf.concat(pooled_outputs, 2)  # bs, sl, 3 * fn

        if wd > 0.:
            add_reg_without_bias()

        return result
Beispiel #4
0
def hierarchical_cnn_res_gate(
        rep_tensor, rep_mask, n_gram=5, layer_num=5, hn=None, scope=None,
        is_train=None, keep_prob=1., wd=0.):
    # padding
    if n_gram % 2 == 1:
        padding_front = padding_back = int((n_gram - 1) / 2)
    else:
        padding_front = (n_gram - 1) // 2
        padding_back = padding_front + 1
    padding = [[0, 0], [padding_front, padding_back], [0, 0], [0, 0]]

    # lengths
    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2]
    org_ivec = rep_tensor.get_shape().as_list()[2]
    ivec = hn or org_ivec

    with tf.variable_scope(scope or 'cnn_for_sentence_encoding'):
        rep_tensor = mask_for_high_rank(rep_tensor, rep_mask)  # bs, sl, hn

        iter_rep = rep_tensor
        layer_res_list = []

        for layer_idx in range(layer_num):
            with tf.variable_scope("conv_maxpool_%s" % layer_idx):

                iter_rep_etd = tf.expand_dims(iter_rep, 3)  # bs,sl,hn,1
                iter_rep_etd_dp = dropout(iter_rep_etd, keep_prob, is_train)
                # Convolution Layer
                feature_size = org_ivec if layer_idx == 0 else ivec
                filter_shape = [n_gram, feature_size, 1, 2 * ivec]
                W = tf.get_variable('W', filter_shape, tf.float32)
                b = tf.get_variable('b', [2 * ivec], tf.float32)
                iter_rep_etd_pad = tf.pad(iter_rep_etd_dp, padding)
                conv = tf.nn.conv2d(
                    iter_rep_etd_pad,
                    W,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv")
                map_res = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")  # bs,sl,1,2hn
                map_res = tf.squeeze(map_res, [2])  # bs,sl,2*hn
                # gate
                map_res_a, map_res_b = tf.split(map_res, num_or_size_splits=2, axis=2)
                iter_rep = map_res_a * tf.nn.sigmoid(map_res_b)

                # res
                if len(layer_res_list) > 0:
                    iter_rep = iter_rep + layer_res_list[-1]
                layer_res_list.append(iter_rep)

        if wd > 0.:
            add_reg_without_bias()
        return iter_rep
Beispiel #5
0
def time_aware_attention(train_inputs, embed, mask, embedding_size, k):
    with tf.variable_scope('time_aware_attention'):
        attn_weights = tf.Variable(
            tf.truncated_normal([embedding_size, k],
                                stddev=1.0 / math.sqrt(k)))
        attn_biases = tf.Variable(tf.zeros([k]))

        # weight add bias
        attn_embed = tf.nn.bias_add(attn_weights, attn_biases)

        # multiplying it with Ei
        attn_scalars = tf.tensordot(embed, attn_embed, axes=[[2], [0]])

        # get abs of distance
        train_delta = tf.abs(train_inputs[:, :, 1])

        # distance function is log(dist+1)
        dist_fun = tf.log(tf.to_float(train_delta) + 1.0)

        # reshape the dist_fun
        dist_fun = tf.reshape(
            dist_fun, [tf.shape(dist_fun)[0],
                       tf.shape(dist_fun)[1], 1])

        # the attribution logits
        attn_logits = tf.multiply(attn_scalars, dist_fun)

        # the attribution logits sum
        attn_logits_sum = tf.reduce_sum(attn_logits, -1, keepdims=True)
        attn_logits_sum = exp_mask_for_high_rank(attn_logits_sum, mask)

        # get weights via softmax
        attn_softmax = tf.nn.softmax(attn_logits_sum, 1)

        # the weighted sum
        attn_embed_weighted = tf.multiply(attn_softmax, embed)
        attn_embed_weighted = mask_for_high_rank(attn_embed_weighted, mask)

        reduced_embed = tf.reduce_sum(attn_embed_weighted, 1)
        # obtain two scalars
        scalar1 = tf.log(tf.to_float(tf.shape(embed)[1]) + 1.0)
        scalar2 = tf.reduce_sum(tf.pow(attn_softmax, 2), 1)
        # the scalared embed
        reduced_embed = tf.multiply(reduced_embed, scalar1)
        reduced_embed = tf.multiply(reduced_embed, scalar2)

        return reduced_embed, attn_embed_weighted
Beispiel #6
0
def sentence_encoding_models(rep_tensor,
                             rep_mask,
                             method,
                             activation_function,
                             scope=None,
                             wd=0.,
                             is_train=None,
                             keep_prob=1.,
                             **kwargs):
    method_name_list = [
        'cnn_kim',
        'no_ct',
        'lstm',
        'gru',
        'sru',
        'sru_normal',  # rnn
        'cnn',
        'multi_head',
        'multi_head_git',
        'disa',
        'mlsa',
        'block'
    ]
    with tf.variable_scope(scope or 'sentence_encoding_models'):
        if method == 'cnn_kim':
            sent_coding = cnn_for_sentence_encoding(rep_tensor, rep_mask,
                                                    (3, 4, 5), 200,
                                                    'sent_encoding_cnn_kim',
                                                    is_train, keep_prob, wd)
        elif method == 'none':
            sent_coding = tf.reduce_sum(
                mask_for_high_rank(rep_tensor, rep_mask), 1)
        else:
            ct_rep = None
            if method == 'no_ct':
                ct_rep = tf.identity(rep_tensor)
            else:
                ct_rep = context_fusion_layers(rep_tensor, rep_mask, method,
                                               activation_function, None, wd,
                                               is_train, keep_prob, **kwargs)

            sent_coding = multi_dimensional_attention(
                ct_rep, rep_mask, 'multi_dim_attn_for_%s' % method, keep_prob,
                is_train, wd, activation_function)

        return sent_coding
Beispiel #7
0
def normal_attention(rep_tensor,
                     rep_mask,
                     scope=None,
                     keep_prob=1.,
                     is_train=None,
                     wd=0.,
                     activation='elu',
                     tensor_dict=None,
                     name=None):
    batch_size, code_len, vec_size = tf.shape(rep_tensor)[0], tf.shape(
        rep_tensor)[1], tf.shape(rep_tensor)[2]
    ivec = rep_tensor.get_shape()[2]
    with tf.variable_scope(scope or 'normal_attention'):
        rep_tensor_map = bn_dense_layer(rep_tensor, ivec, True, 0.,
                                        'bn_dense_map', activation, False, wd,
                                        keep_prob, is_train)

        rep_tensor_logits = get_logits([rep_tensor_map],
                                       None,
                                       False,
                                       scope='self_attn_logits',
                                       mask=rep_mask,
                                       input_keep_prob=keep_prob,
                                       is_train=is_train)  # bs,sl
        attn_result = softsel(rep_tensor, rep_tensor_logits,
                              rep_mask)  # bs,vec

        # save attn
        if tensor_dict is not None and name is not None:
            tensor_dict[name] = tf.nn.softmax(rep_tensor_logits)

        with tf.variable_scope('output'):
            o_bias = tf.get_variable('o_bias', [ivec], tf.float32,
                                     tf.constant_initializer(0.))
            # input gate
            fusion_gate = tf.nn.sigmoid(
                linear(rep_tensor_map, ivec, True, 0., 'linear_fusion_i',
                       False, wd, keep_prob, is_train) +
                linear(attn_result, ivec, True, 0., 'linear_fusion_a', False,
                       wd, keep_prob, is_train) + o_bias)
            output = fusion_gate * rep_tensor_map + (1 -
                                                     fusion_gate) * attn_result
            output = mask_for_high_rank(output, rep_mask)  # bs,sl,vec
        return output
Beispiel #8
0
def pooling_with_mask(rep_tensor, rep_mask, method='max', scope=None):
    # rep_tensor have one more rank than rep_mask
    with tf.name_scope(scope or '%s_pooling' % method):

        if method == 'max':
            rep_tensor_masked = exp_mask_for_high_rank(rep_tensor, rep_mask)
            output = tf.reduce_max(rep_tensor_masked, -2)
        elif method == 'mean':
            rep_tensor_masked = mask_for_high_rank(rep_tensor,
                                                   rep_mask)  # [...,sl,hn]
            rep_sum = tf.reduce_sum(rep_tensor_masked, -2)  #[..., hn]
            denominator = tf.reduce_sum(tf.cast(rep_mask, tf.int32), -1,
                                        True)  # [..., 1]
            denominator = tf.where(
                tf.equal(denominator, tf.zeros_like(denominator, tf.int32)),
                tf.ones_like(denominator, tf.int32), denominator)
            output = rep_sum / tf.cast(denominator, tf.float32)
        else:
            raise AttributeError('No Pooling method name as %s' % method)
        return output
Beispiel #9
0
def self_attention_for_selected_head(
        head_selection, head_org_idx, sl_head, rep_head_mask,
        dep_selection, dep_org_idx, sl_dep, rep_dep_mask,
        rep_map, rep_dep_tensor, keep_prob, is_train, direction, ivec
):
    # data for self-attention
    rep_map_dp = dropout(rep_map, keep_prob, is_train)
    rep_dep_tensor_dp, _, _ = reduce_data_rep_max_len(rep_map_dp, dep_selection)
    rep_head_tensor_dp, _, _ = reduce_data_rep_max_len(rep_map_dp, head_selection)

    # mask generation
    dep_idxs = tf.tile(tf.expand_dims(dep_org_idx, 1), [1, sl_head, 1])
    head_idxs = tf.tile(tf.expand_dims(head_org_idx, 2), [1, 1, sl_dep])

    if direction is None:
        direct_mask = tf.not_equal(head_idxs, dep_idxs)  # [bs, slh, sld]
    else:
        if direction == 'forward':
            direct_mask = tf.greater(head_idxs, dep_idxs)  # [bs, slh, sld]
        else:
            direct_mask = tf.less(head_idxs, dep_idxs)  # [bs, slh, sld]
    # [bs, slh, slh]
    rep_mask_tile = tf.logical_and(tf.expand_dims(rep_dep_mask, 1), tf.expand_dims(rep_head_mask, 2))
    attn_mask = tf.logical_and(direct_mask, rep_mask_tile)  # [bs, slh, sld]

    # tensor tile
    rep_map_tile = tf.tile(tf.expand_dims(rep_dep_tensor, 1), [1, sl_head, 1, 1])  # bs,slh,sld,vec
    with tf.variable_scope('attention'):  # bs,sl,sl,vec
        f_bias = tf.get_variable('f_bias', [ivec], tf.float32, tf.constant_initializer(0.))
        dependent = linear(rep_dep_tensor_dp, ivec, False, scope='linear_dependent')  # bs,sld,vec
        dependent_etd = tf.expand_dims(dependent, 1)  # bs,1,sld,vec
        head = linear(rep_head_tensor_dp, ivec, False, scope='linear_head')  # bs,slh,vec
        head_etd = tf.expand_dims(head, 2)  # bs,slh,1,vec

        logits = scaled_tanh(dependent_etd + head_etd + f_bias, 5.0)  # bs,slh,sld,vec
        logits_masked = exp_mask_for_high_rank(logits, attn_mask)  # bs,slh,sld,vec
        attn_score = tf.nn.softmax(logits_masked, 2)  # bs,slh,sld,vec
        attn_score = mask_for_high_rank(attn_score, attn_mask)
        attn_result = tf.reduce_sum(attn_score * rep_map_tile, 2)  # bs,slh,vec -> head_org_idx
    return attn_result
Beispiel #10
0
def directional_attention_with_dense(rep_tensor,
                                     rep_mask,
                                     direction=None,
                                     scope=None,
                                     keep_prob=1.,
                                     is_train=None,
                                     wd=0.,
                                     activation='elu',
                                     tensor_dict=None,
                                     name=None,
                                     hn=None):
    def scaled_tanh(x, scale=5.):
        return scale * tf.nn.tanh(1. / scale * x)

    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(
        rep_tensor)[2]
    ivec = hn or rep_tensor.get_shape()[2]
    with tf.variable_scope(scope or 'directional_attention_%s' % direction
                           or 'diag'):
        # mask generation
        sl_indices = tf.range(sl, dtype=tf.int32)
        sl_col, sl_row = tf.meshgrid(sl_indices, sl_indices)
        if direction is None:
            direct_mask = tf.cast(
                tf.diag(-tf.ones([sl], tf.int32)) + 1, tf.bool)
        else:
            if direction == 'forward':
                direct_mask = tf.greater(sl_row, sl_col)
            else:
                direct_mask = tf.greater(sl_col, sl_row)
        direct_mask_tile = tf.tile(tf.expand_dims(direct_mask, 0),
                                   [bs, 1, 1])  # bs,sl,sl
        rep_mask_tile = tf.tile(tf.expand_dims(rep_mask, 1),
                                [1, sl, 1])  # bs,sl,sl
        attn_mask = tf.logical_and(direct_mask_tile, rep_mask_tile)  # bs,sl,sl

        # non-linear
        rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map',
                                 activation, False, wd, keep_prob, is_train)
        rep_map_tile = tf.tile(tf.expand_dims(rep_map, 1),
                               [1, sl, 1, 1])  # bs,sl,sl,vec
        rep_map_dp = dropout(rep_map, keep_prob, is_train)

        # attention
        with tf.variable_scope('attention'):  # bs,sl,sl,vec
            f_bias = tf.get_variable('f_bias', [ivec], tf.float32,
                                     tf.constant_initializer(0.))
            dependent = linear(rep_map_dp,
                               ivec,
                               False,
                               scope='linear_dependent')  # bs,sl,vec
            dependent_etd = tf.expand_dims(dependent, 1)  # bs,1,sl,vec
            head = linear(rep_map_dp, ivec, False,
                          scope='linear_head')  # bs,sl,vec
            head_etd = tf.expand_dims(head, 2)  # bs,sl,1,vec

            logits = scaled_tanh(dependent_etd + head_etd + f_bias,
                                 5.0)  # bs,sl,sl,vec

            logits_masked = exp_mask_for_high_rank(logits, attn_mask)
            attn_score = tf.nn.softmax(logits_masked, 2)  # bs,sl,sl,vec
            attn_score = mask_for_high_rank(attn_score, attn_mask)

            attn_result = tf.reduce_sum(attn_score * rep_map_tile,
                                        2)  # bs,sl,vec

        with tf.variable_scope('output'):
            o_bias = tf.get_variable('o_bias', [ivec], tf.float32,
                                     tf.constant_initializer(0.))
            # input gate
            fusion_gate = tf.nn.sigmoid(
                linear(rep_map, ivec, True, 0., 'linear_fusion_i', False, wd,
                       keep_prob, is_train) +
                linear(attn_result, ivec, True, 0., 'linear_fusion_a', False,
                       wd, keep_prob, is_train) + o_bias)
            output = fusion_gate * rep_map + (1 - fusion_gate) * attn_result
            output = mask_for_high_rank(output, rep_mask)

        # save attn
        if tensor_dict is not None and name is not None:
            tensor_dict[name + '_dependent'] = dependent
            tensor_dict[name + '_head'] = head
            tensor_dict[name] = attn_score
            tensor_dict[name + '_gate'] = fusion_gate
        return output
Beispiel #11
0
    def build_network(self):
        # Look up embeddings for inputs.
        with tf.name_scope('code_embeddings'):
            init_code_embed = tf.random_uniform(
                [self.vocabulary_size, self.embedding_size], -1.0, 1.0)
            code_embeddings = tf.Variable(init_code_embed)
            context_embed = tf.nn.embedding_lookup(code_embeddings,
                                                   self.context_codes)

        if self.model_type == 'tesa':
            with tf.name_scope(self.model_type):
                # Embedding size is calculated as shape(train_inputs) + shape(embeddings)[1:]
                init_date_embed = tf.random_uniform(
                    [self.dates_size, self.embedding_size], -1.0, 1.0)
                date_embeddings = tf.Variable(init_date_embed)

                if self.is_date_encoding:
                    date_embed = tf.nn.embedding_lookup(
                        date_embeddings, self.context_dates)

                    # self_attention
                    cntxt_embed = temporal_date_sa_with_dense(
                        rep_tensor=context_embed,
                        rep_mask=self.context_mask,
                        date_tensor=date_embed,
                        is_train=True,
                        activation=self.activation,
                        is_scale=self.is_scale)
                else:
                    date_embed = tf.nn.embedding_lookup(
                        date_embeddings, self.train_masks)

                    # self_attention
                    cntxt_embed = temporal_delta_sa_with_dense(
                        rep_tensor=context_embed,
                        rep_mask=self.context_mask,
                        delta_tensor=date_embed,
                        is_train=True,
                        activation=self.activation,
                        is_scale=self.is_scale)

                # Attention pooling
                context_fusion = multi_dimensional_attention(cntxt_embed,
                                                             self.context_mask,
                                                             is_train=True)

        elif self.model_type == 'delta':
            with tf.name_scope(self.model_type):
                #self_attention
                init_date_embed = tf.random_uniform(
                    [self.dates_size, self.embedding_size], -1.0, 1.0)
                date_embeddings = tf.Variable(init_date_embed)
                date_embed = tf.nn.embedding_lookup(date_embeddings,
                                                    self.train_masks)
                cntxt_embed = delta_with_dense(rep_tensor=context_embed,
                                               rep_mask=self.context_mask,
                                               delta_tensor=date_embed,
                                               is_train=True,
                                               activation=self.activation,
                                               is_scale=self.is_scale)

                # attention pooling
                context_fusion = multi_dimensional_attention(cntxt_embed,
                                                             self.context_mask,
                                                             is_train=True)

        elif self.model_type == 'sa':
            with tf.name_scope(self.model_type):
                #self_attention
                cntxt_embed = self_attention_with_dense(
                    rep_tensor=context_embed,
                    rep_mask=self.context_mask,
                    is_train=True,
                    activation=self.activation,
                    is_scale=self.is_scale)

                # attention pooling
                context_fusion = multi_dimensional_attention(cntxt_embed,
                                                             self.context_mask,
                                                             is_train=True)

        elif self.model_type == 'normal':
            with tf.name_scope(self.model_type):
                #self_attention
                cntxt_embed = normal_attention(rep_tensor=context_embed,
                                               rep_mask=self.context_mask,
                                               is_train=True,
                                               activation=self.activation)

                # attention pooling
                context_fusion = multi_dimensional_attention(cntxt_embed,
                                                             self.context_mask,
                                                             is_train=True)

        elif self.model_type == 'cbow':
            with tf.name_scope(self.model_type):
                cntxt_embed = mask_for_high_rank(
                    context_embed, self.context_mask)  # bs,sl,vec
                context_fusion = tf.reduce_mean(cntxt_embed, 1)

        elif self.model_type == 'ta_attn':
            context_fusion = time_aware_attention(self.train_inputs,
                                                  context_embed,
                                                  self.context_mask,
                                                  self.embedding_size,
                                                  k=100)

        elif self.model_type == 'fusion':
            with tf.name_scope(self.model_type):
                # self-attention
                code2code = self_attention_with_dense(
                    rep_tensor=context_embed,
                    rep_mask=self.context_mask,
                    is_train=True,
                    activation=self.activation)
                # attention pooling
                source2code = multi_dimensional_attention(code2code,
                                                          self.context_mask,
                                                          is_train=True)
                # time-aware attention
                ta_attn_res = time_aware_attention(self.train_inputs,
                                                   context_embed,
                                                   self.context_mask,
                                                   self.embedding_size,
                                                   k=100)

                ivec = ta_attn_res.get_shape()[1]
                concat_context = tf.concat([source2code, ta_attn_res], 1)

                # context_fusion = fusion_gate(source2code,ta_attn_res,wd=0., keep_prob=1., is_train=True)
                context_fusion = bn_dense_layer(concat_context,
                                                ivec,
                                                True,
                                                0.,
                                                'bn_dense_map',
                                                self.activation,
                                                False,
                                                wd=0.,
                                                keep_prob=1.,
                                                is_train=True)
        return context_fusion, code_embeddings
Beispiel #12
0
def multi_head_attention(rep_tensor,
                         rep_mask,
                         head_num=8,
                         hidden_units_num=64,
                         scope=None,
                         is_train=None,
                         keep_prob=1.,
                         wd=0.):
    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(
        rep_tensor)[2]
    ivec = rep_tensor.get_shape().as_list()[2]

    with tf.variable_scope(scope or 'multi_head_attention'):

        with tf.variable_scope('positional_encoding'):
            seq_idxs = tf.tile(tf.expand_dims(tf.range(sl), 1),
                               [1, ivec])  # sl, ivec
            feature_idxs = tf.tile(tf.expand_dims(tf.range(ivec), 0),
                                   [sl, 1])  # sl, ivec
            pos_enc = tf.where(
                tf.equal(tf.mod(feature_idxs, 2), 0),
                tf.sin(
                    tf.cast(seq_idxs, tf.float32) / tf.pow(
                        10000., 2.0 * tf.cast(feature_idxs, tf.float32) /
                        (1.0 * ivec))),
                tf.cos(
                    tf.cast(seq_idxs, tf.float32) / tf.pow(
                        10000., 2.0 * tf.cast(feature_idxs - 1, tf.float32) /
                        (1.0 * ivec))),
            )
            rep_tensor_pos = mask_for_high_rank(rep_tensor + pos_enc,
                                                rep_mask)  # bs, sl, ivec

        with tf.variable_scope('multi_head_attention'):
            W = tf.get_variable('W', [3, head_num, ivec, hidden_units_num],
                                tf.float32)
            rep_tile = tf.tile(
                tf.expand_dims(tf.expand_dims(rep_tensor_pos, 0), 0),
                [3, head_num, 1, 1, 1])  # 3,head_num,bs,sl,ivec
            rep_tile_reshape = tf.reshape(
                rep_tile, [3, head_num, bs * sl, ivec])  # head_num,bs*sl,ivec

            maps = tf.reshape(  # 3,head_num,bs*sl,hn ->  3,head_num,bs,sl,hn
                tf.matmul(dropout(rep_tile_reshape, keep_prob, is_train), W),
                [3, head_num, bs, sl, hidden_units_num])
            Q_map, K_map, V_map = tf.split(maps, 3, 0)
            Q_map = tf.squeeze(Q_map, [0])  # head_num,bs,sl,hn
            K_map = tf.squeeze(K_map, [0])  # head_num,bs,sl,hn
            V_map = tf.squeeze(V_map, [0])  # head_num,bs,sl,hn

            # head_num,bs,sl,sl
            # similarity_mat = tf.reduce_sum(Q_map_tile * K_map_tile, -1) / math.sqrt(1. * hidden_units_num)
            similarity_mat = tf.matmul(Q_map, tf.transpose(
                K_map, [0, 1, 3, 2])) / math.sqrt(1. * hidden_units_num)

            # mask: bs,sl -> head_num,bs,sl
            multi_mask = tf.tile(tf.expand_dims(rep_mask, 0),
                                 [head_num, 1, 1])  # head_num,bs,sl
            multi_mask_tile_1 = tf.expand_dims(multi_mask,
                                               2)  # head_num,bs,1,sl
            multi_mask_tile_2 = tf.expand_dims(multi_mask,
                                               3)  # head_num,bs,sl,1
            multi_mask_tile = tf.logical_and(
                multi_mask_tile_1, multi_mask_tile_2)  # head_num,bs,sl,sl
            similarity_mat_masked = exp_mask(
                similarity_mat, multi_mask_tile)  # head_num,bs,sl,sl
            prob_dist = tf.nn.softmax(
                similarity_mat_masked)  # head_num,bs,sl,sl
            prob_dist_dp = dropout(prob_dist, keep_prob, is_train)

            attn_res = tf.matmul(prob_dist_dp, V_map)  # head_num,bs,sl,hn

            attn_res_tran = tf.transpose(attn_res, [1, 2, 0, 3])
            output = tf.reshape(attn_res_tran,
                                [bs, sl, head_num * hidden_units_num])

            if wd > 0.:
                add_reg_without_bias()

            return output
Beispiel #13
0
def context_fusion_layers(rep_tensor,
                          rep_mask,
                          method,
                          activation_function,
                          scope=None,
                          wd=0.,
                          is_train=None,
                          keep_prob=1.,
                          **kwargs):
    method_name_list = [
        'lstm',
        'gru',
        'sru',
        'sru_normal',  # rnn
        'cnn',
        'multi_head',
        'multi_head_git',
        'disa',
        'mpsa',
        'block'
    ]
    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(
        rep_tensor)[2]
    ivec = rep_tensor.get_shape().as_list()[2]

    context_fusion_output = None
    with tf.variable_scope(scope or 'context_fusion_layers'):
        if method in ['lstm', 'gru', 'sru_normal']:
            context_fusion_output = contextual_bi_rnn(rep_tensor, rep_mask,
                                                      ivec, method, False, wd,
                                                      keep_prob, is_train,
                                                      'ct_bi_%s' % method)
        elif method == 'sru':
            context_fusion_output = bi_sru_recurrent_network(
                rep_tensor, rep_mask, is_train, keep_prob, wd, 'ct_bi_sru')
        elif method == 'cnn':
            context_fusion_output = cnn_for_context_fusion(
                rep_tensor, rep_mask, (3, 4, 5), 200, 'ct_cnn', is_train,
                keep_prob, wd)
        elif method == 'multi_head':
            context_fusion_output = multi_head_attention(
                rep_tensor, rep_mask, 8, 75, 'ct_multi_head', is_train,
                keep_prob, wd)
        elif method == 'multi_head_git':
            context_fusion_output = multi_head_attention_git(
                rep_tensor, rep_mask, 8, 600, 'ct_multi_head', is_train,
                keep_prob, wd)
        elif method == 'disa':
            with tf.variable_scope('ct_disa'):
                disa_fw = directional_attention_with_dense(
                    rep_tensor, rep_mask, 'forward', 'fw_disa', keep_prob,
                    is_train, wd, activation_function)
                disa_bw = directional_attention_with_dense(
                    rep_tensor, rep_mask, 'backward', 'bw_disa', keep_prob,
                    is_train, wd, activation_function)
                context_fusion_output = tf.concat([disa_fw, disa_bw], -1)
        elif method == 'block':
            if 'block_len' in kwargs.keys():
                block_len = kwargs['block_len']
            else:
                block_len = None
            if block_len is None:
                block_len = tf.cast(
                    tf.ceil(tf.pow(tf.cast(2 * sl, tf.float32), 1.0 / 3)),
                    tf.int32)
            context_fusion_output = bi_directional_simple_block_attention(
                rep_tensor, rep_mask, block_len, 'ct_block_attn', keep_prob,
                is_train, wd, activation_function)
        elif method == 'mpsa':
            with tf.variable_scope('ct_mpsa'):
                mpsa_fw = masked_positional_self_attention(
                    0, rep_tensor, rep_mask, 'forward', 'fw_mpsa', keep_prob,
                    is_train, wd, activation_function)
                mpsa_bw = masked_positional_self_attention(
                    0, rep_tensor, rep_mask, 'backward', 'bw_mpsa', keep_prob,
                    is_train, wd, activation_function)
                mpsa_2g = masked_positional_self_attention(
                    2, rep_tensor, rep_mask, None, '2g_mpsa', keep_prob,
                    is_train, wd, activation_function)
                mpsa_3g = masked_positional_self_attention(
                    3, rep_tensor, rep_mask, None, '3g_mpsa', keep_prob,
                    is_train, wd, activation_function)
                sen_tensor = mask_for_high_rank(rep_tensor, rep_mask)
                sen_tensor_t = tf.expand_dims(sen_tensor, 2)
                fw_res = tf.expand_dims(mpsa_fw, 2)
                bw_res = tf.expand_dims(mpsa_bw, 2)
                g2_res = tf.expand_dims(mpsa_2g, 2)
                g3_res = tf.expand_dims(mpsa_3g, 2)
                tmp_res = tf.concat(
                    [sen_tensor_t, fw_res, bw_res, g2_res, g3_res],
                    2)  # bs,sl,5,ivec

                bs, sl = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1]
                ivec = rep_tensor.get_shape()[2]
                num = tmp_res.get_shape()[2]
                bias = tf.get_variable('bias', [num * ivec], tf.float32,
                                       tf.constant_initializer(0.))
                softmax_gate = linear(sen_tensor, num * ivec, True, 0.,
                                      'linear_softmax', False, wd, keep_prob,
                                      is_train) + bias  # bs,sl,5*ivec
                fusion_gate = tf.nn.softmax(
                    tf.reshape(softmax_gate, [bs, sl, num, ivec]), 2)
                context_fusion_output = tf.reduce_sum(fusion_gate * tmp_res,
                                                      2)  # bs,sl,ivec
        else:
            raise RuntimeError

        return context_fusion_output
Beispiel #14
0
def simple_block_attention(rep_tensor,
                           rep_mask,
                           block_len=5,
                           scope=None,
                           direction=None,
                           keep_prob=1.,
                           is_train=None,
                           wd=0.,
                           activation='elu',
                           hn=None):
    assert direction is not None

    def scaled_tanh(x, scale=5.):
        return scale * tf.nn.tanh(1. / scale * x)

    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(
        rep_tensor)[2]
    org_ivec = rep_tensor.get_shape().as_list()[2]
    ivec = hn or org_ivec
    with tf.variable_scope(scope or 'block_simple'):
        # @1. split sequence
        with tf.variable_scope('split_seq'):
            block_num = tf.cast(
                tf.ceil(
                    tf.divide(tf.cast(sl, tf.float32),
                              tf.cast(block_len, tf.float32))), tf.int32)
            comp_len = block_num * block_len - sl

            rep_tensor_comp = tf.concat(
                [rep_tensor,
                 tf.zeros([bs, comp_len, org_ivec], tf.float32)], 1)
            rep_mask_comp = tf.concat([
                rep_mask,
                tf.cast(tf.zeros([bs, comp_len], tf.int32), tf.bool)
            ], 1)

            rep_tensor_split = tf.reshape(
                rep_tensor_comp,
                [bs, block_num, block_len, org_ivec])  # bs,bn,bl,d
            rep_mask_split = tf.reshape(rep_mask_comp,
                                        [bs, block_num, block_len])  # bs,bn,bl

            # non-linear
            rep_map = bn_dense_layer(rep_tensor_split, ivec, True, 0.,
                                     'bn_dense_map', activation, False, wd,
                                     keep_prob, is_train)  # bs,bn,bl,vec
            rep_map_tile = tf.tile(tf.expand_dims(rep_map, 2),
                                   [1, 1, block_len, 1, 1])  # bs,bn,bl,bl,vec
            # rep_map_dp = dropout(rep_map, keep_prob, is_train)
            bn = block_num
            bl = block_len

        with tf.variable_scope('self_attention'):
            # @2.self-attention in block
            # mask generation
            sl_indices = tf.range(block_len, dtype=tf.int32)
            sl_col, sl_row = tf.meshgrid(sl_indices, sl_indices)
            if direction == 'forward':
                direct_mask = tf.greater(sl_row, sl_col)  # bl,bl
            else:
                direct_mask = tf.greater(sl_col, sl_row)  # bl,bl
            direct_mask_tile = tf.tile(
                tf.expand_dims(tf.expand_dims(direct_mask, 0), 0),
                [bs, bn, 1, 1])  # bs,bn,bl,bl
            rep_mask_tile_1 = tf.tile(tf.expand_dims(rep_mask_split, 2),
                                      [1, 1, bl, 1])  # bs,bn,bl,bl
            rep_mask_tile_2 = tf.tile(tf.expand_dims(rep_mask_split, 3),
                                      [1, 1, 1, bl])  # bs,bn,bl,bl
            rep_mask_tile = tf.logical_and(rep_mask_tile_1, rep_mask_tile_2)
            attn_mask = tf.logical_and(direct_mask_tile,
                                       rep_mask_tile,
                                       name='attn_mask')  # bs,bn,bl,bl

            # attention
            f_bias = tf.get_variable('f_bias', [ivec], tf.float32,
                                     tf.constant_initializer(0.))
            dependent_head = linear(rep_map, 2 * ivec, False, 0.,
                                    'linear_dependent_head', False, wd,
                                    keep_prob, is_train)  # bs,bn,bl,2vec
            dependent, head = tf.split(dependent_head, 2, 3)
            dependent_etd = tf.expand_dims(dependent, 2)  # bs,bn,1,bl,vec
            head_etd = tf.expand_dims(head, 3)  # bs,bn,bl,1,vec
            logits = scaled_tanh(dependent_etd + head_etd + f_bias,
                                 5.0)  # bs,bn,bl,bl,vec
            logits_masked = exp_mask_for_high_rank(logits, attn_mask)
            attn_score = tf.nn.softmax(logits_masked, 3)  # bs,bn,bl,bl,vec
            attn_score = mask_for_high_rank(attn_score,
                                            attn_mask)  # bs,bn,bl,bl,vec
            self_attn_result = tf.reduce_sum(attn_score * rep_map_tile,
                                             3)  # bs,bn,bl,vec

        with tf.variable_scope('source2token_self_attn'):
            inter_block_logits = bn_dense_layer(self_attn_result, ivec, True,
                                                0., 'bn_dense_map', 'linear',
                                                False, wd, keep_prob,
                                                is_train)  # bs,bn,bl,vec
            inter_block_logits_masked = exp_mask_for_high_rank(
                inter_block_logits, rep_mask_split)  # bs,bn,bl,vec
            inter_block_soft = tf.nn.softmax(inter_block_logits_masked,
                                             2)  # bs,bn,bl,vec
            inter_block_attn_output = tf.reduce_sum(
                self_attn_result * inter_block_soft, 2)  # bs,bn,vec

        with tf.variable_scope('self_attn_inter_block'):
            inter_block_attn_output_mask = tf.cast(tf.ones([bs, bn], tf.int32),
                                                   tf.bool)
            block_ct_res = directional_attention_with_dense(
                inter_block_attn_output, inter_block_attn_output_mask,
                direction, 'disa', keep_prob, is_train, wd,
                activation)  # [bs,bn,vec]

            block_ct_res_tile = tf.tile(tf.expand_dims(
                block_ct_res, 2), [1, 1, bl, 1])  #[bs,bn,vec]->[bs,bn,bl,vec]

        with tf.variable_scope('combination'):
            # input:1.rep_map[bs,bn,bl,vec]; 2.self_attn_result[bs,bn,bl,vec]; 3.rnn_res_tile[bs,bn,bl,vec]
            rep_tensor_with_ct = tf.concat(
                [rep_map, self_attn_result, block_ct_res_tile],
                -1)  # [bs,bn,bl,3vec]
            new_context_and_gate = linear(rep_tensor_with_ct, 2 * ivec, True,
                                          0., 'linear_new_context_and_gate',
                                          False, wd, keep_prob,
                                          is_train)  # [bs,bn,bl,2vec]
            new_context, gate = tf.split(new_context_and_gate, 2,
                                         3)  # bs,bn,bl,vec
            if activation == "relu":
                new_context_act = tf.nn.relu(new_context)
            elif activation == "elu":
                new_context_act = tf.nn.elu(new_context)
            elif activation == "linear":
                new_context_act = tf.identity(new_context)
            else:
                raise RuntimeError
            gate_sig = tf.nn.sigmoid(gate)
            combination_res = gate_sig * new_context_act + (
                1 - gate_sig) * rep_map  # bs,bn,bl,vec

        with tf.variable_scope('restore_original_length'):
            combination_res_reshape = tf.reshape(
                combination_res, [bs, bn * bl, ivec])  # bs,bn*bl,vec
            output = combination_res_reshape[:, :sl, :]
            return output
Beispiel #15
0
def gated_self_attention(rep_tensor,
                         rep_mask,
                         scope=None,
                         keep_prob=1.,
                         is_train=None,
                         wd=0.,
                         activation='elu',
                         hn=None,
                         position_mask_type=None):
    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(
        rep_tensor)[2]
    ivec = rep_tensor.get_shape().as_list()[2]
    ivec = hn or ivec
    with tf.variable_scope(scope or 'gated_self_attention_%s' %
                           (position_mask_type or 'None')):
        rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map',
                                 activation, False, wd, keep_prob, is_train)

        # mask generation
        rep_mask_epd1 = tf.expand_dims(rep_mask, 1)  # bs,1,sl
        rep_mask_epd2 = tf.expand_dims(rep_mask, 2)  # bs,sl,1
        rep_mask_mat = tf.logical_and(rep_mask_epd1, rep_mask_epd2)  # bs,sl,sl

        if position_mask_type in ['forward', 'backward']:
            sl_indices = tf.range(sl, dtype=tf.int32)
            sl_col, sl_row = tf.meshgrid(sl_indices, sl_indices)
            if position_mask_type == 'forward':
                position_mask = tf.greater(sl_row, sl_col)
            else:
                position_mask = tf.greater(sl_col, sl_row)
            position_mask = tf.tile(tf.expand_dims(position_mask, 0),
                                    [bs, 1, 1])
            position_mask = tf.logical_and(rep_mask_mat, position_mask)

        else:
            position_mask = rep_mask_mat

        position_mask_ft = tf.cast(position_mask, tf.float32)

        # attention
        with tf.variable_scope('intra_sent_attn'):  # bs,sl,hn
            # rep_tensor_mean = pooling_with_mask(rep_tensor, rep_mask, 'mean')  # bs, hn
            rep_tensor_for_attn = rep_map

            pre_align_score = bn_dense_layer(  # bs,sl,hn
                rep_tensor_for_attn, ivec, True, 0., 'intra_sent_map1',
                activation, False, wd, keep_prob, is_train)
            align_score = bn_dense_layer(  # bs,sl,hn
                pre_align_score, ivec, True, 0., 'intra_sent_map2', 'linear',
                False, wd, keep_prob, is_train)
            align_score_w_mask = exp_mask_for_high_rank(align_score,
                                                        rep_mask)  # bs,sl,hn
            exp_align_score = tf.exp(align_score_w_mask)  # bs,sl,hn

            accum_z_deno = tf.matmul(position_mask_ft, exp_align_score)
            accum_z_deno = tf.where(
                tf.greater(accum_z_deno, tf.zeros_like(accum_z_deno)),
                accum_z_deno, tf.ones_like(accum_z_deno))

            rep_mul_score = rep_map * exp_align_score
            accum_rep_mul_score = tf.matmul(position_mask_ft, rep_mul_score)

            attn_res = accum_rep_mul_score / accum_z_deno

        with tf.variable_scope('context_fusion_gate'):
            fusion_gate = tf.nn.sigmoid(
                bn_dense_layer([rep_map, attn_res], hn, True, 0.,
                               'linear_fusion_gate', activation, False, wd,
                               keep_prob, is_train))
            output = fusion_gate * rep_map + (1 - fusion_gate) * attn_res

        output = mask_for_high_rank(output, rep_mask)
        return output
Beispiel #16
0
def masked_positional_self_attention(sigma, rep_tensor, rep_mask, direction=None, scope=None,
                                     keep_prob=1., is_train=None, wd=0., activation='elu',
                                     tensor_dict=None, name=None):
    def scaled_tanh(x, scale=5.):
        return scale * tf.nn.tanh(1./scale * x)

    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2]
    ivec = rep_tensor.get_shape()[2]
    with tf.variable_scope(scope or 'directional_attention_%s' % direction or 'diag'):
        # mask generation
        sl_indices = tf.range(sl, dtype=tf.int32)
        sl_col, sl_row = tf.meshgrid(sl_indices, sl_indices)
        if direction is None:
            direct_mask0 = tf.greater(sl_row + sigma, sl_col)
            direct_mask1 = tf.greater(sl_col + sigma, sl_row)
            direct_mask2 = tf.cast(1 - tf.diag(tf.ones([sl], tf.int32)), tf.bool)
            direct_mask = tf.logical_and(tf.logical_and(direct_mask0, direct_mask1), direct_mask2)
        else:
            if direction == 'forward':
                direct_mask = tf.greater(sl_row, sl_col)
            else:
                direct_mask = tf.greater(sl_col, sl_row)
        direct_mask_tile = tf.tile(tf.expand_dims(direct_mask, 0), [bs, 1, 1])  # bs,sl,sl
        rep_mask_tile = tf.tile(tf.expand_dims(rep_mask, 1), [1, sl, 1])  # bs,sl,sl
        attn_mask = tf.logical_and(direct_mask_tile, rep_mask_tile)  # bs,sl,sl

        # non-linear
        rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map', activation,
                                 False, wd, keep_prob, is_train)
        rep_map_tile = tf.tile(tf.expand_dims(rep_map, 1), [1, sl, 1, 1])  # bs,sl,sl,vec
        rep_map_dp = dropout(rep_map, keep_prob, is_train)

        # attention
        with tf.variable_scope('attention'):  # bs,sl,sl,1
            f_bias = tf.get_variable('f_bias', [1], tf.float32, tf.constant_initializer(0.))
            dependent = linear(rep_map_dp, 1, False, scope='linear_dependent')  # bs,sl,1
            dependent_etd = tf.expand_dims(dependent, 1)  # bs,1,sl,1
            head = linear(rep_map_dp, 1, False, scope='linear_head') # bs,sl,1
            head_etd = tf.expand_dims(head, 2)  # bs,sl,1,1

            logits = scaled_tanh(dependent_etd + head_etd + f_bias, 5.0)  # bs,sl,sl,1

            logits_masked = exp_mask_for_high_rank(logits, attn_mask)
            if direction is not None:
                dis_mask = -tf.log(tf.cast(tf.abs(sl_col - sl_row) + 
                                           tf.diag(tf.ones([sl], tf.int32)), tf.float32))
                logits_masked = dis_mask_for_high_rank(logits_masked, dis_mask)
            attn_score = tf.nn.softmax(logits_masked, 2)  # bs,sl,sl,vec
            attn_score = mask_for_high_rank(attn_score, attn_mask)
            attn_score = tf.tile(tf.expand_dims(tf.reshape(attn_score, [bs, sl, sl]), 3), [1, 1, 1, ivec])

            attn_result = tf.reduce_sum(attn_score * rep_map_tile, 2)  # bs,sl,vec

        with tf.variable_scope('output'):
            output = attn_result

        # save attn
        if tensor_dict is not None and name is not None:
            tensor_dict[name + '_dependent'] = dependent
            tensor_dict[name + '_head'] = head
            tensor_dict[name] = attn_score
        return output
Beispiel #17
0
def visit_sa_with_dense(rep_tensor,
                        keep_prob=1.,
                        is_train=None,
                        wd=0.,
                        activation='relu',
                        hn=None,
                        is_scale=True,
                        is_plus_sa=True):

    batch_size, sw_len, vec_size = tf.shape(rep_tensor)[0], tf.shape(
        rep_tensor)[1], tf.shape(rep_tensor)[2]
    ivec = rep_tensor.get_shape().as_list()[2]
    ivec = hn or ivec
    with tf.variable_scope('temporal_attention'):
        # mask generation
        attn_mask = tf.cast(
            tf.diag(-tf.ones([sw_len], tf.int32)) + 1,
            tf.bool)  # batch_size, code_len, code_len

        # non-linear for context
        rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map',
                                 activation, False, wd, keep_prob, is_train)
        rep_map_tile = tf.tile(tf.expand_dims(rep_map, 1),
                               [1, sw_len, 1, 1])  # bs,sl,sl,vec
        rep_map_dp = dropout(rep_map, keep_prob, is_train)

        # attention
        with tf.variable_scope('attention'):  # bs,sl,sl,vec

            f_bias = tf.get_variable('f_bias', [ivec], tf.float32,
                                     tf.constant_initializer(0.))
            dependent = linear(
                rep_map_dp, ivec, False,
                scope='linear_dependent')  # batch_size, code_len, vec_size
            dependent_etd = tf.expand_dims(
                dependent, 1)  # batch_size, code_len,code_len, vec_size
            head = linear(
                rep_map_dp, ivec, False,
                scope='linear_head')  # batch_size, code_len, vec_size
            head_etd = tf.expand_dims(
                head, 2)  # batch_size, code_len,code_len, vec_size

            if is_plus_sa:
                attention_fact = dependent_etd + head_etd + f_bias
            else:
                return rep_map

            if is_scale:
                logits = scaled_tanh(attention_fact, 5.0)  # bs,sl,sl,vec
            else:
                logits = linear(tf.nn.tanh(attention_fact),
                                ivec,
                                True,
                                scope='linear_attn_fact')

            logits_masked = exp_mask_for_high_rank(logits, attn_mask)
            attn_score = tf.nn.softmax(logits_masked, 2)  # bs,sl,sl,vec
            attn_score = mask_for_high_rank(attn_score, attn_mask)

            attn_result = tf.reduce_sum(attn_score * rep_map_tile,
                                        2)  # bs,sl,vec

        with tf.variable_scope('output'):
            o_bias = tf.get_variable('o_bias', [ivec], tf.float32,
                                     tf.constant_initializer(0.))
            # input gate
            fusion_gate = tf.nn.sigmoid(
                linear(rep_map, ivec, True, 0., 'linear_fusion_i', False, wd,
                       keep_prob, is_train) +
                linear(attn_result, ivec, True, 0., 'linear_fusion_a', False,
                       wd, keep_prob, is_train) + o_bias)
            output = fusion_gate * rep_map + (1 - fusion_gate) * attn_result

        return output
Beispiel #18
0
    def build_network(self):

        with tf.name_scope('code_embeddings'):
            if self.model_type == 'raw':
                # init_code_embed = tf.random_uniform([self.vocabulary_size, self.embedding_size], -1.0, 1.0)
                # code_embeddings = tf.Variable(init_code_embed)
                init_code_embed = tf.one_hot(self.inputs, self.vocabulary_size,on_value=1.0, off_value=0.0,axis=-1)
                inputs_embed = bn_dense_layer(init_code_embed, self.embedding_size, True, 0.,
                                        'bn_dense_map_linear', 'linear',
                                        False, wd=0., keep_prob=1.,
                                        is_train=True)
            elif self.model_type == 'tesa':
                init_code_embed = tesan_trans(self.model_type)
                # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs)
            elif self.model_type == 'delta':
                init_code_embed = tesan_trans(self.model_type)
                # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs)
            elif self.model_type == 'sa':
                init_code_embed = tesan_trans(self.model_type)
                # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs)
            elif self.model_type == 'normal':
                init_code_embed = tesan_trans(self.model_type)
                # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs)
            elif self.model_type == 'cbow':
                init_code_embed = tesan_trans(self.model_type)
                # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs)
            elif self.model_type == 'sg':
                init_code_embed = tesan_trans(self.model_type)
                # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs)
            elif self.model_type == 'mce':
                init_code_embed = mce_trans()
                # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs)
            elif self.model_type == 'glove':
                init_code_embed = glove_trans()
                # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs)
            else:
                init_code_embed = med2vec_trans()
                # code_embeddings = tf.constant(init_code_embed, dtype=tf.float32)
                code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs)

        with tf.name_scope('visit_embedding'):
            # bs, max_visits, max_len_visit, embed_size
            inputs_masked = mask_for_high_rank(inputs_embed, self.inputs_mask)
            inputs_reduced = tf.reduce_mean(inputs_masked, 2)  # batch_size, max_visits, embed_size

        with tf.name_scope('visit_masking'):
            visit_mask = tf.reduce_sum(tf.cast(self.inputs_mask, tf.int32), -1)  # [bs,max_visits]
            visit_mask = tf.cast(visit_mask, tf.bool)
            tensor_len = tf.reduce_sum(tf.cast(visit_mask, tf.int32), -1)  # [bs]

        with tf.name_scope('RNN_computaion'):
            reuse = None if not tf.get_variable_scope().reuse else True
            if cfg.cell_type == 'gru':
                cell = tf.contrib.rnn.GRUCell(cfg.hn, reuse=reuse)
            elif cfg.cell_type == 'lstm':
                cell = tf.contrib.rnn.LSTMCell(cfg.hn, reuse=reuse)
            elif cfg.cell_type == 'basic_lstm':
                cell = tf.contrib.rnn.BasicLSTMCell(cfg.hn, reuse=reuse)
            elif cfg.cell_type == 'basic_rnn':
                cell = tf.contrib.rnn.BasicRNNCell(cfg.hn, reuse=reuse)

            outputs, final_state = dynamic_rnn(cell, inputs_reduced, tensor_len, dtype=tf.float32)
        return outputs, final_state, tensor_len