Ejemplo n.º 1
0
def cnn_for_sentence_encoding( # kim
        rep_tensor, rep_mask, filter_sizes=(3,4,5), num_filters=200, scope=None,
        is_train=None, keep_prob=1., wd=0.):
    """

    :param rep_tensor:
    :param rep_mask:
    :param filter_sizes:
    :param num_filters:
    :param scope:
    :param is_train:
    :param keep_prob:
    :param wd:
    :return:
    """
    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2]
    ivec = rep_tensor.get_shape().as_list()[2]

    with tf.variable_scope(scope or 'cnn_for_sentence_encoding'):
        rep_tensor = mask_for_high_rank(rep_tensor, rep_mask)
        rep_tensor_expand = tf.expand_dims(rep_tensor, 3)
        rep_tensor_expand_dp = dropout(rep_tensor_expand, keep_prob, is_train)

        # Create a convolution + maxpool layer for each filter size
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.variable_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, ivec, 1, num_filters]
                W = tf.get_variable('W', filter_shape, tf.float32)
                b = tf.get_variable('b', [num_filters], tf.float32)

                conv = tf.nn.conv2d(
                    rep_tensor_expand_dp,
                    W,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv")
                # Apply nonlinearity
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")  # bs, sl-fs+1, 1, fn
                # Maxpooling over the outputs
                # pooled = tf.nn.max_pool(
                #     h,
                #     ksize=[1, sl - filter_size + 1, 1, 1],
                #     strides=[1, 1, 1, 1],
                #     padding='VALID',
                #     name="pool")
                pooled = tf.reduce_max(h, 1, True)  # bs, 1, 1, fn
                pooled_outputs.append(pooled)

        # Combine all the pooled features
        num_filters_total = num_filters * len(filter_sizes)
        h_pool = tf.concat(pooled_outputs, 3)
        h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])

        if wd > 0.:
            add_reg_without_bias()

        return h_pool_flat
Ejemplo n.º 2
0
def hierarchical_cnn_res_gate(
        rep_tensor, rep_mask, n_gram=5, layer_num=5, hn=None, scope=None,
        is_train=None, keep_prob=1., wd=0.):
    # padding
    if n_gram % 2 == 1:
        padding_front = padding_back = int((n_gram - 1) / 2)
    else:
        padding_front = (n_gram - 1) // 2
        padding_back = padding_front + 1
    padding = [[0, 0], [padding_front, padding_back], [0, 0], [0, 0]]

    # lengths
    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2]
    org_ivec = rep_tensor.get_shape().as_list()[2]
    ivec = hn or org_ivec

    with tf.variable_scope(scope or 'cnn_for_sentence_encoding'):
        rep_tensor = mask_for_high_rank(rep_tensor, rep_mask)  # bs, sl, hn

        iter_rep = rep_tensor
        layer_res_list = []

        for layer_idx in range(layer_num):
            with tf.variable_scope("conv_maxpool_%s" % layer_idx):

                iter_rep_etd = tf.expand_dims(iter_rep, 3)  # bs,sl,hn,1
                iter_rep_etd_dp = dropout(iter_rep_etd, keep_prob, is_train)
                # Convolution Layer
                feature_size = org_ivec if layer_idx == 0 else ivec
                filter_shape = [n_gram, feature_size, 1, 2 * ivec]
                W = tf.get_variable('W', filter_shape, tf.float32)
                b = tf.get_variable('b', [2 * ivec], tf.float32)
                iter_rep_etd_pad = tf.pad(iter_rep_etd_dp, padding)
                conv = tf.nn.conv2d(
                    iter_rep_etd_pad,
                    W,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv")
                map_res = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")  # bs,sl,1,2hn
                map_res = tf.squeeze(map_res, [2])  # bs,sl,2*hn
                # gate
                map_res_a, map_res_b = tf.split(map_res, num_or_size_splits=2, axis=2)
                iter_rep = map_res_a * tf.nn.sigmoid(map_res_b)

                # res
                if len(layer_res_list) > 0:
                    iter_rep = iter_rep + layer_res_list[-1]
                layer_res_list.append(iter_rep)

        if wd > 0.:
            add_reg_without_bias()
        return iter_rep
Ejemplo n.º 3
0
def cnn_for_context_fusion(
        rep_tensor, rep_mask, filter_sizes=(3,4,5), num_filters=200, scope=None,
        is_train=None, keep_prob=1., wd=0.):
    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2]
    ivec = rep_tensor.get_shape().as_list()[2]

    with tf.variable_scope(scope or 'cnn_for_sentence_encoding'):
        rep_tensor = mask_for_high_rank(rep_tensor, rep_mask)
        rep_tensor_expand = tf.expand_dims(rep_tensor, 3)  # bs, sl,
        rep_tensor_expand_dp = dropout(rep_tensor_expand, keep_prob, is_train)

        # Create a convolution + maxpool layer for each filter size
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.variable_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, ivec, 1, num_filters]
                W = tf.get_variable('W', filter_shape, tf.float32)
                b = tf.get_variable('b', [num_filters], tf.float32)

                # # pading in the sequence
                if filter_size % 2 == 1:
                    padding_front = padding_back = int((filter_size - 1) / 2)
                else:
                    padding_front = (filter_size - 1) // 2
                    padding_back = padding_front + 1
                padding = [[0, 0], [padding_front, padding_back], [0, 0], [0, 0]]
                rep_tensor_expand_dp_pad = tf.pad(rep_tensor_expand_dp, padding)

                conv = tf.nn.conv2d(
                    rep_tensor_expand_dp_pad,
                    W,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv")
                # Apply nonlinearity
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")  # bs, sl, 1, fn
                h_squeeze = tf.squeeze(h, [2])  # bs, sl, fn
                pooled_outputs.append(h_squeeze)

        # Combine all the pooled features
        result = tf.concat(pooled_outputs, 2)  # bs, sl, 3 * fn

        if wd > 0.:
            add_reg_without_bias()

        return result
Ejemplo n.º 4
0
def visit_sa_with_dense(rep_tensor,
                        keep_prob=1.,
                        is_train=None,
                        wd=0.,
                        activation='relu',
                        hn=None,
                        is_scale=True,
                        is_plus_sa=True):

    batch_size, sw_len, vec_size = tf.shape(rep_tensor)[0], tf.shape(
        rep_tensor)[1], tf.shape(rep_tensor)[2]
    ivec = rep_tensor.get_shape().as_list()[2]
    ivec = hn or ivec
    with tf.variable_scope('temporal_attention'):
        # mask generation
        attn_mask = tf.cast(
            tf.diag(-tf.ones([sw_len], tf.int32)) + 1,
            tf.bool)  # batch_size, code_len, code_len

        # non-linear for context
        rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map',
                                 activation, False, wd, keep_prob, is_train)
        rep_map_tile = tf.tile(tf.expand_dims(rep_map, 1),
                               [1, sw_len, 1, 1])  # bs,sl,sl,vec
        rep_map_dp = dropout(rep_map, keep_prob, is_train)

        # attention
        with tf.variable_scope('attention'):  # bs,sl,sl,vec

            f_bias = tf.get_variable('f_bias', [ivec], tf.float32,
                                     tf.constant_initializer(0.))
            dependent = linear(
                rep_map_dp, ivec, False,
                scope='linear_dependent')  # batch_size, code_len, vec_size
            dependent_etd = tf.expand_dims(
                dependent, 1)  # batch_size, code_len,code_len, vec_size
            head = linear(
                rep_map_dp, ivec, False,
                scope='linear_head')  # batch_size, code_len, vec_size
            head_etd = tf.expand_dims(
                head, 2)  # batch_size, code_len,code_len, vec_size

            if is_plus_sa:
                attention_fact = dependent_etd + head_etd + f_bias
            else:
                return rep_map

            if is_scale:
                logits = scaled_tanh(attention_fact, 5.0)  # bs,sl,sl,vec
            else:
                logits = linear(tf.nn.tanh(attention_fact),
                                ivec,
                                True,
                                scope='linear_attn_fact')

            logits_masked = exp_mask_for_high_rank(logits, attn_mask)
            attn_score = tf.nn.softmax(logits_masked, 2)  # bs,sl,sl,vec
            attn_score = mask_for_high_rank(attn_score, attn_mask)

            attn_result = tf.reduce_sum(attn_score * rep_map_tile,
                                        2)  # bs,sl,vec

        with tf.variable_scope('output'):
            o_bias = tf.get_variable('o_bias', [ivec], tf.float32,
                                     tf.constant_initializer(0.))
            # input gate
            fusion_gate = tf.nn.sigmoid(
                linear(rep_map, ivec, True, 0., 'linear_fusion_i', False, wd,
                       keep_prob, is_train) +
                linear(attn_result, ivec, True, 0., 'linear_fusion_a', False,
                       wd, keep_prob, is_train) + o_bias)
            output = fusion_gate * rep_map + (1 - fusion_gate) * attn_result

        return output
Ejemplo n.º 5
0
def directional_attention_with_dense(rep_tensor,
                                     rep_mask,
                                     direction=None,
                                     scope=None,
                                     keep_prob=1.,
                                     is_train=None,
                                     wd=0.,
                                     activation='elu',
                                     tensor_dict=None,
                                     name=None,
                                     hn=None):

    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(
        rep_tensor)[2]
    ivec = rep_tensor.get_shape().as_list()[2]
    ivec = hn or ivec
    with tf.variable_scope(scope or 'directional_attention_%s' % direction
                           or 'diag'):
        # mask generation
        sl_indices = tf.range(sl, dtype=tf.int32)
        sl_col, sl_row = tf.meshgrid(sl_indices, sl_indices)
        if direction is None:
            direct_mask = tf.cast(
                tf.diag(-tf.ones([sl], tf.int32)) + 1, tf.bool)
        else:
            if direction == 'forward':
                direct_mask = tf.greater(sl_row, sl_col)
            else:
                direct_mask = tf.greater(sl_col, sl_row)
        direct_mask_tile = tf.tile(tf.expand_dims(direct_mask, 0),
                                   [bs, 1, 1])  # bs,sl,sl
        rep_mask_tile = tf.tile(tf.expand_dims(rep_mask, 1),
                                [1, sl, 1])  # bs,sl,sl
        attn_mask = tf.logical_and(direct_mask_tile, rep_mask_tile)  # bs,sl,sl

        # non-linear
        rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map',
                                 activation, False, wd, keep_prob, is_train)
        rep_map_tile = tf.tile(tf.expand_dims(rep_map, 1),
                               [1, sl, 1, 1])  # bs,sl,sl,vec
        rep_map_dp = dropout(rep_map, keep_prob, is_train)

        # attention
        with tf.variable_scope('attention'):  # bs,sl,sl,vec
            f_bias = tf.get_variable('f_bias', [ivec], tf.float32,
                                     tf.constant_initializer(0.))
            dependent = linear(rep_map_dp,
                               ivec,
                               False,
                               scope='linear_dependent')  # bs,sl,vec
            dependent_etd = tf.expand_dims(dependent, 1)  # bs,1,sl,vec
            head = linear(rep_map_dp, ivec, False,
                          scope='linear_head')  # bs,sl,vec
            head_etd = tf.expand_dims(head, 2)  # bs,sl,1,vec

            logits = scaled_tanh(dependent_etd + head_etd + f_bias,
                                 5.0)  # bs,sl,sl,vec

            logits_masked = exp_mask_for_high_rank(logits, attn_mask)
            attn_score = tf.nn.softmax(logits_masked, 2)  # bs,sl,sl,vec
            attn_score = mask_for_high_rank(attn_score, attn_mask)

            attn_result = tf.reduce_sum(attn_score * rep_map_tile,
                                        2)  # bs,sl,vec

        with tf.variable_scope('output'):
            o_bias = tf.get_variable('o_bias', [ivec], tf.float32,
                                     tf.constant_initializer(0.))
            # input gate
            fusion_gate = tf.nn.sigmoid(
                linear(rep_map, ivec, True, 0., 'linear_fusion_i', False, wd,
                       keep_prob, is_train) +
                linear(attn_result, ivec, True, 0., 'linear_fusion_a', False,
                       wd, keep_prob, is_train) + o_bias)
            output = fusion_gate * rep_map + (1 - fusion_gate) * attn_result
            output = mask_for_high_rank(output, rep_mask)

        # save attn
        if tensor_dict is not None and name is not None:
            tensor_dict[name + '_dependent'] = dependent
            tensor_dict[name + '_head'] = head
            tensor_dict[name] = attn_score
            tensor_dict[name + '_gate'] = fusion_gate
        return output