Beispiel #1
0
def linear(args,
           output_size,
           bias,
           bias_start=0.0,
           scope=None,
           squeeze=False,
           wd=0.0,
           input_keep_prob=1.0,
           is_train=None):
    if args is None or (isinstance(args, (tuple, list)) and not args):
        raise ValueError("`args` must be specified")
    if not isinstance(args, (tuple, list)):
        args = [args]

    flat_args = [flatten(arg, 1) for arg in args]  # for dense layer [(-1, d)]
    if input_keep_prob < 1.0:
        assert is_train is not None
        flat_args = [
            tf.cond(is_train, lambda: tf.nn.dropout(arg, input_keep_prob),
                    lambda: arg)  # for dense layer [(-1, d)]
            for arg in flat_args
        ]
    flat_out = _linear(flat_args,
                       output_size,
                       bias,
                       bias_start=bias_start,
                       scope=scope)  # dense
    out = reconstruct(flat_out, args[0], 1)  # ()
    if squeeze:
        out = tf.squeeze(out, [len(args[0].get_shape().as_list()) - 1])

    if wd:
        add_reg_without_bias()

    return out
Beispiel #2
0
def contextual_bi_rnn(tensor_rep,
                      mask_rep,
                      hn,
                      cell_type,
                      only_final=False,
                      wd=0.,
                      keep_prob=1.,
                      is_train=None,
                      scope=None):
    """
    fusing contextual information using bi-direction rnn
    :param tensor_rep: [..., sl, vec]
    :param mask_rep: [..., sl]
    :param hn:
    :param cell_type: 'gru', 'lstm', basic_lstm' and 'basic_rnn'
    :param only_final: True or False
    :param wd:
    :param keep_prob:
    :param is_train:
    :param scope:
    :return:
    """
    with tf.variable_scope(scope or 'contextual_bi_rnn'):  # correct
        reuse = None if not tf.get_variable_scope().reuse else True
        #print(reuse)
        if cell_type == 'gru':
            cell_fw = tf.contrib.rnn.GRUCell(hn, reuse=reuse)
            cell_bw = tf.contrib.rnn.GRUCell(hn, reuse=reuse)
        elif cell_type == 'lstm':
            cell_fw = tf.contrib.rnn.LSTMCell(hn, reuse=reuse)
            cell_bw = tf.contrib.rnn.LSTMCell(hn, reuse=reuse)
        elif cell_type == 'basic_lstm':
            cell_fw = tf.contrib.rnn.BasicLSTMCell(hn, reuse=reuse)
            cell_bw = tf.contrib.rnn.BasicLSTMCell(hn, reuse=reuse)
        elif cell_type == 'basic_rnn':
            cell_fw = tf.contrib.rnn.BasicRNNCell(hn, reuse=reuse)
            cell_bw = tf.contrib.rnn.BasicRNNCell(hn, reuse=reuse)
        else:
            raise AttributeError('no cell type \'%s\'' % cell_type)
        cell_dp_fw = SwitchableDropoutWrapper(cell_fw, is_train, keep_prob)
        cell_dp_bw = SwitchableDropoutWrapper(cell_bw, is_train, keep_prob)

        tensor_len = tf.reduce_sum(tf.cast(mask_rep, tf.int32), -1)  # [bs]

        (outputs_fw,
         output_bw), _ = bidirectional_dynamic_rnn(cell_dp_fw,
                                                   cell_dp_bw,
                                                   tensor_rep,
                                                   tensor_len,
                                                   dtype=tf.float32)
        rnn_outputs = tf.concat([outputs_fw, output_bw], -1)  # [...,sl,2hn]

        if wd > 0:
            add_reg_without_bias()
        if not only_final:
            return rnn_outputs  # [....,sl, 2hn]
        else:
            return get_last_state(rnn_outputs, mask_rep)  # [...., 2hn]
Beispiel #3
0
def one_direction_rnn(tensor_rep,
                      mask_rep,
                      hn,
                      cell_type,
                      only_final=False,
                      wd=0.,
                      keep_prob=1.,
                      is_train=None,
                      is_forward=True,
                      scope=None):
    assert not is_forward  # todo: waiting to be implemented
    with tf.variable_scope(scope or '%s_rnn' %
                           'forward' if is_forward else 'backward'):
        reuse = None if not tf.get_variable_scope().reuse else True
        # print(reuse)
        if cell_type == 'gru':
            cell = tf.contrib.rnn.GRUCell(hn, reuse=reuse)
        elif cell_type == 'lstm':
            cell = tf.contrib.rnn.LSTMCell(hn, reuse=reuse)
        elif cell_type == 'basic_lstm':
            cell = tf.contrib.rnn.BasicLSTMCell(hn, reuse=reuse)
        elif cell_type == 'basic_rnn':
            cell = tf.contrib.rnn.BasicRNNCell(hn, reuse=reuse)
        else:
            raise AttributeError('no cell type \'%s\'' % cell_type)
        cell_dp = SwitchableDropoutWrapper(cell, is_train, keep_prob)

        tensor_len = tf.reduce_sum(tf.cast(mask_rep, tf.int32), -1)  # [bs]

        rnn_outputs, _ = dynamic_rnn(cell_dp,
                                     tensor_rep,
                                     tensor_len,
                                     dtype=tf.float32)

        if wd > 0:
            add_reg_without_bias()
        if not only_final:
            return rnn_outputs  # [....,sl, 2hn]
        else:
            return get_last_state(rnn_outputs, mask_rep)  # [...., 2hn]
Beispiel #4
0
def multi_head_attention(rep_tensor,
                         rep_mask,
                         head_num=8,
                         hidden_units_num=64,
                         scope=None,
                         is_train=None,
                         keep_prob=1.,
                         wd=0.):
    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(
        rep_tensor)[2]
    ivec = rep_tensor.get_shape().as_list()[2]

    with tf.variable_scope(scope or 'multi_head_attention'):

        with tf.variable_scope('positional_encoding'):
            seq_idxs = tf.tile(tf.expand_dims(tf.range(sl), 1),
                               [1, ivec])  # sl, ivec
            feature_idxs = tf.tile(tf.expand_dims(tf.range(ivec), 0),
                                   [sl, 1])  # sl, ivec
            pos_enc = tf.where(
                tf.equal(tf.mod(feature_idxs, 2), 0),
                tf.sin(
                    tf.cast(seq_idxs, tf.float32) / tf.pow(
                        10000., 2.0 * tf.cast(feature_idxs, tf.float32) /
                        (1.0 * ivec))),
                tf.cos(
                    tf.cast(seq_idxs, tf.float32) / tf.pow(
                        10000., 2.0 * tf.cast(feature_idxs - 1, tf.float32) /
                        (1.0 * ivec))),
            )
            rep_tensor_pos = mask_for_high_rank(rep_tensor + pos_enc,
                                                rep_mask)  # bs, sl, ivec

        with tf.variable_scope('multi_head_attention'):
            W = tf.get_variable('W', [3, head_num, ivec, hidden_units_num],
                                tf.float32)
            rep_tile = tf.tile(
                tf.expand_dims(tf.expand_dims(rep_tensor_pos, 0), 0),
                [3, head_num, 1, 1, 1])  # 3,head_num,bs,sl,ivec
            rep_tile_reshape = tf.reshape(
                rep_tile, [3, head_num, bs * sl, ivec])  # head_num,bs*sl,ivec

            maps = tf.reshape(  # 3,head_num,bs*sl,hn ->  3,head_num,bs,sl,hn
                tf.matmul(dropout(rep_tile_reshape, keep_prob, is_train), W),
                [3, head_num, bs, sl, hidden_units_num])
            Q_map, K_map, V_map = tf.split(maps, 3, 0)
            Q_map = tf.squeeze(Q_map, [0])  # head_num,bs,sl,hn
            K_map = tf.squeeze(K_map, [0])  # head_num,bs,sl,hn
            V_map = tf.squeeze(V_map, [0])  # head_num,bs,sl,hn

            # head_num,bs,sl,sl
            # similarity_mat = tf.reduce_sum(Q_map_tile * K_map_tile, -1) / math.sqrt(1. * hidden_units_num)
            similarity_mat = tf.matmul(Q_map, tf.transpose(
                K_map, [0, 1, 3, 2])) / math.sqrt(1. * hidden_units_num)

            # mask: bs,sl -> head_num,bs,sl
            multi_mask = tf.tile(tf.expand_dims(rep_mask, 0),
                                 [head_num, 1, 1])  # head_num,bs,sl
            multi_mask_tile_1 = tf.expand_dims(multi_mask,
                                               2)  # head_num,bs,1,sl
            multi_mask_tile_2 = tf.expand_dims(multi_mask,
                                               3)  # head_num,bs,sl,1
            multi_mask_tile = tf.logical_and(
                multi_mask_tile_1, multi_mask_tile_2)  # head_num,bs,sl,sl
            similarity_mat_masked = exp_mask(
                similarity_mat, multi_mask_tile)  # head_num,bs,sl,sl
            prob_dist = tf.nn.softmax(
                similarity_mat_masked)  # head_num,bs,sl,sl
            prob_dist_dp = dropout(prob_dist, keep_prob, is_train)

            attn_res = tf.matmul(prob_dist_dp, V_map)  # head_num,bs,sl,hn

            attn_res_tran = tf.transpose(attn_res, [1, 2, 0, 3])
            output = tf.reshape(attn_res_tran,
                                [bs, sl, head_num * hidden_units_num])

            if wd > 0.:
                add_reg_without_bias()

            return output
Beispiel #5
0
def build_tree_structure(normal_data,
                         op_lists,
                         reduce_mats,
                         method='dy_tree_lstm.v1',
                         hn=None,
                         wd=0.,
                         is_train=None,
                         keep_prob=1.,
                         swap_memory=False,
                         scope=None):
    """
    get shift reduce stacked mat from data and tree info
    :param normal_data: rank is 3 with shape [bs,sl,vec]
    :param op_lists: rank is 2 with shape [bs,ol], 1 for shift, 2 for reduce and 3 for padding
    :param reduce_mats: rank is 3 with shape [bs,ol,mc], indicate the reduce indices in stack matrix, -1 for padding
    :param method: 'concat' 'mean' 'merge' 'lstm'
    :param hn: hn for some func
    :param wd: weight decay
    :param is_train: 
    :param keep_prob: 
    :param swap_memory: use physical memory
    :param scope: 
    :return: [bs,ol,hn]
    """
    # todo: add new generate method
    method_class_list = [
        GeneBiLSTM, GeneBTTreeLSTM, GeneBTMerge, GeneDyTreeLSTMv0,
        GeneDyTreeLSTMv1
    ]
    with tf.variable_scope(scope or 'build_tree_structure', reuse=None):
        # tanspose
        op_lists = tf.transpose(op_lists, [1, 0])  # [ol,bs]
        reduce_mats = tf.transpose(reduce_mats, [1, 0, 2])  # [ol,bs,mc]

        # len parameters
        bs, sl, d = tf.shape(normal_data)[0], tf.shape(
            normal_data)[1], tf.shape(normal_data)[2]
        ol = tf.shape(op_lists)[0]
        mc = tf.shape(reduce_mats)[2]

        gene = None
        for gene_class in method_class_list:
            if gene_class.method_type == method:
                gene = gene_class(hn, keep_prob, is_train, wd)
                break
        assert gene is not None, 'no shift reduce method %s' % method

        hn = gene.update_tree_hn()

        # elems for scan
        elems_tensors = [op_lists, reduce_mats]

        # non-sequence
        batch_indices = tf.range(0, bs, dtype=tf.int32)  # bs
        batch_indices_mat = tf.tile(tf.expand_dims(batch_indices, 1),
                                    [1, mc])  # bs,mc
        data_extend = tf.concat(
            [normal_data,
             tf.zeros(shape=[bs, 1, d], dtype=tf.float32)],
            axis=1)  # pointer will be 'data_len+1' at last
        # scan variable init
        t_init = tf.constant(0, tf.int32)  # indicate the stack mat index
        data_pointer_init = tf.zeros(
            [bs],
            tf.int32)  # indicate the stack which data should be shifted next
        stack_mat_init = tf.zeros([ol, bs, hn], tf.float32)
        scan_init = (t_init, data_pointer_init, stack_mat_init)

        def main_scan_body(iter_vars, elems_vars):
            # get tensors
            # # iter: 1.t 2. data_pointer 3. stack_mat
            t = iter_vars[0]
            data_pointer = iter_vars[1]
            stack_mat = iter_vars[2]  # ol,bs,d
            # # elems: 1.op_list 2.reduce mat
            op_list = elems_vars[0]  # bs
            reduce_mat = elems_vars[1]  # bs mc

            # for shift
            shift_data_coordinates = tf.stack([batch_indices, data_pointer],
                                              axis=1)  # bs,2
            data_for_shift = tf.gather_nd(
                data_extend,
                shift_data_coordinates)  # coord:[bs,2]  data: [bs,sl,d]->bs,d
            # # TODO: add processing for shifted data
            processed_shifted_data = gene.do_shift(data_for_shift)
            assert processed_shifted_data is not None
            # # mask shifted data for change un-shifted data into zero ==> need to add
            masked_shifted_data = tf.where(
                tf.equal(op_list,
                         tf.ones_like(op_list,
                                      tf.int32)), processed_shifted_data,
                tf.zeros_like(processed_shifted_data))  # bs,d
            # # data_pointer update
            data_pointer = tf.where(
                tf.equal(op_list, tf.ones_like(op_list, tf.int32)),
                data_pointer + 1, data_pointer)

            # for reduce
            # # mask generation
            reduce_data_coordinates = tf.stack([reduce_mat, batch_indices_mat],
                                               axis=2)  # bs,mc,2
            data_for_reduce = tf.gather_nd(stack_mat,
                                           reduce_data_coordinates)  # bs,mc,d
            mask_for_reduce = tf.not_equal(
                reduce_mat,
                tf.ones_like(reduce_mat) *
                -1)  # (reduce_mats[t] != -1)  # [bs,mc]
            # TODO: add processing for reduced data
            processed_reduced_data = gene.do_reduce(data_for_reduce,
                                                    mask_for_reduce)

            masked_reduced_data = tf.where(
                tf.equal(op_list,
                         tf.ones_like(op_list, tf.int32) * 2),
                processed_reduced_data,
                tf.zeros_like(processed_reduced_data))  # bs,d
            sr_data = masked_shifted_data + masked_reduced_data  # bs,d

            # new update method for shift and reduce result
            sr_data = tf.scatter_nd(indices=[[t]],
                                    updates=[sr_data],
                                    shape=[ol, bs, hn])
            stack_mat = stack_mat + sr_data

            return t + 1, data_pointer, stack_mat

        output = tf.scan(main_scan_body,
                         elems_tensors,
                         scan_init,
                         parallel_iterations=1,
                         swap_memory=swap_memory)

        output_stack_mats = output[2]  # ol,ol,bs,v
        output_stack_mat = tf.transpose(output_stack_mats[-1],
                                        [1, 0, 2])  # bs,ol,hn
        output_stack_mat = gene.fetch_output(output_stack_mat)
        if wd > 0:
            add_reg_without_bias()
        return output_stack_mat