from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import math

import paddle
import paddle.fluid as fluid
from paddle.fluid.param_attr import ParamAttr


__all__ = ["ResNet", "ResNet18", "ResNet34", "ResNet50", "ResNet101", "ResNet152"]

Trainable = True
w_nolr = fluid.ParamAttr(
        trainable = Trainable)
train_parameters = {
    "input_size": [3, 224, 224],
    "input_mean": [0.485, 0.456, 0.406],
    "input_std": [0.229, 0.224, 0.225],
    "learning_strategy": {
        "name": "piecewise_decay",
        "batch_size": 256,
        "epochs": [30, 60, 90],
        "steps": [0.1, 0.01, 0.001, 0.0001]
    }
}

class ResNet():
    def __init__(self, params):
        self.layers = params['layers']
Esempio n. 2
0
def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
            **ignored):
    # 8 features
    predicate_embedding = fluid.layers.embedding(
        input=predicate,
        size=[pred_dict_len, word_dim],
        dtype='float32',
        is_sparse=IS_SPARSE,
        param_attr='vemb')

    mark_embedding = fluid.layers.embedding(
        input=mark,
        size=[mark_dict_len, mark_dim],
        dtype='float32',
        is_sparse=IS_SPARSE)

    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
    emb_layers = [
        fluid.layers.embedding(
            size=[word_dict_len, word_dim],
            input=x,
            param_attr=fluid.ParamAttr(
                name=embedding_name, trainable=False)) for x in word_input
    ]
    emb_layers.append(predicate_embedding)
    emb_layers.append(mark_embedding)

    hidden_0_layers = [
        fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers
    ]

    hidden_0 = fluid.layers.sums(input=hidden_0_layers)

    lstm_0 = fluid.layers.dynamic_lstm(
        input=hidden_0,
        size=hidden_dim,
        candidate_activation='relu',
        gate_activation='sigmoid',
        cell_activation='sigmoid')

    # stack L-LSTM and R-LSTM with direct edges
    input_tmp = [hidden_0, lstm_0]

    for i in range(1, depth):
        mix_hidden = fluid.layers.sums(input=[
            fluid.layers.fc(input=input_tmp[0], size=hidden_dim),
            fluid.layers.fc(input=input_tmp[1], size=hidden_dim)
        ])

        lstm = fluid.layers.dynamic_lstm(
            input=mix_hidden,
            size=hidden_dim,
            candidate_activation='relu',
            gate_activation='sigmoid',
            cell_activation='sigmoid',
            is_reverse=((i % 2) == 1))

        input_tmp = [mix_hidden, lstm]

    feature_out = fluid.layers.sums(input=[
        fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'),
        fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh')
    ])

    return feature_out
    def forward(self, graph_wrapper, is_test=False):
        """
        Build the network.
        """
        node_features = self._mol_encoder(graph_wrapper, name=self.name)

        features_list = [node_features]
        for layer in range(self.layer_num):
            edge_features = self._bond_encoder(graph_wrapper,
                                               name='%s_layer%s' %
                                               (self.name, layer))
            if self.gnn_type == "gcn":
                feat = gcn_layer(graph_wrapper,
                                 features_list[layer],
                                 edge_features,
                                 act="relu",
                                 name="%s_layer%s_gcn" % (self.name, layer))
            elif self.gnn_type == "gat":
                feat = gat_layer(graph_wrapper,
                                 features_list[layer],
                                 edge_features,
                                 self.embed_dim,
                                 act="relu",
                                 name="%s_layer%s_gat" % (self.name, layer))
            else:
                feat = gin_layer(graph_wrapper,
                                 features_list[layer],
                                 edge_features,
                                 name="%s_layer%s_gin" % (self.name, layer))

            if self.norm_type == 'batch_norm':
                feat = layers.batch_norm(
                    feat,
                    param_attr=fluid.ParamAttr(
                        name="%s_layer%s_batch_norm_scale" %
                        (self.name, layer),
                        initializer=fluid.initializer.Constant(1.0)),
                    bias_attr=fluid.ParamAttr(
                        name="%s_layer%s_batch_norm_bias" % (self.name, layer),
                        initializer=fluid.initializer.Constant(0.0)),
                    moving_mean_name="%s_layer%s_batch_norm_moving_avearage" %
                    (self.name, layer),
                    moving_variance_name="%s_layer%s_batch_norm_moving_variance"
                    % (self.name, layer),
                    is_test=is_test)
            elif self.norm_type == 'layer_norm':
                feat = layers.layer_norm(
                    feat,
                    param_attr=fluid.ParamAttr(
                        name="%s_layer%s_layer_norm_scale" %
                        (self.name, layer),
                        initializer=fluid.initializer.Constant(1.0)),
                    bias_attr=fluid.ParamAttr(
                        name="%s_layer%s_layer_norm_bias" % (self.name, layer),
                        initializer=fluid.initializer.Constant(0.0)))
            else:
                raise ValueError('%s not supported.' % self.norm_type)

            if self.graph_norm:
                feat = pgl.layers.graph_norm(graph_wrapper, feat)

            if layer < self.layer_num - 1:
                feat = layers.relu(feat)
            feat = layers.dropout(feat,
                                  self.dropout_rate,
                                  dropout_implementation="upscale_in_train",
                                  is_test=is_test)

            # residual
            if self.residual:
                feat = feat + features_list[layer]

            features_list.append(feat)

        if self.JK == "sum":
            node_repr = layers.reduce_sum(features_list, axis=0)
        elif self.JK == "mean":
            node_repr = layers.reduce_mean(features_list, axis=0)
        elif self.JK == "last":
            node_repr = features_list[-1]
        else:
            node_repr = features_list[-1]
        return node_repr
Esempio n. 4
0
    def __init__(self,
                 num_labels,
                 n_layer,
                 hidden_size=768,
                 name="encoder",
                 search_layer=True,
                 use_fixed_gumbel=False,
                 gumbel_alphas=None):
        super(EncoderLayer, self).__init__()
        self._n_layer = n_layer
        self._hidden_size = hidden_size
        self._n_channel = 128
        self._steps = 3
        self._n_ops = len(ConvBN_PRIMITIVES)
        self.use_fixed_gumbel = use_fixed_gumbel

        self.stem0 = fluid.dygraph.Sequential(
            Conv2D(num_channels=1,
                   num_filters=self._n_channel,
                   filter_size=[3, self._hidden_size],
                   padding=[1, 0],
                   param_attr=fluid.ParamAttr(initializer=MSRA()),
                   bias_attr=False),
            BatchNorm(num_channels=self._n_channel,
                      param_attr=fluid.ParamAttr(
                          initializer=fluid.initializer.Constant(value=1)),
                      bias_attr=fluid.ParamAttr(
                          initializer=fluid.initializer.Constant(value=0))))

        self.stem1 = fluid.dygraph.Sequential(
            Conv2D(num_channels=1,
                   num_filters=self._n_channel,
                   filter_size=[3, self._hidden_size],
                   padding=[1, 0],
                   param_attr=fluid.ParamAttr(initializer=MSRA()),
                   bias_attr=False),
            BatchNorm(num_channels=self._n_channel,
                      param_attr=fluid.ParamAttr(
                          initializer=fluid.initializer.Constant(value=1)),
                      bias_attr=fluid.ParamAttr(
                          initializer=fluid.initializer.Constant(value=0))))

        cells = []
        for i in range(n_layer):
            cell = Cell(steps=self._steps,
                        n_channel=self._n_channel,
                        name="%s/layer_%d" % (name, i))
            cells.append(cell)

        self._cells = fluid.dygraph.LayerList(cells)

        k = sum(1 for i in range(self._steps) for n in range(2 + i))
        num_ops = self._n_ops
        self.alphas = fluid.layers.create_parameter(
            shape=[k, num_ops],
            dtype="float32",
            default_initializer=NormalInitializer(loc=0.0, scale=1e-3))

        self.pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
        self.bns = []
        self.outs = []
        for i in range(self._n_layer):
            bn = BatchNorm(num_channels=self._n_channel,
                           param_attr=fluid.ParamAttr(
                               initializer=fluid.initializer.Constant(value=1),
                               trainable=False),
                           bias_attr=fluid.ParamAttr(
                               initializer=fluid.initializer.Constant(value=0),
                               trainable=False))
            out = Linear(self._n_channel,
                         num_labels,
                         param_attr=ParamAttr(initializer=MSRA()),
                         bias_attr=ParamAttr(initializer=MSRA()))
            self.bns.append(bn)
            self.outs.append(out)
        self._bns = fluid.dygraph.LayerList(self.bns)
        self._outs = fluid.dygraph.LayerList(self.outs)

        self.use_fixed_gumbel = use_fixed_gumbel
        #self.gumbel_alphas = gumbel_softmax(self.alphas, 0).detach()

        mrpc_arch = [
            [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],  # std_conv7 0     # node 0
            [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],  # dil_conv5 1
            [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],  # std_conv7 0     # node 1
            [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],  # dil_conv5 1
            [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],  # zero 2
            [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],  # zero 0          # node2
            [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # std_conv3 1
            [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],  # zero 2
            [0, 0, 0, 1, 0, 0, 0, 0, 0, 0]  # dil_conv3 3
        ]
        self.gumbel_alphas = to_variable(
            np.array(mrpc_arch).astype(np.float32))
        self.gumbel_alphas.stop_gradient = True
        print("gumbel_alphas: \n", self.gumbel_alphas.numpy())
Esempio n. 5
0
    def context(self, trainable=False, max_seq_len=128, num_slots=1):
        """
        Get the input ,output and program of the pretrained emotion_detection_textcnn

        Args:
             trainable(bool): Whether fine-tune the pretrained parameters of emotion_detection_textcnn or not.
             max_seq_len (int): It will limit the total sequence returned so that it has a maximum length.
             num_slots(int): It's number of data inputted to the model, selectted as following options:

                 - 1(default): There's only one data to be feeded in the model, e.g. the module is used for text classification task.
                 - 2: There are two data to be feeded in the model, e.g. the module is used for text matching task (point-wise).
                 - 3: There are three data to be feeded in the model, e.g. the module is used for text matching task (pair-wise).

        Returns:
             inputs(dict): the input variables of emotion_detection_textcnn (words)
             outputs(dict): the output variables of input words (word embeddings and label probilities);
                 the sentence embedding and sequence length of the first input text.
             main_program(Program): the main_program of emotion_detection_textcnn with pretrained prameters
        """
        assert num_slots >= 1 and num_slots <= 3, "num_slots must be 1, 2, or 3, but the input is %d" % num_slots
        main_program = fluid.Program()
        startup_program = fluid.Program()
        with fluid.program_guard(main_program, startup_program):
            text_1 = fluid.layers.data(name="text", shape=[-1, max_seq_len, 1], dtype="int64", lod_level=0)
            seq_len = fluid.layers.data(name="seq_len", shape=[1], dtype='int64', lod_level=0)
            seq_len_used = fluid.layers.squeeze(seq_len, axes=[1])

            # Add embedding layer.
            w_param_attrs = fluid.ParamAttr(
                name="embedding_0.w_0", initializer=fluid.initializer.TruncatedNormal(scale=0.02), trainable=trainable)
            dict_dim = 240466
            emb_1 = fluid.layers.embedding(
                input=text_1,
                size=[dict_dim, 128],
                is_sparse=True,
                padding_idx=dict_dim - 1,
                dtype='float32',
                param_attr=w_param_attrs)
            emb_1_name = emb_1.name
            data_list = [text_1]
            emb_name_list = [emb_1_name]

            # Add lstm layer.
            pred, fc = textcnn_net(emb_1, seq_len_used)
            pred_name = pred.name
            fc_name = fc.name

            if num_slots > 1:
                text_2 = fluid.data(name='text_2', shape=[-1, max_seq_len], dtype='int64', lod_level=0)
                emb_2 = fluid.embedding(
                    input=text_2,
                    size=[dict_dim, 128],
                    is_sparse=True,
                    padding_idx=dict_dim - 1,
                    dtype='float32',
                    param_attr=w_param_attrs)
                emb_2_name = emb_2.name
                data_list.append(text_2)
                emb_name_list.append(emb_2_name)

            if num_slots > 2:
                text_3 = fluid.data(name='text_3', shape=[-1, max_seq_len], dtype='int64', lod_level=0)
                emb_3 = fluid.embedding(
                    input=text_3,
                    size=[dict_dim, 128],
                    is_sparse=True,
                    padding_idx=dict_dim - 1,
                    dtype='float32',
                    param_attr=w_param_attrs)
                emb_3_name = emb_3.name
                data_list.append(text_3)
                emb_name_list.append(emb_3_name)

            variable_names = filter(lambda v: v not in ['text', 'text_2', 'text_3', "seq_len"],
                                    list(main_program.global_block().vars.keys()))
            prefix_name = "@HUB_{}@".format(self.name)
            add_vars_prefix(program=main_program, prefix=prefix_name, vars=variable_names)

            for param in main_program.global_block().iter_parameters():
                param.trainable = trainable

            place = fluid.CPUPlace()
            exe = fluid.Executor(place)

            # Load the emotion_detection_textcnn pretrained model.
            def if_exist(var):
                return os.path.exists(os.path.join(self.pretrained_model_path, var.name))

            fluid.io.load_vars(exe, self.pretrained_model_path, predicate=if_exist)

            inputs = {'seq_len': seq_len}
            outputs = {
                "class_probs": main_program.global_block().vars[prefix_name + pred_name],
                "sentence_feature": main_program.global_block().vars[prefix_name + fc_name]
            }
            for index, data in enumerate(data_list):
                if index == 0:
                    inputs['text'] = data
                    outputs['emb'] = main_program.global_block().vars[prefix_name + emb_name_list[0]]
                else:
                    inputs['text_%s' % (index + 1)] = data
                    outputs['emb_%s' % (index + 1)] = main_program.global_block().vars[prefix_name +
                                                                                       emb_name_list[index]]
            return inputs, outputs, main_program
Esempio n. 6
0
def point_network_decoder(p_vec, q_vec, hidden_size, args):
    tag = 'pn_decoder:'
    init_random = fluid.initializer.Normal(loc=0.0, scale=1.0)

    random_attn = layers.create_parameter(
        shape=[1, hidden_size],
        dtype='float32',
        default_initializer=init_random)
    random_attn = layers.fc(
        input=random_attn,
        size=hidden_size,
        act=None,
        param_attr=fluid.ParamAttr(name=tag + 'random_attn_fc_w'),
        bias_attr=fluid.ParamAttr(name=tag + 'random_attn_fc_b'))
    random_attn = layers.reshape(random_attn, shape=[-1])
    U = layers.fc(input=q_vec,
                  param_attr=fluid.ParamAttr(name=tag + 'q_vec_fc_w'),
                  bias_attr=False,
                  size=hidden_size,
                  act=None) + random_attn
    U = layers.tanh(U)

    logits = layers.fc(input=U,
                       param_attr=fluid.ParamAttr(name=tag + 'logits_fc_w'),
                       bias_attr=fluid.ParamAttr(name=tag + 'logits_fc_b'),
                       size=1,
                       act=None)
    scores = layers.sequence_softmax(input=logits)
    pooled_vec = layers.elementwise_mul(x=q_vec, y=scores, axis=0)
    pooled_vec = layers.sequence_pool(input=pooled_vec, pool_type='sum')

    init_state = layers.fc(
        input=pooled_vec,
        param_attr=fluid.ParamAttr(name=tag + 'init_state_fc_w'),
        bias_attr=fluid.ParamAttr(name=tag + 'init_state_fc_b'),
        size=hidden_size,
        act=None)

    def custom_dynamic_rnn(p_vec, init_state, hidden_size, para_name, args):
        tag = para_name + "custom_dynamic_rnn:"

        def static_rnn(step,
                       p_vec=p_vec,
                       init_state=None,
                       para_name='',
                       args=args):
            tag = para_name + "static_rnn:"
            ctx = layers.fc(
                input=p_vec,
                param_attr=fluid.ParamAttr(name=tag + 'context_fc_w'),
                bias_attr=fluid.ParamAttr(name=tag + 'context_fc_b'),
                size=hidden_size,
                act=None)

            beta = []
            c_prev = init_state
            m_prev = init_state
            for i in range(step):
                m_prev0 = layers.fc(
                    input=m_prev,
                    size=hidden_size,
                    act=None,
                    param_attr=fluid.ParamAttr(name=tag + 'm_prev0_fc_w'),
                    bias_attr=fluid.ParamAttr(name=tag + 'm_prev0_fc_b'))
                m_prev1 = layers.sequence_expand(x=m_prev0, y=ctx)

                Fk = ctx + m_prev1
                Fk = layers.tanh(Fk)
                logits = layers.fc(
                    input=Fk,
                    size=1,
                    act=None,
                    param_attr=fluid.ParamAttr(name=tag + 'logits_fc_w'),
                    bias_attr=fluid.ParamAttr(name=tag + 'logits_fc_b'))

                scores = layers.sequence_softmax(input=logits)
                attn_ctx = layers.elementwise_mul(x=p_vec, y=scores, axis=0)
                attn_ctx = layers.sequence_pool(input=attn_ctx, pool_type='sum')

                hidden_t, cell_t = lstm_step(
                    attn_ctx,
                    hidden_t_prev=m_prev,
                    cell_t_prev=c_prev,
                    size=hidden_size,
                    para_name=tag,
                    args=args)
                m_prev = hidden_t
                c_prev = cell_t
                beta.append(scores)
            return beta

        return static_rnn(
            2, p_vec=p_vec, init_state=init_state, para_name=para_name)

    fw_outputs = custom_dynamic_rnn(p_vec, init_state, hidden_size, tag + "fw:",
                                    args)
    bw_outputs = custom_dynamic_rnn(p_vec, init_state, hidden_size, tag + "bw:",
                                    args)

    start_prob = layers.elementwise_add(
        x=fw_outputs[0], y=bw_outputs[1], axis=0) / 2
    end_prob = layers.elementwise_add(
        x=fw_outputs[1], y=bw_outputs[0], axis=0) / 2

    return start_prob, end_prob
Esempio n. 7
0
def gaan(gw, feature, hidden_size_a, hidden_size_v, hidden_size_m,
         hidden_size_o, heads, name):
    """Implementation of GaAN"""
    def send_func(src_feat, dst_feat, edge_feat):
        # compute attention
        # E * (M * D1)
        feat_query, feat_key = dst_feat['feat_query'], src_feat['feat_key']
        # E * M * D1
        old = feat_query
        feat_query = fluid.layers.reshape(feat_query,
                                          [-1, heads, hidden_size_a])
        feat_key = fluid.layers.reshape(feat_key, [-1, heads, hidden_size_a])
        # E * M
        alpha = fluid.layers.reduce_sum(feat_key * feat_query, dim=-1)

        return {
            'dst_node_feat': dst_feat['node_feat'],
            'src_node_feat': src_feat['node_feat'],
            'feat_value': src_feat['feat_value'],
            'alpha': alpha,
            'feat_gate': src_feat['feat_gate']
        }

    def recv_func(message):
        # feature of src and dst node on each edge
        dst_feat = message['dst_node_feat']
        src_feat = message['src_node_feat']
        # feature of center node
        x = fluid.layers.sequence_pool(dst_feat, 'average')
        # feature of neighbors of center node
        z = fluid.layers.sequence_pool(src_feat, 'average')

        # compute gate
        feat_gate = message['feat_gate']
        g_max = fluid.layers.sequence_pool(feat_gate, 'max')
        g = fluid.layers.concat([x, g_max, z], axis=1)
        g = fluid.layers.fc(g, heads, bias_attr=False, act="sigmoid")

        # softmax
        alpha = message['alpha']
        alpha = paddle_helper.sequence_softmax(alpha)  # E * M

        feat_value = message['feat_value']  # E * (M * D2)
        old = feat_value
        feat_value = fluid.layers.reshape(
            feat_value, [-1, heads, hidden_size_v])  # E * M * D2
        feat_value = fluid.layers.elementwise_mul(feat_value, alpha, axis=0)
        feat_value = fluid.layers.reshape(
            feat_value, [-1, heads * hidden_size_v])  # E * (M * D2)
        feat_value = fluid.layers.lod_reset(feat_value, old)

        feat_value = fluid.layers.sequence_pool(feat_value,
                                                'sum')  # N * (M * D2)

        feat_value = fluid.layers.reshape(
            feat_value, [-1, heads, hidden_size_v])  # N * M * D2

        output = fluid.layers.elementwise_mul(feat_value, g, axis=0)
        output = fluid.layers.reshape(
            output, [-1, heads * hidden_size_v])  # N * (M * D2)

        output = fluid.layers.concat([x, output], axis=1)

        return output

    # N * (D1 * M)
    feat_key = fluid.layers.fc(feature,
                               hidden_size_a * heads,
                               bias_attr=False,
                               param_attr=fluid.ParamAttr(name=name +
                                                          '_project_key'))
    # N * (D2 * M)
    feat_value = fluid.layers.fc(feature,
                                 hidden_size_v * heads,
                                 bias_attr=False,
                                 param_attr=fluid.ParamAttr(name=name +
                                                            '_project_value'))
    # N * (D1 * M)
    feat_query = fluid.layers.fc(feature,
                                 hidden_size_a * heads,
                                 bias_attr=False,
                                 param_attr=fluid.ParamAttr(name=name +
                                                            '_project_query'))
    # N * Dm
    feat_gate = fluid.layers.fc(feature,
                                hidden_size_m,
                                bias_attr=False,
                                param_attr=fluid.ParamAttr(name=name +
                                                           '_project_gate'))

    # send
    message = gw.send(
        send_func,
        nfeat_list=[('node_feat', feature), ('feat_key', feat_key),
                    ('feat_value', feat_value), ('feat_query', feat_query),
                    ('feat_gate', feat_gate)],
        efeat_list=None,
    )

    # recv
    output = gw.recv(message, recv_func)
    output = fluid.layers.fc(output,
                             hidden_size_o,
                             bias_attr=False,
                             param_attr=fluid.ParamAttr(name=name +
                                                        '_project_output'))
    output = fluid.layers.leaky_relu(output, alpha=0.1)
    output = fluid.layers.dropout(output, dropout_prob=0.1)

    return output
Esempio n. 8
0
    def __init__(self,
                 hidden_size,
                 num_steps,
                 num_layers=1,
                 init_scale=0.1,
                 dropout=None):

        #这个模型有几个参数:
        #1. hidden_size,表示embedding-size,或者是记忆向量的维度
        #2. num_steps,表示这个长短时记忆网络,最多可以考虑多长的时间序列
        #3. num_layers,表示这个长短时记忆网络内部有多少层,我们知道,
        #   给定一个形状为[batch_size, seq_len, embedding_size]的输入,
        # 长短时记忆网络会输出一个同样为[batch_size, seq_len, embedding_size]的输出,
        #   我们可以把这个输出再链到一个新的长短时记忆网络上
        # 如此叠加多层长短时记忆网络,有助于学习更复杂的句子甚至是篇章。
        #5. init_scale,表示网络内部的参数的初始化范围,
        # 长短时记忆网络内部用了很多tanh,sigmoid等激活函数,这些函数对数值精度非常敏感,
        # 因此我们一般只使用比较小的初始化范围,以保证效果,

        super(SimpleLSTMRNN, self).__init__()
        self._hidden_size = hidden_size
        self._num_layers = num_layers
        self._init_scale = init_scale
        self._dropout = dropout
        self._input = None
        self._num_steps = num_steps
        self.cell_array = []
        self.hidden_array = []

        # weight_1_arr用于存储不同层的长短时记忆网络中,不同门的W参数
        self.weight_1_arr = []
        self.weight_2_arr = []
        # bias_arr用于存储不同层的长短时记忆网络中,不同门的b参数
        self.bias_arr = []
        self.mask_array = []

        # 通过使用create_parameter函数,创建不同长短时记忆网络层中的参数
        # 通过上面的公式,我们知道,我们总共需要8个形状为[_hidden_size, _hidden_size]的W向量
        # 和4个形状为[_hidden_size]的b向量,因此,我们在声明参数的时候,
        # 一次性声明一个大小为[self._hidden_size * 2, self._hidden_size * 4]的参数
        # 和一个 大小为[self._hidden_size * 4]的参数,这样做的好处是,
        # 可以使用一次矩阵计算,同时计算8个不同的矩阵乘法
        # 以便加快计算速度
        for i in range(self._num_layers):
            weight_1 = self.create_parameter(
                attr=fluid.ParamAttr(
                    initializer=fluid.initializer.UniformInitializer(
                        low=-self._init_scale, high=self._init_scale)),
                shape=[self._hidden_size * 2, self._hidden_size * 4],
                dtype="float32",
                default_initializer=fluid.initializer.UniformInitializer(
                    low=-self._init_scale, high=self._init_scale))

            self.weight_1_arr.append(self.add_parameter('w_%d' % i, weight_1))

            bias_1 = self.create_parameter(
                attr=fluid.ParamAttr(
                    initializer=fluid.initializer.UniformInitializer(
                        low=-self._init_scale, high=self._init_scale)),
                shape=[self._hidden_size * 4],
                dtype="float32",
                default_initializer=fluid.initializer.Constant(0.0))

            self.bias_arr.append(self.add_parameter('b_%d' % i, bias_1))
Esempio n. 9
0
    def net(self, inputs, is_infer=False):

        # ------------------------- network input --------------------------

        hist_item_seq = inputs[0]  # history item sequence
        hist_cat_seq = inputs[1]  # history category sequence
        target_item = inputs[2]  # one dim target item
        target_cat = inputs[3]  # one dim target category
        label = inputs[4]  # label
        mask = inputs[5]  # mask
        target_item_seq = inputs[6]  # target item expand to sequence
        target_cat_seq = inputs[7]  # traget category expand to sequence
        neg_hist_item_seq = inputs[8]  # neg item sampling for aux loss
        neg_hist_cat_seq = inputs[9]  # neg cat sampling for aux loss

        item_emb_attr = fluid.ParamAttr(name="item_emb")
        cur_program = fluid.Program()
        cur_block = cur_program.current_block()
        item_emb_copy = cur_block.create_var(
            name="item_emb",
            shape=[self.item_count, self.item_emb_size],
            dtype='float32')
        #item_emb_copy = fluid.layers.Print(item_emb_copy, message="Testing:")
        ##item_emb_attr = fluid.layers.Print(item_emb_attr, summarize=2)
        cat_emb_attr = fluid.ParamAttr(name="cat_emb")

        # ------------------------- Embedding Layer --------------------------

        hist_item_emb = fluid.embedding(
            input=hist_item_seq,
            size=[self.item_count, self.item_emb_size],
            param_attr=item_emb_attr,
            is_sparse=self.is_sparse)
        item_emb_copy = fluid.layers.Print(item_emb_copy,
                                           message="Testing:",
                                           summarize=20,
                                           print_phase='backward')
        neg_hist_cat_emb = fluid.embedding(
            input=neg_hist_cat_seq,
            size=[self.cat_count, self.cat_emb_size],
            param_attr=cat_emb_attr,
            is_sparse=self.is_sparse)

        neg_hist_item_emb = fluid.embedding(
            input=neg_hist_item_seq,
            size=[self.item_count, self.item_emb_size],
            param_attr=item_emb_attr,
            is_sparse=self.is_sparse)

        hist_cat_emb = fluid.embedding(
            input=hist_cat_seq,
            size=[self.cat_count, self.cat_emb_size],
            param_attr=cat_emb_attr,
            is_sparse=self.is_sparse)

        target_item_emb = fluid.embedding(
            input=target_item,
            size=[self.item_count, self.item_emb_size],
            param_attr=item_emb_attr,
            is_sparse=self.is_sparse)

        target_cat_emb = fluid.embedding(
            input=target_cat,
            size=[self.cat_count, self.cat_emb_size],
            param_attr=cat_emb_attr,
            is_sparse=self.is_sparse)

        target_item_seq_emb = fluid.embedding(
            input=target_item_seq,
            size=[self.item_count, self.item_emb_size],
            param_attr=item_emb_attr,
            is_sparse=self.is_sparse)

        target_cat_seq_emb = fluid.embedding(
            input=target_cat_seq,
            size=[self.cat_count, self.cat_emb_size],
            param_attr=cat_emb_attr,
            is_sparse=self.is_sparse)

        item_b = fluid.embedding(
            input=target_item,
            size=[self.item_count, 1],
            param_attr=fluid.initializer.Constant(value=0.0))

        # ------------------------- Interest Extractor Layer --------------------------

        hist_seq_concat = fluid.layers.concat([hist_item_emb, hist_cat_emb],
                                              axis=2)
        neg_hist_seq_concat = fluid.layers.concat(
            [neg_hist_item_emb, neg_hist_cat_emb], axis=2)
        target_seq_concat = fluid.layers.concat(
            [target_item_seq_emb, target_cat_seq_emb], axis=2)
        target_concat = fluid.layers.concat([target_item_emb, target_cat_emb],
                                            axis=1)

        reshape_hist_item_emb = fluid.layers.reduce_sum(hist_seq_concat, dim=1)
        neg_reshape_hist_item_emb = fluid.layers.reduce_sum(
            neg_hist_seq_concat, dim=1)
        gru_input_hist_item_emb = fluid.layers.concat([reshape_hist_item_emb] *
                                                      3,
                                                      axis=1)

        gru_h1 = fluid.layers.dynamic_gru(gru_input_hist_item_emb,
                                          size=self.item_emb_size * 2)
        gru_h1_input = fluid.layers.concat([gru_h1] * 3, axis=1)
        gru_h2 = fluid.layers.dynamic_gru(gru_h1_input,
                                          size=self.item_emb_size * 2)

        # ------------------------- Auxiliary loss  --------------------------

        pad_value = fluid.layers.zeros(shape=[1], dtype='float32')
        start_value = fluid.layers.zeros(shape=[1], dtype='int32')
        gru_out_pad, lengths = fluid.layers.sequence_pad(gru_h2, pad_value)
        pos_seq_pad, _ = fluid.layers.sequence_pad(reshape_hist_item_emb,
                                                   pad_value)
        neg_seq_pad, _ = fluid.layers.sequence_pad(neg_reshape_hist_item_emb,
                                                   pad_value)
        seq_shape = fluid.layers.shape(pos_seq_pad)
        if (seq_shape[1] == 1):
            aux_loss = 0
        else:
            test_pos = fluid.layers.reduce_sum(fluid.layers.reduce_sum(
                fluid.layers.log(
                    fluid.layers.sigmoid(
                        fluid.layers.reduce_sum(
                            gru_out_pad[:, start_value:seq_shape[1] - 1, :] *
                            pos_seq_pad[:, start_value + 1:seq_shape[1], :],
                            dim=2,
                            keep_dim=True))),
                dim=2),
                                               dim=1,
                                               keep_dim=True)
            test_neg = fluid.layers.reduce_sum(fluid.layers.reduce_sum(
                fluid.layers.log(
                    fluid.layers.sigmoid(
                        fluid.layers.reduce_sum(
                            gru_out_pad[:, start_value:seq_shape[1] - 1, :] *
                            neg_seq_pad[:, start_value + 1:seq_shape[1], :],
                            dim=2,
                            keep_dim=True))),
                dim=2),
                                               dim=1,
                                               keep_dim=True)
            aux_loss = fluid.layers.mean(test_neg + test_pos)

        # ------------------------- Interest Evolving Layer (GRU with attentional input (AIGRU)) --------------------------

        weighted_vector = self.din_attention(gru_out_pad, target_seq_concat,
                                             mask)
        weighted_vector = fluid.layers.transpose(weighted_vector, [1, 0, 2])
        concat_weighted_vector = fluid.layers.concat([weighted_vector] * 3,
                                                     axis=2)

        attention_rnn = fluid.layers.StaticRNN(name="attention_evolution")

        with attention_rnn.step():
            word = attention_rnn.step_input(concat_weighted_vector)
            prev = attention_rnn.memory(shape=[-1, self.item_emb_size * 2],
                                        batch_ref=word)
            hidden, _, _ = fluid.layers.gru_unit(input=word,
                                                 hidden=prev,
                                                 size=self.item_emb_size * 6)
            attention_rnn.update_memory(prev, hidden)
            attention_rnn.output(hidden)

        attention_rnn_res = attention_rnn()
        attention_rnn_res_T = fluid.layers.transpose(attention_rnn_res,
                                                     [1, 0, 2])[:, -1, :]

        out = fluid.layers.sequence_pool(input=hist_item_emb, pool_type='sum')
        out_fc = fluid.layers.fc(name="out_fc",
                                 input=out,
                                 size=self.item_emb_size + self.cat_emb_size,
                                 num_flatten_dims=1)
        embedding_concat = fluid.layers.concat(
            [attention_rnn_res_T, target_concat], axis=1)

        fc1 = fluid.layers.fc(name="fc1",
                              input=embedding_concat,
                              size=80,
                              act=self.act)
        fc2 = fluid.layers.fc(name="fc2", input=fc1, size=40, act=self.act)
        fc3 = fluid.layers.fc(name="fc3", input=fc2, size=1)
        logit = fc3 + item_b

        loss = fluid.layers.sigmoid_cross_entropy_with_logits(x=logit,
                                                              label=label)

        avg_loss = fluid.layers.mean(loss) + aux_loss
        self._cost = avg_loss

        self.predict = fluid.layers.sigmoid(logit)
        predict_2d = fluid.layers.concat([1 - self.predict, self.predict], 1)

        label_int = fluid.layers.cast(label, 'int64')
        auc_var, batch_auc_var, _ = fluid.layers.auc(input=predict_2d,
                                                     label=label_int,
                                                     slide_steps=0)
        self._metrics["AUC"] = auc_var
        self._metrics["BATCH_AUC"] = batch_auc_var

        if is_infer:
            self._infer_results["AUC"] = auc_var
Esempio n. 10
0
    def test_pipeline(self):
        x = fluid.layers.data(name='x', shape=[1], dtype='int64', lod_level=0)
        y = fluid.layers.data(name='y', shape=[1], dtype='int64', lod_level=0)
        emb_x = layers.embedding(input=x,
                                 param_attr=fluid.ParamAttr(name="embx"),
                                 size=[10, 2],
                                 is_sparse=False)
        emb_y = layers.embedding(input=y,
                                 param_attr=fluid.ParamAttr(name="emby",
                                                            learning_rate=0.9),
                                 size=[10, 2],
                                 is_sparse=False)

        concat = layers.concat([emb_x, emb_y], axis=1)

        fc = layers.fc(input=concat,
                       name="fc",
                       size=1,
                       num_flatten_dims=1,
                       bias_attr=False)
        loss = layers.reduce_mean(fc)

        optimizer = fluid.optimizer.SGD(learning_rate=0.5)
        optimizer = fluid.optimizer.PipelineOptimizer(
            optimizer,
            cut_list=[[emb_x, emb_y], [loss]],
            place_list=[
                fluid.CPUPlace(),
                fluid.CUDAPlace(0),
                fluid.CPUPlace()
            ],
            concurrency_list=[1, 1, 1],
            queue_size=1,
            sync_steps=10000000,
        )
        optimizer.minimize(loss)
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(fluid.default_startup_program())
        #prepare data
        batch_size = 100

        def binary_print(slot, fout):
            num = np.int16(len(slot) + 1)
            num.tofile(fout)
            a = np.int64(batch_size)
            a.tofile(fout)
            slot.tofile(fout)

        #batch1 = np.array([[0,1], [1,2], [2,3]]).astype("int64").reshape(batch_size,2,1)
        #batch2 = np.array([[1,2], [2,3], [3,4]]).astype("int64").reshape(batch_size,2,1)
        batch1 = np.ones(
            (batch_size, 2, 1)).astype("int64").reshape(batch_size, 2, 1)
        batch2 = np.ones(
            (batch_size, 2, 1)).astype("int64").reshape(batch_size, 2, 1)
        data = [batch1, batch2]
        filelist = []
        for i in range(2):
            filelist.append("test_pipeline_input_" + str(i))
        for f in filelist:
            with open(f, "wb") as fout:
                for batch_data in data:
                    for ins in batch_data:
                        for slot in ins:
                            binary_print(slot, fout)

        dataset = fluid.DatasetFactory().create_dataset("FileInstantDataset")
        dataset.set_use_var([x, y])
        dataset.set_batch_size(batch_size)
        dataset.set_filelist(filelist)

        for epoch in range(1):
            exe.train_from_dataset(fluid.default_main_program(),
                                   dataset,
                                   thread=1,
                                   debug=False,
                                   fetch_list=[],
                                   fetch_info=[],
                                   print_period=1)

        for f in filelist:
            os.remove(f)
Esempio n. 11
0
    def __init__(self,
                 hidden_size,
                 vocab_size,
                 class_num=2,
                 num_layers=1,
                 num_steps=128,
                 init_scale=0.1,
                 dropout=None):

        #这个模型的参数分别为:
        #1. hidden_size,表示embedding-size,hidden和cell向量的维度
        #2. vocab_size,模型可以考虑的词表大小
        #3. class_num,情感类型个数,可以是2分类,也可以是多分类
        #4. num_steps,表示这个情感分析模型最大可以考虑的句子长度
        #5. init_scale,表示网络内部的参数的初始化范围,
        # 长短时记忆网络内部用了很多tanh,sigmoid等激活函数,这些函数对数值精度非常敏感,
        # 因此我们一般只使用比较小的初始化范围,以保证效果

        super(SentimentClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.class_num = class_num
        self.init_scale = init_scale
        self.num_layers = num_layers
        self.num_steps = num_steps
        self.dropout = dropout

        # 声明一个LSTM模型,用来把一个句子抽象城一个向量
        self.simple_lstm_rnn = SimpleLSTMRNN(hidden_size,
                                             num_steps,
                                             num_layers=num_layers,
                                             init_scale=init_scale,
                                             dropout=dropout)

        # 声明一个embedding层,用来把句子中的每个词转换为向量
        self.embedding = Embedding(
            size=[vocab_size, hidden_size],
            dtype='float32',
            is_sparse=False,
            param_attr=fluid.ParamAttr(
                name='embedding_para',
                initializer=fluid.initializer.UniformInitializer(
                    low=-init_scale, high=init_scale)))

        # 在得到一个句子的向量表示后,我们需要根据这个向量表示对这个句子进行分类
        # 一般来说,我们可以把这个句子的向量表示,
        # 乘以一个大小为[self.hidden_size, self.class_num]的W参数
        # 并加上一个大小为[self.class_num]的b参数
        # 通过这种手段达到把句子向量映射到分类结果的目标

        # 我们需要声明最终在使用句子向量映射到具体情感类别过程中所需要使用的参数
        # 这个参数的大小一般是[self.hidden_size, self.class_num]

        self.softmax_weight = self.create_parameter(
            attr=fluid.ParamAttr(),
            shape=[self.hidden_size, self.class_num],
            dtype="float32",
            default_initializer=fluid.initializer.UniformInitializer(
                low=-self.init_scale, high=self.init_scale))
        # 同样的,我们需要声明最终分类过程中的b参数
        #  这个参数的大小一般是[self.class_num]

        self.softmax_bias = self.create_parameter(
            attr=fluid.ParamAttr(),
            shape=[self.class_num],
            dtype="float32",
            default_initializer=fluid.initializer.UniformInitializer(
                low=-self.init_scale, high=self.init_scale))
Esempio n. 12
0
    def single_section(self, random_dump):
        program = fluid.Program()
        with fluid.program_guard(program):
            x = fluid.layers.data(name='x',
                                  shape=[1],
                                  dtype='int64',
                                  lod_level=0)
            y = fluid.layers.data(name='y',
                                  shape=[1],
                                  dtype='int64',
                                  lod_level=0)
            emb_x = layers.embedding(input=x,
                                     param_attr=fluid.ParamAttr(name="embx"),
                                     size=[10, 2],
                                     is_sparse=False)
            emb_y = layers.embedding(input=y,
                                     param_attr=fluid.ParamAttr(
                                         name="emby", learning_rate=0.9),
                                     size=[10, 2],
                                     is_sparse=False)

            concat = layers.concat([emb_x, emb_y], axis=1)

            fc = layers.fc(input=concat,
                           name="fc",
                           size=1,
                           num_flatten_dims=1,
                           bias_attr=False)
            loss = layers.reduce_mean(fc)

            optimizer = fluid.optimizer.SGD(learning_rate=0.5)
            optimizer = fluid.optimizer.PipelineOptimizer(
                optimizer,
                cut_list=[],
                #place_list=[fluid.CPUPlace()],
                place_list=[fluid.CUDAPlace(0)],
                concurrency_list=[1],
                queue_size=1,
                sync_steps=-1)
            optimizer.minimize(loss)

            program._pipeline_opt["dump_fields"] = [
                "fc.tmp_0", "fc.tmp_0@GRAD"
            ]
            program._pipeline_opt["dump_fields_path"] = "./dump_log/"
            program._pipeline_opt["dump_param"] = ["embx"]
            program._pipeline_opt["enable_random_dump"] = random_dump
            program._pipeline_opt["dump_interval"] = 10
            program._pipeline_opt["random_with_lineid"] = False
            #print(program._pipeline_opt)
            place = fluid.CPUPlace()
            exe = fluid.Executor(place)
            exe.run(fluid.default_startup_program())
            #prepare data
            batch_size = 100

            def binary_print(slot, fout):
                num = np.int16(len(slot) + 1)
                num.tofile(fout)
                a = np.int64(batch_size)
                a.tofile(fout)
                slot.tofile(fout)

            #batch1 = np.array([[0,1], [1,2], [2,3]]).astype("int64").reshape(batch_size,2,1)
            #batch2 = np.array([[1,2], [2,3], [3,4]]).astype("int64").reshape(batch_size,2,1)
            batch1 = np.ones(
                (batch_size, 2, 1)).astype("int64").reshape(batch_size, 2, 1)
            batch2 = np.ones(
                (batch_size, 2, 1)).astype("int64").reshape(batch_size, 2, 1)
            data = [batch1, batch2]
            filelist = []
            for i in range(2):
                filelist.append("test_pipeline_input_" + str(i))
            for f in filelist:
                with open(f, "wb") as fout:
                    for batch_data in data:
                        for ins in batch_data:
                            for slot in ins:
                                binary_print(slot, fout)

            dataset = fluid.DatasetFactory().create_dataset(
                "FileInstantDataset")
            dataset.set_use_var([x, y])
            dataset.set_batch_size(batch_size)
            dataset.set_filelist(filelist)

            for epoch in range(1):
                exe.train_from_dataset(fluid.default_main_program(),
                                       dataset,
                                       thread=1,
                                       debug=True,
                                       fetch_list=[],
                                       fetch_info=[],
                                       print_period=1)

            for f in filelist:
                os.remove(f)
            if os.path.isdir("dump_log"):
                shutil.rmtree("dump_log")
def create_vcr_model(pyreader_name,
                     ernie_config,
                     task_group,
                     is_prediction=False):
    """
        create model arc for vcr tasks
    """
    shapes = [
        [-1, args.max_seq_len, 1],  #src_id 
        [-1, args.max_seq_len, 1],  #pos_id
        [-1, args.max_seq_len, 1],  #sent_id
        [-1, args.max_seq_len, 1],  #task_id
        [-1, args.max_seq_len, 1],  #input_mask
        [-1, args.max_img_len, args.feature_size],  #image_embedding
        [-1, args.max_img_len, 5],  #image_loc
        [-1, args.max_img_len, 1],  #image_mask
        [-1, 1],  #labels
        [-1, 1],  #qids
        [],  #task_index
        [-1, 1],  #binary_labels
    ]
    dtypes = [
        'int64', 'int64', 'int64', 'int64', 'float32', 'float32', 'float32',
        'float32', 'int64', 'int64', 'int64', 'float32'
    ]
    lod_levels = [0] * len(dtypes)

    for _ in task_group:
        shapes.append([])
        dtypes.append('float')
        lod_levels.append(0)

    pyreader = fluid.layers.py_reader(capacity=30,
                                      shapes=shapes,
                                      dtypes=dtypes,
                                      lod_levels=lod_levels,
                                      name=pyreader_name,
                                      use_double_buffer=False)

    inputs = fluid.layers.read_file(pyreader)
    src_ids, pos_ids, sent_ids, task_ids, input_mask, image_embeddings, \
         image_loc, image_mask, labels, q_ids, task_index, binary_labels = inputs[: 12]

    ernie_vil = ErnieVilModel(src_ids=src_ids,
                              position_ids=pos_ids,
                              sentence_ids=sent_ids,
                              task_ids=task_ids,
                              input_mask=input_mask,
                              image_embeddings=image_embeddings,
                              image_loc=image_loc,
                              input_image_mask=image_mask,
                              config=ernie_config)

    h_cls, h_img = ernie_vil.get_pooled_output()
    task_conf = task_group[0]
    fusion_method = task_conf["fusion_method"]
    fusion_fea = ernie_vil.get_match_score(text=h_cls, image=h_img,         \
                                           dropout_rate=task_conf["dropout_rate"],
                                           mode=fusion_method)

    if is_prediction:
        num_choice = int(task_conf['num_choice'])
        task_name = task_conf.get('task_prefix', 'vcr')
        score = fluid.layers.fc(
            fusion_fea,
            1,
            param_attr=fluid.ParamAttr(
                name=task_name + "_fc.w_0",
                initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
            bias_attr=task_name + "_fc.b_0")
        score = fluid.layers.reshape(score, shape=[-1, num_choice])
        _loss, _softmax = fluid.layers.softmax_with_cross_entropy(
            logits=score, label=labels, return_softmax=True)
        _acc = fluid.layers.accuracy(input=_softmax, label=labels)
        pred = fluid.layers.argmax(score, axis=1)
        mean_loss = fluid.layers.mean(_loss)
        task_vars = [mean_loss, _acc, pred, q_ids, labels, score]  #_softmax
        for var in task_vars:
            var.persistable = True
        return pyreader, task_vars
    else:
        start_ind = 12
        mean_loss = fluid.layers.zeros(shape=[1], dtype='float32')
        mean_acc = fluid.layers.zeros(shape=[1], dtype='float32')
        for task_conf in task_group:
            task_weight = inputs[start_ind]
            start_ind += 1
            num_choice = int(task_conf['num_choice'])
            task_name = task_conf.get('task_prefix', 'vcr')
            score = fluid.layers.fc(
                fusion_fea,
                1,
                param_attr=fluid.ParamAttr(
                    name=task_name + "_fc.w_0",
                    initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
                bias_attr=task_name + "_fc.b_0")

            _loss = fluid.layers.sigmoid_cross_entropy_with_logits(
                score, binary_labels, name="cross_entropy_loss")
            tmp_score = fluid.layers.reshape(score, shape=[-1, num_choice])
            _softmax = fluid.layers.softmax(tmp_score)
            _acc = fluid.layers.accuracy(input=_softmax, label=labels)
            _mean_loss = fluid.layers.mean(_loss)
            mean_loss += _mean_loss * task_weight
            mean_acc += _acc * task_weight
        # Added score & labels for roc_auc
        task_vars = [
            fluid.layers.reduce_mean(mean_loss), mean_acc, score, binary_labels
        ]
        for var in task_vars:
            var.persistable = True

        return pyreader, task_vars
Esempio n. 14
0
def create_model(args, bert_config, num_labels, is_prediction=False):
    input_fields = {
        'names': ['src_ids', 'pos_ids', 'sent_ids', 'input_mask', 'labels'],
        'shapes': [[None, None], [None, None], [None, None], [None, None, 1],
                   [None, 1]],
        'dtypes': ['int64', 'int64', 'int64', 'float32', 'int64'],
        'lod_levels': [0, 0, 0, 0, 0],
    }

    inputs = [
        fluid.data(
            name=input_fields['names'][i],
            shape=input_fields['shapes'][i],
            dtype=input_fields['dtypes'][i],
            lod_level=input_fields['lod_levels'][i])
        for i in range(len(input_fields['names']))
    ]
    (src_ids, pos_ids, sent_ids, input_mask, labels) = inputs

    data_loader = fluid.io.DataLoader.from_generator(
        feed_list=inputs, capacity=50, iterable=False)

    bert = BertModel(
        src_ids=src_ids,
        position_ids=pos_ids,
        sentence_ids=sent_ids,
        input_mask=input_mask,
        config=bert_config,
        use_fp16=args.use_fp16)

    cls_feats = bert.get_pooled_output()
    cls_feats = fluid.layers.dropout(
        x=cls_feats,
        dropout_prob=0.1,
        dropout_implementation="upscale_in_train")
    logits = fluid.layers.fc(
        input=cls_feats,
        num_flatten_dims=2,
        size=num_labels,
        param_attr=fluid.ParamAttr(
            name="cls_out_w",
            initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
        bias_attr=fluid.ParamAttr(
            name="cls_out_b", initializer=fluid.initializer.Constant(0.)))

    if is_prediction:
        probs = fluid.layers.softmax(logits)
        feed_targets_name = [
            src_ids.name, pos_ids.name, sent_ids.name, input_mask.name
        ]
        return data_loader, probs, feed_targets_name

    logits = fluid.layers.reshape(logits, [-1, num_labels], inplace=True)
    ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
        logits=logits, label=labels, return_softmax=True)
    loss = fluid.layers.mean(x=ce_loss)

    num_seqs = fluid.layers.create_tensor(dtype='int64')
    accuracy = fluid.layers.accuracy(input=probs, label=labels, total=num_seqs)

    return data_loader, loss, probs, accuracy, num_seqs
Esempio n. 15
0
def create_model(args, pyreader_name, ernie_config, is_prediction=False):
    pyreader = fluid.layers.py_reader(
        capacity=50,
        shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
                [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1],
                [-1, 1]],
        dtypes=['int64', 'int64', 'int64', 'float32', 'int64', 'int64'],
        lod_levels=[0, 0, 0, 0, 0, 0],
        name=pyreader_name,
        use_double_buffer=True)

    (src_ids, sent_ids, pos_ids, input_mask, labels,
     qids) = fluid.layers.read_file(pyreader)

    ernie = ErnieModel(src_ids=src_ids,
                       position_ids=pos_ids,
                       sentence_ids=sent_ids,
                       input_mask=input_mask,
                       config=ernie_config,
                       use_fp16=args.use_fp16)

    cls_feats = ernie.get_pooled_output()
    cls_feats = fluid.layers.dropout(x=cls_feats,
                                     dropout_prob=0.1,
                                     dropout_implementation="upscale_in_train")
    logits = fluid.layers.fc(
        input=cls_feats,
        size=args.num_labels,
        param_attr=fluid.ParamAttr(
            name="cls_out_w",
            initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
        bias_attr=fluid.ParamAttr(name="cls_out_b",
                                  initializer=fluid.initializer.Constant(0.)))

    if is_prediction:
        probs = fluid.layers.softmax(logits)
        feed_targets_name = [
            src_ids.name, pos_ids.name, sent_ids.name, input_mask.name
        ]
        return pyreader, probs, feed_targets_name

    ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
        logits=logits, label=labels, return_softmax=True)
    loss = fluid.layers.mean(x=ce_loss)

    if args.use_fp16 and args.loss_scaling > 1.0:
        loss *= args.loss_scaling

    num_seqs = fluid.layers.create_tensor(dtype='int64')
    accuracy = fluid.layers.accuracy(input=probs, label=labels, total=num_seqs)

    graph_vars = {
        "loss": loss,
        "probs": probs,
        "accuracy": accuracy,
        "labels": labels,
        "num_seqs": num_seqs,
        "qids": qids
    }

    for k, v in graph_vars.items():
        v.persistable = True

    return pyreader, graph_vars
Esempio n. 16
0
def create_model(bert_config, is_training=False):
    if is_training:
        input_fields = {
            'names': [
                'src_ids', 'pos_ids', 'sent_ids', 'input_mask',
                'start_positions', 'end_positions'
            ],
            'shapes': [[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
                       [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
                       [-1, 1], [-1, 1]],
            'dtypes': ['int64', 'int64', 'int64', 'float32', 'int64', 'int64'],
            'lod_levels': [0, 0, 0, 0, 0, 0],
        }
    else:
        input_fields = {
            'names':
            ['src_ids', 'pos_ids', 'sent_ids', 'input_mask', 'unique_id'],
            'shapes': [[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
                       [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
                       [-1, 1]],
            'dtypes': ['int64', 'int64', 'int64', 'float32', 'int64'],
            'lod_levels': [0, 0, 0, 0, 0],
        }

    inputs = [
        fluid.layers.data(name=input_fields['names'][i],
                          shape=input_fields['shapes'][i],
                          dtype=input_fields['dtypes'][i],
                          lod_level=input_fields['lod_levels'][i])
        for i in range(len(input_fields['names']))
    ]

    pyreader = fluid.io.PyReader(feed_list=inputs, capacity=50, iterable=False)

    if is_training:
        (src_ids, pos_ids, sent_ids, input_mask, start_positions,
         end_positions) = inputs
    else:
        (src_ids, pos_ids, sent_ids, input_mask, unique_id) = inputs

    bert = BertModel(src_ids=src_ids,
                     position_ids=pos_ids,
                     sentence_ids=sent_ids,
                     input_mask=input_mask,
                     config=bert_config,
                     use_fp16=args.use_fp16)

    enc_out = bert.get_sequence_output()

    logits = fluid.layers.fc(
        input=enc_out,
        size=2,
        num_flatten_dims=2,
        param_attr=fluid.ParamAttr(
            name="cls_squad_out_w",
            initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
        bias_attr=fluid.ParamAttr(name="cls_squad_out_b",
                                  initializer=fluid.initializer.Constant(0.)))

    logits = fluid.layers.transpose(x=logits, perm=[2, 0, 1])
    start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0)

    batch_ones = fluid.layers.fill_constant_batch_size_like(input=start_logits,
                                                            dtype='int64',
                                                            shape=[1],
                                                            value=1)
    num_seqs = fluid.layers.reduce_sum(input=batch_ones)

    if is_training:

        def compute_loss(logits, positions):
            loss = fluid.layers.softmax_with_cross_entropy(logits=logits,
                                                           label=positions)
            loss = fluid.layers.mean(x=loss)
            return loss

        start_loss = compute_loss(start_logits, start_positions)
        end_loss = compute_loss(end_logits, end_positions)
        total_loss = (start_loss + end_loss) / 2.0
        if args.use_fp16 and args.loss_scaling > 1.0:
            total_loss = total_loss * args.loss_scaling

        return pyreader, total_loss, num_seqs
    else:
        return pyreader, unique_id, start_logits, end_logits, num_seqs
Esempio n. 17
0
 def linear(inputs, para_name, args):
     return layers.fc(input=inputs,
                      size=size,
                      param_attr=fluid.ParamAttr(name=para_name + '_w'),
                      bias_attr=fluid.ParamAttr(name=para_name + '_b'))
Esempio n. 18
0
    def __init__(self, args, pretrained_embed=None):
        super(Model, self).__init__()
        self.args = args
        # the embedding layer
        self.word_embed = dygraph.Embedding(size=(args.n_words, args.n_embed))

        if args.pretrained_embed_shape is not None:
            if pretrained_embed is not None:
                pre_param_attrs = fluid.ParamAttr(
                    name="pretrained_emb",
                    initializer=initializer.NumpyArrayInitializer(
                        pretrained_embed),
                    trainable=True)
                self.pretrained = dygraph.Embedding(
                    size=args.pretrained_embed_shape,
                    param_attr=pre_param_attrs)
                self.word_embed.weight = layers.create_parameter(
                    shape=(self.args.n_words, self.args.n_embed),
                    dtype='float32',
                    default_initializer=initializer.Constant(value=0.0))
            else:
                self.pretrained = dygraph.Embedding(
                    size=args.pretrained_embed_shape)
        # Initialize feat feature, feat can be char or pos
        if args.feat == 'char':
            self.feat_embed = CharLSTM(n_chars=args.n_feats,
                                       n_embed=args.n_char_embed,
                                       n_out=args.n_feat_embed,
                                       pad_index=args.feat_pad_index)
        else:
            self.feat_embed = dygraph.Embedding(size=(args.n_feats,
                                                      args.n_feat_embed))
        self.embed_dropout = IndependentDropout(p=args.embed_dropout)

        # lstm layer
        self.lstm = BiLSTM(input_size=args.n_embed + args.n_feat_embed,
                           hidden_size=args.n_lstm_hidden,
                           num_layers=args.n_lstm_layers,
                           dropout=args.lstm_dropout)
        self.lstm_dropout = SharedDropout(p=args.lstm_dropout)

        # mlp layer
        self.mlp_arc_h = MLP(n_in=args.n_lstm_hidden * 2,
                             n_out=args.n_mlp_arc,
                             dropout=args.mlp_dropout)
        self.mlp_arc_d = MLP(n_in=args.n_lstm_hidden * 2,
                             n_out=args.n_mlp_arc,
                             dropout=args.mlp_dropout)
        self.mlp_rel_h = MLP(n_in=args.n_lstm_hidden * 2,
                             n_out=args.n_mlp_rel,
                             dropout=args.mlp_dropout)
        self.mlp_rel_d = MLP(n_in=args.n_lstm_hidden * 2,
                             n_out=args.n_mlp_rel,
                             dropout=args.mlp_dropout)

        # biaffine layers
        self.arc_attn = Biaffine(n_in=args.n_mlp_arc,
                                 bias_x=True,
                                 bias_y=False)
        self.rel_attn = Biaffine(n_in=args.n_mlp_rel,
                                 n_out=args.n_rels,
                                 bias_x=True,
                                 bias_y=True)
        self.pad_index = args.pad_index
        self.unk_index = args.unk_index
Esempio n. 19
0
def gin(gw,
        feature,
        hidden_size,
        activation,
        name,
        init_eps=0.0,
        train_eps=False):
    """Implementation of Graph Isomorphism Network (GIN) layer.

    This is an implementation of the paper How Powerful are Graph Neural Networks?
    (https://arxiv.org/pdf/1810.00826.pdf).

    In their implementation, all MLPs have 2 layers. Batch normalization is applied
    on every hidden layer.

    Args:
        gw: Graph wrapper object (:code:`StaticGraphWrapper` or :code:`GraphWrapper`)

        feature: A tensor with shape (num_nodes, feature_size).

        name: GIN layer names.

        hidden_size: The hidden size for gin.

        activation: The activation for the output.

        init_eps: float, optional
            Initial :math:`\epsilon` value, default is 0.

        train_eps: bool, optional
            if True, :math:`\epsilon` will be a learnable parameter.

    Return:
        A tensor with shape (num_nodes, hidden_size).
    """
    def send_src_copy(src_feat, dst_feat, edge_feat):
        return src_feat["h"]

    epsilon = fluid.layers.create_parameter(
        shape=[1, 1],
        dtype="float32",
        attr=fluid.ParamAttr(name="%s_eps" % name),
        default_initializer=fluid.initializer.ConstantInitializer(
            value=init_eps))

    if not train_eps:
        epsilon.stop_gradient = True

    msg = gw.send(send_src_copy, nfeat_list=[("h", feature)])
    output = gw.recv(msg, "sum") + feature * (epsilon + 1.0)

    output = fluid.layers.fc(output,
                             size=hidden_size,
                             act=None,
                             param_attr=fluid.ParamAttr(name="%s_w_0" % name),
                             bias_attr=fluid.ParamAttr(name="%s_b_0" % name))

    output = fluid.layers.layer_norm(
        output,
        begin_norm_axis=1,
        param_attr=fluid.ParamAttr(
            name="norm_scale_%s" % (name),
            initializer=fluid.initializer.Constant(1.0)),
        bias_attr=fluid.ParamAttr(name="norm_bias_%s" % (name),
                                  initializer=fluid.initializer.Constant(0.0)),
    )

    if activation is not None:
        output = getattr(fluid.layers, activation)(output)

    output = fluid.layers.fc(output,
                             size=hidden_size,
                             act=activation,
                             param_attr=fluid.ParamAttr(name="%s_w_1" % name),
                             bias_attr=fluid.ParamAttr(name="%s_b_1" % name))

    return output
Esempio n. 20
0
def multi_head_attention(queries,
                         keys,
                         values,
                         attn_bias,
                         d_key,
                         d_value,
                         d_model,
                         n_head=1,
                         dropout_rate=0.,
                         cache=None,
                         param_initializer=None,
                         name='multi_head_att'):
    """
    Multi-Head Attention. Note that attn_bias is added to the logit before
    computing softmax activiation to mask certain selected positions so that
    they will not considered in attention weights.
    """
    keys = queries if keys is None else keys
    values = keys if values is None else values

    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
        raise ValueError(
            "Inputs: quries, keys and values should all be 3-D tensors.")

    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
        """
        Add linear projection to queries, keys, and values.
        """
        q = layers.fc(input=queries,
                      size=d_key * n_head,
                      num_flatten_dims=2,
                      param_attr=fluid.ParamAttr(
                          name=name + '_query_fc.w_0',
                          initializer=param_initializer),
                      bias_attr=name + '_query_fc.b_0')
        k = layers.fc(input=keys,
                      size=d_key * n_head,
                      num_flatten_dims=2,
                      param_attr=fluid.ParamAttr(
                          name=name + '_key_fc.w_0',
                          initializer=param_initializer),
                      bias_attr=name + '_key_fc.b_0')
        v = layers.fc(input=values,
                      size=d_value * n_head,
                      num_flatten_dims=2,
                      param_attr=fluid.ParamAttr(
                          name=name + '_value_fc.w_0',
                          initializer=param_initializer),
                      bias_attr=name + '_value_fc.b_0')
        return q, k, v

    def __split_heads(x, n_head):
        """
        Reshape the last dimension of inpunt tensor x so that it becomes two
        dimensions and then transpose. Specifically, input a tensor with shape
        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
        with shape [bs, n_head, max_sequence_length, hidden_dim].
        """
        hidden_size = x.shape[-1]
        # The value 0 in shape attr means copying the corresponding dimension
        # size of the input as the output dimension size.
        reshaped = layers.reshape(x=x,
                                  shape=[0, 0, n_head, hidden_size // n_head],
                                  inplace=True)

        # permuate the dimensions into:
        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])

    def __combine_heads(x):
        """
        Transpose and then reshape the last two dimensions of inpunt tensor x
        so that it becomes one dimension, which is reverse to __split_heads.
        """
        if len(x.shape) == 3: return x
        if len(x.shape) != 4:
            raise ValueError("Input(x) should be a 4-D Tensor.")

        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
        # The value 0 in shape attr means copying the corresponding dimension
        # size of the input as the output dimension size.
        return layers.reshape(
            x=trans_x,
            shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
            inplace=True)

    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
        """
        Scaled Dot-Product Attention
        """
        scaled_q = layers.scale(x=q, scale=d_key**-0.5)
        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
        if attn_bias:
            product += attn_bias
        weights = layers.softmax(product, use_cudnn=True)
        if dropout_rate:
            weights = layers.dropout(weights,
                                     dropout_prob=dropout_rate,
                                     dropout_implementation="upscale_in_train",
                                     is_test=False)
        out = layers.matmul(weights, v)
        return out

    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)

    if cache is not None:  # use cache and concat time steps
        # Since the inplace reshape in __split_heads changes the shape of k and
        # v, which is the cache input for next time step, reshape the cache
        # input from the previous time step first.
        k = cache["k"] = layers.concat(
            [layers.reshape(cache["k"], shape=[0, 0, d_model]), k], axis=1)
        v = cache["v"] = layers.concat(
            [layers.reshape(cache["v"], shape=[0, 0, d_model]), v], axis=1)

    q = __split_heads(q, n_head)
    k = __split_heads(k, n_head)
    v = __split_heads(v, n_head)

    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key,
                                                  dropout_rate)

    out = __combine_heads(ctx_multiheads)

    # Project back to the model size.
    proj_out = layers.fc(input=out,
                         size=d_model,
                         num_flatten_dims=2,
                         param_attr=fluid.ParamAttr(
                             name=name + '_output_fc.w_0',
                             initializer=param_initializer),
                         bias_attr=name + '_output_fc.b_0')
    return proj_out
Esempio n. 21
0
def gat(gw,
        feature,
        hidden_size,
        activation,
        name,
        num_heads=8,
        feat_drop=0.6,
        attn_drop=0.6,
        is_test=False):
    """Implementation of graph attention networks (GAT)

    This is an implementation of the paper GRAPH ATTENTION NETWORKS
    (https://arxiv.org/abs/1710.10903).

    Args:
        gw: Graph wrapper object (:code:`StaticGraphWrapper` or :code:`GraphWrapper`)

        feature: A tensor with shape (num_nodes, feature_size).

        hidden_size: The hidden size for gat.

        activation: The activation for the output.

        name: Gat layer names.

        num_heads: The head number in gat.

        feat_drop: Dropout rate for feature.

        attn_drop: Dropout rate for attention.

        is_test: Whether in test phrase.

    Return:
        A tensor with shape (num_nodes, hidden_size * num_heads)
    """
    def send_attention(src_feat, dst_feat, edge_feat):
        output = src_feat["left_a"] + dst_feat["right_a"]
        output = fluid.layers.leaky_relu(output,
                                         alpha=0.2)  # (num_edges, num_heads)
        return {"alpha": output, "h": src_feat["h"]}

    def reduce_attention(msg):
        alpha = msg["alpha"]  # lod-tensor (batch_size, seq_len, num_heads)
        h = msg["h"]
        alpha = paddle_helper.sequence_softmax(alpha)
        old_h = h
        h = fluid.layers.reshape(h, [-1, num_heads, hidden_size])
        alpha = fluid.layers.reshape(alpha, [-1, num_heads, 1])
        if attn_drop > 1e-15:
            alpha = fluid.layers.dropout(
                alpha,
                dropout_prob=attn_drop,
                is_test=is_test,
                dropout_implementation="upscale_in_train")
        h = h * alpha
        h = fluid.layers.reshape(h, [-1, num_heads * hidden_size])
        h = fluid.layers.lod_reset(h, old_h)
        return fluid.layers.sequence_pool(h, "sum")

    if feat_drop > 1e-15:
        feature = fluid.layers.dropout(
            feature,
            dropout_prob=feat_drop,
            is_test=is_test,
            dropout_implementation='upscale_in_train')

    ft = fluid.layers.fc(feature,
                         hidden_size * num_heads,
                         bias_attr=False,
                         param_attr=fluid.ParamAttr(name=name + '_weight'))
    left_a = fluid.layers.create_parameter(shape=[num_heads, hidden_size],
                                           dtype='float32',
                                           name=name + '_gat_l_A')
    right_a = fluid.layers.create_parameter(shape=[num_heads, hidden_size],
                                            dtype='float32',
                                            name=name + '_gat_r_A')
    reshape_ft = fluid.layers.reshape(ft, [-1, num_heads, hidden_size])
    left_a_value = fluid.layers.reduce_sum(reshape_ft * left_a, -1)
    right_a_value = fluid.layers.reduce_sum(reshape_ft * right_a, -1)

    msg = gw.send(send_attention,
                  nfeat_list=[("h", ft), ("left_a", left_a_value),
                              ("right_a", right_a_value)])
    output = gw.recv(msg, reduce_attention)
    bias = fluid.layers.create_parameter(shape=[hidden_size * num_heads],
                                         dtype='float32',
                                         is_bias=True,
                                         name=name + '_bias')
    bias.stop_gradient = True
    output = fluid.layers.elementwise_add(output, bias, act=activation)
    return output
Esempio n. 22
0
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.layers as layers

# Program to generate parameter
# The original 'weight' is filled value 1 with shape (4, 8)
ones = np.ones((4, 8)).astype('float32')
main_prog = fluid.Program()
start_prog = fluid.Program()
with fluid.program_guard(main_prog, start_prog):
    input = fluid.data('input', shape=[-1, 4], dtype='float32')
    output = layers.fc(
        input,
        8,
        param_attr=fluid.ParamAttr(
            name='weight',
            initializer=fluid.initializer.NumpyArrayInitializer(ones)))

exe = fluid.Executor(fluid.CPUPlace())
# initialize all parameters
exe.run(start_prog)

# simulate saving model
fluid.io.save_persistables(exe, dirname="old", main_program=main_prog)

#############################################################################
# The following section illustrates what user should do to adjust parameter #
#############################################################################

# The target 'weight' is the concatenation of original 'weight' and a
# supplement weight filled 0 of shape (4, 8)
Esempio n. 23
0
def compute_span_end_logits(input_tensor,
                            span_mask,
                            flat_start_positions,
                            args,
                            name=""):
    input_shape = list(input_tensor.shape)
    span_mask_shape = list(span_mask.shape)

    batch_size = args.start_top_k * args.batch_size
    seq_length = span_mask_shape[1]
    width = input_shape[-1]

    start_vectors = gather_indexes(input_tensor, flat_start_positions)
    start_vectors = fluid.layers.reshape(x=start_vectors, shape=[-1, 1, width])
    start_vectors = fluid.layers.expand(x=start_vectors,
                                        expand_times=[1, seq_length, 1])
    concat_input = fluid.layers.concat(input=[start_vectors, input_tensor],
                                       axis=2)

    weights = fluid.ParamAttr(name=name + "conditional_fc_weights",
                              initializer=create_initializer(0.02))

    bias = fluid.ParamAttr(name=name + "conditional_fc_bias")

    concat_input_reshape = fluid.layers.reshape(x=concat_input,
                                                shape=[-1, 2 * width])

    conditional_tensor = fluid.layers.fc(input=concat_input_reshape,
                                         size=width,
                                         act="gelu",
                                         name=name + "span_end_conditional",
                                         param_attr=weights,
                                         bias_attr=bias)

    conditional_tensor_reshape = fluid.layers.reshape(
        x=conditional_tensor, shape=[-1, seq_length, width])

    conditional_tensor = fluid.layers.layer_norm(
        input=conditional_tensor_reshape,
        begin_norm_axis=2,
        param_attr=fluid.ParamAttr(name=name + "conditional_layernorm_gamma",
                                   initializer=create_initializer(0.02)),
        bias_attr=fluid.ParamAttr(name=name + "conditional_layernorm_beta"))

    end_weights = fluid.layers.create_parameter(
        name=name + "span_end_weights",
        shape=[width],
        dtype='float32',
        default_initializer=create_initializer(0.02))

    template_var = fluid.layers.fill_constant_batch_size_like(
        conditional_tensor,
        shape=list(conditional_tensor.shape),
        dtype='float32',
        value=0)

    end_weights = fluid.layers.reshape(x=end_weights, shape=[1, width])
    end_weights = fluid.layers.expand(x=end_weights,
                                      expand_times=[seq_length, 1])
    end_weights = fluid.layers.elementwise_add(template_var,
                                               end_weights,
                                               axis=-1)

    raw_scores = fluid.layers.reduce_sum(conditional_tensor * end_weights,
                                         dim=-1)
    raw_scores += (1.0 -
                   fluid.layers.cast(x=span_mask, dtype='float32')) * -10000.0

    logits = fluid.layers.reshape(x=raw_scores, shape=[-1, seq_length])

    return logits
    def get_pretraining_output(self, mask_label, mask_pos, labels):
        """Get the loss & accuracy for pretraining"""

        mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')

        # extract the first token feature in each sentence
        next_sent_feat = self.get_pooled_output()
        reshaped_emb_out = fluid.layers.reshape(x=self._enc_out,
                                                shape=[-1, self._emb_size])
        # extract masked tokens' feature
        mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)

        # transform: fc
        mask_trans_feat = fluid.layers.fc(
            input=mask_feat,
            size=self._emb_size,
            act=self._hidden_act,
            param_attr=fluid.ParamAttr(name=self.model_name +
                                       'mask_lm_trans_fc.w_0',
                                       initializer=self._param_initializer),
            bias_attr=fluid.ParamAttr(name=self.model_name +
                                      'mask_lm_trans_fc.b_0'))
        # transform: layer norm
        mask_trans_feat = pre_process_layer(mask_trans_feat,
                                            'n',
                                            name=self.model_name +
                                            'mask_lm_trans')

        mask_lm_out_bias_attr = fluid.ParamAttr(
            name=self.model_name + "mask_lm_out_fc.b_0",
            initializer=fluid.initializer.Constant(value=0.0))
        if self._weight_sharing:
            fc_out = fluid.layers.matmul(
                x=mask_trans_feat,
                y=fluid.default_main_program().global_block().var(
                    self._word_emb_name),
                transpose_y=True)
            fc_out += fluid.layers.create_parameter(shape=[self._voc_size],
                                                    dtype=self._dtype,
                                                    attr=mask_lm_out_bias_attr,
                                                    is_bias=True)

        else:
            fc_out = fluid.layers.fc(
                input=mask_trans_feat,
                size=self._voc_size,
                param_attr=fluid.ParamAttr(
                    name=self.model_name + "mask_lm_out_fc.w_0",
                    initializer=self._param_initializer),
                bias_attr=mask_lm_out_bias_attr)

        mask_lm_loss = fluid.layers.softmax_with_cross_entropy(
            logits=fc_out, label=mask_label)
        mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss)

        next_sent_fc_out = fluid.layers.fc(
            input=next_sent_feat,
            size=2,
            param_attr=fluid.ParamAttr(name=self.model_name +
                                       "next_sent_fc.w_0",
                                       initializer=self._param_initializer),
            bias_attr=self.model_name + "next_sent_fc.b_0")

        next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(
            logits=next_sent_fc_out, label=labels, return_softmax=True)

        next_sent_acc = fluid.layers.accuracy(input=next_sent_softmax,
                                              label=labels)

        mean_next_sent_loss = fluid.layers.mean(next_sent_loss)

        loss = mean_next_sent_loss + mean_mask_lm_loss
        return next_sent_acc, mean_mask_lm_loss, loss
Esempio n. 25
0
def train(use_cuda, save_dirname=None, is_local=True):
    # define network topology
    word = fluid.layers.data(
        name='word_data', shape=[1], dtype='int64', lod_level=1)
    predicate = fluid.layers.data(
        name='verb_data', shape=[1], dtype='int64', lod_level=1)
    ctx_n2 = fluid.layers.data(
        name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
    ctx_n1 = fluid.layers.data(
        name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
    ctx_0 = fluid.layers.data(
        name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
    ctx_p1 = fluid.layers.data(
        name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
    ctx_p2 = fluid.layers.data(
        name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
    mark = fluid.layers.data(
        name='mark_data', shape=[1], dtype='int64', lod_level=1)
    feature_out = db_lstm(**locals())
    target = fluid.layers.data(
        name='target', shape=[1], dtype='int64', lod_level=1)
    crf_cost = fluid.layers.linear_chain_crf(
        input=feature_out,
        label=target,
        param_attr=fluid.ParamAttr(
            name='crfw', learning_rate=mix_hidden_lr))
    avg_cost = fluid.layers.mean(crf_cost)

    # TODO(qiao)
    # check other optimizers and check why out will be NAN
    sgd_optimizer = fluid.optimizer.SGD(
        learning_rate=fluid.layers.exponential_decay(
            learning_rate=0.01,
            decay_steps=100000,
            decay_rate=0.5,
            staircase=True))
    sgd_optimizer.minimize(avg_cost)

    # TODO(qiao)
    # add dependency track and move this config before optimizer
    crf_decode = fluid.layers.crf_decoding(
        input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))

    train_data = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.conll05.test(), buf_size=8192),
        batch_size=BATCH_SIZE)

    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    feeder = fluid.DataFeeder(
        feed_list=[
            word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target
        ],
        place=place)
    exe = fluid.Executor(place)

    def train_loop(main_program):
        exe.run(fluid.default_startup_program())
        embedding_param = fluid.global_scope().find_var(
            embedding_name).get_tensor()
        embedding_param.set(
            load_parameter(conll05.get_embedding(), word_dict_len, word_dim),
            place)

        start_time = time.time()
        batch_id = 0
        for pass_id in range(PASS_NUM):
            for data in train_data():
                cost = exe.run(main_program,
                               feed=feeder.feed(data),
                               fetch_list=[avg_cost])
                cost = cost[0]

                if batch_id % 10 == 0:
                    print("avg_cost:" + str(cost))
                    if batch_id != 0:
                        print("second per batch: " + str((time.time(
                        ) - start_time) / batch_id))
                    # Set the threshold low to speed up the CI test
                    if float(cost) < 80.0:
                        if save_dirname is not None:
                            # TODO(liuyiqun): Change the target to crf_decode
                            fluid.io.save_inference_model(save_dirname, [
                                'word_data', 'verb_data', 'ctx_n2_data',
                                'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
                                'ctx_p2_data', 'mark_data'
                            ], [feature_out], exe)
                        return

                batch_id = batch_id + 1

        raise RuntimeError(
            "This model should save_inference_model and return, but not reach here, please check!"
        )

    if is_local:
        train_loop(fluid.default_main_program())
    else:
        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
        trainers = int(os.getenv("PADDLE_TRAINERS"))
        current_endpoint = os.getenv("POD_IP") + ":" + port
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
        t = fluid.DistributeTranspiler()
        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
        if training_role == "PSERVER":
            pserver_prog = t.get_pserver_program(current_endpoint)
            pserver_startup = t.get_startup_program(current_endpoint,
                                                    pserver_prog)
            exe.run(pserver_startup)
            exe.run(pserver_prog)
        elif training_role == "TRAINER":
            train_loop(t.get_trainer_program())
Esempio n. 26
0
import numpy as np
import paddle.fluid as fluid
from paddle.fluid.dygraph.nn import Conv2D
from paddle.fluid.initializer import NumpyArrayInitializer

img = Image.open(r'D:\softwaresavfile\Github\machine_learning\计算视觉基础\test.jpg')
with fluid.dygraph.guard():
    # 设置卷积核参数
    w = np.array([[-1,-1,-1], [-1,8,-1], [-1,-1,-1]], dtype='float32')/8
    w = w.reshape([1, 1, 3, 3])
    # 由于输入通道数是3,将卷积核的形状从[1,1,3,3]调整为[1,3,3,3]
    w = np.repeat(w, 3, axis=1)
    # 创建卷积算子,输出通道数为1,卷积核大小为3x3,
    # 并使用上面的设置好的数值作为卷积核权重的初始化参数
    conv = Conv2D(num_channels=3, num_filters=1, filter_size=[3, 3],
            param_attr=fluid.ParamAttr(
              initializer=NumpyArrayInitializer(value=w)))

    # 将读入的图片转化为float32类型的numpy.ndarray
    x = np.array(img).astype('float32')
    # 图片读入成ndarry时,形状是[H, W, 3],
    # 将通道这一维度调整到最前面
    x = np.transpose(x, (2,0,1))

    print("图像的高度和宽度=",img.height, img.width)
    # 将数据形状调整为[N, C, H, W]格式
    x = x.reshape(1, 3, img.height, img.width)
    x = fluid.dygraph.to_variable(x)
    y = conv(x)
    out = y.numpy()

plt.figure(figsize=(20, 10))
Esempio n. 27
0
def train(conf_dict, data_reader, use_cuda=False):
    """
    Training of so labeling model
    """
    # input data layer
    word = fluid.layers.data(
        name='word_data', shape=[1], dtype='int64', lod_level=1)
    postag = fluid.layers.data(
        name='token_pos', shape=[1], dtype='int64', lod_level=1)
    p_word = fluid.layers.data(
        name='p_word', shape=[1], dtype='int64', lod_level=1)
    # label
    target = fluid.layers.data(
        name='target', shape=[1], dtype='int64', lod_level=1)

    # embedding + lstm
    feature_out = spo_model.db_lstm(data_reader, word, \
            postag, p_word, conf_dict)

    # loss function
    # crf layer
    mix_hidden_lr = float(conf_dict['mix_hidden_lr'])
    crf_cost = fluid.layers.linear_chain_crf(
        input=feature_out,
        label=target,
        param_attr=fluid.ParamAttr(name='crfw', learning_rate=mix_hidden_lr))
    avg_cost = fluid.layers.mean(crf_cost)

    # optimizer
    sgd_optimizer = fluid.optimizer.AdamOptimizer(
        learning_rate=2e-3, )

    sgd_optimizer.minimize(avg_cost)

    crf_decode = fluid.layers.crf_decoding(
        input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))

    train_batch_reader = paddle.batch(
        paddle.reader.shuffle(data_reader.get_train_reader(), buf_size=8192),
        batch_size=conf_dict['batch_size'])

    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

    feeder = fluid.DataFeeder(feed_list=[word, postag, p_word, target], place=place)
    exe = fluid.Executor(place)

    save_dirname = conf_dict['spo_model_save_dir']

    def train_loop(main_program, trainer_id=0):
        """start train loop"""
        exe.run(fluid.default_startup_program())

        start_time = time.time()
        batch_id = 0
        for pass_id in six.moves.xrange(conf_dict['pass_num']):
            pass_start_time = time.time()
            cost_sum, cost_counter = 0, 0
            for data in train_batch_reader():
                cost = exe.run(main_program, feed=feeder.feed(data), fetch_list=[avg_cost])
                cost = cost[0]
                cost_sum += cost
                cost_counter += 1
                if batch_id % 10 == 0 and batch_id != 0:
                    #sys.stderr.write("batch %d finished, second per batch: %02f\n" % (
                    #    batch_id, (time.time() - start_time) / batch_id))

                # cost expected, training over
                if float(cost) < 1:
                    save_path = os.path.join(save_dirname, 'final')
                    fluid.io.save_inference_model(save_path, ['word_data', 'token_dist', 'p_word'],
                                                  [feature_out], exe, params_filename='params')
                    return
                batch_id = batch_id + 1

            # save the model once each pass ends
            pass_avg_cost = cost_sum / cost_counter if cost_counter > 0 else 0.0
            #sys.stderr.write("%d pass end, cost time: %02f, avg_cost: %f" % (
             #       pass_id, time.time() - pass_start_time, pass_avg_cost))
            save_path = os.path.join(save_dirname, 'pass_%04d-%f' %
                                    (pass_id, pass_avg_cost))
            fluid.io.save_inference_model(save_path, ['word_data', 'token_pos', 'p_word'],
                                          [feature_out], exe, params_filename='params')

        else:
            # pass times complete and the training is over
            save_path = os.path.join(save_dirname, 'final')
            fluid.io.save_inference_model(save_path, ['word_data', 'token_pos', 'p_word'],
                                          [feature_out], exe, params_filename='params')
        return

    train_loop(fluid.default_main_program())


def main(conf_dict, use_cuda=False):
    """Train main function"""
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return
    data_generator = spo_data_reader.DataReader(
        wordemb_dict_path=conf_dict['word_idx_path'],
        postag_dict_path=conf_dict['postag_dict_path'],
        label_dict_path=conf_dict['so_label_dict_path'],
        p_eng_dict_path=conf_dict['label_dict_path'],
        train_data_list_path=conf_dict['spo_train_data_path'],
        test_data_list_path=conf_dict['spo_test_data_path'])
    
    train(conf_dict, data_generator, use_cuda=use_cuda)


if __name__ == '__main__':
    # Load the configuration file
    parser = argparse.ArgumentParser()
    parser.add_argument("--conf_path", type=str,
        help="conf_file_path_for_model. (default: %(default)s)",
        required=True)
    args = parser.parse_args()
    conf_dict = conf_lib.load_conf(args.conf_path)
    use_gpu = True if conf_dict.get('use_gpu', 'False') == 'True' else False
    main(conf_dict, use_cuda=use_gpu)
Esempio n. 28
0
    def _build_model(self, src_ids, position_ids, sentence_ids, input_mask):
        # padding id in vocabulary must be set to 0
        emb_out = fluid.layers.embedding(
            input=src_ids,
            size=[self._voc_size, self._emb_size],
            dtype=self._dtype,
            param_attr=fluid.ParamAttr(name=self._word_emb_name,
                                       initializer=self._param_initializer),
            is_sparse=False)
        position_emb_out = fluid.layers.embedding(
            input=position_ids,
            size=[self._max_position_seq_len, self._emb_size],
            dtype=self._dtype,
            param_attr=fluid.ParamAttr(name=self._pos_emb_name,
                                       initializer=self._param_initializer))

        sent_emb_out = fluid.layers.embedding(
            sentence_ids,
            size=[self._sent_types, self._emb_size],
            dtype=self._dtype,
            param_attr=fluid.ParamAttr(name=self._sent_emb_name,
                                       initializer=self._param_initializer))

        emb_out = emb_out + position_emb_out
        emb_out = emb_out + sent_emb_out

        emb_out = pre_process_layer(emb_out,
                                    'nd',
                                    self._prepostprocess_dropout,
                                    name='pre_encoder')

        if self._dtype == core.VarDesc.VarType.FP16:
            input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype)
        self_attn_mask = fluid.layers.matmul(x=input_mask,
                                             y=input_mask,
                                             transpose_y=True)

        self_attn_mask = fluid.layers.scale(x=self_attn_mask,
                                            scale=10000.0,
                                            bias=-1.0,
                                            bias_after_scale=False)
        n_head_self_attn_mask = fluid.layers.stack(x=[self_attn_mask] *
                                                   self._n_head,
                                                   axis=1)
        n_head_self_attn_mask.stop_gradient = True

        self._enc_out = encoder(
            enc_input=emb_out,
            attn_bias=n_head_self_attn_mask,
            n_layer=self._n_layer,
            n_head=self._n_head,
            d_key=self._emb_size // self._n_head,
            d_value=self._emb_size // self._n_head,
            d_model=self._emb_size,
            d_inner_hid=self._emb_size * 4,
            prepostprocess_dropout=self._prepostprocess_dropout,
            attention_dropout=self._attention_dropout,
            relu_dropout=0,
            hidden_act=self._hidden_act,
            preprocess_cmd="",
            postprocess_cmd="dan",
            param_initializer=self._param_initializer,
            name='encoder')
Esempio n. 29
0
def create_model(pyreader_name, bert_config, max_wn_concept_length, max_nell_concept_length, wn_concept_embedding_mat, nell_concept_embedding_mat, is_training=False, freeze=False):
    if is_training:
        pyreader = fluid.layers.py_reader(
            capacity=50,
            shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
                    [-1, args.max_seq_len, 1],
                    [-1, args.max_seq_len, max_wn_concept_length, 1],
                    [-1, args.max_seq_len, max_nell_concept_length, 1],
                    [-1, args.max_seq_len, 1], [-1, 1], [-1, 1]],
            dtypes=[
                'int64', 'int64', 'int64', 'int64', 'int64', 'float32', 'int64', 'int64'],
            lod_levels=[0, 0, 0, 0, 0, 0, 0, 0],
            name=pyreader_name,
            use_double_buffer=True)
        (src_ids, pos_ids, sent_ids, wn_concept_ids, nell_concept_ids, input_mask, start_positions,
         end_positions) = fluid.layers.read_file(pyreader)
    else:
        pyreader = fluid.layers.py_reader(
            capacity=50,
            shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
                    [-1, args.max_seq_len, 1],
                    [-1, args.max_seq_len, max_wn_concept_length, 1],
                    [-1, args.max_seq_len, max_nell_concept_length, 1],
                    [-1, args.max_seq_len, 1], [-1, 1]],
            dtypes=['int64', 'int64', 'int64', 'int64', 'int64', 'float32', 'int64'],
            lod_levels=[0, 0, 0, 0, 0, 0, 0],
            name=pyreader_name,
            use_double_buffer=True)
        (src_ids, pos_ids, sent_ids, wn_concept_ids, nell_concept_ids, input_mask, unique_id) = fluid.layers.read_file(pyreader)

    '''1st Layer: BERT Layer'''
    bert = BertModel(
        src_ids=src_ids,
        position_ids=pos_ids,
        sentence_ids=sent_ids,
        input_mask=input_mask,
        config=bert_config,
        use_fp16=args.use_fp16)

    enc_out = bert.get_sequence_output()
    if freeze:
        enc_out.stop_gradient=True
    logger.info("enc_out.stop_gradient: {}".format(enc_out.stop_gradient))

    '''2nd layer: Memory Layer'''
    # get memory embedding
    wn_concept_vocab_size = wn_concept_embedding_mat.shape[0]
    wn_concept_dim = wn_concept_embedding_mat.shape[1]
    nell_concept_vocab_size = nell_concept_embedding_mat.shape[0]
    nell_concept_dim = nell_concept_embedding_mat.shape[1]    
    wn_memory_embs = fluid.layers.embedding(wn_concept_ids,
                                         size=(wn_concept_vocab_size, wn_concept_dim),
                                         param_attr=fluid.ParamAttr(name="wn_concept_emb_mat",
                                                                    do_model_average=False,
                                                                    trainable=False),
                                         dtype='float32')
    nell_memory_embs = fluid.layers.embedding(nell_concept_ids,
                                         size=(nell_concept_vocab_size, nell_concept_dim),
                                         param_attr=fluid.ParamAttr(name="nell_concept_emb_mat",
                                                                    do_model_average=False,
                                                                    trainable=False),
                                         dtype='float32')    
    
    # get memory length
    wn_concept_ids_reduced = fluid.layers.equal(wn_concept_ids,
        fluid.layers.fill_constant(shape=[1], value=0, dtype="int64"))  # [batch_size, sent_size, concept_size, 1]
    wn_concept_ids_reduced = fluid.layers.cast(wn_concept_ids_reduced, dtype="float32")  # [batch_size, sent_size, concept_size, 1]
    wn_concept_ids_reduced = fluid.layers.scale(
        fluid.layers.elementwise_sub(
            wn_concept_ids_reduced,
            fluid.layers.fill_constant([1], "float32", 1)
        ),
        scale=-1
    )
    wn_mem_length = fluid.layers.reduce_sum(wn_concept_ids_reduced, dim=2)  # [batch_size, sent_size, 1]    

    nell_concept_ids_reduced = fluid.layers.equal(nell_concept_ids,
        fluid.layers.fill_constant(shape=[1], value=0, dtype="int64"))  # [batch_size, sent_size, concept_size, 1]
    nell_concept_ids_reduced = fluid.layers.cast(nell_concept_ids_reduced, dtype="float32")  # [batch_size, sent_size, concept_size, 1]
    nell_concept_ids_reduced = fluid.layers.scale(
        fluid.layers.elementwise_sub(
            nell_concept_ids_reduced,
            fluid.layers.fill_constant([1], "float32", 1)
        ),
        scale=-1
    )
    nell_mem_length = fluid.layers.reduce_sum(nell_concept_ids_reduced, dim=2)  # [batch_size, sent_size, 1]      

    # select and integrate
    wn_memory_layer = MemoryLayer(bert_config, max_wn_concept_length, wn_concept_dim, mem_method='raw', prefix='wn')
    wn_memory_output = wn_memory_layer.forward(enc_out, wn_memory_embs, wn_mem_length, ignore_no_memory_token=True)

    nell_memory_layer = MemoryLayer(bert_config, max_nell_concept_length, nell_concept_dim, mem_method='raw', prefix='nell')
    nell_memory_output = nell_memory_layer.forward(enc_out, nell_memory_embs, nell_mem_length, ignore_no_memory_token=True)

    memory_output = fluid.layers.concat([enc_out, wn_memory_output, nell_memory_output], axis=2)

    '''3rd layer: Self-Matching Layer'''
    # calculate input dim for self-matching layer
    memory_output_size = bert_config['hidden_size'] + wn_concept_dim + nell_concept_dim    
    logger.info("memory_output_size: {}".format(memory_output_size))

    # do matching
    self_att_layer = TriLinearTwoTimeSelfAttentionLayer(
        memory_output_size, dropout_rate=0.0, 
        cat_mul=True, cat_sub=True, cat_twotime=True,
        cat_twotime_mul=False, cat_twotime_sub=True)  # [bs, sq, concat_hs]
    att_output = self_att_layer.forward(memory_output, input_mask)  # [bs, sq, concat_hs]

    '''4th layer: Output Layer'''
    logits = fluid.layers.fc(
        input=att_output,
        size=2,
        num_flatten_dims=2,
        param_attr=fluid.ParamAttr(
            name="cls_squad_out_w",
            initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=bert_config['initializer_range'])),
        bias_attr=fluid.ParamAttr(
            name="cls_squad_out_b", initializer=fluid.initializer.Constant(0.)))

    logits = fluid.layers.transpose(x=logits, perm=[2, 0, 1])
    start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0)

    batch_ones = fluid.layers.fill_constant_batch_size_like(
        input=start_logits, dtype='int64', shape=[1], value=1)
    num_seqs = fluid.layers.reduce_sum(input=batch_ones)

    if is_training:

        def compute_loss(logits, positions):
            loss = fluid.layers.softmax_with_cross_entropy(
                logits=logits, label=positions)
            loss = fluid.layers.mean(x=loss)
            return loss

        start_loss = compute_loss(start_logits, start_positions)
        end_loss = compute_loss(end_logits, end_positions)
        total_loss = (start_loss + end_loss) / 2.0
        if args.use_fp16 and args.loss_scaling > 1.0:
            total_loss = total_loss * args.loss_scaling

        return pyreader, total_loss, num_seqs
    else:
        return pyreader, unique_id, start_logits, end_logits, num_seqs
Esempio n. 30
0
    def dyanmic_gru_op(self, **kwargs):
        role = kwargs['role']
        data = kwargs['data']
        data_share = kwargs['data_share'][role]
        weight = kwargs['weight']
        weight_share = kwargs['weight_share'][role]
        return_results = kwargs['return_results']
        return_results_cheb = kwargs['return_results_cheb']
        expected_result = kwargs['expect_results']
        pfl_mpc.init("aby3", role, "localhost", self.server, int(self.port))

        hidden_dim = 1

        data_paddle = fluid.data(name='input_paddle',
                                 shape=[3, 3],
                                 dtype='float32',
                                 lod_level=1)
        ldata_paddle = fluid.create_lod_tensor(data, [[3]], fluid.CPUPlace())
        w_param_attrs = fluid.ParamAttr(
            name='gru_weight',
            learning_rate=0.5,
            initializer=fluid.initializer.NumpyArrayInitializer(weight),
            trainable=True)
        hidden_paddle = fluid.layers.dynamic_gru(input=data_paddle,
                                                 size=hidden_dim,
                                                 param_attr=w_param_attrs,
                                                 gate_activation='sigmoid',
                                                 candidate_activation='relu')

        data_mpc = fluid.data(name='input_mpc',
                              shape=[3, 2, 3],
                              dtype='int64',
                              lod_level=1)
        # trans batch information to shape[0]
        data_share_trans = np.transpose(data_share, [1, 0, 2])
        ldata_mpc = fluid.create_lod_tensor(data_share_trans, [[3]],
                                            fluid.CPUPlace())
        w_param_attrs1 = fluid.ParamAttr(
            name='mpc_gru_weight',
            learning_rate=0.5,
            initializer=pfl_mpc.initializer.NumpyArrayInitializer(
                weight_share),
            trainable=True)
        w_param_attrs2 = fluid.ParamAttr(
            name='mpc_gru_weight_cheb',
            learning_rate=0.5,
            initializer=pfl_mpc.initializer.NumpyArrayInitializer(
                weight_share),
            trainable=True)
        hidden_mpc = pfl_mpc.layers.dynamic_gru(input=data_mpc,
                                                size=hidden_dim,
                                                param_attr=w_param_attrs1)
        hidden_mpc_cheb = pfl_mpc.layers.dynamic_gru(
            input=data_mpc,
            size=hidden_dim,
            param_attr=w_param_attrs2,
            gate_activation='sigmoid_chebyshev')

        exe = fluid.Executor(place=fluid.CPUPlace())
        exe.run(fluid.default_startup_program())
        results = exe.run(
            feed={
                'input_paddle': ldata_paddle,
                'input_mpc': ldata_mpc
            },
            fetch_list=[hidden_paddle, hidden_mpc, hidden_mpc_cheb],
            return_numpy=False)
        return_results.append(np.array(results[1]))
        return_results_cheb.append(np.array(results[2]))
        expected_result.append(np.array(results[0]))