コード例 #1
0
 def test_calc_gradient(self):
     x = layers.create_parameter(dtype="float32", shape=[5, 10])
     y = layers.create_parameter(dtype="float32", shape=[10, 8])
     mul_out = layers.mul(x=x, y=y)
     mean_out = layers.mean(mul_out)
     a = calc_gradient(mean_out, mul_out)
     b = calc_gradient(mean_out, x)
     place = fluid.CPUPlace()
     exe = fluid.Executor(place)
     exe.run(fluid.default_startup_program())
     exe.run(fluid.default_main_program(), feed={}, fetch_list=[a, b])
コード例 #2
0
    def _build_decoder(self, enc_last_hidden, enc_last_cell, mode='train'):
        softmax_weight = layers.create_parameter([self.hidden_size, self.tar_vocab_size], dtype="float32", name="softmax_weight", \
                    default_initializer=fluid.initializer.UniformInitializer(low=-self.init_scale, high=self.init_scale))
        if mode == 'train':

            #fluid.layers.Print(self.tar_emb)
            #fluid.layers.Print(enc_last_hidden)
            #fluid.layers.Print(enc_last_cell)
            dec_output, dec_last_hidden, dec_last_cell = basic_lstm( self.tar_emb, enc_last_hidden, enc_last_cell, \
                    self.hidden_size, num_layers=self.num_layers, \
                    batch_first=self.batch_first, \
                    dropout_prob=self.dropout, \
                    param_attr = ParamAttr( initializer=fluid.initializer.UniformInitializer(low=-self.init_scale, high=self.init_scale) ), \
                    bias_attr = ParamAttr( initializer = fluid.initializer.Constant(0.0) ))

            dec_output = layers.matmul(dec_output, softmax_weight)

            return dec_output
        else:
            print("mode not supprt", mode)
コード例 #3
0
 def fully_con_layer(self, x, n, channel, name):
     """Fully connected layer"""
     #  bt_init = fluid.initializer.ConstantInitializer(value=0.01)
     bt_init = fluid.initializer.TruncatedNormal(loc=0.0, scale=2.0)
     bt = fl.create_parameter(
         shape=[n, 1],
         dtype="float32",
         attr=fluid.ParamAttr(name="%s_bt" % name,
                              trainable=True,
                              initializer=bt_init),
     )
     x_conv = fl.conv2d(input=x,
                        num_filters=1,
                        filter_size=[1, 1],
                        stride=[1, 1],
                        padding="SAME",
                        data_format="NHWC",
                        param_attr=fluid.ParamAttr(name="%s_conv2d" % name))
     x_conv = x_conv + bt
     return x_conv
コード例 #4
0
    def _calc_bow_logits(self, enc_out, bow_pos):
        """Get the logits of generation."""
        bow_feat = layers.slice(input=enc_out, axes=[1], starts=[0], ends=[1])
        bow_feat = layers.reshape(x=bow_feat, shape=[-1, self.hidden_size])
        bow_pos = layers.cast(x=bow_pos, dtype="int32")
        bow_feat = layers.gather(input=bow_feat, index=bow_pos)

        bow_trans_feat = layers.fc(
            input=bow_feat,
            size=self.emb_size,
            act=self.hidden_act,
            param_attr=fluid.ParamAttr(name="bow_trans_fc.w_0",
                                       initializer=self.param_initializer),
            bias_attr=fluid.ParamAttr(name="bow_trans_fc.b_0"))

        bow_trans_feat = pre_process_layer(bow_trans_feat,
                                           self.post_cls_cmd,
                                           name="bow_trans")

        if self.weight_sharing:
            fc_out = layers.matmul(
                x=bow_trans_feat,
                y=fluid.default_main_program().global_block().var(
                    self.token_emb_name),
                transpose_y=True)
            if self.cls_bias:
                fc_out += layers.create_parameter(
                    shape=[self.vocab_size],
                    dtype=self.dtype,
                    attr=fluid.ParamAttr(name="bow_out_fc.b_0"),
                    is_bias=True)
        else:
            bow_out_bias_attr = fluid.ParamAttr(
                name="bow_out_fc.b_0") if self.cls_bias else False
            fc_out = layers.fc(input=bow_trans_feat,
                               size=self.vocab_size,
                               param_attr=fluid.ParamAttr(
                                   name="bow_out_fc.w_0",
                                   initializer=self.param_initializer),
                               bias_attr=bow_out_bias_attr)
        return fc_out
コード例 #5
0
    def _calc_logits(self, enc_out, checkpoints=None, seq_pos=None):
        """Get the logits of generation."""
        enc_out = layers.reshape(x=enc_out, shape=[-1, self.hidden_size])
        if seq_pos is not None:
            seq_pos = layers.cast(x=seq_pos, dtype="int32")
            seq_feat = layers.gather(input=enc_out, index=seq_pos)
        else:
            seq_feat = enc_out

        seq_trans_feat = layers.fc(
            input=seq_feat,
            size=self.emb_size,
            act=self.hidden_act,
            param_attr=fluid.ParamAttr(name="mask_lm_trans_fc.w_0", initializer=self.param_initializer),
            bias_attr=fluid.ParamAttr(name="mask_lm_trans_fc.b_0"))

        seq_trans_feat = pre_process_layer(seq_trans_feat, self.post_cls_cmd, name="mask_lm_trans")

        if checkpoints is not None:
            checkpoints.append(seq_trans_feat)

        if self.weight_sharing:
            fc_out = layers.matmul(
                x=seq_trans_feat,
                y=fluid.default_main_program().global_block().var(self.token_emb_name),
                transpose_y=True)
            if self.cls_bias:
                fc_out += layers.create_parameter(
                    shape=[self.vocab_size],
                    dtype=self.dtype,
                    attr=fluid.ParamAttr(name="mask_lm_out_fc.b_0"),
                    is_bias=True)
        else:
            seq_out_bias_attr = fluid.ParamAttr(name="mask_lm_out_fc.b_0") if self.cls_bias else False
            fc_out = layers.fc(
                input=seq_trans_feat,
                size=self.vocab_size,
                param_attr=fluid.ParamAttr(name="mask_lm_out_fc.w_0", initializer=self.param_initializer),
                bias_attr=seq_out_bias_attr)
        return fc_out
コード例 #6
0
    def __init__(self, cfg, name=None):
        super(ErnieModelForPretraining, self).__init__(cfg, name=name)
        initializer = F.initializer.TruncatedNormal(
            scale=cfg['initializer_range'])
        d_model = cfg['hidden_size']
        d_vocab = cfg['vocab_size']

        self.pooler_heads = D.LayerList([NSPHead(cfg, name=name)])
        self.mlm = _build_linear(d_model,
                                 d_model,
                                 append_name(name, 'mask_lm_trans_fc'),
                                 initializer,
                                 act=cfg['hidden_act'])
        self.mlm_ln = _build_ln(d_model,
                                name=append_name(name, 'mask_lm_trans'))
        self.mlm_bias = L.create_parameter(
            dtype='float32',
            shape=[d_vocab],
            attr=F.ParamAttr(name=append_name(name, 'mask_lm_out_fc.b_0'),
                             initializer=F.initializer.Constant(value=0.0)),
            is_bias=True,
        )
コード例 #7
0
def gin_layer(gw, node_features, edge_features, train_eps, name):
    def send_func(src_feat, dst_feat, edge_feat):
        """Send"""
        return src_feat["h"] + edge_feat["h"]

    epsilon = L.create_parameter(
        shape=[1, 1],
        dtype="float32",
        attr=F.ParamAttr(name="%s_eps" % name),
        default_initializer=F.initializer.ConstantInitializer(value=0.0))
    if not train_eps:
        epsilon.stop_gradient = True

    msg = gw.send(send_func,
                  nfeat_list=[("h", node_features)],
                  efeat_list=[("h", edge_features)])

    node_feat = gw.recv(msg, "sum") + node_features * (epsilon + 1.0)

    #  if apply_func is not None:
    #      node_feat = apply_func(node_feat, name)
    return node_feat
コード例 #8
0
    def _create_mask_variables(cls, main_program, startup_program, params):
        r"""
        Create sparse mask Tensors according to supported layers in :attr:`main_program`.
        This function is called in second step of `ASPHelper._minimize`

        Args:
            main_program (Program): Program with model definition and its parameters.
            startup_program (Program): Program for initializing parameters.
            params (list): Variable parameters.
        """
        asp_info = cls._get_program_asp_info(main_program)
        with program_guard(main_program, startup_program):
            for param in params:
                if ASPHelper._is_supported_layer(main_program, param.name):
                    if param.name not in asp_info.mask_vars:
                        mask_param = layers.create_parameter(
                            name=ASPHelper._get_mask_name(param.name),
                            shape=param.shape,
                            dtype=param.dtype,
                            default_initializer=ConstantInitializer(value=1.0))
                        mask_param.stop_gradient = True
                        mask_param.trainable = False
                        asp_info.update_mask_vars(param.name, mask_param)
コード例 #9
0
    def __init__(self, cfg, name=None):
        cfg['return_additional_info'] = True
        cfg['has_pooler'] = False
        super(ErnieModelForGeneration, self).__init__(cfg, name=name)
        initializer = F.initializer.TruncatedNormal(
            scale=cfg['initializer_range'])
        d_model = cfg['hidden_size']
        d_vocab = cfg['vocab_size']

        self.mlm = _build_linear(d_model,
                                 d_model,
                                 append_name(name, 'mask_lm_trans_fc'),
                                 initializer,
                                 act=cfg['hidden_act'])
        self.mlm_ln = _build_ln(d_model,
                                name=append_name(name, 'mask_lm_trans'))
        self.mlm_bias = L.create_parameter(
            dtype='float32',
            shape=[d_vocab],
            attr=F.ParamAttr(name=append_name(name, 'mask_lm_out_fc.b_0'),
                             initializer=F.initializer.Constant(value=0.0)),
            is_bias=True,
        )
コード例 #10
0
ファイル: gnn_block.py プロジェクト: xiaoyao4573/PaddleHelix
def gcn_layer(gw, feature, edge_features, act, name):
    """tbd"""
    def send_func(src_feat, dst_feat, edge_feat):
        """tbd"""
        return src_feat["h"] + edge_feat["h"]

    size = feature.shape[-1]

    msg = gw.send(send_func,
                  nfeat_list=[("h", feature)],
                  efeat_list=[("h", edge_features)])

    output = gw.recv(msg, mean_recv)
    output = layers.fc(output,
                       size=size,
                       bias_attr=False,
                       param_attr=fluid.ParamAttr(name=name))

    bias = layers.create_parameter(shape=[size],
                                   dtype='float32',
                                   is_bias=True,
                                   name=name + '_bias')
    output = layers.elementwise_add(output, bias, act=act)
    return output
コード例 #11
0
def network(batch_size, items_num, hidden_size, step, rate):
    stdv = 1.0 / math.sqrt(hidden_size)

    items = layers.data(
        name="items",
        shape=[batch_size, -1, 1],
        dtype="int64",
        append_batch_size=False)  #[bs, uniq_max, 1]
    seq_index = layers.data(
        name="seq_index",
        shape=[batch_size, -1],
        dtype="int64",
        append_batch_size=False)  #[-1(seq_max)*batch_size, 1]
    last_index = layers.data(
        name="last_index",
        shape=[batch_size],
        dtype="int64",
        append_batch_size=False)  #[batch_size, 1]
    adj_in = layers.data(
        name="adj_in",
        shape=[batch_size, -1, -1],
        dtype="float32",
        append_batch_size=False)
    adj_out = layers.data(
        name="adj_out",
        shape=[batch_size, -1, -1],
        dtype="float32",
        append_batch_size=False)
    mask = layers.data(
        name="mask",
        shape=[batch_size, -1, 1],
        dtype="float32",
        append_batch_size=False)
    label = layers.data(
        name="label",
        shape=[batch_size, 1],
        dtype="int64",
        append_batch_size=False)

    items_emb = layers.embedding(
        input=items,
        is_sparse=True,
        param_attr=fluid.ParamAttr(
            name="emb",
            learning_rate=rate,
            initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)),
        size=[items_num, hidden_size])  #[batch_size, uniq_max, h]
    data_feed = [items, seq_index, last_index, adj_in, adj_out, mask, label]

    pre_state = items_emb
    for i in range(step):
        pre_state = layers.reshape(
            x=pre_state, shape=[batch_size, -1, hidden_size])
        state_in = layers.fc(
            input=pre_state,
            name="state_in",
            size=hidden_size,
            act=None,
            num_flatten_dims=2,
            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)),
            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  #[batch_size, uniq_max, h]
        state_out = layers.fc(
            input=pre_state,
            name="state_out",
            size=hidden_size,
            act=None,
            num_flatten_dims=2,
            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)),
            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  #[batch_size, uniq_max, h]

        state_adj_in = layers.matmul(adj_in,
                                     state_in)  #[batch_size, uniq_max, h]
        state_adj_out = layers.matmul(adj_out,
                                      state_out)  #[batch_size, uniq_max, h]

        gru_input = layers.concat([state_adj_in, state_adj_out], axis=2)

        gru_input = layers.reshape(x=gru_input, shape=[-1, hidden_size * 2])
        gru_fc = layers.fc(input=gru_input,
                           name="gru_fc",
                           size=3 * hidden_size,
                           bias_attr=False)
        pre_state, _, _ = fluid.layers.gru_unit(
            input=gru_fc,
            hidden=layers.reshape(
                x=pre_state, shape=[-1, hidden_size]),
            size=3 * hidden_size)

    final_state = pre_state
    seq_index = layers.reshape(seq_index, shape=[-1])
    seq = layers.gather(final_state, seq_index)  #[batch_size*-1(seq_max), h]
    last = layers.gather(final_state, last_index)  #[batch_size, h]

    seq = layers.reshape(
        seq, shape=[batch_size, -1, hidden_size])  #[batch_size, -1(seq_max), h]
    last = layers.reshape(
        last, shape=[batch_size, hidden_size])  #[batch_size, h]

    seq_fc = layers.fc(
        input=seq,
        name="seq_fc",
        size=hidden_size,
        bias_attr=False,
        act=None,
        num_flatten_dims=2,
        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
            low=-stdv, high=stdv)))  #[batch_size, -1(seq_max), h]
    last_fc = layers.fc(input=last,
                        name="last_fc",
                        size=hidden_size,
                        bias_attr=False,
                        act=None,
                        num_flatten_dims=1,
                        param_attr=fluid.ParamAttr(
                            initializer=fluid.initializer.Uniform(
                                low=-stdv, high=stdv)))  #[bathc_size, h]

    seq_fc_t = layers.transpose(
        seq_fc, perm=[1, 0, 2])  #[-1(seq_max), batch_size, h]
    add = layers.elementwise_add(seq_fc_t,
                                 last_fc)  #[-1(seq_max), batch_size, h]
    b = layers.create_parameter(
        shape=[hidden_size],
        dtype='float32',
        default_initializer=fluid.initializer.Constant(value=0.0))  #[h]
    add = layers.elementwise_add(add, b)  #[-1(seq_max), batch_size, h]

    add_sigmoid = layers.sigmoid(add)  #[-1(seq_max), batch_size, h] 
    add_sigmoid = layers.transpose(
        add_sigmoid, perm=[1, 0, 2])  #[batch_size, -1(seq_max), h]

    weight = layers.fc(input=add_sigmoid,
                       name="weight_fc",
                       size=1,
                       act=None,
                       num_flatten_dims=2,
                       bias_attr=False,
                       param_attr=fluid.ParamAttr(
                           initializer=fluid.initializer.Uniform(
                               low=-stdv, high=stdv)))  #[batch_size, -1, 1]
    weight *= mask
    weight_mask = layers.elementwise_mul(seq, weight, axis=0)
    global_attention = layers.reduce_sum(weight_mask, dim=1)

    final_attention = layers.concat(
        [global_attention, last], axis=1)  #[batch_size, 2*h]
    final_attention_fc = layers.fc(
        input=final_attention,
        name="fina_attention_fc",
        size=hidden_size,
        bias_attr=False,
        act=None,
        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
            low=-stdv, high=stdv)))  #[batch_size, h]

    all_vocab = layers.create_global_var(
        shape=[items_num - 1, 1],
        value=0,
        dtype="int64",
        persistable=True,
        name="all_vocab")

    all_emb = layers.embedding(
        input=all_vocab,
        is_sparse=True,
        param_attr=fluid.ParamAttr(
            name="emb",
            learning_rate=rate,
            initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)),
        size=[items_num, hidden_size])  #[all_vocab, h]

    logits = layers.matmul(
        x=final_attention_fc, y=all_emb,
        transpose_y=True)  #[batch_size, all_vocab]
    softmax = layers.softmax_with_cross_entropy(
        logits=logits, label=label)  #[batch_size, 1]
    loss = layers.reduce_mean(softmax)  # [1]
    #fluid.layers.Print(loss)
    acc = layers.accuracy(input=logits, label=label, k=20)
    return loss, acc, data_feed, [items_emb, all_emb]
コード例 #12
0
    def __init__(
        self,
        dlatent_size,  # Disentangled latent (W) dimensionality.
        resolution=1024,  # Output resolution (1024 x 1024 by default).
        fmap_base=8192,  # Overall multiplier for the number of feature maps.
        num_channels=3,  # Number of output color channels.
        structure='fixed',  # 'fixed' = no progressive growing, 'linear' = human-readable, 'recursive' = efficient, 'auto' = select automatically.
        fmap_max=512,  # Maximum number of feature maps in any layer.
        fmap_decay=1.0,  # log2 feature map reduction when doubling the resolution.
        f=None,  # (Huge overload, if you dont have enough resouces, please pass it as `f = None`)Low-pass filter to apply when resampling activations. None = no filtering.
        use_pixel_norm=False,  # Enable pixelwise feature vector normalization?
        use_instance_norm=True,  # Enable instance normalization?
        use_wscale=True,  # Enable equalized learning rate?
        use_noise=True,  # Enable noise inputs?
        use_style=True  # Enable style inputs?
    ):  # batch size.
        """
        synthesis of generator, the second part of gnerator
        parameters:
        dlatent_size: 512 Disentangled latent(W) dimensionality.
        resolution: 1024 x 1024.
        fmap_base:
        num_channels:
        structure: only support 'fixed' mode.
        fmap_max:
        """
        super(G_synthesis, self).__init__()

        self.nf = lambda stage: min(
            int(fmap_base / (2.0**(stage * fmap_decay))), fmap_max)
        self.structure = structure

        # - 2 means we start from feature map with height and width equals 4.
        # as this example, we get num_layers = 18.
        self.resolution_log2 = int(np.log2(resolution))
        num_layers = self.resolution_log2 * 2 - 2

        self.num_layers = num_layers

        # Noise inputs.
        self.noise_inputs = []
        for layer_idx in range(num_layers):  #2~18
            res = layer_idx // 2 + 2
            shape = [1, 1, 2**res, 2**res]
            self.noise_inputs.append(layers.randn(shape))

        # Blur2d
        self.blur = Blur2d(f)

        # torgb: fixed mode
        # channel 16 -> channel 8
        self.channel_shrinkage = Conv2d(self.nf(self.resolution_log2 - 2),
                                        self.nf(self.resolution_log2),
                                        3,
                                        use_wscale=use_wscale)
        # channel 8 -> channel 3
        self.torgb = Conv2d(self.nf(self.resolution_log2),
                            num_channels,
                            1,
                            gain=1,
                            use_wscale=use_wscale)

        # Initial Input Block
        self.const_input = layers.create_parameter(
            (1, self.nf(1), 4, 4),
            'float32',
            default_initializer=fluid.initializer.ConstantInitializer(
                value=1.0))
        self.bias = layers.create_parameter(
            (self.nf(1), ),
            'float32',
            default_initializer=fluid.initializer.ConstantInitializer(
                value=1.0))
        self.adaIn1 = LayerEpilogue(self.nf(1), dlatent_size, use_wscale,
                                    use_noise, use_pixel_norm,
                                    use_instance_norm, use_style)
        self.conv1 = Conv2d(self.nf(1),
                            self.nf(1),
                            3,
                            gain=1,
                            use_wscale=use_wscale)
        self.adaIn2 = LayerEpilogue(self.nf(1), dlatent_size, use_wscale,
                                    use_noise, use_pixel_norm,
                                    use_instance_norm, use_style)

        # Common Block
        # 4 x 4 -> 8 x 8
        res = 3
        self.GBlock1 = GBlock(res, use_wscale, use_noise, use_pixel_norm,
                              use_instance_norm, self.noise_inputs)

        # 8 x 8 -> 16 x 16
        res = 4
        self.GBlock2 = GBlock(res, use_wscale, use_noise, use_pixel_norm,
                              use_instance_norm, self.noise_inputs)

        # 16 x 16 -> 32 x 32
        res = 5
        self.GBlock3 = GBlock(res, use_wscale, use_noise, use_pixel_norm,
                              use_instance_norm, self.noise_inputs)

        # 32 x 32 -> 64 x 64
        res = 6
        self.GBlock4 = GBlock(res, use_wscale, use_noise, use_pixel_norm,
                              use_instance_norm, self.noise_inputs)

        # 64 x 64 -> 128 x 128
        res = 7
        self.GBlock5 = GBlock(res, use_wscale, use_noise, use_pixel_norm,
                              use_instance_norm, self.noise_inputs)

        # 128 x 128 -> 256 x 256
        res = 8
        self.GBlock6 = GBlock(res, use_wscale, use_noise, use_pixel_norm,
                              use_instance_norm, self.noise_inputs)

        # 256 x 256 -> 512 x 512
        res = 9
        self.GBlock7 = GBlock(res, use_wscale, use_noise, use_pixel_norm,
                              use_instance_norm, self.noise_inputs)

        # 512 x 512 -> 1024 x 1024
        res = 10
        self.GBlock8 = GBlock(res, use_wscale, use_noise, use_pixel_norm,
                              use_instance_norm, self.noise_inputs)
コード例 #13
0
ファイル: bidaf_model.py プロジェクト: wbj0110/models
def point_network_decoder(p_vec, q_vec, hidden_size, args):
    """Output layer - pointer network"""
    tag = 'pn_decoder_'
    init_random = fluid.initializer.Normal(loc=0.0, scale=1.0)

    random_attn = layers.create_parameter(
        shape=[1, hidden_size],
        dtype='float32',
        default_initializer=init_random)
    random_attn = layers.fc(
        input=random_attn,
        size=hidden_size,
        act=None,
        param_attr=fluid.ParamAttr(name=tag + 'random_attn_fc_w'),
        bias_attr=fluid.ParamAttr(name=tag + 'random_attn_fc_b'))
    random_attn = layers.reshape(random_attn, shape=[-1])
    U = layers.fc(input=q_vec,
                  param_attr=fluid.ParamAttr(name=tag + 'q_vec_fc_w'),
                  bias_attr=False,
                  size=hidden_size,
                  act=None) + random_attn
    U = layers.tanh(U)

    logits = layers.fc(input=U,
                       param_attr=fluid.ParamAttr(name=tag + 'logits_fc_w'),
                       bias_attr=fluid.ParamAttr(name=tag + 'logits_fc_b'),
                       size=1,
                       act=None)
    scores = layers.sequence_softmax(input=logits)
    pooled_vec = layers.elementwise_mul(x=q_vec, y=scores, axis=0)
    pooled_vec = layers.sequence_pool(input=pooled_vec, pool_type='sum')

    init_state = layers.fc(
        input=pooled_vec,
        param_attr=fluid.ParamAttr(name=tag + 'init_state_fc_w'),
        bias_attr=fluid.ParamAttr(name=tag + 'init_state_fc_b'),
        size=hidden_size,
        act=None)

    def custom_dynamic_rnn(p_vec, init_state, hidden_size, para_name, args):
        tag = para_name + "custom_dynamic_rnn_"

        def static_rnn(step,
                       p_vec=p_vec,
                       init_state=None,
                       para_name='',
                       args=args):
            tag = para_name + "static_rnn_"
            ctx = layers.fc(
                input=p_vec,
                param_attr=fluid.ParamAttr(name=tag + 'context_fc_w'),
                bias_attr=fluid.ParamAttr(name=tag + 'context_fc_b'),
                size=hidden_size,
                act=None)

            beta = []
            c_prev = init_state
            m_prev = init_state
            for i in range(step):
                m_prev0 = layers.fc(
                    input=m_prev,
                    size=hidden_size,
                    act=None,
                    param_attr=fluid.ParamAttr(name=tag + 'm_prev0_fc_w'),
                    bias_attr=fluid.ParamAttr(name=tag + 'm_prev0_fc_b'))
                m_prev1 = layers.sequence_expand(x=m_prev0, y=ctx)

                Fk = ctx + m_prev1
                Fk = layers.tanh(Fk)
                logits = layers.fc(
                    input=Fk,
                    size=1,
                    act=None,
                    param_attr=fluid.ParamAttr(name=tag + 'logits_fc_w'),
                    bias_attr=fluid.ParamAttr(name=tag + 'logits_fc_b'))

                scores = layers.sequence_softmax(input=logits)
                attn_ctx = layers.elementwise_mul(x=p_vec, y=scores, axis=0)
                attn_ctx = layers.sequence_pool(input=attn_ctx, pool_type='sum')

                hidden_t, cell_t = lstm_step(
                    attn_ctx,
                    hidden_t_prev=m_prev,
                    cell_t_prev=c_prev,
                    size=hidden_size,
                    para_name=tag,
                    args=args)
                m_prev = hidden_t
                c_prev = cell_t
                beta.append(scores)
            return beta

        return static_rnn(
            2, p_vec=p_vec, init_state=init_state, para_name=para_name)

    fw_outputs = custom_dynamic_rnn(p_vec, init_state, hidden_size, tag + "fw_",
                                    args)
    bw_outputs = custom_dynamic_rnn(p_vec, init_state, hidden_size, tag + "bw_",
                                    args)

    start_prob = layers.elementwise_add(
        x=fw_outputs[0], y=bw_outputs[1], axis=0) / 2
    end_prob = layers.elementwise_add(
        x=fw_outputs[1], y=bw_outputs[0], axis=0) / 2

    return start_prob, end_prob
コード例 #14
0
    def encoder_static(input_embedding,
                       len=3,
                       init_hidden=None,
                       init_cell=None):

        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
            weight_1 = layers.create_parameter(
                [hidden_size * 2, hidden_size * 4],
                dtype="float32",
                name="fc_weight1_" + str(i),
                default_initializer=fluid.initializer.UniformInitializer(
                    low=-init_scale, high=init_scale))
            weight_1_arr.append(weight_1)
            bias_1 = layers.create_parameter(
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
                default_initializer=fluid.initializer.Constant(0.0))
            bias_arr.append(bias_1)

            pre_hidden = layers.slice(init_hidden,
                                      axes=[0],
                                      starts=[i],
                                      ends=[i + 1])
            pre_cell = layers.slice(init_cell,
                                    axes=[0],
                                    starts=[i],
                                    ends=[i + 1])
            pre_hidden = layers.reshape(pre_hidden,
                                        shape=[-1, hidden_size],
                                        inplace=True)
            pre_cell = layers.reshape(pre_cell,
                                      shape=[-1, hidden_size],
                                      inplace=True)
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        res = []
        sliced_inputs = layers.split(input_embedding,
                                     num_or_sections=len,
                                     dim=1)

        for index in range(len):
            input = sliced_inputs[index]
            input = layers.reshape(input,
                                   shape=[-1, hidden_size],
                                   inplace=True)
            for k in range(num_layers):
                pre_hidden = hidden_array[k]
                pre_cell = cell_array[k]
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

                nn = layers.concat([input, pre_hidden], 1)
                gate_input = layers.matmul(x=nn, y=weight_1)

                gate_input = layers.elementwise_add(gate_input, bias)
                i, j, f, o = layers.split(gate_input,
                                          num_or_sections=4,
                                          dim=-1)

                c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
                    i) * layers.tanh(j)
                m = layers.tanh(c) * layers.sigmoid(o)

                hidden_array[k] = m
                cell_array[k] = c
                input = m

                if dropout != None and dropout > 0.0:
                    input = layers.dropout(
                        input,
                        dropout_prob=dropout,
                        dropout_implementation='upscale_in_train')

            res.append(input)

        last_hidden = layers.concat(hidden_array, 1)
        last_hidden = layers.reshape(last_hidden,
                                     shape=[-1, num_layers, hidden_size],
                                     inplace=True)
        last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2])

        last_cell = layers.concat(cell_array, 1)
        last_cell = layers.reshape(last_cell,
                                   shape=[-1, num_layers, hidden_size])
        last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2])

        real_res = layers.concat(res, 0)
        real_res = layers.reshape(real_res,
                                  shape=[len, -1, hidden_size],
                                  inplace=True)
        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])

        return real_res, last_hidden, last_cell
コード例 #15
0
ファイル: gnn_block.py プロジェクト: xiaoyao4573/PaddleHelix
def gat_layer(gw,
              feature,
              edge_features,
              hidden_size,
              act,
              name,
              num_heads=1,
              feat_drop=0.1,
              attn_drop=0.1,
              is_test=False):
    """tbd"""
    def send_attention(src_feat, dst_feat, edge_feat):
        """tbd"""
        output = src_feat["left_a"] + dst_feat["right_a"]
        output = layers.leaky_relu(output, alpha=0.2)  # (num_edges, num_heads)
        return {"alpha": output, "h": src_feat["h"] + edge_feat["h"]}

    def reduce_attention(msg):
        """tbd"""
        alpha = msg["alpha"]  # lod-tensor (batch_size, seq_len, num_heads)
        h = msg["h"]
        alpha = paddle_helper.sequence_softmax(alpha)
        old_h = h
        h = layers.reshape(h, [-1, num_heads, hidden_size])
        alpha = layers.reshape(alpha, [-1, num_heads, 1])
        if attn_drop > 1e-15:
            alpha = layers.dropout(alpha,
                                   dropout_prob=attn_drop,
                                   is_test=is_test,
                                   dropout_implementation="upscale_in_train")
        h = h * alpha
        h = layers.reshape(h, [-1, num_heads * hidden_size])
        h = layers.lod_reset(h, old_h)
        return layers.sequence_pool(h, "sum")

    if feat_drop > 1e-15:
        feature = layers.dropout(feature,
                                 dropout_prob=feat_drop,
                                 is_test=is_test,
                                 dropout_implementation='upscale_in_train')

    ft = layers.fc(feature,
                   hidden_size * num_heads,
                   bias_attr=False,
                   param_attr=fluid.ParamAttr(name=name + '_weight'))
    left_a = layers.create_parameter(shape=[num_heads, hidden_size],
                                     dtype='float32',
                                     name=name + '_gat_l_A')
    right_a = layers.create_parameter(shape=[num_heads, hidden_size],
                                      dtype='float32',
                                      name=name + '_gat_r_A')
    reshape_ft = layers.reshape(ft, [-1, num_heads, hidden_size])
    left_a_value = layers.reduce_sum(reshape_ft * left_a, -1)
    right_a_value = layers.reduce_sum(reshape_ft * right_a, -1)

    msg = gw.send(send_attention,
                  nfeat_list=[("h", ft), ("left_a", left_a_value),
                              ("right_a", right_a_value)],
                  efeat_list=[("h", edge_features)])
    output = gw.recv(msg, reduce_attention)
    bias = layers.create_parameter(shape=[hidden_size * num_heads],
                                   dtype='float32',
                                   is_bias=True,
                                   name=name + '_bias')
    bias.stop_gradient = True
    output = layers.elementwise_add(output, bias, act=act)
    return output
コード例 #16
0
ファイル: graphsum_model.py プロジェクト: zzg-971030/Research
    def decode(self,
               dec_input,
               enc_words_output,
               enc_sents_output,
               caches=None,
               gather_idx=None):
        """Decoding to generate output text"""

        trg_word, trg_pos, trg_slf_attn_bias, trg_src_words_attn_bias, \
        trg_src_sents_attn_bias, graph_attn_bias = dec_input

        dec_res = self._gen_dec_input(trg_word, trg_pos, trg_slf_attn_bias,
                                      trg_src_words_attn_bias,
                                      trg_src_sents_attn_bias, graph_attn_bias)

        emb_out, trg_slf_attn_bias, trg_src_words_attn_bias, trg_src_sents_attn_bias, graph_attn_bias = \
            dec_res.emb_out, dec_res.trg_slf_attn_bias, dec_res.trg_src_words_attn_bias, \
            dec_res.trg_src_sents_attn_bias, dec_res.graph_attn_bias

        # (batch_size, tgt_len, emb_dim)
        dec_output = graph_decoder(
            dec_input=emb_out,  # (batch_size, tgt_len, emb_dim)
            enc_words_output=
            enc_words_output,  # (batch_size, n_blocks, n_tokens, emb_dim)
            enc_sents_output=enc_sents_output,  # (batch_size, n_blocks, emb_dim)
            dec_slf_attn_bias=
            trg_slf_attn_bias,  # (batch_size, n_head, tgt_len, tgt_len)
            dec_enc_words_attn_bias=
            trg_src_words_attn_bias,  # (batch_size, n_blocks, n_head, tgt_len, n_tokens)
            dec_enc_sents_attn_bias=
            trg_src_sents_attn_bias,  # (batch_size, n_head, tgt_len, n_blocks)
            graph_attn_bias=
            graph_attn_bias,  # (batch_size, n_head, n_blocks, n_blocks)
            pos_win=self.pos_win,
            n_layer=self._dec_n_layer,
            n_head=self._n_head,
            d_key=self._emb_size // self._n_head,
            d_value=self._emb_size // self._n_head,
            d_model=self._emb_size,
            d_inner_hid=self._emb_size * 4,
            prepostprocess_dropout=self._prepostprocess_dropout,
            attention_dropout=self._attention_dropout,
            relu_dropout=self._prepostprocess_dropout,
            hidden_act=self._hidden_act,
            preprocess_cmd=self._preprocess_command,
            postprocess_cmd=self._postprocess_command,
            param_initializer=self._param_initializer,
            caches=caches,
            gather_idx=gather_idx,
            name='graph_decoder')

        # Reshape to 2D tensor to use GEMM instead of BatchedGEMM
        # (batch_size*tgt_len, emb_dim)
        dec_output = layers.reshape(dec_output,
                                    shape=[-1, self._emb_size],
                                    inplace=True)

        if self._dtype is "float16":
            dec_output = fluid.layers.cast(x=dec_output, dtype=self._emb_dtype)

        if self._weight_sharing:
            out = layers.matmul(
                x=dec_output,
                y=fluid.default_main_program().global_block().var(
                    self._word_emb_name),
                transpose_y=True)
            bias = layers.create_parameter(
                shape=[self.voc_size],
                dtype=self._emb_dtype,
                attr=fluid.ParamAttr(
                    name='generator.bias',
                    initializer=fluid.initializer.Constant(value=0.0)),
                is_bias=True)
            predict = layers.elementwise_add(x=out, y=bias, axis=-1)
        else:
            predict = layers.fc(
                input=dec_output,
                size=self.voc_size,
                param_attr=fluid.ParamAttr(
                    name="generator.w",
                    initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
                bias_attr=fluid.ParamAttr(
                    name='generator.bias',
                    initializer=fluid.initializer.Constant(value=0.0)))

        return predict
コード例 #17
0
ファイル: lm_model.py プロジェクト: baojun-nervana/benchmark
def lm_model(hidden_size,
             vocab_size,
             batch_size,
             num_layers=2,
             num_steps=20,
             init_scale=0.1,
             dropout=None,
             rnn_model='static',
             use_py_reader=False):
    def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None):
        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
            weight_1 = layers.create_parameter([hidden_size * 2, hidden_size*4], dtype="float32", name="fc_weight1_"+str(i), \
                    default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale))
            weight_1_arr.append(weight_1)
            bias_1 = layers.create_parameter(
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
                default_initializer=fluid.initializer.Constant(0.0))
            bias_arr.append(bias_1)

            pre_hidden = layers.slice(init_hidden,
                                      axes=[0],
                                      starts=[i],
                                      ends=[i + 1])
            pre_cell = layers.slice(init_cell,
                                    axes=[0],
                                    starts=[i],
                                    ends=[i + 1])
            pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size])
            pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size])
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        input_embedding = layers.transpose(input_embedding, perm=[1, 0, 2])
        rnn = PaddingRNN()

        with rnn.step():
            input = rnn.step_input(input_embedding)
            for k in range(num_layers):
                pre_hidden = rnn.memory(init=hidden_array[k])
                pre_cell = rnn.memory(init=cell_array[k])
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

                nn = layers.concat([input, pre_hidden], 1)
                gate_input = layers.matmul(x=nn, y=weight_1)

                gate_input = layers.elementwise_add(gate_input, bias)
                #i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
                i = layers.slice(gate_input,
                                 axes=[1],
                                 starts=[0],
                                 ends=[hidden_size])
                j = layers.slice(gate_input,
                                 axes=[1],
                                 starts=[hidden_size],
                                 ends=[hidden_size * 2])
                f = layers.slice(gate_input,
                                 axes=[1],
                                 starts=[hidden_size * 2],
                                 ends=[hidden_size * 3])
                o = layers.slice(gate_input,
                                 axes=[1],
                                 starts=[hidden_size * 3],
                                 ends=[hidden_size * 4])

                c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
                    i) * layers.tanh(j)
                m = layers.tanh(c) * layers.sigmoid(o)

                rnn.update_memory(pre_hidden, m)
                rnn.update_memory(pre_cell, c)

                rnn.step_output(m)
                rnn.step_output(c)

                input = m

                if dropout != None and dropout > 0.0:
                    input = layers.dropout(
                        input,
                        dropout_prob=dropout,
                        dropout_implementation='upscale_in_train')

            rnn.step_output(input)
        #real_res = layers.concat(res, 0)
        rnnout = rnn()

        last_hidden_array = []
        last_cell_array = []
        real_res = rnnout[-1]
        for i in range(num_layers):
            m = rnnout[i * 2]
            c = rnnout[i * 2 + 1]
            m.stop_gradient = True
            c.stop_gradient = True
            last_h = layers.slice(m,
                                  axes=[0],
                                  starts=[num_steps - 1],
                                  ends=[num_steps])
            last_hidden_array.append(last_h)
            last_c = layers.slice(c,
                                  axes=[0],
                                  starts=[num_steps - 1],
                                  ends=[num_steps])
            last_cell_array.append(last_c)
        '''
        else:
            real_res = rnnout[-1]
            for i in range( num_layers ):

            m1, c1, m2, c2 = rnnout
            real_res = m2
            m1.stop_gradient = True
            c1.stop_gradient = True
            c2.stop_gradient = True
        '''

        #layers.Print( first_hidden, message="22", summarize=10)
        #layers.Print( rnnout[1], message="11", summarize=10)
        #real_res = ( rnnout[1] + rnnout[2] + rnnout[3] + rnnout[4]) / 4.0
        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])
        last_hidden = layers.concat(last_hidden_array, 0)
        last_cell = layers.concat(last_cell_array, 0)
        '''
        last_hidden = layers.concat( hidden_array, 1 )
        last_hidden = layers.reshape( last_hidden, shape=[-1, num_layers, hidden_size])
        last_hidden = layers.transpose( x = last_hidden, perm = [1, 0, 2])
        last_cell = layers.concat( cell_array, 1)
        last_cell = layers.reshape( last_cell, shape=[ -1, num_layers, hidden_size])
        last_cell = layers.transpose( x = last_cell, perm = [1, 0, 2])
        '''

        return real_res, last_hidden, last_cell

    def encoder_static(input_embedding,
                       len=3,
                       init_hidden=None,
                       init_cell=None):

        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
            weight_1 = layers.create_parameter([hidden_size * 2, hidden_size*4], dtype="float32", name="fc_weight1_"+str(i), \
                    default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale))
            weight_1_arr.append(weight_1)
            bias_1 = layers.create_parameter(
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
                default_initializer=fluid.initializer.Constant(0.0))
            bias_arr.append(bias_1)

            pre_hidden = layers.slice(init_hidden,
                                      axes=[0],
                                      starts=[i],
                                      ends=[i + 1])
            pre_cell = layers.slice(init_cell,
                                    axes=[0],
                                    starts=[i],
                                    ends=[i + 1])
            pre_hidden = layers.reshape(pre_hidden,
                                        shape=[-1, hidden_size],
                                        inplace=True)
            pre_cell = layers.reshape(pre_cell,
                                      shape=[-1, hidden_size],
                                      inplace=True)
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        res = []
        sliced_inputs = layers.split(input_embedding,
                                     num_or_sections=len,
                                     dim=1)

        for index in range(len):
            input = sliced_inputs[index]
            input = layers.reshape(input,
                                   shape=[-1, hidden_size],
                                   inplace=True)
            for k in range(num_layers):
                pre_hidden = hidden_array[k]
                pre_cell = cell_array[k]
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

                nn = layers.concat([input, pre_hidden], 1)
                gate_input = layers.matmul(x=nn, y=weight_1)

                gate_input = layers.elementwise_add(gate_input, bias)
                i, j, f, o = layers.split(gate_input,
                                          num_or_sections=4,
                                          dim=-1)

                try:
                    from paddle.fluid.contrib.layers import fused_elemwise_activation
                    # layers.sigmoid(i) * layers.tanh(j)
                    tmp0 = fused_elemwise_activation(
                        x=layers.tanh(j),
                        y=i,
                        functor_list=['elementwise_mul', 'sigmoid'])
                    # pre_cell * layers.sigmoid(f)
                    tmp1 = fused_elemwise_activation(
                        x=pre_cell,
                        y=f,
                        functor_list=['elementwise_mul', 'sigmoid'])
                    c = tmp0 + tmp1
                    # layers.tanh(c) * layers.sigmoid(o)
                    m = fused_elemwise_activation(
                        x=layers.tanh(c),
                        y=o,
                        functor_list=['elementwise_mul', 'sigmoid'])
                except ImportError:
                    c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
                        i) * layers.tanh(j)
                    m = layers.tanh(c) * layers.sigmoid(o)

                hidden_array[k] = m
                cell_array[k] = c
                input = m

                if dropout != None and dropout > 0.0:
                    input = layers.dropout(
                        input,
                        dropout_prob=dropout,
                        dropout_implementation='upscale_in_train')

            res.append(input)

        last_hidden = layers.concat(hidden_array, 1)
        last_hidden = layers.reshape(last_hidden,
                                     shape=[-1, num_layers, hidden_size],
                                     inplace=True)
        last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2])

        last_cell = layers.concat(cell_array, 1)
        last_cell = layers.reshape(last_cell,
                                   shape=[-1, num_layers, hidden_size])
        last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2])

        real_res = layers.concat(res, 0)
        real_res = layers.reshape(real_res,
                                  shape=[len, -1, hidden_size],
                                  inplace=True)
        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])

        return real_res, last_hidden, last_cell

    batch_size_each = batch_size // fluid.core.get_cuda_device_count()
    if use_py_reader:
        feed_shapes = [[batch_size_each, num_steps, 1],
                       [batch_size_each * num_steps, 1]]
        py_reader = fluid.layers.py_reader(capacity=16,
                                           shapes=feed_shapes,
                                           dtypes=['int64', 'int64'])
        x, y = fluid.layers.read_file(py_reader)
    else:
        x = layers.data(name="x",
                        shape=[batch_size_each, num_steps, 1],
                        dtype='int64',
                        append_batch_size=False)
        y = layers.data(name="y",
                        shape=[batch_size_each * num_steps, 1],
                        dtype='int64',
                        append_batch_size=False)

    init_hidden = layers.data(name="init_hidden",
                              shape=[num_layers, batch_size_each, hidden_size],
                              dtype='float32',
                              append_batch_size=False)
    init_cell = layers.data(name="init_cell",
                            shape=[num_layers, batch_size_each, hidden_size],
                            dtype='float32',
                            append_batch_size=False)

    init_cell.persistable = True
    init_hidden.persistable = True

    init_hidden = layers.reshape(init_hidden,
                                 shape=[num_layers, -1, hidden_size])
    init_cell = layers.reshape(init_cell, shape=[num_layers, -1, hidden_size])

    x_emb = layers.embedding(
        input=x,
        size=[vocab_size, hidden_size],
        dtype='float32',
        is_sparse=False,
        param_attr=fluid.ParamAttr(
            name='embedding_para',
            initializer=fluid.initializer.UniformInitializer(low=-init_scale,
                                                             high=init_scale)))

    x_emb = layers.reshape(x_emb,
                           shape=[-1, num_steps, hidden_size],
                           inplace=True)
    if dropout != None and dropout > 0.0:
        x_emb = layers.dropout(x_emb,
                               dropout_prob=dropout,
                               dropout_implementation='upscale_in_train')

    if rnn_model == "padding":
        rnn_out, last_hidden, last_cell = padding_rnn(x_emb,
                                                      len=num_steps,
                                                      init_hidden=init_hidden,
                                                      init_cell=init_cell)
    elif rnn_model == "static":
        rnn_out, last_hidden, last_cell = encoder_static(
            x_emb, len=num_steps, init_hidden=init_hidden, init_cell=init_cell)
    elif rnn_model == "cudnn":
        x_emb = layers.transpose(x_emb, perm=[1, 0, 2])
        rnn_out, last_hidden, last_cell = layers.lstm( x_emb, init_hidden, init_cell,  num_steps, hidden_size, num_layers, \
                is_bidirec=False, \
                default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale) )
        rnn_out = layers.transpose(rnn_out, perm=[1, 0, 2])
    else:
        print("type not support")
        return

    rnn_out = layers.reshape(rnn_out,
                             shape=[-1, num_steps, hidden_size],
                             inplace=True)

    softmax_weight = layers.create_parameter([hidden_size, vocab_size], dtype="float32", name="softmax_weight", \
            default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale))
    softmax_bias = layers.create_parameter([vocab_size], dtype="float32", name='softmax_bias', \
            default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale))

    projection = layers.matmul(rnn_out, softmax_weight)
    projection = layers.elementwise_add(projection, softmax_bias)
    projection = layers.reshape(projection,
                                shape=[-1, vocab_size],
                                inplace=True)

    loss = layers.softmax_with_cross_entropy(logits=projection,
                                             label=y,
                                             soft_label=False)

    loss = layers.reshape(loss, shape=[-1, num_steps], inplace=True)
    loss = layers.reduce_mean(loss, dim=[0])
    loss = layers.reduce_sum(loss)

    loss.persistable = True
    last_cell.persistable = True
    last_hidden.persistable = True

    layers.assign(input=last_cell, output=init_cell)
    layers.assign(input=last_hidden, output=init_hidden)

    feeding_list = ['x', 'y', 'init_hidden', 'init_cell']
    if use_py_reader:
        return loss, last_hidden, last_cell, feeding_list, py_reader
    else:
        return loss, last_hidden, last_cell, feeding_list
    def __init__(self,
                 batch_size,
                 channels=1,
                 bottleneck=32,
                 params=[0, 1, 1, 1, 1],
                 n_iter=20,
                 last=False,
                 bn=True):
        super(FlowLayer, self).__init__()
        self.batch_size = batch_size
        self.bottleneck = Conv2D(channels,
                                 bottleneck,
                                 stride=1,
                                 padding=0,
                                 filter_size=1,
                                 bias_attr=fluid.ParamAttr(trainable=False))

        self.unbottleneck = Conv2D(bottleneck * 2,
                                   channels,
                                   stride=1,
                                   padding=(1, 1),
                                   filter_size=(3, 3),
                                   bias_attr=fluid.ParamAttr(trainable=False))
        self.bn = BatchNorm(channels) if bn else None
        channels = bottleneck

        self.conv4Ix = Conv2D(
            channels,
            channels,
            padding=0,
            stride=1,
            filter_size=3,
            param_attr=fluid.ParamAttr(
                learning_rate=0.01,
                initializer=fluid.initializer.NumpyArrayInitializer(
                    np.array([[[[-0.5, 0, 0.5]]] * channels] * channels)),
                trainable=params[0] == 1),
            bias_attr=fluid.ParamAttr(trainable=False),
            groups=1)

        self.conv4Iy = Conv2D(
            channels,
            channels,
            padding=0,
            stride=1,
            filter_size=3,
            param_attr=fluid.ParamAttr(
                learning_rate=0.01,
                initializer=fluid.initializer.NumpyArrayInitializer(
                    np.array([[[[-0.5], [0], [0.5]]] * channels] * channels)),
                trainable=params[0] == 1),
            bias_attr=fluid.ParamAttr(trainable=False),
            groups=1)

        self.conv4px = Conv2D(
            channels,
            channels,
            padding=0,
            stride=1,
            filter_size=(1, 2),
            param_attr=fluid.ParamAttr(
                learning_rate=0.01,
                initializer=fluid.initializer.NumpyArrayInitializer(
                    np.array([[[[-1, 1]]] * channels] * channels)),
                trainable=params[1] == 1),
            bias_attr=fluid.ParamAttr(trainable=False),
            groups=1)

        self.conv4py = Conv2D(
            channels,
            channels,
            padding=0,
            stride=1,
            filter_size=(2, 1),
            param_attr=fluid.ParamAttr(
                learning_rate=0.01,
                initializer=fluid.initializer.NumpyArrayInitializer(
                    np.array([[[[-1], [1]]] * channels] * channels)),
                trainable=params[1] == 1),
            bias_attr=fluid.ParamAttr(trainable=False),
            groups=1)

        self.conv4u = Conv2D(
            channels,
            channels,
            padding=0,
            stride=1,
            filter_size=(1, 2),
            param_attr=fluid.ParamAttr(
                learning_rate=0.01,
                initializer=fluid.initializer.NumpyArrayInitializer(
                    np.array([[[[-1, 1]]] * channels] * channels)),
                trainable=params[1] == 1),
            bias_attr=fluid.ParamAttr(trainable=False),
            groups=1)

        self.conv4v = Conv2D(
            channels,
            channels,
            padding=0,
            stride=1,
            filter_size=(2, 1),
            param_attr=fluid.ParamAttr(
                learning_rate=0.01,
                initializer=fluid.initializer.NumpyArrayInitializer(
                    np.array([[[[-1], [1]]] * channels] * channels)),
                trainable=params[1] == 1),
            bias_attr=fluid.ParamAttr(trainable=False),
            groups=1)

        self.n_iter = n_iter
        self.channels = channels

        self.theta = layers.create_parameter(
            shape=[1],
            dtype='float32',
            attr=fluid.ParamAttr(
                learning_rate=0.01,
                initializer=fluid.initializer.NumpyArrayInitializer(
                    np.array([0.3])),
                trainable=params[2] == 1))
        self.lamda = layers.create_parameter(
            shape=[1],
            dtype='float32',
            attr=fluid.ParamAttr(
                learning_rate=0.01,
                initializer=fluid.initializer.NumpyArrayInitializer(
                    np.array([0.15])),
                trainable=params[3] == 1))
        self.tau = layers.create_parameter(
            shape=[1],
            dtype='float32',
            attr=fluid.ParamAttr(
                learning_rate=0.01,
                initializer=fluid.initializer.NumpyArrayInitializer(
                    np.array([0.25])),
                trainable=params[4] == 1))
コード例 #19
0
ファイル: network.py プロジェクト: liangzuan1983/kddcupdebias
def network(items_num, hidden_size, step, bs):
    stdv = 1.0 / math.sqrt(hidden_size)

    items = fluid.data(name="items", shape=[bs, -1],
                       dtype="int64")  #[batch_size, uniq_max]
    seq_index = fluid.data(name="seq_index", shape=[bs, -1, 2],
                           dtype="int32")  #[batch_size, seq_max, 2]
    last_index = fluid.data(name="last_index", shape=[bs, 2],
                            dtype="int32")  #[batch_size, 2]
    adj_in = fluid.data(name="adj_in", shape=[bs, -1, -1],
                        dtype="float32")  #[batch_size, seq_max, seq_max]
    adj_out = fluid.data(name="adj_out", shape=[bs, -1, -1],
                         dtype="float32")  #[batch_size, seq_max, seq_max]
    mask = fluid.data(name="mask", shape=[bs, -1, 1],
                      dtype="float32")  #[batch_size, seq_max, 1]
    label = fluid.data(name="label", shape=[bs, 1],
                       dtype="int64")  #[batch_size, 1]

    datas = [items, seq_index, last_index, adj_in, adj_out, mask, label]
    py_reader = fluid.io.DataLoader.from_generator(capacity=256,
                                                   feed_list=datas,
                                                   iterable=False)
    feed_datas = datas

    items_emb = fluid.embedding(
        input=items,
        param_attr=fluid.ParamAttr(name="emb",
                                   initializer=fluid.initializer.Uniform(
                                       low=-stdv, high=stdv)),
        size=[items_num, hidden_size])  #[batch_size, uniq_max, h]

    pre_state = items_emb
    for i in range(step):
        pre_state = layers.reshape(x=pre_state, shape=[bs, -1, hidden_size])
        state_in = layers.fc(
            input=pre_state,
            name="state_in",
            size=hidden_size,
            act=None,
            num_flatten_dims=2,
            param_attr=fluid.ParamAttr(
                initializer=fluid.initializer.Uniform(low=-stdv, high=stdv)),
            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  #[batch_size, uniq_max, h]
        state_out = layers.fc(
            input=pre_state,
            name="state_out",
            size=hidden_size,
            act=None,
            num_flatten_dims=2,
            param_attr=fluid.ParamAttr(
                initializer=fluid.initializer.Uniform(low=-stdv, high=stdv)),
            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  #[batch_size, uniq_max, h]

        state_adj_in = layers.matmul(adj_in,
                                     state_in)  #[batch_size, uniq_max, h]
        state_adj_out = layers.matmul(adj_out,
                                      state_out)  #[batch_size, uniq_max, h]

        gru_input = layers.concat([state_adj_in, state_adj_out], axis=2)

        gru_input = layers.reshape(x=gru_input, shape=[-1, hidden_size * 2])
        gru_fc = layers.fc(input=gru_input,
                           name="gru_fc",
                           size=3 * hidden_size,
                           bias_attr=False)
        pre_state, _, _ = fluid.layers.gru_unit(input=gru_fc,
                                                hidden=layers.reshape(
                                                    x=pre_state,
                                                    shape=[-1, hidden_size]),
                                                size=3 * hidden_size)

    final_state = layers.reshape(pre_state, shape=[bs, -1, hidden_size])
    seq = layers.gather_nd(final_state, seq_index)
    last = layers.gather_nd(final_state, last_index)

    seq_fc = layers.fc(
        input=seq,
        name="seq_fc",
        size=hidden_size,
        bias_attr=False,
        act=None,
        num_flatten_dims=2,
        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
            low=-stdv, high=stdv)))  #[batch_size, seq_max, h]
    last_fc = layers.fc(
        input=last,
        name="last_fc",
        size=hidden_size,
        bias_attr=False,
        act=None,
        num_flatten_dims=1,
        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
            low=-stdv, high=stdv)))  #[bathc_size, h]

    seq_fc_t = layers.transpose(seq_fc, perm=[1, 0,
                                              2])  #[seq_max, batch_size, h]
    add = layers.elementwise_add(seq_fc_t, last_fc)  #[seq_max, batch_size, h]
    b = layers.create_parameter(
        shape=[hidden_size],
        dtype='float32',
        default_initializer=fluid.initializer.Constant(value=0.0))  #[h]
    add = layers.elementwise_add(add, b)  #[seq_max, batch_size, h]

    add_sigmoid = layers.sigmoid(add)  #[seq_max, batch_size, h]
    add_sigmoid = layers.transpose(add_sigmoid,
                                   perm=[1, 0, 2])  #[batch_size, seq_max, h]

    weight = layers.fc(
        input=add_sigmoid,
        name="weight_fc",
        size=1,
        act=None,
        num_flatten_dims=2,
        bias_attr=False,
        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
            low=-stdv, high=stdv)))  #[batch_size, seq_max, 1]
    weight *= mask
    weight_mask = layers.elementwise_mul(seq, weight,
                                         axis=0)  #[batch_size, seq_max, h]
    global_attention = layers.reduce_sum(weight_mask, dim=1)  #[batch_size, h]

    final_attention = layers.concat([global_attention, last],
                                    axis=1)  #[batch_size, 2*h]
    final_attention_fc = layers.fc(
        input=final_attention,
        name="final_attention_fc",
        size=hidden_size,
        bias_attr=False,
        act=None,
        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
            low=-stdv, high=stdv)))  #[batch_size, h]

    all_vocab = layers.create_global_var(shape=[items_num - 1],
                                         value=0,
                                         dtype="int64",
                                         persistable=True,
                                         name="all_vocab")

    all_emb = fluid.embedding(input=all_vocab,
                              param_attr=fluid.ParamAttr(
                                  name="emb",
                                  initializer=fluid.initializer.Uniform(
                                      low=-stdv, high=stdv)),
                              size=[items_num, hidden_size])  #[all_vocab, h]

    logits = layers.matmul(x=final_attention_fc, y=all_emb,
                           transpose_y=True)  #[batch_size, all_vocab]
    softmax = layers.softmax_with_cross_entropy(logits=logits,
                                                label=label)  #[batch_size, 1]
    loss = layers.reduce_mean(softmax)  # [1]
    acc = layers.accuracy(input=logits, label=label, k=50)
    return loss, acc, py_reader, feed_datas, logits
コード例 #20
0
    def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None):
        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
            weight_1 = layers.create_parameter(
                [hidden_size * 2, hidden_size * 4],
                dtype="float32",
                name="fc_weight1_" + str(i),
                default_initializer=fluid.initializer.UniformInitializer(
                    low=-init_scale, high=init_scale))
            weight_1_arr.append(weight_1)
            bias_1 = layers.create_parameter(
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
                default_initializer=fluid.initializer.Constant(0.0))
            bias_arr.append(bias_1)

            pre_hidden = layers.slice(
                init_hidden, axes=[0], starts=[i], ends=[i + 1])
            pre_cell = layers.slice(
                init_cell, axes=[0], starts=[i], ends=[i + 1])
            pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size])
            pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size])
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        input_embedding = layers.transpose(input_embedding, perm=[1, 0, 2])
        rnn = PaddingRNN()

        with rnn.step():
            input = rnn.step_input(input_embedding)
            for k in range(num_layers):
                pre_hidden = rnn.memory(init=hidden_array[k])
                pre_cell = rnn.memory(init=cell_array[k])
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

                nn = layers.concat([input, pre_hidden], 1)
                gate_input = layers.matmul(x=nn, y=weight_1)

                gate_input = layers.elementwise_add(gate_input, bias)
                i = layers.slice(
                    gate_input, axes=[1], starts=[0], ends=[hidden_size])
                j = layers.slice(
                    gate_input,
                    axes=[1],
                    starts=[hidden_size],
                    ends=[hidden_size * 2])
                f = layers.slice(
                    gate_input,
                    axes=[1],
                    starts=[hidden_size * 2],
                    ends=[hidden_size * 3])
                o = layers.slice(
                    gate_input,
                    axes=[1],
                    starts=[hidden_size * 3],
                    ends=[hidden_size * 4])

                c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
                    i) * layers.tanh(j)
                m = layers.tanh(c) * layers.sigmoid(o)

                rnn.update_memory(pre_hidden, m)
                rnn.update_memory(pre_cell, c)

                rnn.step_output(m)
                rnn.step_output(c)

                input = m

                if dropout != None and dropout > 0.0:
                    input = layers.dropout(
                        input,
                        dropout_prob=dropout,
                        dropout_implementation='upscale_in_train')

            rnn.step_output(input)
        rnnout = rnn()

        last_hidden_array = []
        last_cell_array = []
        real_res = rnnout[-1]
        for i in range(num_layers):
            m = rnnout[i * 2]
            c = rnnout[i * 2 + 1]
            m.stop_gradient = True
            c.stop_gradient = True
            last_h = layers.slice(
                m, axes=[0], starts=[num_steps - 1], ends=[num_steps])
            last_hidden_array.append(last_h)
            last_c = layers.slice(
                c, axes=[0], starts=[num_steps - 1], ends=[num_steps])
            last_cell_array.append(last_c)
        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])
        last_hidden = layers.concat(last_hidden_array, 0)
        last_cell = layers.concat(last_cell_array, 0)

        return real_res, last_hidden, last_cell
コード例 #21
0
ファイル: lm_model.py プロジェクト: wbj0110/models
def encoder(x,
            y,
            vocab_size,
            emb_size,
            init_hidden=None,
            init_cell=None,
            para_name='',
            custom_samples=None,
            custom_probabilities=None,
            test_mode=False,
            args=None):
    x_emb = layers.embedding(input=x,
                             size=[vocab_size, emb_size],
                             dtype='float32',
                             is_sparse=False,
                             param_attr=fluid.ParamAttr(name='embedding_para'))
    rnn_input = x_emb
    rnn_outs = []
    rnn_outs_ori = []
    cells = []
    projs = []
    for i in range(args.num_layers):
        rnn_input = dropout(rnn_input, test_mode, args)
        if init_hidden and init_cell:
            h0 = layers.squeeze(layers.slice(init_hidden,
                                             axes=[0],
                                             starts=[i],
                                             ends=[i + 1]),
                                axes=[0])
            c0 = layers.squeeze(layers.slice(init_cell,
                                             axes=[0],
                                             starts=[i],
                                             ends=[i + 1]),
                                axes=[0])
        else:
            h0 = c0 = None
        rnn_out, cell, input_proj = lstmp_encoder(
            rnn_input, args.hidden_size, h0, c0,
            para_name + 'layer{}'.format(i + 1), emb_size, test_mode, args)
        rnn_out_ori = rnn_out
        if i > 0:
            rnn_out = rnn_out + rnn_input
        rnn_out = dropout(rnn_out, test_mode, args)
        cell = dropout(cell, test_mode, args)
        rnn_outs.append(rnn_out)
        rnn_outs_ori.append(rnn_out_ori)
        rnn_input = rnn_out
        cells.append(cell)
        projs.append(input_proj)

    softmax_weight = layers.create_parameter([vocab_size, emb_size],
                                             dtype="float32",
                                             name="softmax_weight")
    softmax_bias = layers.create_parameter([vocab_size],
                                           dtype="float32",
                                           name='softmax_bias')
    projection = layers.matmul(rnn_outs[-1], softmax_weight, transpose_y=True)
    projection = layers.elementwise_add(projection, softmax_bias)

    projection = layers.reshape(projection, shape=[-1, vocab_size])

    if args.sample_softmax and (not test_mode):
        loss = layers.sampled_softmax_with_cross_entropy(
            logits=projection,
            label=y,
            num_samples=args.n_negative_samples_batch,
            seed=args.random_seed)
    else:
        label = layers.one_hot(input=y, depth=vocab_size)
        loss = layers.softmax_with_cross_entropy(logits=projection,
                                                 label=label,
                                                 soft_label=True)
    return [x_emb, projection, loss], rnn_outs, rnn_outs_ori, cells, projs
コード例 #22
0
def gat(gw,
        feature,
        hidden_size,
        activation,
        name,
        num_heads=8,
        feat_drop=0.6,
        attn_drop=0.6,
        is_test=False):
    """Implementation of graph attention networks (GAT)

    This is an implementation of the paper GRAPH ATTENTION NETWORKS
    (https://arxiv.org/abs/1710.10903).

    Args:
        gw: Graph wrapper object (:code:`StaticGraphWrapper` or :code:`GraphWrapper`)

        feature: A tensor with shape (num_nodes, feature_size).

        hidden_size: The hidden size for gat.

        activation: The activation for the output.

        name: Gat layer names.

        num_heads: The head number in gat.

        feat_drop: Dropout rate for feature.

        attn_drop: Dropout rate for attention.

        is_test: Whether in test phrase.

    Return:
        A tensor with shape (num_nodes, hidden_size * num_heads)
    """
    def send_attention(src_feat, dst_feat, edge_feat):
        output = src_feat["left_a"] + dst_feat["right_a"]
        output = L.leaky_relu(output, alpha=0.2)  # (num_edges, num_heads)
        return {"alpha": output, "h": src_feat["h"]}

    def reduce_attention(msg):
        alpha = msg["alpha"]  # lod-tensor (batch_size, seq_len, num_heads)
        h = msg["h"]
        alpha = paddle_helper.sequence_softmax(alpha)
        old_h = h
        h = L.reshape(h, [-1, num_heads, hidden_size])
        alpha = L.reshape(alpha, [-1, num_heads, 1])
        if attn_drop > 1e-15:
            alpha = L.dropout(alpha,
                              dropout_prob=attn_drop,
                              is_test=is_test,
                              dropout_implementation="upscale_in_train")
        h = h * alpha
        h = L.reshape(h, [-1, num_heads * hidden_size])
        h = L.lod_reset(h, old_h)
        return L.sequence_pool(h, "sum")

    if feat_drop > 1e-15:
        feature = L.dropout(feature,
                            dropout_prob=feat_drop,
                            is_test=is_test,
                            dropout_implementation='upscale_in_train')

    ft = L.fc(feature,
              hidden_size * num_heads,
              bias_attr=False,
              param_attr=fluid.ParamAttr(name=name + '_weight'))
    left_a = L.create_parameter(shape=[num_heads, hidden_size],
                                dtype='float32',
                                name=name + '_gat_l_A')
    right_a = L.create_parameter(shape=[num_heads, hidden_size],
                                 dtype='float32',
                                 name=name + '_gat_r_A')
    reshape_ft = L.reshape(ft, [-1, num_heads, hidden_size])
    left_a_value = L.reduce_sum(reshape_ft * left_a, -1)
    right_a_value = L.reduce_sum(reshape_ft * right_a, -1)

    msg = gw.send(send_attention,
                  nfeat_list=[("h", ft), ("left_a", left_a_value),
                              ("right_a", right_a_value)])
    output = gw.recv(msg, reduce_attention)
    bias = L.create_parameter(shape=[hidden_size * num_heads],
                              dtype='float32',
                              is_bias=True,
                              name=name + '_bias')
    bias.stop_gradient = True
    output = L.elementwise_add(output, bias, act=activation)
    return output
コード例 #23
0
def gin(gw,
        feature,
        hidden_size,
        activation,
        name,
        init_eps=0.0,
        train_eps=False):
    """Implementation of Graph Isomorphism Network (GIN) layer.

    This is an implementation of the paper How Powerful are Graph Neural Networks?
    (https://arxiv.org/pdf/1810.00826.pdf).

    In their implementation, all MLPs have 2 layers. Batch normalization is applied
    on every hidden layer.

    Args:
        gw: Graph wrapper object (:code:`StaticGraphWrapper` or :code:`GraphWrapper`)

        feature: A tensor with shape (num_nodes, feature_size).

        name: GIN layer names.

        hidden_size: The hidden size for gin.

        activation: The activation for the output.

        init_eps: float, optional
            Initial :math:`\epsilon` value, default is 0.

        train_eps: bool, optional
            if True, :math:`\epsilon` will be a learnable parameter.

    Return:
        A tensor with shape (num_nodes, hidden_size).
    """
    def send_src_copy(src_feat, dst_feat, edge_feat):
        return src_feat["h"]

    epsilon = L.create_parameter(
        shape=[1, 1],
        dtype="float32",
        attr=fluid.ParamAttr(name="%s_eps" % name),
        default_initializer=fluid.initializer.ConstantInitializer(
            value=init_eps))

    if not train_eps:
        epsilon.stop_gradient = True

    msg = gw.send(send_src_copy, nfeat_list=[("h", feature)])
    output = gw.recv(msg, "sum") + feature * (epsilon + 1.0)

    output = L.fc(output,
                  size=hidden_size,
                  act=None,
                  param_attr=fluid.ParamAttr(name="%s_w_0" % name),
                  bias_attr=fluid.ParamAttr(name="%s_b_0" % name))

    output = L.layer_norm(
        output,
        begin_norm_axis=1,
        param_attr=fluid.ParamAttr(
            name="norm_scale_%s" % (name),
            initializer=fluid.initializer.Constant(1.0)),
        bias_attr=fluid.ParamAttr(name="norm_bias_%s" % (name),
                                  initializer=fluid.initializer.Constant(0.0)),
    )

    if activation is not None:
        output = getattr(L, activation)(output)

    output = L.fc(output,
                  size=hidden_size,
                  act=activation,
                  param_attr=fluid.ParamAttr(name="%s_w_1" % name),
                  bias_attr=fluid.ParamAttr(name="%s_b_1" % name))

    return output
コード例 #24
0
ファイル: networks.py プロジェクト: wyhlaowang/UGATIT-Paddle
 def __init__(self, num_features, eps=1e-5):
     super(ILN, self).__init__()
     self.eps = eps
     self.rho = layers.create_parameter(shape=[1, num_features, 1, 1], dtype='float32', default_initializer=fluid.initializer.Constant(0.0))
     self.gamma = layers.create_parameter(shape=[1, num_features, 1, 1], dtype='float32', default_initializer=fluid.initializer.Constant(1.0))
     self.beta = layers.create_parameter(shape=[1, num_features, 1, 1], dtype='float32', default_initializer=fluid.initializer.Constant(0.0))
コード例 #25
0
    def forward(self):
        """Build the GATNE net.
        """
        param_attr_init = fluid.initializer.Uniform(
            low=-1.0, high=1.0, seed=np.random.randint(100))
        embed_param_attrs = fluid.ParamAttr(name='Base_node_embed',
                                            initializer=param_attr_init)

        # node_embeddings
        base_node_embed = fl.embedding(
            input=fl.reshape(self.train_inputs, shape=[-1, 1]),
            size=[self.num_nodes, self.embedding_size],
            param_attr=embed_param_attrs)

        node_features = []
        for edge_type in self.edge_types:
            param_attr_init = fluid.initializer.Uniform(
                low=-1.0, high=1.0, seed=np.random.randint(100))
            embed_param_attrs = fluid.ParamAttr(name='%s_node_embed' %
                                                edge_type,
                                                initializer=param_attr_init)

            features = fl.embedding(
                input=self.gw[edge_type].node_feat['index'],
                size=[self.num_nodes, self.embedding_u_size],
                param_attr=embed_param_attrs)

            node_features.append(features)

        # mp_output: list of embedding(self.num_nodes, dim)
        mp_output = self.message_passing(self.gw, self.edge_types,
                                         node_features)

        # U : (num_type[m], num_nodes, dim[s])
        node_type_embed = fl.stack(mp_output, axis=0)

        # U : (num_nodes, num_type[m], dim[s])
        node_type_embed = fl.transpose(node_type_embed, perm=[1, 0, 2])

        #gather node_type_embed from train_inputs
        node_type_embed = fl.gather(node_type_embed, self.train_inputs)

        # M_r
        trans_weights = fl.create_parameter(
            shape=[
                self.edge_type_count, self.embedding_u_size,
                self.embedding_size // self.att_head
            ],
            attr=fluid.initializer.TruncatedNormalInitializer(
                loc=0.0, scale=1.0 / math.sqrt(self.embedding_size)),
            dtype='float32',
            name='trans_w')

        # W_r
        trans_weights_s1 = fl.create_parameter(
            shape=[self.edge_type_count, self.embedding_u_size, self.dim_a],
            attr=fluid.initializer.TruncatedNormalInitializer(
                loc=0.0, scale=1.0 / math.sqrt(self.embedding_size)),
            dtype='float32',
            name='trans_w_s1')

        # w_r
        trans_weights_s2 = fl.create_parameter(
            shape=[self.edge_type_count, self.dim_a, self.att_head],
            attr=fluid.initializer.TruncatedNormalInitializer(
                loc=0.0, scale=1.0 / math.sqrt(self.embedding_size)),
            dtype='float32',
            name='trans_w_s2')

        trans_w = fl.gather(trans_weights, self.train_types)
        trans_w_s1 = fl.gather(trans_weights_s1, self.train_types)
        trans_w_s2 = fl.gather(trans_weights_s2, self.train_types)

        attention = self.attention(node_type_embed, trans_w_s1, trans_w_s2)
        node_type_embed = fl.matmul(attention, node_type_embed)
        node_embed = base_node_embed + fl.reshape(
            fl.matmul(node_type_embed, trans_w), [-1, self.embedding_size])

        self.last_node_embed = fl.l2_normalize(node_embed, axis=1)

        nce_weight_initializer = fluid.initializer.TruncatedNormalInitializer(
            loc=0.0, scale=1.0 / math.sqrt(self.embedding_size))
        nce_weight_attrs = fluid.ParamAttr(name='nce_weight',
                                           initializer=nce_weight_initializer)

        weight_pos = fl.embedding(input=self.train_labels,
                                  size=[self.num_nodes, self.embedding_size],
                                  param_attr=nce_weight_attrs)
        weight_neg = fl.embedding(input=self.train_negs,
                                  size=[self.num_nodes, self.embedding_size],
                                  param_attr=nce_weight_attrs)
        tmp_node_embed = fl.unsqueeze(self.last_node_embed, axes=[1])
        pos_logits = fl.matmul(tmp_node_embed, weight_pos,
                               transpose_y=True)  # [B, 1, 1]

        neg_logits = fl.matmul(tmp_node_embed, weight_neg,
                               transpose_y=True)  # [B, 1, neg_num]

        pos_score = fl.squeeze(pos_logits, axes=[1])
        pos_score = fl.clip(pos_score, min=-10, max=10)
        pos_score = -1.0 * fl.logsigmoid(pos_score)

        neg_score = fl.squeeze(neg_logits, axes=[1])
        neg_score = fl.clip(neg_score, min=-10, max=10)
        neg_score = -1.0 * fl.logsigmoid(-1.0 * neg_score)

        neg_score = fl.reduce_sum(neg_score, dim=1, keep_dim=True)
        self.loss = fl.reduce_mean(pos_score + neg_score)
コード例 #26
0
def lm_model(hidden_size,
             vocab_size,
             batch_size,
             num_layers=2,
             num_steps=20,
             init_scale=0.1,
             dropout=None,
             rnn_model='static',
             use_py_reader=False):
    def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None):
        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
            weight_1 = layers.create_parameter(
                [hidden_size * 2, hidden_size * 4],
                dtype="float32",
                name="fc_weight1_" + str(i),
                default_initializer=fluid.initializer.UniformInitializer(
                    low=-init_scale, high=init_scale))
            weight_1_arr.append(weight_1)
            bias_1 = layers.create_parameter(
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
                default_initializer=fluid.initializer.Constant(0.0))
            bias_arr.append(bias_1)

            pre_hidden = layers.slice(
                init_hidden, axes=[0], starts=[i], ends=[i + 1])
            pre_cell = layers.slice(
                init_cell, axes=[0], starts=[i], ends=[i + 1])
            pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size])
            pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size])
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        input_embedding = layers.transpose(input_embedding, perm=[1, 0, 2])
        rnn = PaddingRNN()

        with rnn.step():
            input = rnn.step_input(input_embedding)
            for k in range(num_layers):
                pre_hidden = rnn.memory(init=hidden_array[k])
                pre_cell = rnn.memory(init=cell_array[k])
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

                nn = layers.concat([input, pre_hidden], 1)
                gate_input = layers.matmul(x=nn, y=weight_1)

                gate_input = layers.elementwise_add(gate_input, bias)
                i = layers.slice(
                    gate_input, axes=[1], starts=[0], ends=[hidden_size])
                j = layers.slice(
                    gate_input,
                    axes=[1],
                    starts=[hidden_size],
                    ends=[hidden_size * 2])
                f = layers.slice(
                    gate_input,
                    axes=[1],
                    starts=[hidden_size * 2],
                    ends=[hidden_size * 3])
                o = layers.slice(
                    gate_input,
                    axes=[1],
                    starts=[hidden_size * 3],
                    ends=[hidden_size * 4])

                c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
                    i) * layers.tanh(j)
                m = layers.tanh(c) * layers.sigmoid(o)

                rnn.update_memory(pre_hidden, m)
                rnn.update_memory(pre_cell, c)

                rnn.step_output(m)
                rnn.step_output(c)

                input = m

                if dropout != None and dropout > 0.0:
                    input = layers.dropout(
                        input,
                        dropout_prob=dropout,
                        dropout_implementation='upscale_in_train')

            rnn.step_output(input)
        rnnout = rnn()

        last_hidden_array = []
        last_cell_array = []
        real_res = rnnout[-1]
        for i in range(num_layers):
            m = rnnout[i * 2]
            c = rnnout[i * 2 + 1]
            m.stop_gradient = True
            c.stop_gradient = True
            last_h = layers.slice(
                m, axes=[0], starts=[num_steps - 1], ends=[num_steps])
            last_hidden_array.append(last_h)
            last_c = layers.slice(
                c, axes=[0], starts=[num_steps - 1], ends=[num_steps])
            last_cell_array.append(last_c)
        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])
        last_hidden = layers.concat(last_hidden_array, 0)
        last_cell = layers.concat(last_cell_array, 0)

        return real_res, last_hidden, last_cell

    def encoder_static(input_embedding, len=3, init_hidden=None,
                       init_cell=None):

        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
            weight_1 = layers.create_parameter(
                [hidden_size * 2, hidden_size * 4],
                dtype="float32",
                name="fc_weight1_" + str(i),
                default_initializer=fluid.initializer.UniformInitializer(
                    low=-init_scale, high=init_scale))
            weight_1_arr.append(weight_1)
            bias_1 = layers.create_parameter(
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
                default_initializer=fluid.initializer.Constant(0.0))
            bias_arr.append(bias_1)

            pre_hidden = layers.slice(
                init_hidden, axes=[0], starts=[i], ends=[i + 1])
            pre_cell = layers.slice(
                init_cell, axes=[0], starts=[i], ends=[i + 1])
            pre_hidden = layers.reshape(
                pre_hidden, shape=[-1, hidden_size], inplace=True)
            pre_cell = layers.reshape(
                pre_cell, shape=[-1, hidden_size], inplace=True)
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        res = []
        sliced_inputs = layers.split(
            input_embedding, num_or_sections=len, dim=1)

        for index in range(len):
            input = sliced_inputs[index]
            input = layers.reshape(input, shape=[-1, hidden_size], inplace=True)
            for k in range(num_layers):
                pre_hidden = hidden_array[k]
                pre_cell = cell_array[k]
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

                nn = layers.concat([input, pre_hidden], 1)
                gate_input = layers.matmul(x=nn, y=weight_1)

                gate_input = layers.elementwise_add(gate_input, bias)
                i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)

                try:
                    from paddle.fluid.contrib.layers import fused_elemwise_activation
                    # fluid.contrib.layers.fused_elemwise_activation can do a fused
                    # operation, like:
                    # 1) x + sigmoid(y); x + tanh(y)
                    # 2) tanh(x + y)
                    # Now the unary operation supported in this fused op is limit, and
                    # we will extent this operation to support more unary operations and
                    # do this kind of fusion automitically in future version of paddle.fluid.
                    # layers.sigmoid(i) * layers.tanh(j)
                    tmp0 = fused_elemwise_activation(
                        x=layers.tanh(j),
                        y=i,
                        functor_list=['elementwise_mul', 'sigmoid'],
                        save_intermediate_out=False)
                    # pre_cell * layers.sigmoid(f)
                    tmp1 = fused_elemwise_activation(
                        x=pre_cell,
                        y=f,
                        functor_list=['elementwise_mul', 'sigmoid'],
                        save_intermediate_out=False)
                    c = tmp0 + tmp1
                    # layers.tanh(c) * layers.sigmoid(o)
                    m = fused_elemwise_activation(
                        x=layers.tanh(c),
                        y=o,
                        functor_list=['elementwise_mul', 'sigmoid'],
                        save_intermediate_out=False)
                except ImportError:
                    c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
                        i) * layers.tanh(j)
                    m = layers.tanh(c) * layers.sigmoid(o)

                hidden_array[k] = m
                cell_array[k] = c
                input = m

                if dropout != None and dropout > 0.0:
                    input = layers.dropout(
                        input,
                        dropout_prob=dropout,
                        dropout_implementation='upscale_in_train')

            res.append(input)

        last_hidden = layers.concat(hidden_array, 1)
        last_hidden = layers.reshape(
            last_hidden, shape=[-1, num_layers, hidden_size], inplace=True)
        last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2])

        last_cell = layers.concat(cell_array, 1)
        last_cell = layers.reshape(
            last_cell, shape=[-1, num_layers, hidden_size])
        last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2])

        real_res = layers.concat(res, 0)
        real_res = layers.reshape(
            real_res, shape=[len, -1, hidden_size], inplace=True)
        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])

        return real_res, last_hidden, last_cell

    batch_size_each = batch_size
    if use_py_reader:
        feed_shapes = [[batch_size_each, num_steps, 1],
                       [batch_size_each * num_steps, 1]]
        py_reader = fluid.layers.py_reader(
            capacity=16, shapes=feed_shapes, dtypes=['int64', 'int64'])
        x, y = fluid.layers.read_file(py_reader)
    else:
        x = layers.data(
            name="x",
            shape=[batch_size_each, num_steps, 1],
            dtype='int64',
            append_batch_size=False)
        y = layers.data(
            name="y",
            shape=[batch_size_each * num_steps, 1],
            dtype='int64',
            append_batch_size=False)

    init_hidden = layers.data(
        name="init_hidden",
        shape=[num_layers, batch_size_each, hidden_size],
        dtype='float32',
        append_batch_size=False)
    init_cell = layers.data(
        name="init_cell",
        shape=[num_layers, batch_size_each, hidden_size],
        dtype='float32',
        append_batch_size=False)

    init_cell.persistable = True
    init_hidden.persistable = True

    init_hidden_reshape = layers.reshape(
        init_hidden, shape=[num_layers, -1, hidden_size])
    init_cell_reshape = layers.reshape(
        init_cell, shape=[num_layers, -1, hidden_size])

    x_emb = layers.embedding(
        input=x,
        size=[vocab_size, hidden_size],
        dtype='float32',
        is_sparse=False,
        param_attr=fluid.ParamAttr(
            name='embedding_para',
            initializer=fluid.initializer.UniformInitializer(
                low=-init_scale, high=init_scale)))

    x_emb = layers.reshape(
        x_emb, shape=[-1, num_steps, hidden_size], inplace=True)
    if dropout != None and dropout > 0.0:
        x_emb = layers.dropout(
            x_emb,
            dropout_prob=dropout,
            dropout_implementation='upscale_in_train')

    if rnn_model == "padding":
        rnn_out, last_hidden, last_cell = padding_rnn(
            x_emb,
            len=num_steps,
            init_hidden=init_hidden_reshape,
            init_cell=init_cell_reshape)
    elif rnn_model == "static":
        rnn_out, last_hidden, last_cell = encoder_static(
            x_emb,
            len=num_steps,
            init_hidden=init_hidden_reshape,
            init_cell=init_cell_reshape)
    elif rnn_model == "cudnn":
        x_emb = layers.transpose(x_emb, perm=[1, 0, 2])
        rnn_out, last_hidden, last_cell = layers.lstm(
            x_emb,
            init_hidden_reshape,
            init_cell_reshape,
            num_steps,
            hidden_size,
            num_layers,
            is_bidirec=False,
            default_initializer=fluid.initializer.UniformInitializer(
                low=-init_scale, high=init_scale))
        rnn_out = layers.transpose(rnn_out, perm=[1, 0, 2])
    elif rnn_model == "basic_lstm":
        rnn_out, last_hidden, last_cell = basic_lstm( x_emb, init_hidden, init_cell, hidden_size, \
                num_layers=num_layers, batch_first=True, dropout_prob=dropout, \
                param_attr = ParamAttr( initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale) ), \
                bias_attr = ParamAttr( initializer = fluid.initializer.Constant(0.0) ), \
                forget_bias = 0.0)
    else:
        print("type not support")
        return

    rnn_out = layers.reshape(
        rnn_out, shape=[-1, num_steps, hidden_size], inplace=True)

    softmax_weight = layers.create_parameter(
        [hidden_size, vocab_size],
        dtype="float32",
        name="softmax_weight",
        default_initializer=fluid.initializer.UniformInitializer(
            low=-init_scale, high=init_scale))
    softmax_bias = layers.create_parameter(
        [vocab_size],
        dtype="float32",
        name='softmax_bias',
        default_initializer=fluid.initializer.UniformInitializer(
            low=-init_scale, high=init_scale))

    projection = layers.matmul(rnn_out, softmax_weight)
    projection = layers.elementwise_add(projection, softmax_bias)
    projection = layers.reshape(
        projection, shape=[-1, vocab_size], inplace=True)

    loss = layers.softmax_with_cross_entropy(
        logits=projection, label=y, soft_label=False)

    loss = layers.reshape(loss, shape=[-1, num_steps], inplace=True)
    loss = layers.reduce_mean(loss, dim=[0])
    loss = layers.reduce_sum(loss)

    loss.persistable = True
    last_cell.persistable = True
    last_hidden.persistable = True

    # This will feed last_hidden, last_cell to init_hidden, init_cell, which
    # can be used directly in next batch. This can avoid the fetching of
    # last_hidden and last_cell and feeding of init_hidden and init_cell in
    # each training step.
    layers.assign(input=last_cell, output=init_cell)
    layers.assign(input=last_hidden, output=init_hidden)

    feeding_list = ['x', 'y', 'init_hidden', 'init_cell']
    if use_py_reader:
        return loss, last_hidden, last_cell, feeding_list, py_reader
    else:
        return loss, last_hidden, last_cell, feeding_list
コード例 #27
0
    def _build_decoder(self,
                       enc_last_hidden,
                       enc_last_cell,
                       mode='train',
                       beam_size=10):
        softmax_weight = layers.create_parameter([self.hidden_size, self.tar_vocab_size], dtype="float32", name="softmax_weight", \
                    default_initializer=fluid.initializer.UniformInitializer(low=-self.init_scale, high=self.init_scale))
        if mode == 'train':
            dec_output, dec_last_hidden, dec_last_cell = basic_lstm( self.tar_emb, enc_last_hidden, enc_last_cell, \
                    self.hidden_size, num_layers=self.num_layers, \
                    batch_first=self.batch_first, \
                    dropout_prob=self.dropout, \
                    param_attr = ParamAttr( initializer=fluid.initializer.UniformInitializer(low=-self.init_scale, high=self.init_scale) ), \
                    bias_attr = ParamAttr( initializer = fluid.initializer.Constant(0.0) ))

            dec_output = layers.matmul(dec_output, softmax_weight)

            return dec_output
        elif mode == 'beam_search' or mode == 'greedy_search':
            dec_unit_list = []
            name = 'basic_lstm'
            for i in range(self.num_layers):
                new_name = name + "_layers_" + str(i)
                dec_unit_list.append(
                    BasicLSTMUnit(new_name, self.hidden_size, dtype='float32'))

            def decoder_step(current_in, pre_hidden_array, pre_cell_array):
                new_hidden_array = []
                new_cell_array = []

                step_in = current_in
                for i in range(self.num_layers):
                    pre_hidden = pre_hidden_array[i]
                    pre_cell = pre_cell_array[i]

                    new_hidden, new_cell = dec_unit_list[i](step_in,
                                                            pre_hidden,
                                                            pre_cell)

                    new_hidden_array.append(new_hidden)
                    new_cell_array.append(new_cell)

                    step_in = new_hidden

                return step_in, new_hidden_array, new_cell_array

            if mode == 'beam_search':
                max_src_seq_len = layers.shape(self.src)[1]
                max_length = max_src_seq_len * 2
                #max_length = layers.fill_constant( [1], dtype='int32', value = 10)
                pre_ids = layers.fill_constant([1, 1], dtype='int64', value=1)
                full_ids = layers.fill_constant([1, 1], dtype='int64', value=1)

                score = layers.fill_constant([1], dtype='float32', value=0.0)

                #eos_ids = layers.fill_constant( [1, 1], dtype='int64', value=2)

                pre_hidden_array = []
                pre_cell_array = []
                pre_feed = layers.fill_constant([beam_size, self.hidden_size],
                                                dtype='float32',
                                                value=0)
                for i in range(self.num_layers):
                    pre_hidden_array.append(
                        layers.expand(enc_last_hidden[i], [beam_size, 1]))
                    pre_cell_array.append(
                        layers.expand(enc_last_cell[i], [beam_size, 1]))

                eos_ids = layers.fill_constant([beam_size],
                                               dtype='int64',
                                               value=2)
                init_score = np.zeros((beam_size)).astype('float32')
                init_score[1:] = -INF
                pre_score = layers.assign(init_score)
                #pre_score = layers.fill_constant( [1,], dtype='float32', value= 0.0)
                tokens = layers.fill_constant([beam_size, 1],
                                              dtype='int64',
                                              value=1)

                enc_memory = layers.expand(self.enc_output, [beam_size, 1, 1])

                pre_tokens = layers.fill_constant([beam_size, 1],
                                                  dtype='int64',
                                                  value=1)

                finished_seq = layers.fill_constant([beam_size, 1],
                                                    dtype='int64',
                                                    value=0)
                finished_scores = layers.fill_constant([beam_size],
                                                       dtype='float32',
                                                       value=-INF)
                finished_flag = layers.fill_constant([beam_size],
                                                     dtype='float32',
                                                     value=0.0)

                step_idx = layers.fill_constant(shape=[1],
                                                dtype='int32',
                                                value=0)
                cond = layers.less_than(x=step_idx,
                                        y=max_length)  # default force_cpu=True

                parent_idx = layers.fill_constant([1], dtype='int32', value=0)
                while_op = layers.While(cond)

                def compute_topk_scores_and_seq(sequences,
                                                scores,
                                                scores_to_gather,
                                                flags,
                                                beam_size,
                                                select_beam=None,
                                                generate_id=None):
                    scores = layers.reshape(scores, shape=[1, -1])
                    _, topk_indexs = layers.topk(scores, k=beam_size)

                    topk_indexs = layers.reshape(topk_indexs, shape=[-1])

                    # gather result

                    top_seq = layers.gather(sequences, topk_indexs)
                    topk_flags = layers.gather(flags, topk_indexs)
                    topk_gather_scores = layers.gather(scores_to_gather,
                                                       topk_indexs)

                    if select_beam:
                        topk_beam = layers.gather(select_beam, topk_indexs)
                    else:
                        topk_beam = select_beam

                    if generate_id:
                        topk_id = layers.gather(generate_id, topk_indexs)
                    else:
                        topk_id = generate_id
                    return top_seq, topk_gather_scores, topk_flags, topk_beam, topk_id

                def grow_alive(curr_seq, curr_scores, curr_log_probs,
                               curr_finished, select_beam, generate_id):
                    curr_scores += curr_finished * -INF
                    return compute_topk_scores_and_seq(curr_seq,
                                                       curr_scores,
                                                       curr_log_probs,
                                                       curr_finished,
                                                       beam_size,
                                                       select_beam,
                                                       generate_id=generate_id)

                def grow_finished(finished_seq, finished_scores, finished_flag,
                                  curr_seq, curr_scores, curr_finished):
                    finished_seq = layers.concat([
                        finished_seq,
                        layers.fill_constant(
                            [beam_size, 1], dtype='int64', value=1)
                    ],
                                                 axis=1)
                    curr_scores += (1.0 - curr_finished) * -INF
                    #layers.Print( curr_scores, message="curr scores")
                    curr_finished_seq = layers.concat([finished_seq, curr_seq],
                                                      axis=0)
                    curr_finished_scores = layers.concat(
                        [finished_scores, curr_scores], axis=0)
                    curr_finished_flags = layers.concat(
                        [finished_flag, curr_finished], axis=0)

                    return compute_topk_scores_and_seq(curr_finished_seq,
                                                       curr_finished_scores,
                                                       curr_finished_scores,
                                                       curr_finished_flags,
                                                       beam_size)

                def is_finished(alive_log_prob, finished_scores,
                                finished_in_finished):

                    max_out_len = 200
                    max_length_penalty = layers.pow(
                        layers.fill_constant([1],
                                             dtype='float32',
                                             value=((5.0 + max_out_len) /
                                                    6.0)), alpha)

                    lower_bound_alive_score = layers.slice(
                        alive_log_prob, starts=[0], ends=[1],
                        axes=[0]) / max_length_penalty

                    lowest_score_of_fininshed_in_finished = finished_scores * finished_in_finished
                    lowest_score_of_fininshed_in_finished += (
                        1.0 - finished_in_finished) * -INF
                    lowest_score_of_fininshed_in_finished = layers.reduce_min(
                        lowest_score_of_fininshed_in_finished)

                    met = layers.less_than(
                        lower_bound_alive_score,
                        lowest_score_of_fininshed_in_finished)
                    met = layers.cast(met, 'float32')
                    bound_is_met = layers.reduce_sum(met)

                    finished_eos_num = layers.reduce_sum(finished_in_finished)

                    finish_cond = layers.less_than(
                        finished_eos_num,
                        layers.fill_constant([1],
                                             dtype='float32',
                                             value=beam_size))

                    return finish_cond

                def grow_top_k(step_idx, alive_seq, alive_log_prob,
                               parant_idx):
                    pre_ids = alive_seq

                    dec_step_emb = layers.embedding(
                        input=pre_ids,
                        size=[self.tar_vocab_size, self.hidden_size],
                        dtype='float32',
                        is_sparse=False,
                        param_attr=fluid.ParamAttr(
                            name='target_embedding',
                            initializer=fluid.initializer.UniformInitializer(
                                low=-self.init_scale, high=self.init_scale)))

                    dec_att_out, new_hidden_array, new_cell_array = decoder_step(
                        dec_step_emb, pre_hidden_array, pre_cell_array)

                    projection = layers.matmul(dec_att_out, softmax_weight)

                    logits = layers.softmax(projection)
                    current_log = layers.elementwise_add(x=layers.log(logits),
                                                         y=alive_log_prob,
                                                         axis=0)
                    base_1 = layers.cast(step_idx, 'float32') + 6.0
                    base_1 /= 6.0
                    length_penalty = layers.pow(base_1, alpha)

                    len_pen = layers.pow(
                        ((5. + layers.cast(step_idx + 1, 'float32')) / 6.),
                        alpha)

                    current_log = layers.reshape(current_log, shape=[1, -1])

                    current_log = current_log / length_penalty
                    topk_scores, topk_indices = layers.topk(input=current_log,
                                                            k=beam_size)

                    topk_scores = layers.reshape(topk_scores, shape=[-1])

                    topk_log_probs = topk_scores * length_penalty

                    generate_id = layers.reshape(
                        topk_indices, shape=[-1]) % self.tar_vocab_size

                    selected_beam = layers.reshape(
                        topk_indices, shape=[-1]) // self.tar_vocab_size

                    topk_finished = layers.equal(generate_id, eos_ids)

                    topk_finished = layers.cast(topk_finished, 'float32')

                    generate_id = layers.reshape(generate_id, shape=[-1, 1])

                    pre_tokens_list = layers.gather(tokens, selected_beam)

                    full_tokens_list = layers.concat(
                        [pre_tokens_list, generate_id], axis=1)


                    return full_tokens_list, topk_log_probs, topk_scores, topk_finished, selected_beam, generate_id, \
                            dec_att_out, new_hidden_array, new_cell_array

                with while_op.block():
                    topk_seq, topk_log_probs, topk_scores, topk_finished, topk_beam, topk_generate_id, attention_out, new_hidden_array, new_cell_array = \
                        grow_top_k(  step_idx, pre_tokens, pre_score, parent_idx)
                    alive_seq, alive_log_prob, _, alive_beam, alive_id = grow_alive(
                        topk_seq, topk_scores, topk_log_probs, topk_finished,
                        topk_beam, topk_generate_id)

                    finished_seq_2, finished_scores_2, finished_flags_2, _, _ = grow_finished(
                        finished_seq, finished_scores, finished_flag, topk_seq,
                        topk_scores, topk_finished)

                    finished_cond = is_finished(alive_log_prob,
                                                finished_scores_2,
                                                finished_flags_2)

                    layers.increment(x=step_idx, value=1.0, in_place=True)

                    layers.assign(alive_beam, parent_idx)
                    layers.assign(alive_id, pre_tokens)
                    layers.assign(alive_log_prob, pre_score)
                    layers.assign(alive_seq, tokens)
                    layers.assign(finished_seq_2, finished_seq)
                    layers.assign(finished_scores_2, finished_scores)
                    layers.assign(finished_flags_2, finished_flag)

                    # update init_hidden, init_cell, input_feed
                    new_feed = layers.gather(attention_out, parent_idx)
                    layers.assign(new_feed, pre_feed)
                    for i in range(self.num_layers):
                        new_hidden_var = layers.gather(new_hidden_array[i],
                                                       parent_idx)
                        layers.assign(new_hidden_var, pre_hidden_array[i])
                        new_cell_var = layers.gather(new_cell_array[i],
                                                     parent_idx)
                        layers.assign(new_cell_var, pre_cell_array[i])

                    length_cond = layers.less_than(x=step_idx, y=max_length)
                    layers.logical_and(x=length_cond,
                                       y=finished_cond,
                                       out=cond)

                tokens_with_eos = tokens

                all_seq = layers.concat([tokens_with_eos, finished_seq],
                                        axis=0)
                all_score = layers.concat([pre_score, finished_scores], axis=0)
                _, topk_index = layers.topk(all_score, k=beam_size)
                topk_index = layers.reshape(topk_index, shape=[-1])
                final_seq = layers.gather(all_seq, topk_index)
                final_score = layers.gather(all_score, topk_index)

                return final_seq
            elif mode == 'greedy_search':
                max_src_seq_len = layers.shape(self.src)[1]
                max_length = max_src_seq_len * 2
                #max_length = layers.fill_constant( [1], dtype='int32', value = 10)
                pre_ids = layers.fill_constant([1, 1], dtype='int64', value=1)
                full_ids = layers.fill_constant([1, 1], dtype='int64', value=1)

                score = layers.fill_constant([1], dtype='float32', value=0.0)

                eos_ids = layers.fill_constant([1, 1], dtype='int64', value=2)

                pre_hidden_array = []
                pre_cell_array = []
                pre_feed = layers.fill_constant([1, self.hidden_size],
                                                dtype='float32',
                                                value=0)
                for i in range(self.num_layers):
                    pre_hidden_array.append(enc_last_hidden[i])
                    pre_cell_array.append(enc_last_cell[i])
                    #pre_hidden_array.append( layers.fill_constant( [1, hidden_size], dtype='float32', value=0)  )
                    #pre_cell_array.append( layers.fill_constant( [1, hidden_size], dtype='float32', value=0) )

                step_idx = layers.fill_constant(shape=[1],
                                                dtype='int32',
                                                value=0)
                cond = layers.less_than(x=step_idx,
                                        y=max_length)  # default force_cpu=True
                while_op = layers.While(cond)

                with while_op.block():

                    dec_step_emb = layers.embedding(
                        input=pre_ids,
                        size=[self.tar_vocab_size, self.hidden_size],
                        dtype='float32',
                        is_sparse=False,
                        param_attr=fluid.ParamAttr(
                            name='target_embedding',
                            initializer=fluid.initializer.UniformInitializer(
                                low=-self.init_scale, high=self.init_scale)))

                    dec_att_out, new_hidden_array, new_cell_array = decoder_step(
                        dec_step_emb, pre_hidden_array, pre_cell_array)

                    projection = layers.matmul(dec_att_out, softmax_weight)

                    logits = layers.softmax(projection)
                    logits = layers.log(logits)

                    current_log = layers.elementwise_add(logits, score, axis=0)

                    topk_score, topk_indices = layers.topk(input=current_log,
                                                           k=1)

                    new_ids = layers.concat([full_ids, topk_indices])
                    layers.assign(new_ids, full_ids)
                    #layers.Print( full_ids, message="ful ids")
                    layers.assign(topk_score, score)
                    layers.assign(topk_indices, pre_ids)
                    layers.assign(dec_att_out, pre_feed)
                    for i in range(self.num_layers):
                        layers.assign(new_hidden_array[i], pre_hidden_array[i])
                        layers.assign(new_cell_array[i], pre_cell_array[i])

                    layers.increment(x=step_idx, value=1.0, in_place=True)

                    eos_met = layers.not_equal(topk_indices, eos_ids)
                    length_cond = layers.less_than(x=step_idx, y=max_length)
                    layers.logical_and(x=length_cond, y=eos_met, out=cond)

                return full_ids

            raise Exception("error")
        else:
            print("mode not supprt", mode)
コード例 #28
0
    def encoder_static(input_embedding, len=3, init_hidden=None,
                       init_cell=None):

        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
            weight_1 = layers.create_parameter(
                [hidden_size * 2, hidden_size * 4],
                dtype="float32",
                name="fc_weight1_" + str(i),
                default_initializer=fluid.initializer.UniformInitializer(
                    low=-init_scale, high=init_scale))
            weight_1_arr.append(weight_1)
            bias_1 = layers.create_parameter(
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
                default_initializer=fluid.initializer.Constant(0.0))
            bias_arr.append(bias_1)

            pre_hidden = layers.slice(
                init_hidden, axes=[0], starts=[i], ends=[i + 1])
            pre_cell = layers.slice(
                init_cell, axes=[0], starts=[i], ends=[i + 1])
            pre_hidden = layers.reshape(
                pre_hidden, shape=[-1, hidden_size], inplace=True)
            pre_cell = layers.reshape(
                pre_cell, shape=[-1, hidden_size], inplace=True)
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        res = []
        sliced_inputs = layers.split(
            input_embedding, num_or_sections=len, dim=1)

        for index in range(len):
            input = sliced_inputs[index]
            input = layers.reshape(input, shape=[-1, hidden_size], inplace=True)
            for k in range(num_layers):
                pre_hidden = hidden_array[k]
                pre_cell = cell_array[k]
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

                nn = layers.concat([input, pre_hidden], 1)
                gate_input = layers.matmul(x=nn, y=weight_1)

                gate_input = layers.elementwise_add(gate_input, bias)
                i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)

                try:
                    from paddle.fluid.contrib.layers import fused_elemwise_activation
                    # fluid.contrib.layers.fused_elemwise_activation can do a fused
                    # operation, like:
                    # 1) x + sigmoid(y); x + tanh(y)
                    # 2) tanh(x + y)
                    # Now the unary operation supported in this fused op is limit, and
                    # we will extent this operation to support more unary operations and
                    # do this kind of fusion automitically in future version of paddle.fluid.
                    # layers.sigmoid(i) * layers.tanh(j)
                    tmp0 = fused_elemwise_activation(
                        x=layers.tanh(j),
                        y=i,
                        functor_list=['elementwise_mul', 'sigmoid'],
                        save_intermediate_out=False)
                    # pre_cell * layers.sigmoid(f)
                    tmp1 = fused_elemwise_activation(
                        x=pre_cell,
                        y=f,
                        functor_list=['elementwise_mul', 'sigmoid'],
                        save_intermediate_out=False)
                    c = tmp0 + tmp1
                    # layers.tanh(c) * layers.sigmoid(o)
                    m = fused_elemwise_activation(
                        x=layers.tanh(c),
                        y=o,
                        functor_list=['elementwise_mul', 'sigmoid'],
                        save_intermediate_out=False)
                except ImportError:
                    c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
                        i) * layers.tanh(j)
                    m = layers.tanh(c) * layers.sigmoid(o)

                hidden_array[k] = m
                cell_array[k] = c
                input = m

                if dropout != None and dropout > 0.0:
                    input = layers.dropout(
                        input,
                        dropout_prob=dropout,
                        dropout_implementation='upscale_in_train')

            res.append(input)

        last_hidden = layers.concat(hidden_array, 1)
        last_hidden = layers.reshape(
            last_hidden, shape=[-1, num_layers, hidden_size], inplace=True)
        last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2])

        last_cell = layers.concat(cell_array, 1)
        last_cell = layers.reshape(
            last_cell, shape=[-1, num_layers, hidden_size])
        last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2])

        real_res = layers.concat(res, 0)
        real_res = layers.reshape(
            real_res, shape=[len, -1, hidden_size], inplace=True)
        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])

        return real_res, last_hidden, last_cell
コード例 #29
0
    def temporal_conv_layer(self, x, Kt, c_in, c_out, name, act_func='relu'):
        """Temporal convolution layer"""
        _, T, n, _ = x.shape
        if c_in > c_out:
            x_input = fl.conv2d(input=x,
                                num_filters=c_out,
                                filter_size=[1, 1],
                                stride=[1, 1],
                                padding="SAME",
                                data_format="NHWC",
                                param_attr=fluid.ParamAttr(name="%s_conv2d_1" %
                                                           name))
        elif c_in < c_out:
            # if the size of input channel is less than the output,
            # padding x to the same size of output channel.
            pad = fl.fill_constant_batch_size_like(
                input=x,
                shape=[-1, T, n, c_out - c_in],
                dtype="float32",
                value=0.0)
            x_input = fl.concat([x, pad], axis=3)
        else:
            x_input = x

        #  x_input = x_input[:, Kt - 1:T, :, :]
        if act_func == 'GLU':
            # gated liner unit
            bt_init = fluid.initializer.ConstantInitializer(value=0.0)
            bt = fl.create_parameter(
                shape=[2 * c_out],
                dtype="float32",
                attr=fluid.ParamAttr(name="%s_bt" % name,
                                     trainable=True,
                                     initializer=bt_init),
            )
            x_conv = fl.conv2d(input=x,
                               num_filters=2 * c_out,
                               filter_size=[Kt, 1],
                               stride=[1, 1],
                               padding="SAME",
                               data_format="NHWC",
                               param_attr=fluid.ParamAttr(name="%s_conv2d_wt" %
                                                          name))
            x_conv = x_conv + bt
            return (x_conv[:, :, :, 0:c_out] + x_input) * fl.sigmoid(
                x_conv[:, :, :, -c_out:])
        else:
            bt_init = fluid.initializer.ConstantInitializer(value=0.0)
            bt = fl.create_parameter(
                shape=[c_out],
                dtype="float32",
                attr=fluid.ParamAttr(name="%s_bt" % name,
                                     trainable=True,
                                     initializer=bt_init),
            )
            x_conv = fl.conv2d(input=x,
                               num_filters=c_out,
                               filter_size=[Kt, 1],
                               stride=[1, 1],
                               padding="SAME",
                               data_format="NHWC",
                               param_attr=fluid.ParamAttr(name="%s_conv2d_wt" %
                                                          name))
            x_conv = x_conv + bt
            if act_func == "linear":
                return x_conv
            elif act_func == "sigmoid":
                return fl.sigmoid(x_conv)
            elif act_func == "relu":
                return fl.relu(x_conv + x_input)
            else:
                raise ValueError(
                    f'ERROR: activation function "{act_func}" is not defined.')
コード例 #30
0
    def net(self, inputs, is_infer=False):
        if is_infer:
            bs = self.evaluate_batch_size
        else:
            bs = self.train_batch_size

        stdv = 1.0 / math.sqrt(self.hidden_size)

        def embedding_layer(input,
                            table_name,
                            emb_dim,
                            initializer_instance=None):
            emb = fluid.embedding(
                input=input,
                size=[self.dict_size, emb_dim],
                param_attr=fluid.ParamAttr(
                    name=table_name, initializer=initializer_instance))
            return emb

        sparse_initializer = fluid.initializer.Uniform(low=-stdv, high=stdv)
        items_emb = embedding_layer(inputs[0], "emb", self.hidden_size,
                                    sparse_initializer)
        pre_state = items_emb
        for i in range(self.step):
            pre_state = layers.reshape(
                x=pre_state, shape=[bs, -1, self.hidden_size])
            state_in = layers.fc(
                input=pre_state,
                name="state_in",
                size=self.hidden_size,
                act=None,
                num_flatten_dims=2,
                param_attr=fluid.ParamAttr(
                    initializer=fluid.initializer.Uniform(
                        low=-stdv, high=stdv)),
                bias_attr=fluid.ParamAttr(
                    initializer=fluid.initializer.Uniform(
                        low=-stdv, high=stdv)))  # [batch_size, uniq_max, h]
            state_out = layers.fc(
                input=pre_state,
                name="state_out",
                size=self.hidden_size,
                act=None,
                num_flatten_dims=2,
                param_attr=fluid.ParamAttr(
                    initializer=fluid.initializer.Uniform(
                        low=-stdv, high=stdv)),
                bias_attr=fluid.ParamAttr(
                    initializer=fluid.initializer.Uniform(
                        low=-stdv, high=stdv)))  # [batch_size, uniq_max, h]

            state_adj_in = layers.matmul(inputs[3],
                                         state_in)  # [batch_size, uniq_max, h]
            state_adj_out = layers.matmul(
                inputs[4], state_out)  # [batch_size, uniq_max, h]

            gru_input = layers.concat([state_adj_in, state_adj_out], axis=2)

            gru_input = layers.reshape(
                x=gru_input, shape=[-1, self.hidden_size * 2])
            gru_fc = layers.fc(input=gru_input,
                               name="gru_fc",
                               size=3 * self.hidden_size,
                               bias_attr=False)
            pre_state, _, _ = fluid.layers.gru_unit(
                input=gru_fc,
                hidden=layers.reshape(
                    x=pre_state, shape=[-1, self.hidden_size]),
                size=3 * self.hidden_size)

        final_state = layers.reshape(
            pre_state, shape=[bs, -1, self.hidden_size])
        seq = layers.gather_nd(final_state, inputs[1])
        last = layers.gather_nd(final_state, inputs[2])

        seq_fc = layers.fc(
            input=seq,
            name="seq_fc",
            size=self.hidden_size,
            bias_attr=False,
            act=None,
            num_flatten_dims=2,
            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  # [batch_size, seq_max, h]
        last_fc = layers.fc(input=last,
                            name="last_fc",
                            size=self.hidden_size,
                            bias_attr=False,
                            act=None,
                            num_flatten_dims=1,
                            param_attr=fluid.ParamAttr(
                                initializer=fluid.initializer.Uniform(
                                    low=-stdv, high=stdv)))  # [bathc_size, h]

        seq_fc_t = layers.transpose(
            seq_fc, perm=[1, 0, 2])  # [seq_max, batch_size, h]
        add = layers.elementwise_add(seq_fc_t,
                                     last_fc)  # [seq_max, batch_size, h]
        b = layers.create_parameter(
            shape=[self.hidden_size],
            dtype='float32',
            default_initializer=fluid.initializer.Constant(value=0.0))  # [h]
        add = layers.elementwise_add(add, b)  # [seq_max, batch_size, h]

        add_sigmoid = layers.sigmoid(add)  # [seq_max, batch_size, h]
        add_sigmoid = layers.transpose(
            add_sigmoid, perm=[1, 0, 2])  # [batch_size, seq_max, h]

        weight = layers.fc(
            input=add_sigmoid,
            name="weight_fc",
            size=1,
            act=None,
            num_flatten_dims=2,
            bias_attr=False,
            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  # [batch_size, seq_max, 1]
        weight *= inputs[5]
        weight_mask = layers.elementwise_mul(
            seq, weight, axis=0)  # [batch_size, seq_max, h]
        global_attention = layers.reduce_sum(
            weight_mask, dim=1)  # [batch_size, h]

        final_attention = layers.concat(
            [global_attention, last], axis=1)  # [batch_size, 2*h]
        final_attention_fc = layers.fc(
            input=final_attention,
            name="final_attention_fc",
            size=self.hidden_size,
            bias_attr=False,
            act=None,
            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  # [batch_size, h]

        # all_vocab = layers.create_global_var(
        #     shape=[items_num - 1],
        #     value=0,
        #     dtype="int64",
        #     persistable=True,
        #     name="all_vocab")
        all_vocab = np.arange(1, self.dict_size).reshape((-1)).astype('int32')
        all_vocab = fluid.layers.cast(
            x=fluid.layers.assign(all_vocab), dtype='int64')

        all_emb = fluid.embedding(
            input=all_vocab,
            param_attr=fluid.ParamAttr(
                name="emb",
                initializer=fluid.initializer.Uniform(
                    low=-stdv, high=stdv)),
            size=[self.dict_size, self.hidden_size])  # [all_vocab, h]

        logits = layers.matmul(
            x=final_attention_fc, y=all_emb,
            transpose_y=True)  # [batch_size, all_vocab]
        softmax = layers.softmax_with_cross_entropy(
            logits=logits, label=inputs[6])  # [batch_size, 1]
        self.loss = layers.reduce_mean(softmax)  # [1]
        self.acc = layers.accuracy(input=logits, label=inputs[6], k=20)

        self._cost = self.loss
        if is_infer:
            self._infer_results['acc'] = self.acc
            self._infer_results['loss'] = self.loss
            return

        self._metrics["LOSS"] = self.loss
        self._metrics["train_acc"] = self.acc
コード例 #31
0
ファイル: SSAN_encoder.py プロジェクト: ly-888/Research
def multi_head_attention(queries,
                         keys,
                         values,
                         attn_bias,
                         structure_mask,
                         with_ent_structure,
                         d_key,
                         d_value,
                         d_model,
                         n_head=1,
                         dropout_rate=0.,
                         cache=None,
                         param_initializer=None,
                         name='multi_head_att'):
    """
    Multi-Head Attention. Note that attn_bias is added to the logit before
    computing softmax activiation to mask certain selected positions so that
    they will not considered in attention weights.
    """
    keys = queries if keys is None else keys
    values = keys if values is None else values

    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
        raise ValueError(
            "Inputs: quries, keys and values should all be 3-D tensors.")

    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
        """
        Add linear projection to queries, keys, and values.
        """
        q = layers.fc(input=queries,
                      size=d_key * n_head,
                      num_flatten_dims=2,
                      param_attr=fluid.ParamAttr(
                          name=name + '_query_fc.w_0',
                          initializer=param_initializer),
                      bias_attr=name + '_query_fc.b_0')
        k = layers.fc(input=keys,
                      size=d_key * n_head,
                      num_flatten_dims=2,
                      param_attr=fluid.ParamAttr(
                          name=name + '_key_fc.w_0',
                          initializer=param_initializer),
                      bias_attr=name + '_key_fc.b_0')
        v = layers.fc(input=values,
                      size=d_value * n_head,
                      num_flatten_dims=2,
                      param_attr=fluid.ParamAttr(
                          name=name + '_value_fc.w_0',
                          initializer=param_initializer),
                      bias_attr=name + '_value_fc.b_0')

        return q, k, v

    def __split_heads(x, n_head):
        """
        Reshape the last dimension of inpunt tensor x so that it becomes two
        dimensions and then transpose. Specifically, input a tensor with shape
        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
        with shape [bs, n_head, max_sequence_length, hidden_dim].
        """
        hidden_size = x.shape[-1]
        # The value 0 in shape attr means copying the corresponding dimension
        # size of the input as the output dimension size.
        reshaped = layers.reshape(
            x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True)

        # permuate the dimensions into:
        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])

    def __combine_heads(x):
        """
        Transpose and then reshape the last two dimensions of inpunt tensor x
        so that it becomes one dimension, which is reverse to __split_heads.
        """
        if len(x.shape) == 3: return x
        if len(x.shape) != 4:
            raise ValueError("Input(x) should be a 4-D Tensor.")

        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
        # The value 0 in shape attr means copying the corresponding dimension
        # size of the input as the output dimension size.
        return layers.reshape(
            x=trans_x,
            shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
            inplace=True)

    def scaled_dot_product_attention(q, k, v, attn_bias, biaffine_transformation, biaffine_transformation_bias,
                                     structure_mask, with_ent_structure, d_key, dropout_rate):
        """
        Scaled Dot-Product Attention
        """
        scaled_q = layers.scale(x=q, scale=d_key**-0.5)
        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)

        if with_ent_structure:
            # TRANSFORMATION
            # 1.reshape input
            # q: [bs, n_head, seq, hidden] -> [bs, 1, n_head, seq, hidden]
            # transformation: [dependencies(5), n_head, hidden, hidden] -> [1, dependencies(5), n_head, hidden, hidden]
            # k: [bs, n_head, seq, hidden] -> [bs, 1, n_head, seq, hidden]
            q_ = layers.unsqueeze(scaled_q, [1])
            q_ = layers.expand(q_, [1, biaffine_transformation.shape[0], 1, 1, 1])
            biaffine_transformation_ = layers.unsqueeze(biaffine_transformation, [0])
            biaffine_transformation_ = layers.expand(biaffine_transformation_, [q_.shape[0], 1, 1, 1, 1])
            k_ = layers.unsqueeze(k, [1])
            k_ = layers.expand(k_, [1, biaffine_transformation.shape[0], 1, 1, 1])

            # 2.implement matmul
            # q * transformation: [bs, dependencies(5), n_head, seq, hidden]
            # q * transformation * k: [bs, dependencies(5), n_head, seq, seq]
            structured_bias = layers.matmul(x=q_, y=biaffine_transformation_)
            structured_bias = layers.matmul(x=structured_bias, y=k_, transpose_y=True)

            structured_bias = layers.elementwise_add(structured_bias, biaffine_transformation_bias, axis=1)

            # mask & apply
            structured_bias = structured_bias * structure_mask
            structured_bias = layers.reduce_sum(structured_bias, dim=1)
            product += structured_bias

        if attn_bias:
            product += attn_bias
        weights = layers.softmax(product)
        if dropout_rate:
            weights = layers.dropout(
                weights,
                dropout_prob=dropout_rate,
                dropout_implementation="upscale_in_train",
                is_test=False)
        out = layers.matmul(weights, v)
        return out

    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)

    if cache is not None:  # use cache and concat time steps
        # Since the inplace reshape in __split_heads changes the shape of k and
        # v, which is the cache input for next time step, reshape the cache
        # input from the previous time step first.
        k = cache["k"] = layers.concat(
            [layers.reshape(
                cache["k"], shape=[0, 0, d_model]), k], axis=1)
        v = cache["v"] = layers.concat(
            [layers.reshape(
                cache["v"], shape=[0, 0, d_model]), v], axis=1)

    q = __split_heads(q, n_head)
    k = __split_heads(k, n_head)
    v = __split_heads(v, n_head)

    biaffine_transformation = layers.create_parameter([5, n_head, d_key, d_key], core.VarDesc.VarType.FP32,
                                                      name=name + '_biaffine_transformation', attr=None,
                                                      is_bias=False, default_initializer=param_initializer)
    biaffine_transformation_bias = layers.create_parameter([5, n_head], core.VarDesc.VarType.FP32,
                                                      name=name + '_biaffine_transformation_bias', attr=None,
                                                      is_bias=False, default_initializer=param_initializer)

    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias,
                                                  biaffine_transformation, biaffine_transformation_bias,
                                                  structure_mask, with_ent_structure, d_key, dropout_rate)

    out = __combine_heads(ctx_multiheads)

    # Project back to the model size.
    proj_out = layers.fc(input=out,
                         size=d_model,
                         num_flatten_dims=2,
                         param_attr=fluid.ParamAttr(
                             name=name + '_output_fc.w_0',
                             initializer=param_initializer),
                         bias_attr=name + '_output_fc.b_0')
    return proj_out