Esempio n. 1
0
    def label_embed_input(self, feature):
        label = F.data(name="label", shape=[None, 1], dtype="int64")
        label_idx = F.data(name='label_idx', shape=[None], dtype="int64")
        label = L.reshape(label, shape=[-1])
        label = L.gather(label, label_idx, overwrite=False)

        lay_norm_attr = F.ParamAttr(
            initializer=F.initializer.ConstantInitializer(value=1))
        lay_norm_bias = F.ParamAttr(
            initializer=F.initializer.ConstantInitializer(value=0))
        feature = L.layer_norm(feature,
                               name='layer_norm_feature_input1',
                               param_attr=lay_norm_attr,
                               bias_attr=lay_norm_bias)

        embed_attr = F.ParamAttr(
            initializer=F.initializer.NormalInitializer(loc=0.0, scale=1.0))
        embed = F.embedding(input=label,
                            size=(self.out_size, self.embed_size),
                            param_attr=embed_attr)
        lay_norm_attr = F.ParamAttr(
            initializer=F.initializer.ConstantInitializer(value=1))
        lay_norm_bias = F.ParamAttr(
            initializer=F.initializer.ConstantInitializer(value=0))
        embed = L.layer_norm(embed,
                             name='layer_norm_feature_input2',
                             param_attr=lay_norm_attr,
                             bias_attr=lay_norm_bias)
        embed = L.relu(embed)

        feature_label = L.gather(feature, label_idx, overwrite=False)
        feature_label = feature_label + embed
        feature = L.scatter(feature, label_idx, feature_label, overwrite=True)

        return feature
Esempio n. 2
0
def pre_post_process_layer(prev_out,
                           out,
                           process_cmd,
                           dropout_rate=0.,
                           epsilon=1e-5,
                           name=""):
    """Add a pre-process or post-process between sub layers.

    Add residual connection, layer normalization and droput to the out tensor
    optionally according to the value of process_cmd.
    This will be used before or after multi-head attention and position-wise
    feed-forward networks.
    """
    for cmd in process_cmd:
        if cmd == "a":  # add residual connection
            out = out + prev_out if prev_out else out
        elif cmd == "n":  # add layer normalization
            out = layers.layer_norm(
                out,
                begin_norm_axis=len(out.shape) - 1,
                param_attr=fluid.ParamAttr(
                    name=name + "_layer_norm_scale",
                    initializer=fluid.initializer.Constant(1.)),
                bias_attr=fluid.ParamAttr(
                    name=name + "_layer_norm_bias",
                    initializer=fluid.initializer.Constant(0.)),
                epsilon=epsilon)
        elif cmd == "d":  # add dropout
            if dropout_rate:
                out = layers.dropout(out,
                                     dropout_prob=dropout_rate,
                                     dropout_implementation="upscale_in_train",
                                     is_test=False)
    return out
Esempio n. 3
0
 def mlp(self, features, name):
     h = features
     dim = features.shape[-1]
     dim_list = [dim * 2, dim]
     for i in range(2):
         h = L.fc(h,
                  size=dim_list[i],
                  name="%s_fc_%s" % (name, i),
                  act=None)
         if self.args.norm_type == "layer_norm":
             log.info("norm_type is %s" % self.args.norm_type)
             h = L.layer_norm(
                 h,
                 begin_norm_axis=1,
                 param_attr=F.ParamAttr(
                     name="norm_scale_%s_%s" % (name, i),
                     initializer=F.initializer.Constant(1.0)),
                 bias_attr=F.ParamAttr(
                     name="norm_bias_%s_%s" % (name, i),
                     initializer=F.initializer.Constant(0.0)),
             )
         else:
             log.info("using batch_norm")
             h = L.batch_norm(h)
         h = pgl.layers.graph_norm(self.graph_wrapper, h)
         h = L.relu(h)
     return h
Esempio n. 4
0
    def embed_input(self, feature, name, norm=True):
        fan_in = feature.shape[-1]
        bias_bound = 1.0 / math.sqrt(fan_in)
        fc_bias_attr = F.ParamAttr(
            initializer=F.initializer.UniformInitializer(low=-bias_bound,
                                                         high=bias_bound))

        negative_slope = math.sqrt(5)
        gain = math.sqrt(2.0 / (1 + negative_slope**2))
        std = gain / math.sqrt(fan_in)
        weight_bound = math.sqrt(3.0) * std
        fc_w_attr = F.ParamAttr(initializer=F.initializer.UniformInitializer(
            low=-weight_bound, high=weight_bound))

        feature = L.fc(feature,
                       self.embed_size,
                       act=None,
                       param_attr=fc_w_attr,
                       bias_attr=fc_bias_attr,
                       name=name + "_node_feature_encoder")

        if norm:
            lay_norm_attr = F.ParamAttr(
                initializer=F.initializer.ConstantInitializer(value=1))
            lay_norm_bias = F.ParamAttr(
                initializer=F.initializer.ConstantInitializer(value=0))
            feature = L.layer_norm(feature,
                                   name=name + '_layer_norm_feature_input',
                                   param_attr=lay_norm_attr,
                                   bias_attr=lay_norm_bias)

        return feature
Esempio n. 5
0
    def output_layer(self, x, T, name, act_func='GLU'):
        """Output layer"""
        _, _, n, channel = x.shape

        # maps multi-steps to one.
        x_i = self.temporal_conv_layer(x=x,
                                       Kt=T,
                                       c_in=channel,
                                       c_out=channel,
                                       name="%s_in" % name,
                                       act_func=act_func)
        x_ln = fl.layer_norm(x_i)
        x_o = self.temporal_conv_layer(x=x_ln,
                                       Kt=1,
                                       c_in=channel,
                                       c_out=channel,
                                       name="%s_out" % name,
                                       act_func='sigmoid')

        # maps multi-channels to one.
        x_fc = self.fully_con_layer(x=x_o,
                                    n=n,
                                    channel=channel,
                                    name="%s_fc" % name)
        return x_fc
Esempio n. 6
0
def pre_post_process_layer(prev_out,
                           out,
                           process_cmd,
                           dropout_rate=0.,
                           name='',
                           is_test=False):

    for cmd in process_cmd:
        if cmd == "a":  # 两个输入相加
            out = out + prev_out if prev_out else out
        elif cmd == "n":  # 进行normalization
            out_type = out.dtype
            if out_type == fluid.core.VarDesc.VarType.FP16:
                out = layers.cast(x=out, dtype="float32")
            out = layers.layer_norm(
                out,
                begin_norm_axis=len(out.shape) - 1,
                param_attr=fluid.ParamAttr(
                    name=name + '_layer_norm_scale',
                    initializer=fluid.initializer.Constant(1.)),
                bias_attr=fluid.ParamAttr(
                    name=name + '_layer_norm_bias',
                    initializer=fluid.initializer.Constant(0.)))
            if out_type == fluid.core.VarDesc.VarType.FP16:
                out = layers.cast(x=out, dtype="float16")
        elif cmd == "d":  # 进行dropout
            if dropout_rate:
                out = layers.dropout(out,
                                     dropout_prob=dropout_rate,
                                     dropout_implementation="upscale_in_train",
                                     is_test=is_test)
    return out
def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.,
                           name=''):
    """
    Add residual connection, layer normalization and droput to the out tensor
    optionally according to the value of process_cmd.
    This will be used before or after multi-head attention and position-wise
    feed-forward networks.
    """
    for cmd in process_cmd:
        if cmd == "a":  # add residual connection
            out = out + prev_out if prev_out else out
        elif cmd == "n":  # add layer normalization
            out_dtype = out.dtype
            if out_dtype == fluid.core.VarDesc.VarType.FP16:
                out = layers.cast(x=out, dtype="float32")
            out = layers.layer_norm(
                out,
                begin_norm_axis=len(out.shape) - 1,
                param_attr=fluid.ParamAttr(
                    name=name + '_layer_norm_scale',
                    initializer=fluid.initializer.Constant(1.)),
                bias_attr=fluid.ParamAttr(
                    name=name + '_layer_norm_bias',
                    initializer=fluid.initializer.Constant(0.)))
            if out_dtype == fluid.core.VarDesc.VarType.FP16:
                out = layers.cast(x=out, dtype="float16")
        elif cmd == "d":  # add dropout
            if dropout_rate:
                out = layers.dropout(
                    out,
                    dropout_prob=dropout_rate,
                    dropout_implementation="upscale_in_train",
                    is_test=False)
    return out
Esempio n. 8
0
def self_attention_and_residual(feature, size, input_mask, name, maxlen):
    query = L.fc(feature, size, name=name + "_query", num_flatten_dims=2)
    key = L.fc(feature, size, name=name + "_key", num_flatten_dims=2)
    value = L.fc(feature, size, name=name + "_value", num_flatten_dims=2)
    attention = L.softmax(L.matmul(query, key, transpose_y=True) + input_mask)
    output = L.matmul(attention, value)
    output = L.fc(output, size, name=name + "_model", num_flatten_dims=2)
    output = L.relu(output + feature)
    output = L.layer_norm(output, begin_norm_axis=2, name=name + '_ln')
    return output
Esempio n. 9
0
    def embed_input(self, feature):

        lay_norm_attr = F.ParamAttr(
            initializer=F.initializer.ConstantInitializer(value=1))
        lay_norm_bias = F.ParamAttr(
            initializer=F.initializer.ConstantInitializer(value=0))
        feature = L.layer_norm(feature,
                               name='layer_norm_feature_input',
                               param_attr=lay_norm_attr,
                               bias_attr=lay_norm_bias)

        return feature
Esempio n. 10
0
def layer_norm(feature, name=""):
    lay_norm_attr = F.ParamAttr(
        name="attr_%s" % name,
        initializer=F.initializer.ConstantInitializer(value=1))
    lay_norm_bias = F.ParamAttr(
        name="bias_%s" % name,
        initializer=F.initializer.ConstantInitializer(value=0))

    feature = L.layer_norm(feature,
                           param_attr=lay_norm_attr,
                           bias_attr=lay_norm_bias)

    return feature
Esempio n. 11
0
    def get_batch_feature(self, feature):
        batch_nodes = F.data(name='batch_nodes_0', shape=[None], dtype="int64")
        #         feature=L.index_select(feature, batch_nodes)
        feature = L.gather(feature, batch_nodes, overwrite=False)

        lay_norm_attr = F.ParamAttr(
            initializer=F.initializer.ConstantInitializer(value=1))
        lay_norm_bias = F.ParamAttr(
            initializer=F.initializer.ConstantInitializer(value=0))
        feature = L.layer_norm(feature,
                               name='layer_norm_feature_input',
                               param_attr=lay_norm_attr,
                               bias_attr=lay_norm_bias)

        #         feature=L.dropout(feature, dropout_prob=self.dropout, dropout_implementation='upscale_in_train',
        #                          is_test=test)
        return feature
Esempio n. 12
0
    def forward(self):
        """forward"""
        features_list = [self.gw.node_feat["attr"]]

        for i in range(self.num_layers):
            h = gin(self.gw,
                    features_list[i],
                    hidden_size=self.hidden_size,
                    activation="relu",
                    name="gin_%s" % (i),
                    init_eps=0.0,
                    train_eps=self.train_eps)

            h = fl.layer_norm(
                h,
                begin_norm_axis=1,
                param_attr=fluid.ParamAttr(
                    name="norm_scale_%s" % (i),
                    initializer=fluid.initializer.Constant(1.0)),
                bias_attr=fluid.ParamAttr(
                    name="norm_bias_%s" % (i),
                    initializer=fluid.initializer.Constant(0.0)), )

            h = fl.relu(h)

            features_list.append(h)

        output = 0
        for i, h in enumerate(features_list):
            pooled_h = pgl.layers.graph_pooling(self.gw, h, self.pool_type)
            drop_h = fl.dropout(
                pooled_h,
                self.dropout_prob,
                dropout_implementation="upscale_in_train")
            output += fl.fc(drop_h,
                            size=self.num_class,
                            act=None,
                            param_attr=fluid.ParamAttr(name="final_fc_%s" %
                                                       (i)))

        # calculate loss
        self.loss = fl.softmax_with_cross_entropy(output, self.labels)
        self.loss = fl.reduce_mean(self.loss)
        self.acc = fl.accuracy(fl.softmax(output), self.labels)
Esempio n. 13
0
def pre_post_process_layer(prev_out, out, process_cmd, dropout=0.):
    """
    Add residual connection, layer normalization and droput to the out tensor
    optionally according to the value of process_cmd.

    This will be used before or after multi-head attention and position-wise
    feed-forward networks.
    """
    for cmd in process_cmd:
        if cmd == "a":  # add residual connection
            out = out + prev_out if prev_out else out
        elif cmd == "n":  # add layer normalization
            out = layers.layer_norm(out,
                                    begin_norm_axis=len(out.shape) - 1,
                                    param_attr=fluid.initializer.Constant(1.),
                                    bias_attr=fluid.initializer.Constant(0.))
        elif cmd == "d":  # add dropout
            if dropout:
                out = layers.dropout(out, dropout_prob=dropout, is_test=False)
    return out
Esempio n. 14
0
def pre_post_process_layer(prev_out, out, process_cmd, dropout=0.):
    """
    Add residual connection, layer normalization and droput to the out tensor
    optionally according to the value of process_cmd.

    This will be used before or after multi-head attention and position-wise
    feed-forward networks.
    """
    for cmd in process_cmd:
        if cmd == "a":  # add residual connection
            out = out + prev_out if prev_out else out
        elif cmd == "n":  # add layer normalization
            out = layers.layer_norm(
                out,
                begin_norm_axis=len(out.shape) - 1,
                param_attr=fluid.initializer.Constant(1.),
                bias_attr=fluid.initializer.Constant(0.))
        elif cmd == "d":  # add dropout
            if dropout:
                out = layers.dropout(out, dropout_prob=dropout, is_test=False)
    return out
def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0., seed=0,
                           name='', debug=False):
    """
    Add residual connection, layer normalization and droput to the out tensor
    optionally according to the value of process_cmd.
    This will be used before or after multi-head attention and position-wise
    feed-forward networks.
    out: (b, seq-len, hidden size)
    """
    # logger.info(f"pre_post_process_layer out: {prev_out}")
    debug_dict = {}
    for cmd in process_cmd:
        if cmd == "a":  # add residual connection
            out = out + prev_out if prev_out else out
        elif cmd == "n":  # add layer normalization
            out = layers.layer_norm(
                out,
                begin_norm_axis=len(out.shape) - 1,
                param_attr=fluid.ParamAttr(
                    name=name + '_layer_norm_scale',
                    initializer=fluid.initializer.Constant(1.)),
                bias_attr=fluid.ParamAttr(
                    name=name + '_layer_norm_bias',
                    initializer=fluid.initializer.Constant(0.)))
        elif cmd == "d":  # add dropout
            if dropout_rate:
                out = layers.dropout(
                    out,
                    dropout_prob=dropout_rate,
                    dropout_implementation="upscale_in_train",
                    seed=seed,
                    is_test=False)
        
        if debug:
            debug_dict[f"out_{cmd}"] = out
    if debug:
        debug_dict[f"out"] = out
        return debug_dict
    else:
        return out
Esempio n. 16
0
    def st_conv_block(self,
                      x,
                      Ks,
                      Kt,
                      channels,
                      name,
                      keep_prob,
                      act_func='GLU'):
        """Spatio-Temporal convolution block"""
        c_si, c_t, c_oo = channels

        x_s = self.temporal_conv_layer(x,
                                       Kt,
                                       c_si,
                                       c_t,
                                       "%s_tconv_in" % name,
                                       act_func=act_func)
        x_t = self.spatio_conv_layer(x_s, Ks, c_t, c_t, "%s_sonv" % name)
        x_o = self.temporal_conv_layer(x_t, Kt, c_t, c_oo,
                                       "%s_tconv_out" % name)

        x_ln = fl.layer_norm(x_o)
        return fl.dropout(x_ln, dropout_prob=(1.0 - keep_prob))
Esempio n. 17
0
    def forward(self, graph_wrapper, is_test=False):
        """
        Build the network.
        """
        node_features = self._mol_encoder(graph_wrapper, name=self.name)

        features_list = [node_features]
        for layer in range(self.layer_num):
            edge_features = self._bond_encoder(
                    graph_wrapper, 
                    name='%s_layer%s' % (self.name, layer))
            if self.gnn_type == "gcn":
                feat = gcn_layer(
                        graph_wrapper,
                        features_list[layer],
                        edge_features,
                        act="relu",
                        name="%s_layer%s_gcn" % (self.name, layer))
            elif self.gnn_type == "gat":
                feat = gat_layer(
                        graph_wrapper, 
                        features_list[layer],
                        edge_features,
                        self.embed_dim,
                        act="relu",
                        name="%s_layer%s_gat" % (self.name, layer))
            else:
                feat = gin_layer(
                        graph_wrapper,
                        features_list[layer],
                        edge_features,
                        name="%s_layer%s_gin" % (self.name, layer))

            if self.norm_type == 'batch_norm':
                feat = layers.batch_norm(
                        feat, 
                        param_attr=fluid.ParamAttr(
                            name="%s_layer%s_batch_norm_scale" % (self.name, layer),
                            initializer=fluid.initializer.Constant(1.0)),
                        bias_attr=fluid.ParamAttr(
                            name="%s_layer%s_batch_norm_bias" % (self.name, layer),
                            initializer=fluid.initializer.Constant(0.0)),
                        moving_mean_name="%s_layer%s_batch_norm_moving_avearage" % (self.name, layer),
                        moving_variance_name="%s_layer%s_batch_norm_moving_variance" % (self.name, layer),
                        is_test=is_test)
            elif self.norm_type == 'layer_norm':
                feat = layers.layer_norm(
                        feat, 
                        param_attr=fluid.ParamAttr(
                            name="%s_layer%s_layer_norm_scale" % (self.name, layer),
                            initializer=fluid.initializer.Constant(1.0)),
                        bias_attr=fluid.ParamAttr(
                            name="%s_layer%s_layer_norm_bias" % (self.name, layer),
                            initializer=fluid.initializer.Constant(0.0)))
            else:
                raise ValueError('%s not supported.' % self.norm_type)

            if self.graph_norm:
                feat = pgl.layers.graph_norm(graph_wrapper, feat)

            if layer < self.layer_num - 1:
                feat = layers.relu(feat)
            feat = layers.dropout(
                    feat,
                    self.dropout_rate,
                    dropout_implementation="upscale_in_train",
                    is_test=is_test)

            # residual
            if self.residual:
                feat = feat + features_list[layer]

            features_list.append(feat)

        if self.JK == "sum":
            node_repr = layers.reduce_sum(features_list, axis=0)
        elif self.JK == "mean":
            node_repr = layers.reduce_mean(features_list, axis=0)
        elif self.JK == "last":
            node_repr = features_list[-1]
        else:
            node_repr = features_list[-1]
        return node_repr
Esempio n. 18
0
    def _build_net(self):
        # ConvLSTM2D
        rnn_out, last_hidden = ConvLSTM.convlstm2d_rnn(
            rnn_input=self.input,
            init_hidden=None,
            init_cell=None,
            padding=1,
            hidden_h=self.h,
            hidden_w=self.w,
            filters=self.filters,
            filter_size=self.filter_size,
            sequence_length=self.input_seqlen)

        # Batch Norm
        bn = layers.layer_norm(rnn_out, begin_norm_axis=4)

        # ConvLSTM2D
        rnn_out, last_hidden = ConvLSTM.convlstm2d_rnn(
            rnn_input=bn,
            init_hidden=None,
            init_cell=None,
            padding=1,
            hidden_h=self.h,
            hidden_w=self.w,
            filters=self.filters,
            filter_size=self.filter_size,
            sequence_length=self.input_seqlen)

        # Batch Norm
        bn = layers.layer_norm(rnn_out, begin_norm_axis=4)

        # ConvLSTM2D
        rnn_out, last_hidden = ConvLSTM.convlstm2d_rnn(
            rnn_input=bn,
            init_hidden=None,
            init_cell=None,
            padding=1,
            hidden_h=self.h,
            hidden_w=self.w,
            filters=self.filters,
            filter_size=self.filter_size,
            sequence_length=self.input_seqlen)

        # Batch Norm
        bn = layers.layer_norm(rnn_out, begin_norm_axis=4)

        # ConvLSTM2D
        rnn_out, last_hidden = ConvLSTM.convlstm2d_rnn(
            rnn_input=bn,
            init_hidden=None,
            init_cell=None,
            padding=1,
            hidden_h=self.h,
            hidden_w=self.w,
            filters=self.filters,
            filter_size=self.filter_size,
            sequence_length=self.input_seqlen)

        # Batch Norm
        bn = layers.layer_norm(rnn_out, begin_norm_axis=4)

        # Transpose : (batch x C x D x H x W)
        tr = layers.transpose(bn, [0, 4, 1, 2, 3])

        # Conv3D
        conv3d = layers.conv3d(input=tr,
                               num_filters=2,
                               filter_size=3,
                               padding=1)
        # conv3d : (batch x C x D x H x W)

        conv3d = layers.transpose(conv3d, [0, 2, 3, 4, 1])
        # conv3d: (batch x D x H x W x C)

        return conv3d
Esempio n. 19
0
    def get_gat_layer(self,
                      i,
                      gw,
                      feature,
                      hidden_size,
                      num_heads,
                      concat=True,
                      layer_norm=True,
                      relu=True,
                      gate=False):

        fan_in = feature.shape[-1]
        bias_bound = 1.0 / math.sqrt(fan_in)
        fc_bias_attr = F.ParamAttr(
            initializer=F.initializer.UniformInitializer(low=-bias_bound,
                                                         high=bias_bound))

        negative_slope = math.sqrt(5)
        gain = math.sqrt(2.0 / (1 + negative_slope**2))
        std = gain / math.sqrt(fan_in)
        weight_bound = math.sqrt(3.0) * std
        fc_w_attr = F.ParamAttr(initializer=F.initializer.UniformInitializer(
            low=-weight_bound, high=weight_bound))

        if concat:
            skip_feature = L.fc(feature,
                                hidden_size * num_heads,
                                param_attr=fc_w_attr,
                                name='fc_skip_' + str(i),
                                bias_attr=fc_bias_attr)

        else:
            skip_feature = L.fc(feature,
                                hidden_size,
                                param_attr=fc_w_attr,
                                name='fc_skip_' + str(i),
                                bias_attr=fc_bias_attr)

        out_feat = transformer_gat_pgl(
            gw,
            feature,
            hidden_size,
            'gat_' + str(i),
            num_heads,
            concat=concat,
        )
        #         out_feat= out_feat + skip_feature

        if gate:

            fan_in = out_feat.shape[-1] * 3
            bias_bound = 1.0 / math.sqrt(fan_in)
            fc_bias_attr = F.ParamAttr(
                initializer=F.initializer.UniformInitializer(low=-bias_bound,
                                                             high=bias_bound))

            negative_slope = math.sqrt(5)
            gain = math.sqrt(2.0 / (1 + negative_slope**2))
            std = gain / math.sqrt(fan_in)
            weight_bound = math.sqrt(3.0) * std
            fc_w_attr = F.ParamAttr(
                initializer=F.initializer.UniformInitializer(
                    low=-weight_bound, high=weight_bound))

            gate_f = L.fc([skip_feature, out_feat, out_feat - skip_feature],
                          1,
                          param_attr=fc_w_attr,
                          name='gate_' + str(i),
                          bias_attr=fc_bias_attr)

            gate_f = L.sigmoid(gate_f)

            out_feat = skip_feature * gate_f + out_feat * (1 - gate_f)

        else:
            out_feat = out_feat + skip_feature

        if layer_norm:
            lay_norm_attr = F.ParamAttr(
                initializer=F.initializer.ConstantInitializer(value=1))
            lay_norm_bias = F.ParamAttr(
                initializer=F.initializer.ConstantInitializer(value=0))
            out_feat = L.layer_norm(out_feat,
                                    name='layer_norm_' + str(i),
                                    param_attr=lay_norm_attr,
                                    bias_attr=lay_norm_bias)
        if relu:
            out_feat = L.relu(out_feat)


#         if dropout>1e-15:
#             out_feat = L.dropout(out_feat, dropout_prob=dropout,
#                                  dropout_implementation='upscale_in_train', is_test=test)
#         sub_node_index=F.data(name='sub_node_index_'+str(i), shape=[None], dtype="int64")
# #         out_feat=L.index_select(out_feat, sub_node_index)
#         out_feat=L.gather(out_feat, sub_node_index)
        return out_feat
Esempio n. 20
0
def gin(gw,
        feature,
        hidden_size,
        activation,
        name,
        init_eps=0.0,
        train_eps=False):
    """Implementation of Graph Isomorphism Network (GIN) layer.

    This is an implementation of the paper How Powerful are Graph Neural Networks?
    (https://arxiv.org/pdf/1810.00826.pdf).

    In their implementation, all MLPs have 2 layers. Batch normalization is applied
    on every hidden layer.

    Args:
        gw: Graph wrapper object (:code:`StaticGraphWrapper` or :code:`GraphWrapper`)

        feature: A tensor with shape (num_nodes, feature_size).

        name: GIN layer names.

        hidden_size: The hidden size for gin.

        activation: The activation for the output.

        init_eps: float, optional
            Initial :math:`\epsilon` value, default is 0.

        train_eps: bool, optional
            if True, :math:`\epsilon` will be a learnable parameter.

    Return:
        A tensor with shape (num_nodes, hidden_size).
    """
    def send_src_copy(src_feat, dst_feat, edge_feat):
        return src_feat["h"]

    epsilon = L.create_parameter(
        shape=[1, 1],
        dtype="float32",
        attr=fluid.ParamAttr(name="%s_eps" % name),
        default_initializer=fluid.initializer.ConstantInitializer(
            value=init_eps))

    if not train_eps:
        epsilon.stop_gradient = True

    msg = gw.send(send_src_copy, nfeat_list=[("h", feature)])
    output = gw.recv(msg, "sum") + feature * (epsilon + 1.0)

    output = L.fc(output,
                  size=hidden_size,
                  act=None,
                  param_attr=fluid.ParamAttr(name="%s_w_0" % name),
                  bias_attr=fluid.ParamAttr(name="%s_b_0" % name))

    output = L.layer_norm(
        output,
        begin_norm_axis=1,
        param_attr=fluid.ParamAttr(
            name="norm_scale_%s" % (name),
            initializer=fluid.initializer.Constant(1.0)),
        bias_attr=fluid.ParamAttr(name="norm_bias_%s" % (name),
                                  initializer=fluid.initializer.Constant(0.0)),
    )

    if activation is not None:
        output = getattr(L, activation)(output)

    output = L.fc(output,
                  size=hidden_size,
                  act=activation,
                  param_attr=fluid.ParamAttr(name="%s_w_1" % name),
                  bias_attr=fluid.ParamAttr(name="%s_b_1" % name))

    return output
Esempio n. 21
0
    def get_gat_layer(self,
                      i,
                      gw,
                      feature,
                      hidden_size,
                      num_heads,
                      concat=True,
                      layer_norm=True,
                      relu=True,
                      gate=False,
                      edge_feature=None):

        fan_in = feature.shape[-1]
        bias_bound = 1.0 / math.sqrt(fan_in)
        fc_bias_attr = F.ParamAttr(
            initializer=F.initializer.UniformInitializer(low=-bias_bound,
                                                         high=bias_bound))

        negative_slope = math.sqrt(5)
        gain = math.sqrt(2.0 / (1 + negative_slope**2))
        std = gain / math.sqrt(fan_in)
        weight_bound = math.sqrt(3.0) * std
        fc_w_attr = F.ParamAttr(initializer=F.initializer.UniformInitializer(
            low=-weight_bound, high=weight_bound))

        if concat:
            skip_feature = L.fc(feature,
                                hidden_size * num_heads,
                                param_attr=fc_w_attr,
                                name='fc_skip_' + str(i),
                                bias_attr=fc_bias_attr)
        else:
            skip_feature = L.fc(feature,
                                hidden_size,
                                param_attr=fc_w_attr,
                                name='fc_skip_' + str(i),
                                bias_attr=fc_bias_attr)
        out_feat = transformer_gat_pgl(gw,
                                       feature,
                                       hidden_size,
                                       'gat_' + str(i),
                                       num_heads,
                                       concat=concat,
                                       edge_feature=edge_feature)
        #         out_feat= out_feat + skip_feature

        if gate:

            fan_in = out_feat.shape[-1] * 3
            bias_bound = 1.0 / math.sqrt(fan_in)
            fc_bias_attr = F.ParamAttr(
                initializer=F.initializer.UniformInitializer(low=-bias_bound,
                                                             high=bias_bound))

            negative_slope = math.sqrt(5)
            gain = math.sqrt(2.0 / (1 + negative_slope**2))
            std = gain / math.sqrt(fan_in)
            weight_bound = math.sqrt(3.0) * std
            fc_w_attr = F.ParamAttr(
                initializer=F.initializer.UniformInitializer(
                    low=-weight_bound, high=weight_bound))

            gate_f = L.fc([skip_feature, out_feat, out_feat - skip_feature],
                          1,
                          param_attr=fc_w_attr,
                          name='gate_' + str(i),
                          bias_attr=fc_bias_attr)

            gate_f = L.sigmoid(gate_f)

            out_feat = skip_feature * gate_f + out_feat * (1 - gate_f)

        else:
            out_feat = out_feat + skip_feature

        if layer_norm:
            lay_norm_attr = F.ParamAttr(
                initializer=F.initializer.ConstantInitializer(value=1))
            lay_norm_bias = F.ParamAttr(
                initializer=F.initializer.ConstantInitializer(value=0))
            out_feat = L.layer_norm(out_feat,
                                    name='layer_norm_' + str(i),
                                    param_attr=lay_norm_attr,
                                    bias_attr=lay_norm_bias)
        if relu:
            out_feat = L.relu(out_feat)
        return out_feat
Esempio n. 22
0
def graph_transformer(name,
                      gw,
                      feature,
                      hidden_size,
                      num_heads=4,
                      attn_drop=False,
                      edge_feature=None,
                      concat=True,
                      skip_feat=True,
                      gate=False,
                      layer_norm=True,
                      relu=True,
                      is_test=False):
    """Implementation of graph Transformer from UniMP

    This is an implementation of the paper Unified Massage Passing Model for Semi-Supervised Classification
    (https://arxiv.org/abs/2009.03509).

    Args:
        name: Granph Transformer layer names.
        
        gw: Graph wrapper object (:code:`StaticGraphWrapper` or :code:`GraphWrapper`)

        feature: A tensor with shape (num_nodes, feature_size).

        hidden_size: The hidden size for graph transformer.

        num_heads: The head number in graph transformer.

        attn_drop: Dropout rate for attention.
        
        edge_feature: A tensor with shape (num_edges, feature_size).
        
        concat: Reshape the output (num_nodes, num_heads, hidden_size) by concat (num_nodes, hidden_size * num_heads) or mean (num_nodes, hidden_size)
        
        skip_feat: Whether use skip connect
        
        gate: Whether add skip_feat and output up with gate weight
        
        layer_norm: Whether use layer_norm for output
        
        relu: Whether use relu activation for output

        is_test: Whether in test phrase.

    Return:
        A tensor with shape (num_nodes, hidden_size * num_heads) or (num_nodes, hidden_size)
    """
    def send_attention(src_feat, dst_feat, edge_feat):
        if edge_feat is None or not edge_feat:
            output = src_feat["k_h"] * dst_feat["q_h"]
            output = L.reduce_sum(output, -1)
            output = output / (hidden_size**0.5)
            #             alpha = paddle_helper.sequence_softmax(output)
            return {
                "alpha": output,
                "v": src_feat["v_h"]
            }  # batch x h     batch x h x feat
        else:
            edge_feat = edge_feat["edge"]
            edge_feat = L.reshape(edge_feat, [-1, num_heads, hidden_size])
            output = (src_feat["k_h"] + edge_feat) * dst_feat["q_h"]
            output = L.reduce_sum(output, -1)
            output = output / (hidden_size**0.5)
            #             alpha = paddle_helper.sequence_softmax(output)
            return {
                "alpha": output,
                "v": (src_feat["v_h"] + edge_feat)
            }  # batch x h     batch x h x feat

    class Reduce_attention():
        def __init__(self, ):
            self.alpha = None

        def __call__(self, msg):
            alpha = msg["alpha"]  # lod-tensor (batch_size, num_heads)
            if attn_drop:
                old_h = alpha
                dropout = F.data(name='attn_drop', shape=[1], dtype="int64")
                u = L.uniform_random(shape=L.cast(L.shape(alpha)[:1], 'int64'),
                                     min=0.,
                                     max=1.)
                keeped = L.cast(u > dropout, dtype="float32")
                self_attn_mask = L.scale(x=keeped,
                                         scale=10000.0,
                                         bias=-1.0,
                                         bias_after_scale=False)
                n_head_self_attn_mask = L.stack(x=[self_attn_mask] * num_heads,
                                                axis=1)
                n_head_self_attn_mask.stop_gradient = True
                alpha = n_head_self_attn_mask + alpha
                alpha = L.lod_reset(alpha, old_h)

            h = msg["v"]
            alpha = paddle_helper.sequence_softmax(alpha)

            self.alpha = alpha
            old_h = h
            h_mean = L.sequence_pool(h, "average")
            h = h * alpha
            h = L.lod_reset(h, old_h)
            h = L.sequence_pool(h, "sum")

            h = h * 0.8 + h_mean * 0.2

            if concat:
                h = L.reshape(h, [-1, num_heads * hidden_size])
            else:
                h = L.reduce_mean(h, dim=1)
            return h

    reduce_attention = Reduce_attention()

    q = linear(feature,
               hidden_size * num_heads,
               name=name + '_q_weight',
               init_type='gcn')
    k = linear(feature,
               hidden_size * num_heads,
               name=name + '_k_weight',
               init_type='gcn')
    v = linear(feature,
               hidden_size * num_heads,
               name=name + '_v_weight',
               init_type='gcn')

    reshape_q = L.reshape(q, [-1, num_heads, hidden_size])
    reshape_k = L.reshape(k, [-1, num_heads, hidden_size])
    reshape_v = L.reshape(v, [-1, num_heads, hidden_size])

    msg = gw.send(send_attention,
                  nfeat_list=[("q_h", reshape_q), ("k_h", reshape_k),
                              ("v_h", reshape_v)],
                  efeat_list=edge_feature)
    out_feat = gw.recv(msg, reduce_attention)
    checkpoints = [out_feat]

    if skip_feat:
        if concat:

            out_feat, cks = appnp(gw, out_feat, k_hop=3, name=name + "_appnp")
            #             out_feat, cks = appnp(gw, out_feat, k_hop=3)
            checkpoints.append(out_feat)

            #             The UniMP-xxlarge will come soon.
            #             out_feat, cks = appnp(gw, out_feat, k_hop=6)
            #             out_feat, cks = appnp(gw, out_feat, k_hop=9)
            #             checkpoints = checkpoints + cks

            skip_feature = linear(feature,
                                  hidden_size * num_heads,
                                  name=name + '_skip_weight',
                                  init_type='lin')
        else:

            skip_feature = linear(feature,
                                  hidden_size,
                                  name=name + '_skip_weight',
                                  init_type='lin')

        if gate:
            temp_output = L.concat(
                [skip_feature, out_feat, out_feat - skip_feature], axis=-1)
            gate_f = L.sigmoid(
                linear(temp_output,
                       1,
                       name=name + '_gate_weight',
                       init_type='lin'))
            out_feat = skip_feature * gate_f + out_feat * (1 - gate_f)
        else:
            out_feat = skip_feature + out_feat

    if layer_norm:
        lay_norm_attr = F.ParamAttr(
            initializer=F.initializer.ConstantInitializer(value=1))
        lay_norm_bias = F.ParamAttr(
            initializer=F.initializer.ConstantInitializer(value=0))
        out_feat = L.layer_norm(out_feat,
                                name=name + '_layer_norm',
                                param_attr=lay_norm_attr,
                                bias_attr=lay_norm_bias,
                                scale=False,
                                shift=False)
    if relu:
        out_feat = L.relu(out_feat)

    return out_feat, reduce_attention.alpha, checkpoints