def label_embed_input(self, feature): label = F.data(name="label", shape=[None, 1], dtype="int64") label_idx = F.data(name='label_idx', shape=[None], dtype="int64") label = L.reshape(label, shape=[-1]) label = L.gather(label, label_idx, overwrite=False) lay_norm_attr = F.ParamAttr( initializer=F.initializer.ConstantInitializer(value=1)) lay_norm_bias = F.ParamAttr( initializer=F.initializer.ConstantInitializer(value=0)) feature = L.layer_norm(feature, name='layer_norm_feature_input1', param_attr=lay_norm_attr, bias_attr=lay_norm_bias) embed_attr = F.ParamAttr( initializer=F.initializer.NormalInitializer(loc=0.0, scale=1.0)) embed = F.embedding(input=label, size=(self.out_size, self.embed_size), param_attr=embed_attr) lay_norm_attr = F.ParamAttr( initializer=F.initializer.ConstantInitializer(value=1)) lay_norm_bias = F.ParamAttr( initializer=F.initializer.ConstantInitializer(value=0)) embed = L.layer_norm(embed, name='layer_norm_feature_input2', param_attr=lay_norm_attr, bias_attr=lay_norm_bias) embed = L.relu(embed) feature_label = L.gather(feature, label_idx, overwrite=False) feature_label = feature_label + embed feature = L.scatter(feature, label_idx, feature_label, overwrite=True) return feature
def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0., epsilon=1e-5, name=""): """Add a pre-process or post-process between sub layers. Add residual connection, layer normalization and droput to the out tensor optionally according to the value of process_cmd. This will be used before or after multi-head attention and position-wise feed-forward networks. """ for cmd in process_cmd: if cmd == "a": # add residual connection out = out + prev_out if prev_out else out elif cmd == "n": # add layer normalization out = layers.layer_norm( out, begin_norm_axis=len(out.shape) - 1, param_attr=fluid.ParamAttr( name=name + "_layer_norm_scale", initializer=fluid.initializer.Constant(1.)), bias_attr=fluid.ParamAttr( name=name + "_layer_norm_bias", initializer=fluid.initializer.Constant(0.)), epsilon=epsilon) elif cmd == "d": # add dropout if dropout_rate: out = layers.dropout(out, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=False) return out
def mlp(self, features, name): h = features dim = features.shape[-1] dim_list = [dim * 2, dim] for i in range(2): h = L.fc(h, size=dim_list[i], name="%s_fc_%s" % (name, i), act=None) if self.args.norm_type == "layer_norm": log.info("norm_type is %s" % self.args.norm_type) h = L.layer_norm( h, begin_norm_axis=1, param_attr=F.ParamAttr( name="norm_scale_%s_%s" % (name, i), initializer=F.initializer.Constant(1.0)), bias_attr=F.ParamAttr( name="norm_bias_%s_%s" % (name, i), initializer=F.initializer.Constant(0.0)), ) else: log.info("using batch_norm") h = L.batch_norm(h) h = pgl.layers.graph_norm(self.graph_wrapper, h) h = L.relu(h) return h
def embed_input(self, feature, name, norm=True): fan_in = feature.shape[-1] bias_bound = 1.0 / math.sqrt(fan_in) fc_bias_attr = F.ParamAttr( initializer=F.initializer.UniformInitializer(low=-bias_bound, high=bias_bound)) negative_slope = math.sqrt(5) gain = math.sqrt(2.0 / (1 + negative_slope**2)) std = gain / math.sqrt(fan_in) weight_bound = math.sqrt(3.0) * std fc_w_attr = F.ParamAttr(initializer=F.initializer.UniformInitializer( low=-weight_bound, high=weight_bound)) feature = L.fc(feature, self.embed_size, act=None, param_attr=fc_w_attr, bias_attr=fc_bias_attr, name=name + "_node_feature_encoder") if norm: lay_norm_attr = F.ParamAttr( initializer=F.initializer.ConstantInitializer(value=1)) lay_norm_bias = F.ParamAttr( initializer=F.initializer.ConstantInitializer(value=0)) feature = L.layer_norm(feature, name=name + '_layer_norm_feature_input', param_attr=lay_norm_attr, bias_attr=lay_norm_bias) return feature
def output_layer(self, x, T, name, act_func='GLU'): """Output layer""" _, _, n, channel = x.shape # maps multi-steps to one. x_i = self.temporal_conv_layer(x=x, Kt=T, c_in=channel, c_out=channel, name="%s_in" % name, act_func=act_func) x_ln = fl.layer_norm(x_i) x_o = self.temporal_conv_layer(x=x_ln, Kt=1, c_in=channel, c_out=channel, name="%s_out" % name, act_func='sigmoid') # maps multi-channels to one. x_fc = self.fully_con_layer(x=x_o, n=n, channel=channel, name="%s_fc" % name) return x_fc
def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0., name='', is_test=False): for cmd in process_cmd: if cmd == "a": # 两个输入相加 out = out + prev_out if prev_out else out elif cmd == "n": # 进行normalization out_type = out.dtype if out_type == fluid.core.VarDesc.VarType.FP16: out = layers.cast(x=out, dtype="float32") out = layers.layer_norm( out, begin_norm_axis=len(out.shape) - 1, param_attr=fluid.ParamAttr( name=name + '_layer_norm_scale', initializer=fluid.initializer.Constant(1.)), bias_attr=fluid.ParamAttr( name=name + '_layer_norm_bias', initializer=fluid.initializer.Constant(0.))) if out_type == fluid.core.VarDesc.VarType.FP16: out = layers.cast(x=out, dtype="float16") elif cmd == "d": # 进行dropout if dropout_rate: out = layers.dropout(out, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=is_test) return out
def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0., name=''): """ Add residual connection, layer normalization and droput to the out tensor optionally according to the value of process_cmd. This will be used before or after multi-head attention and position-wise feed-forward networks. """ for cmd in process_cmd: if cmd == "a": # add residual connection out = out + prev_out if prev_out else out elif cmd == "n": # add layer normalization out_dtype = out.dtype if out_dtype == fluid.core.VarDesc.VarType.FP16: out = layers.cast(x=out, dtype="float32") out = layers.layer_norm( out, begin_norm_axis=len(out.shape) - 1, param_attr=fluid.ParamAttr( name=name + '_layer_norm_scale', initializer=fluid.initializer.Constant(1.)), bias_attr=fluid.ParamAttr( name=name + '_layer_norm_bias', initializer=fluid.initializer.Constant(0.))) if out_dtype == fluid.core.VarDesc.VarType.FP16: out = layers.cast(x=out, dtype="float16") elif cmd == "d": # add dropout if dropout_rate: out = layers.dropout( out, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=False) return out
def self_attention_and_residual(feature, size, input_mask, name, maxlen): query = L.fc(feature, size, name=name + "_query", num_flatten_dims=2) key = L.fc(feature, size, name=name + "_key", num_flatten_dims=2) value = L.fc(feature, size, name=name + "_value", num_flatten_dims=2) attention = L.softmax(L.matmul(query, key, transpose_y=True) + input_mask) output = L.matmul(attention, value) output = L.fc(output, size, name=name + "_model", num_flatten_dims=2) output = L.relu(output + feature) output = L.layer_norm(output, begin_norm_axis=2, name=name + '_ln') return output
def embed_input(self, feature): lay_norm_attr = F.ParamAttr( initializer=F.initializer.ConstantInitializer(value=1)) lay_norm_bias = F.ParamAttr( initializer=F.initializer.ConstantInitializer(value=0)) feature = L.layer_norm(feature, name='layer_norm_feature_input', param_attr=lay_norm_attr, bias_attr=lay_norm_bias) return feature
def layer_norm(feature, name=""): lay_norm_attr = F.ParamAttr( name="attr_%s" % name, initializer=F.initializer.ConstantInitializer(value=1)) lay_norm_bias = F.ParamAttr( name="bias_%s" % name, initializer=F.initializer.ConstantInitializer(value=0)) feature = L.layer_norm(feature, param_attr=lay_norm_attr, bias_attr=lay_norm_bias) return feature
def get_batch_feature(self, feature): batch_nodes = F.data(name='batch_nodes_0', shape=[None], dtype="int64") # feature=L.index_select(feature, batch_nodes) feature = L.gather(feature, batch_nodes, overwrite=False) lay_norm_attr = F.ParamAttr( initializer=F.initializer.ConstantInitializer(value=1)) lay_norm_bias = F.ParamAttr( initializer=F.initializer.ConstantInitializer(value=0)) feature = L.layer_norm(feature, name='layer_norm_feature_input', param_attr=lay_norm_attr, bias_attr=lay_norm_bias) # feature=L.dropout(feature, dropout_prob=self.dropout, dropout_implementation='upscale_in_train', # is_test=test) return feature
def forward(self): """forward""" features_list = [self.gw.node_feat["attr"]] for i in range(self.num_layers): h = gin(self.gw, features_list[i], hidden_size=self.hidden_size, activation="relu", name="gin_%s" % (i), init_eps=0.0, train_eps=self.train_eps) h = fl.layer_norm( h, begin_norm_axis=1, param_attr=fluid.ParamAttr( name="norm_scale_%s" % (i), initializer=fluid.initializer.Constant(1.0)), bias_attr=fluid.ParamAttr( name="norm_bias_%s" % (i), initializer=fluid.initializer.Constant(0.0)), ) h = fl.relu(h) features_list.append(h) output = 0 for i, h in enumerate(features_list): pooled_h = pgl.layers.graph_pooling(self.gw, h, self.pool_type) drop_h = fl.dropout( pooled_h, self.dropout_prob, dropout_implementation="upscale_in_train") output += fl.fc(drop_h, size=self.num_class, act=None, param_attr=fluid.ParamAttr(name="final_fc_%s" % (i))) # calculate loss self.loss = fl.softmax_with_cross_entropy(output, self.labels) self.loss = fl.reduce_mean(self.loss) self.acc = fl.accuracy(fl.softmax(output), self.labels)
def pre_post_process_layer(prev_out, out, process_cmd, dropout=0.): """ Add residual connection, layer normalization and droput to the out tensor optionally according to the value of process_cmd. This will be used before or after multi-head attention and position-wise feed-forward networks. """ for cmd in process_cmd: if cmd == "a": # add residual connection out = out + prev_out if prev_out else out elif cmd == "n": # add layer normalization out = layers.layer_norm(out, begin_norm_axis=len(out.shape) - 1, param_attr=fluid.initializer.Constant(1.), bias_attr=fluid.initializer.Constant(0.)) elif cmd == "d": # add dropout if dropout: out = layers.dropout(out, dropout_prob=dropout, is_test=False) return out
def pre_post_process_layer(prev_out, out, process_cmd, dropout=0.): """ Add residual connection, layer normalization and droput to the out tensor optionally according to the value of process_cmd. This will be used before or after multi-head attention and position-wise feed-forward networks. """ for cmd in process_cmd: if cmd == "a": # add residual connection out = out + prev_out if prev_out else out elif cmd == "n": # add layer normalization out = layers.layer_norm( out, begin_norm_axis=len(out.shape) - 1, param_attr=fluid.initializer.Constant(1.), bias_attr=fluid.initializer.Constant(0.)) elif cmd == "d": # add dropout if dropout: out = layers.dropout(out, dropout_prob=dropout, is_test=False) return out
def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0., seed=0, name='', debug=False): """ Add residual connection, layer normalization and droput to the out tensor optionally according to the value of process_cmd. This will be used before or after multi-head attention and position-wise feed-forward networks. out: (b, seq-len, hidden size) """ # logger.info(f"pre_post_process_layer out: {prev_out}") debug_dict = {} for cmd in process_cmd: if cmd == "a": # add residual connection out = out + prev_out if prev_out else out elif cmd == "n": # add layer normalization out = layers.layer_norm( out, begin_norm_axis=len(out.shape) - 1, param_attr=fluid.ParamAttr( name=name + '_layer_norm_scale', initializer=fluid.initializer.Constant(1.)), bias_attr=fluid.ParamAttr( name=name + '_layer_norm_bias', initializer=fluid.initializer.Constant(0.))) elif cmd == "d": # add dropout if dropout_rate: out = layers.dropout( out, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", seed=seed, is_test=False) if debug: debug_dict[f"out_{cmd}"] = out if debug: debug_dict[f"out"] = out return debug_dict else: return out
def st_conv_block(self, x, Ks, Kt, channels, name, keep_prob, act_func='GLU'): """Spatio-Temporal convolution block""" c_si, c_t, c_oo = channels x_s = self.temporal_conv_layer(x, Kt, c_si, c_t, "%s_tconv_in" % name, act_func=act_func) x_t = self.spatio_conv_layer(x_s, Ks, c_t, c_t, "%s_sonv" % name) x_o = self.temporal_conv_layer(x_t, Kt, c_t, c_oo, "%s_tconv_out" % name) x_ln = fl.layer_norm(x_o) return fl.dropout(x_ln, dropout_prob=(1.0 - keep_prob))
def forward(self, graph_wrapper, is_test=False): """ Build the network. """ node_features = self._mol_encoder(graph_wrapper, name=self.name) features_list = [node_features] for layer in range(self.layer_num): edge_features = self._bond_encoder( graph_wrapper, name='%s_layer%s' % (self.name, layer)) if self.gnn_type == "gcn": feat = gcn_layer( graph_wrapper, features_list[layer], edge_features, act="relu", name="%s_layer%s_gcn" % (self.name, layer)) elif self.gnn_type == "gat": feat = gat_layer( graph_wrapper, features_list[layer], edge_features, self.embed_dim, act="relu", name="%s_layer%s_gat" % (self.name, layer)) else: feat = gin_layer( graph_wrapper, features_list[layer], edge_features, name="%s_layer%s_gin" % (self.name, layer)) if self.norm_type == 'batch_norm': feat = layers.batch_norm( feat, param_attr=fluid.ParamAttr( name="%s_layer%s_batch_norm_scale" % (self.name, layer), initializer=fluid.initializer.Constant(1.0)), bias_attr=fluid.ParamAttr( name="%s_layer%s_batch_norm_bias" % (self.name, layer), initializer=fluid.initializer.Constant(0.0)), moving_mean_name="%s_layer%s_batch_norm_moving_avearage" % (self.name, layer), moving_variance_name="%s_layer%s_batch_norm_moving_variance" % (self.name, layer), is_test=is_test) elif self.norm_type == 'layer_norm': feat = layers.layer_norm( feat, param_attr=fluid.ParamAttr( name="%s_layer%s_layer_norm_scale" % (self.name, layer), initializer=fluid.initializer.Constant(1.0)), bias_attr=fluid.ParamAttr( name="%s_layer%s_layer_norm_bias" % (self.name, layer), initializer=fluid.initializer.Constant(0.0))) else: raise ValueError('%s not supported.' % self.norm_type) if self.graph_norm: feat = pgl.layers.graph_norm(graph_wrapper, feat) if layer < self.layer_num - 1: feat = layers.relu(feat) feat = layers.dropout( feat, self.dropout_rate, dropout_implementation="upscale_in_train", is_test=is_test) # residual if self.residual: feat = feat + features_list[layer] features_list.append(feat) if self.JK == "sum": node_repr = layers.reduce_sum(features_list, axis=0) elif self.JK == "mean": node_repr = layers.reduce_mean(features_list, axis=0) elif self.JK == "last": node_repr = features_list[-1] else: node_repr = features_list[-1] return node_repr
def _build_net(self): # ConvLSTM2D rnn_out, last_hidden = ConvLSTM.convlstm2d_rnn( rnn_input=self.input, init_hidden=None, init_cell=None, padding=1, hidden_h=self.h, hidden_w=self.w, filters=self.filters, filter_size=self.filter_size, sequence_length=self.input_seqlen) # Batch Norm bn = layers.layer_norm(rnn_out, begin_norm_axis=4) # ConvLSTM2D rnn_out, last_hidden = ConvLSTM.convlstm2d_rnn( rnn_input=bn, init_hidden=None, init_cell=None, padding=1, hidden_h=self.h, hidden_w=self.w, filters=self.filters, filter_size=self.filter_size, sequence_length=self.input_seqlen) # Batch Norm bn = layers.layer_norm(rnn_out, begin_norm_axis=4) # ConvLSTM2D rnn_out, last_hidden = ConvLSTM.convlstm2d_rnn( rnn_input=bn, init_hidden=None, init_cell=None, padding=1, hidden_h=self.h, hidden_w=self.w, filters=self.filters, filter_size=self.filter_size, sequence_length=self.input_seqlen) # Batch Norm bn = layers.layer_norm(rnn_out, begin_norm_axis=4) # ConvLSTM2D rnn_out, last_hidden = ConvLSTM.convlstm2d_rnn( rnn_input=bn, init_hidden=None, init_cell=None, padding=1, hidden_h=self.h, hidden_w=self.w, filters=self.filters, filter_size=self.filter_size, sequence_length=self.input_seqlen) # Batch Norm bn = layers.layer_norm(rnn_out, begin_norm_axis=4) # Transpose : (batch x C x D x H x W) tr = layers.transpose(bn, [0, 4, 1, 2, 3]) # Conv3D conv3d = layers.conv3d(input=tr, num_filters=2, filter_size=3, padding=1) # conv3d : (batch x C x D x H x W) conv3d = layers.transpose(conv3d, [0, 2, 3, 4, 1]) # conv3d: (batch x D x H x W x C) return conv3d
def get_gat_layer(self, i, gw, feature, hidden_size, num_heads, concat=True, layer_norm=True, relu=True, gate=False): fan_in = feature.shape[-1] bias_bound = 1.0 / math.sqrt(fan_in) fc_bias_attr = F.ParamAttr( initializer=F.initializer.UniformInitializer(low=-bias_bound, high=bias_bound)) negative_slope = math.sqrt(5) gain = math.sqrt(2.0 / (1 + negative_slope**2)) std = gain / math.sqrt(fan_in) weight_bound = math.sqrt(3.0) * std fc_w_attr = F.ParamAttr(initializer=F.initializer.UniformInitializer( low=-weight_bound, high=weight_bound)) if concat: skip_feature = L.fc(feature, hidden_size * num_heads, param_attr=fc_w_attr, name='fc_skip_' + str(i), bias_attr=fc_bias_attr) else: skip_feature = L.fc(feature, hidden_size, param_attr=fc_w_attr, name='fc_skip_' + str(i), bias_attr=fc_bias_attr) out_feat = transformer_gat_pgl( gw, feature, hidden_size, 'gat_' + str(i), num_heads, concat=concat, ) # out_feat= out_feat + skip_feature if gate: fan_in = out_feat.shape[-1] * 3 bias_bound = 1.0 / math.sqrt(fan_in) fc_bias_attr = F.ParamAttr( initializer=F.initializer.UniformInitializer(low=-bias_bound, high=bias_bound)) negative_slope = math.sqrt(5) gain = math.sqrt(2.0 / (1 + negative_slope**2)) std = gain / math.sqrt(fan_in) weight_bound = math.sqrt(3.0) * std fc_w_attr = F.ParamAttr( initializer=F.initializer.UniformInitializer( low=-weight_bound, high=weight_bound)) gate_f = L.fc([skip_feature, out_feat, out_feat - skip_feature], 1, param_attr=fc_w_attr, name='gate_' + str(i), bias_attr=fc_bias_attr) gate_f = L.sigmoid(gate_f) out_feat = skip_feature * gate_f + out_feat * (1 - gate_f) else: out_feat = out_feat + skip_feature if layer_norm: lay_norm_attr = F.ParamAttr( initializer=F.initializer.ConstantInitializer(value=1)) lay_norm_bias = F.ParamAttr( initializer=F.initializer.ConstantInitializer(value=0)) out_feat = L.layer_norm(out_feat, name='layer_norm_' + str(i), param_attr=lay_norm_attr, bias_attr=lay_norm_bias) if relu: out_feat = L.relu(out_feat) # if dropout>1e-15: # out_feat = L.dropout(out_feat, dropout_prob=dropout, # dropout_implementation='upscale_in_train', is_test=test) # sub_node_index=F.data(name='sub_node_index_'+str(i), shape=[None], dtype="int64") # # out_feat=L.index_select(out_feat, sub_node_index) # out_feat=L.gather(out_feat, sub_node_index) return out_feat
def gin(gw, feature, hidden_size, activation, name, init_eps=0.0, train_eps=False): """Implementation of Graph Isomorphism Network (GIN) layer. This is an implementation of the paper How Powerful are Graph Neural Networks? (https://arxiv.org/pdf/1810.00826.pdf). In their implementation, all MLPs have 2 layers. Batch normalization is applied on every hidden layer. Args: gw: Graph wrapper object (:code:`StaticGraphWrapper` or :code:`GraphWrapper`) feature: A tensor with shape (num_nodes, feature_size). name: GIN layer names. hidden_size: The hidden size for gin. activation: The activation for the output. init_eps: float, optional Initial :math:`\epsilon` value, default is 0. train_eps: bool, optional if True, :math:`\epsilon` will be a learnable parameter. Return: A tensor with shape (num_nodes, hidden_size). """ def send_src_copy(src_feat, dst_feat, edge_feat): return src_feat["h"] epsilon = L.create_parameter( shape=[1, 1], dtype="float32", attr=fluid.ParamAttr(name="%s_eps" % name), default_initializer=fluid.initializer.ConstantInitializer( value=init_eps)) if not train_eps: epsilon.stop_gradient = True msg = gw.send(send_src_copy, nfeat_list=[("h", feature)]) output = gw.recv(msg, "sum") + feature * (epsilon + 1.0) output = L.fc(output, size=hidden_size, act=None, param_attr=fluid.ParamAttr(name="%s_w_0" % name), bias_attr=fluid.ParamAttr(name="%s_b_0" % name)) output = L.layer_norm( output, begin_norm_axis=1, param_attr=fluid.ParamAttr( name="norm_scale_%s" % (name), initializer=fluid.initializer.Constant(1.0)), bias_attr=fluid.ParamAttr(name="norm_bias_%s" % (name), initializer=fluid.initializer.Constant(0.0)), ) if activation is not None: output = getattr(L, activation)(output) output = L.fc(output, size=hidden_size, act=activation, param_attr=fluid.ParamAttr(name="%s_w_1" % name), bias_attr=fluid.ParamAttr(name="%s_b_1" % name)) return output
def get_gat_layer(self, i, gw, feature, hidden_size, num_heads, concat=True, layer_norm=True, relu=True, gate=False, edge_feature=None): fan_in = feature.shape[-1] bias_bound = 1.0 / math.sqrt(fan_in) fc_bias_attr = F.ParamAttr( initializer=F.initializer.UniformInitializer(low=-bias_bound, high=bias_bound)) negative_slope = math.sqrt(5) gain = math.sqrt(2.0 / (1 + negative_slope**2)) std = gain / math.sqrt(fan_in) weight_bound = math.sqrt(3.0) * std fc_w_attr = F.ParamAttr(initializer=F.initializer.UniformInitializer( low=-weight_bound, high=weight_bound)) if concat: skip_feature = L.fc(feature, hidden_size * num_heads, param_attr=fc_w_attr, name='fc_skip_' + str(i), bias_attr=fc_bias_attr) else: skip_feature = L.fc(feature, hidden_size, param_attr=fc_w_attr, name='fc_skip_' + str(i), bias_attr=fc_bias_attr) out_feat = transformer_gat_pgl(gw, feature, hidden_size, 'gat_' + str(i), num_heads, concat=concat, edge_feature=edge_feature) # out_feat= out_feat + skip_feature if gate: fan_in = out_feat.shape[-1] * 3 bias_bound = 1.0 / math.sqrt(fan_in) fc_bias_attr = F.ParamAttr( initializer=F.initializer.UniformInitializer(low=-bias_bound, high=bias_bound)) negative_slope = math.sqrt(5) gain = math.sqrt(2.0 / (1 + negative_slope**2)) std = gain / math.sqrt(fan_in) weight_bound = math.sqrt(3.0) * std fc_w_attr = F.ParamAttr( initializer=F.initializer.UniformInitializer( low=-weight_bound, high=weight_bound)) gate_f = L.fc([skip_feature, out_feat, out_feat - skip_feature], 1, param_attr=fc_w_attr, name='gate_' + str(i), bias_attr=fc_bias_attr) gate_f = L.sigmoid(gate_f) out_feat = skip_feature * gate_f + out_feat * (1 - gate_f) else: out_feat = out_feat + skip_feature if layer_norm: lay_norm_attr = F.ParamAttr( initializer=F.initializer.ConstantInitializer(value=1)) lay_norm_bias = F.ParamAttr( initializer=F.initializer.ConstantInitializer(value=0)) out_feat = L.layer_norm(out_feat, name='layer_norm_' + str(i), param_attr=lay_norm_attr, bias_attr=lay_norm_bias) if relu: out_feat = L.relu(out_feat) return out_feat
def graph_transformer(name, gw, feature, hidden_size, num_heads=4, attn_drop=False, edge_feature=None, concat=True, skip_feat=True, gate=False, layer_norm=True, relu=True, is_test=False): """Implementation of graph Transformer from UniMP This is an implementation of the paper Unified Massage Passing Model for Semi-Supervised Classification (https://arxiv.org/abs/2009.03509). Args: name: Granph Transformer layer names. gw: Graph wrapper object (:code:`StaticGraphWrapper` or :code:`GraphWrapper`) feature: A tensor with shape (num_nodes, feature_size). hidden_size: The hidden size for graph transformer. num_heads: The head number in graph transformer. attn_drop: Dropout rate for attention. edge_feature: A tensor with shape (num_edges, feature_size). concat: Reshape the output (num_nodes, num_heads, hidden_size) by concat (num_nodes, hidden_size * num_heads) or mean (num_nodes, hidden_size) skip_feat: Whether use skip connect gate: Whether add skip_feat and output up with gate weight layer_norm: Whether use layer_norm for output relu: Whether use relu activation for output is_test: Whether in test phrase. Return: A tensor with shape (num_nodes, hidden_size * num_heads) or (num_nodes, hidden_size) """ def send_attention(src_feat, dst_feat, edge_feat): if edge_feat is None or not edge_feat: output = src_feat["k_h"] * dst_feat["q_h"] output = L.reduce_sum(output, -1) output = output / (hidden_size**0.5) # alpha = paddle_helper.sequence_softmax(output) return { "alpha": output, "v": src_feat["v_h"] } # batch x h batch x h x feat else: edge_feat = edge_feat["edge"] edge_feat = L.reshape(edge_feat, [-1, num_heads, hidden_size]) output = (src_feat["k_h"] + edge_feat) * dst_feat["q_h"] output = L.reduce_sum(output, -1) output = output / (hidden_size**0.5) # alpha = paddle_helper.sequence_softmax(output) return { "alpha": output, "v": (src_feat["v_h"] + edge_feat) } # batch x h batch x h x feat class Reduce_attention(): def __init__(self, ): self.alpha = None def __call__(self, msg): alpha = msg["alpha"] # lod-tensor (batch_size, num_heads) if attn_drop: old_h = alpha dropout = F.data(name='attn_drop', shape=[1], dtype="int64") u = L.uniform_random(shape=L.cast(L.shape(alpha)[:1], 'int64'), min=0., max=1.) keeped = L.cast(u > dropout, dtype="float32") self_attn_mask = L.scale(x=keeped, scale=10000.0, bias=-1.0, bias_after_scale=False) n_head_self_attn_mask = L.stack(x=[self_attn_mask] * num_heads, axis=1) n_head_self_attn_mask.stop_gradient = True alpha = n_head_self_attn_mask + alpha alpha = L.lod_reset(alpha, old_h) h = msg["v"] alpha = paddle_helper.sequence_softmax(alpha) self.alpha = alpha old_h = h h_mean = L.sequence_pool(h, "average") h = h * alpha h = L.lod_reset(h, old_h) h = L.sequence_pool(h, "sum") h = h * 0.8 + h_mean * 0.2 if concat: h = L.reshape(h, [-1, num_heads * hidden_size]) else: h = L.reduce_mean(h, dim=1) return h reduce_attention = Reduce_attention() q = linear(feature, hidden_size * num_heads, name=name + '_q_weight', init_type='gcn') k = linear(feature, hidden_size * num_heads, name=name + '_k_weight', init_type='gcn') v = linear(feature, hidden_size * num_heads, name=name + '_v_weight', init_type='gcn') reshape_q = L.reshape(q, [-1, num_heads, hidden_size]) reshape_k = L.reshape(k, [-1, num_heads, hidden_size]) reshape_v = L.reshape(v, [-1, num_heads, hidden_size]) msg = gw.send(send_attention, nfeat_list=[("q_h", reshape_q), ("k_h", reshape_k), ("v_h", reshape_v)], efeat_list=edge_feature) out_feat = gw.recv(msg, reduce_attention) checkpoints = [out_feat] if skip_feat: if concat: out_feat, cks = appnp(gw, out_feat, k_hop=3, name=name + "_appnp") # out_feat, cks = appnp(gw, out_feat, k_hop=3) checkpoints.append(out_feat) # The UniMP-xxlarge will come soon. # out_feat, cks = appnp(gw, out_feat, k_hop=6) # out_feat, cks = appnp(gw, out_feat, k_hop=9) # checkpoints = checkpoints + cks skip_feature = linear(feature, hidden_size * num_heads, name=name + '_skip_weight', init_type='lin') else: skip_feature = linear(feature, hidden_size, name=name + '_skip_weight', init_type='lin') if gate: temp_output = L.concat( [skip_feature, out_feat, out_feat - skip_feature], axis=-1) gate_f = L.sigmoid( linear(temp_output, 1, name=name + '_gate_weight', init_type='lin')) out_feat = skip_feature * gate_f + out_feat * (1 - gate_f) else: out_feat = skip_feature + out_feat if layer_norm: lay_norm_attr = F.ParamAttr( initializer=F.initializer.ConstantInitializer(value=1)) lay_norm_bias = F.ParamAttr( initializer=F.initializer.ConstantInitializer(value=0)) out_feat = L.layer_norm(out_feat, name=name + '_layer_norm', param_attr=lay_norm_attr, bias_attr=lay_norm_bias, scale=False, shift=False) if relu: out_feat = L.relu(out_feat) return out_feat, reduce_attention.alpha, checkpoints