def forward( self, g: dgl.DGLGraph, feats: Dict[str, torch.Tensor], norm_atom: torch.Tensor = None, norm_bond: torch.Tensor = None, ) -> Dict[str, torch.Tensor]: """ Args: g: the graph feats: node features. Allowed node types are `atom`, `bond` and `global`. norm_atom: values used to normalize atom features as proposed in graph norm. norm_bond: values used to normalize bond features as proposed in graph norm. Returns: updated node features. """ g = g.local_var() h = feats["atom"] e = feats["bond"] u = feats["global"] # for residual connection h_in = h e_in = e u_in = u g.nodes["atom"].data.update({"Ah": self.A(h), "Dh": self.D(h), "Eh": self.E(h)}) g.nodes["bond"].data.update({"Be": self.B(e)}) g.nodes["global"].data.update({"Cu": self.C(u), "Fu": self.F(u)}) # update bond feature e g.multi_update_all( { "a2b": (fn.copy_u("Ah", "m"), fn.sum("m", "e")), # A * (h_i + h_j) "b2b": (fn.copy_u("Be", "m"), fn.sum("m", "e")), # B * e_ij "g2b": (fn.copy_u("Cu", "m"), fn.sum("m", "e")), # C * u }, "sum", ) e = g.nodes["bond"].data["e"] if self.graph_norm: e = e * norm_bond if self.batch_norm: e = self.bn_node_e(e) e = self.activation(e) if self.residual: e = e_in + e g.nodes["bond"].data["e"] = e # update atom feature h # Copy Eh to bond nodes, without reduction. # This is the first arrow in: Eh_j -> bond node -> atom i node # The second arrow is done in self.message_fn and self.reduce_fn below g.update_all(fn.copy_u("Eh", "Eh_j"), self.reduce_fn_a2b, etype="a2b") g.multi_update_all( { "a2a": (fn.copy_u("Dh", "m"), fn.sum("m", "h")), # D * h_i "b2a": (self.message_fn, self.reduce_fn), # e_ij [Had] (E * hj) "g2a": (fn.copy_u("Fu", "m"), fn.sum("m", "h")), # F * u }, "sum", ) h = g.nodes["atom"].data["h"] if self.graph_norm: h = h * norm_atom if self.batch_norm: h = self.bn_node_h(h) h = self.activation(h) if self.residual: h = h_in + h g.nodes["atom"].data["h"] = h # update global feature u g.nodes["atom"].data.update({"Gh": self.G(h)}) g.nodes["bond"].data.update({"He": self.H(e)}) g.nodes["global"].data.update({"Iu": self.I(u)}) g.multi_update_all( { "a2g": (fn.copy_u("Gh", "m"), fn.mean("m", "u")), # G * (mean_i h_i) "b2g": (fn.copy_u("He", "m"), fn.mean("m", "u")), # H * (mean_ij e_ij) "g2g": (fn.copy_u("Iu", "m"), fn.sum("m", "u")), # I * u }, "sum", ) u = g.nodes["global"].data["u"] # do not apply batch norm if it there is only one graph if self.batch_norm and u.shape[0] > 1: u = self.bn_node_u(u) u = self.activation(u) if self.residual: u = u_in + u # dropout h = self.dropout(h) e = self.dropout(e) u = self.dropout(u) feats = {"atom": h, "bond": e, "global": u} return feats
features = bond_features(bond) bond_src.append(begin_idx) bond_dst.append(end_idx) bond_x.append(features) # set up the reverse direction bond_src.append(end_idx) bond_dst.append(begin_idx) bond_x.append(features) graph.add_edges(bond_src, bond_dst) n_edges += n_bonds return graph, torch.stack(atom_x), \ torch.stack(bond_x) if len(bond_x) > 0 else torch.zeros(0) mpn_loopy_bp_msg = DGLF.copy_src(src='msg', out='msg') mpn_loopy_bp_reduce = DGLF.sum(msg='msg', out='accum_msg') class LoopyBPUpdate(nn.Module): def __init__(self, hidden_size): super(LoopyBPUpdate, self).__init__() self.hidden_size = hidden_size self.W_h = nn.Linear(hidden_size, hidden_size, bias=False) def reset_parameters(self): """Reinitialize model parameters.""" self.W_h.reset_parameters() def forward(self, nodes): msg_input = nodes.data['msg_input'] msg_delta = self.W_h(nodes.data['accum_msg'])
# # GCN implementation with DGL # `````````````````````````````````````````` # We first define the message and reduce function as usual. Since the # aggregation on a node :math:`u` only involves summing over the neighbors' # representations :math:`h_v`, we can simply use builtin functions: import dgl import dgl.function as fn import torch as th import torch.nn as nn import torch.nn.functional as F from dgl import DGLGraph import Processtest gcn_msg = fn.u_mul_e('h', 'w', 'm') gcn_reduce = fn.sum(msg='m', out='h') ############################################################################### # We then proceed to define the GCNLayer module. A GCNLayer essentially performs # message passing on all the nodes then applies a fully-connected layer. class GCNLayer(nn.Module): def __init__(self, in_feats, out_feats): super(GCNLayer, self).__init__() self.linear = nn.Linear(in_feats, out_feats) def forward(self, g, feature): # Creating a local scope so that all the stored ndata and edata # (such as the `'h'` ndata below) are automatically popped out # when the scope exits.
def propagate(self, g, weight, incidence_in, incidence_out): self.aggregate_relation(g, weight, incidence_in, incidence_out) g.update_all(self.msg_func, fn.sum(msg='msg', out='h'), self.apply_func) return self.weight
def forward(self, graph): graph.update_all(message_func=self.message_function, reduce_func=fn.sum(msg='m', out='m_sum'), apply_node_func=self.update_function)
def run(self, cand_graphs, cand_line_graph, tree_mess_src_edges, tree_mess_tgt_edges, tree_mess_tgt_nodes, mol_tree_batch): n_nodes = cand_graphs.number_of_nodes() cand_graphs.apply_edges(func=lambda edges: {'src_x': edges.src['x']}, ) bond_features = cand_line_graph.ndata['x'] source_features = cand_line_graph.ndata['src_x'] features = torch.cat([source_features, bond_features], 1) msg_input = self.W_i(features) cand_line_graph.ndata.update({ 'msg_input': msg_input, 'msg': torch.relu(msg_input), 'accum_msg': torch.zeros_like(msg_input), }) zero_node_state = bond_features.new(n_nodes, self.hidden_size).zero_() cand_graphs.ndata.update({ 'm': zero_node_state.clone(), 'h': zero_node_state.clone(), }) cand_graphs.edata['alpha'] = \ cuda(torch.zeros(cand_graphs.number_of_edges(), self.hidden_size)) cand_graphs.ndata['alpha'] = zero_node_state if tree_mess_src_edges.shape[0] > 0: if PAPER: src_u, src_v = tree_mess_src_edges.unbind(1) tgt_u, tgt_v = tree_mess_tgt_edges.unbind(1) src_u = src_u.to(mol_tree_batch.device) src_v = src_v.to(mol_tree_batch.device) eid = mol_tree_batch.edge_ids(src_u, src_v) alpha = mol_tree_batch.edata['m'][eid] cand_graphs.edges[tgt_u, tgt_v].data['alpha'] = alpha else: src_u, src_v = tree_mess_src_edges.unbind(1) src_u = src_u.to(mol_tree_batch.device) src_v = src_v.to(mol_tree_batch.device) eid = mol_tree_batch.edge_ids(src_u, src_v) alpha = mol_tree_batch.edata['m'][eid] node_idx = (tree_mess_tgt_nodes.to( device=zero_node_state.device)[:, None].expand_as(alpha)) node_alpha = zero_node_state.clone().scatter_add( 0, node_idx, alpha) cand_graphs.ndata['alpha'] = node_alpha cand_graphs.apply_edges( func=lambda edges: {'alpha': edges.src['alpha']}, ) cand_line_graph.ndata.update(cand_graphs.edata) for i in range(self.depth - 1): cand_line_graph.update_all(DGLF.copy_u('msg', 'msg'), DGLF.sum('msg', 'accum_msg')) cand_line_graph.apply_nodes(self.loopy_bp_updater) cand_graphs.edata.update(cand_line_graph.ndata) cand_graphs.update_all(DGLF.copy_e('msg', 'msg'), DGLF.sum('msg', 'm')) if PAPER: cand_graphs.update_all(DGLF.copy_e('alpha', 'alpha'), DGLF.sum('alpha', 'accum_alpha')) cand_graphs.apply_nodes(self.gather_updater) return cand_graphs
bond_x, 'src_x': atom_x.new(len(bond_feature_list), ATOM_FDIM).zero_() }) cand_graphs.append(g) return cand_graphs, tree_mess_source_edges, tree_mess_target_edges, \ tree_mess_target_nodes # TODO: use SPMV mpn_loopy_bp_msg = DGLF.copy_src(src='msg', out='msg') #def mpn_loopy_bp_msg(src, edge): # return src['msg'] mpn_loopy_bp_reduce = DGLF.sum(msgs='msg', out='accum_msg') #def mpn_loopy_bp_reduce(node, msgs): # return {'accum_msg': torch.sum(msgs, 1)} class LoopyBPUpdate(nn.Module): def __init__(self, hidden_size): super(LoopyBPUpdate, self).__init__() self.hidden_size = hidden_size self.W_h = nn.Linear(hidden_size, hidden_size, bias=False) def forward(self, node): msg_input = node['msg_input'] msg_delta = self.W_h(node['accum_msg'] + node['alpha']) msg = torch.relu(msg_input + msg_delta)
def reduce_sum(self, msg, out): res = fn.sum(msg, out) return res
def test_copy(): num_layers = 2 g = generate_rand_graph(100) g.ndata['h'] = g.ndata['h1'] nf = create_mini_batch(g, num_layers) nf.copy_from_parent() for i in range(nf.num_layers): assert len(g.ndata.keys()) == len(nf.layers[i].data.keys()) for key in g.ndata.keys(): assert key in nf.layers[i].data.keys() assert F.array_equal(nf.layers[i].data[key], g.ndata[key][nf.layer_parent_nid(i)]) for i in range(nf.num_blocks): assert len(g.edata.keys()) == len(nf.blocks[i].data.keys()) for key in g.edata.keys(): assert key in nf.blocks[i].data.keys() assert F.array_equal(nf.blocks[i].data[key], g.edata[key][nf.block_parent_eid(i)]) nf = create_mini_batch(g, num_layers) node_embed_names = [['h'], ['h1'], ['h']] edge_embed_names = [['h2'], ['h2']] nf.copy_from_parent(node_embed_names=node_embed_names, edge_embed_names=edge_embed_names) for i in range(nf.num_layers): assert len(node_embed_names[i]) == len(nf.layers[i].data.keys()) for key in node_embed_names[i]: assert key in nf.layers[i].data.keys() assert F.array_equal(nf.layers[i].data[key], g.ndata[key][nf.layer_parent_nid(i)]) for i in range(nf.num_blocks): assert len(edge_embed_names[i]) == len(nf.blocks[i].data.keys()) for key in edge_embed_names[i]: assert key in nf.blocks[i].data.keys() assert F.array_equal(nf.blocks[i].data[key], g.edata[key][nf.block_parent_eid(i)]) nf = create_mini_batch(g, num_layers) g.ndata['h0'] = F.clone(g.ndata['h']) node_embed_names = [['h0'], [], []] nf.copy_from_parent(node_embed_names=node_embed_names, edge_embed_names=None) for i in range(num_layers): nf.block_compute(i, fn.copy_src(src='h%d' % i, out='m'), fn.sum(msg='m', out='t'), lambda nodes: {'h%d' % (i + 1): nodes.data['t'] + 1}) g.update_all(fn.copy_src(src='h', out='m'), fn.sum(msg='m', out='t'), lambda nodes: {'h': nodes.data['t'] + 1}) assert F.array_equal(nf.layers[i + 1].data['h%d' % (i + 1)], g.ndata['h'][nf.layer_parent_nid(i + 1)]) nf.copy_to_parent(node_embed_names=[['h0'], ['h1'], ['h2']]) for i in range(num_layers + 1): assert F.array_equal(nf.layers[i].data['h%d' % i], g.ndata['h%d' % i][nf.layer_parent_nid(i)]) nf = create_mini_batch(g, num_layers) g.ndata['h0'] = F.clone(g.ndata['h']) g.ndata['h1'] = F.clone(g.ndata['h']) g.ndata['h2'] = F.clone(g.ndata['h']) node_embed_names = [['h0'], ['h1'], ['h2']] nf.copy_from_parent(node_embed_names=node_embed_names, edge_embed_names=None) def msg_func(edge, ind): assert 'h%d' % ind in edge.src.keys() return {'m': edge.src['h%d' % ind]} def reduce_func(node, ind): assert 'h%d' % (ind + 1) in node.data.keys() return { 'h': F.sum(node.mailbox['m'], 1) + node.data['h%d' % (ind + 1)] } for i in range(num_layers): nf.block_compute(i, partial(msg_func, ind=i), partial(reduce_func, ind=i))
import scipy.sparse as spp import matplotlib.pyplot as plt import networkx as nx edgelist=[(0,4),(0,1),(4,1),(4,3),(1,2),(3,2),(3,5)] g=nx.DiGraph(edgelist) # add self-edge for each node g.remove_edges_from(nx.selfloop_edges(g)) g.add_edges_from(zip(g.nodes(), g.nodes())) #nx.draw(g, with_labels=True) #plt.show() g = dgl.DGLGraph(g) degs = g.in_degrees().float() norm = torch.pow(degs, -0.5) norm[torch.isinf(norm)] = 0 g.ndata['norm'] = norm.unsqueeze(1) h=torch.ones([6, 2]) # normalization by square root of src degree h = h * g.ndata['norm'] g.ndata['h'] = h g.update_all(fn.copy_src(src='h', out='m'), fn.sum(msg='m', out='h')) h = g.ndata.pop('h') # normalization by square root of dst degree h = h * g.ndata['norm'] print(h)
def check_partition(g, part_method, reshuffle, num_parts=4, num_trainers_per_machine=1, load_feats=True): g.ndata['labels'] = F.arange(0, g.number_of_nodes()) g.ndata['feats'] = F.tensor(np.random.randn(g.number_of_nodes(), 10), F.float32) g.edata['feats'] = F.tensor(np.random.randn(g.number_of_edges(), 10), F.float32) g.update_all(fn.copy_src('feats', 'msg'), fn.sum('msg', 'h')) g.update_all(fn.copy_edge('feats', 'msg'), fn.sum('msg', 'eh')) num_hops = 2 orig_nids, orig_eids = partition_graph( g, 'test', num_parts, '/tmp/partition', num_hops=num_hops, part_method=part_method, reshuffle=reshuffle, return_mapping=True, num_trainers_per_machine=num_trainers_per_machine) part_sizes = [] shuffled_labels = [] shuffled_edata = [] for i in range(num_parts): part_g, node_feats, edge_feats, gpb, _, ntypes, etypes = load_partition( '/tmp/partition/test.json', i, load_feats=load_feats) if not load_feats: assert not node_feats assert not edge_feats node_feats, edge_feats = load_partition_feats( '/tmp/partition/test.json', i) if num_trainers_per_machine > 1: for ntype in g.ntypes: name = ntype + '/trainer_id' assert name in node_feats part_ids = F.floor_div(node_feats[name], num_trainers_per_machine) assert np.all(F.asnumpy(part_ids) == i) for etype in g.etypes: name = etype + '/trainer_id' assert name in edge_feats part_ids = F.floor_div(edge_feats[name], num_trainers_per_machine) assert np.all(F.asnumpy(part_ids) == i) # Check the metadata assert gpb._num_nodes() == g.number_of_nodes() assert gpb._num_edges() == g.number_of_edges() assert gpb.num_partitions() == num_parts gpb_meta = gpb.metadata() assert len(gpb_meta) == num_parts assert len(gpb.partid2nids(i)) == gpb_meta[i]['num_nodes'] assert len(gpb.partid2eids(i)) == gpb_meta[i]['num_edges'] part_sizes.append((gpb_meta[i]['num_nodes'], gpb_meta[i]['num_edges'])) nid = F.boolean_mask(part_g.ndata[dgl.NID], part_g.ndata['inner_node']) local_nid = gpb.nid2localnid(nid, i) assert F.dtype(local_nid) in (F.int64, F.int32) assert np.all(F.asnumpy(local_nid) == np.arange(0, len(local_nid))) eid = F.boolean_mask(part_g.edata[dgl.EID], part_g.edata['inner_edge']) local_eid = gpb.eid2localeid(eid, i) assert F.dtype(local_eid) in (F.int64, F.int32) assert np.all(F.asnumpy(local_eid) == np.arange(0, len(local_eid))) # Check the node map. local_nodes = F.boolean_mask(part_g.ndata[dgl.NID], part_g.ndata['inner_node']) llocal_nodes = F.nonzero_1d(part_g.ndata['inner_node']) local_nodes1 = gpb.partid2nids(i) assert F.dtype(local_nodes1) in (F.int32, F.int64) assert np.all( np.sort(F.asnumpy(local_nodes)) == np.sort(F.asnumpy( local_nodes1))) assert np.all(F.asnumpy(llocal_nodes) == np.arange(len(llocal_nodes))) # Check the edge map. local_edges = F.boolean_mask(part_g.edata[dgl.EID], part_g.edata['inner_edge']) llocal_edges = F.nonzero_1d(part_g.edata['inner_edge']) local_edges1 = gpb.partid2eids(i) assert F.dtype(local_edges1) in (F.int32, F.int64) assert np.all( np.sort(F.asnumpy(local_edges)) == np.sort(F.asnumpy( local_edges1))) assert np.all(F.asnumpy(llocal_edges) == np.arange(len(llocal_edges))) # Verify the mapping between the reshuffled IDs and the original IDs. part_src_ids, part_dst_ids = part_g.edges() part_src_ids = F.gather_row(part_g.ndata[dgl.NID], part_src_ids) part_dst_ids = F.gather_row(part_g.ndata[dgl.NID], part_dst_ids) part_eids = part_g.edata[dgl.EID] orig_src_ids = F.gather_row(orig_nids, part_src_ids) orig_dst_ids = F.gather_row(orig_nids, part_dst_ids) orig_eids1 = F.gather_row(orig_eids, part_eids) orig_eids2 = g.edge_ids(orig_src_ids, orig_dst_ids) assert F.shape(orig_eids1)[0] == F.shape(orig_eids2)[0] assert np.all(F.asnumpy(orig_eids1) == F.asnumpy(orig_eids2)) if reshuffle: part_g.ndata['feats'] = F.gather_row(g.ndata['feats'], part_g.ndata['orig_id']) part_g.edata['feats'] = F.gather_row(g.edata['feats'], part_g.edata['orig_id']) # when we read node data from the original global graph, we should use orig_id. local_nodes = F.boolean_mask(part_g.ndata['orig_id'], part_g.ndata['inner_node']) local_edges = F.boolean_mask(part_g.edata['orig_id'], part_g.edata['inner_edge']) else: part_g.ndata['feats'] = F.gather_row(g.ndata['feats'], part_g.ndata[dgl.NID]) part_g.edata['feats'] = F.gather_row(g.edata['feats'], part_g.edata[dgl.NID]) part_g.update_all(fn.copy_src('feats', 'msg'), fn.sum('msg', 'h')) part_g.update_all(fn.copy_edge('feats', 'msg'), fn.sum('msg', 'eh')) assert F.allclose(F.gather_row(g.ndata['h'], local_nodes), F.gather_row(part_g.ndata['h'], llocal_nodes)) assert F.allclose(F.gather_row(g.ndata['eh'], local_nodes), F.gather_row(part_g.ndata['eh'], llocal_nodes)) for name in ['labels', 'feats']: assert '_N/' + name in node_feats assert node_feats['_N/' + name].shape[0] == len(local_nodes) true_feats = F.gather_row(g.ndata[name], local_nodes) ndata = F.gather_row(node_feats['_N/' + name], local_nid) assert np.all(F.asnumpy(true_feats) == F.asnumpy(ndata)) for name in ['feats']: assert '_E/' + name in edge_feats assert edge_feats['_E/' + name].shape[0] == len(local_edges) true_feats = F.gather_row(g.edata[name], local_edges) edata = F.gather_row(edge_feats['_E/' + name], local_eid) assert np.all(F.asnumpy(true_feats) == F.asnumpy(edata)) # This only works if node/edge IDs are shuffled. if reshuffle: shuffled_labels.append(node_feats['_N/labels']) shuffled_edata.append(edge_feats['_E/feats']) # Verify that we can reconstruct node/edge data for original IDs. if reshuffle: shuffled_labels = F.asnumpy(F.cat(shuffled_labels, 0)) shuffled_edata = F.asnumpy(F.cat(shuffled_edata, 0)) orig_labels = np.zeros(shuffled_labels.shape, dtype=shuffled_labels.dtype) orig_edata = np.zeros(shuffled_edata.shape, dtype=shuffled_edata.dtype) orig_labels[F.asnumpy(orig_nids)] = shuffled_labels orig_edata[F.asnumpy(orig_eids)] = shuffled_edata assert np.all(orig_labels == F.asnumpy(g.ndata['labels'])) assert np.all(orig_edata == F.asnumpy(g.edata['feats'])) if reshuffle: node_map = [] edge_map = [] for i, (num_nodes, num_edges) in enumerate(part_sizes): node_map.append(np.ones(num_nodes) * i) edge_map.append(np.ones(num_edges) * i) node_map = np.concatenate(node_map) edge_map = np.concatenate(edge_map) nid2pid = gpb.nid2partid(F.arange(0, len(node_map))) assert F.dtype(nid2pid) in (F.int32, F.int64) assert np.all(F.asnumpy(nid2pid) == node_map) eid2pid = gpb.eid2partid(F.arange(0, len(edge_map))) assert F.dtype(eid2pid) in (F.int32, F.int64) assert np.all(F.asnumpy(eid2pid) == edge_map)
from dgl import batch, unbatch, bfs_edges_generator import dgl.function as DGLF from .line_profiler_integration import profile import numpy as np MAX_NB = 8 def level_order(forest, roots): edges = bfs_edges_generator(forest, roots) _, leaves = forest.find_edges(edges[-1]) edges_back = bfs_edges_generator(forest, roots, reverse=True) yield from reversed(edges_back) yield from edges enc_tree_msg = [DGLF.copy_src(src='m', out='m'), DGLF.copy_src(src='rm', out='rm')] enc_tree_reduce = [DGLF.sum(msg='m', out='s'), DGLF.sum(msg='rm', out='accum_rm')] enc_tree_gather_msg = DGLF.copy_edge(edge='m', out='m') enc_tree_gather_reduce = DGLF.sum(msg='m', out='m') class EncoderGatherUpdate(nn.Module): def __init__(self, hidden_size): nn.Module.__init__(self) self.hidden_size = hidden_size self.W = nn.Linear(2 * hidden_size, hidden_size) def forward(self, nodes): x = nodes.data['x'] m = nodes.data['m'] return { 'h': torch.relu(self.W(torch.cat([x, m], 1))),
def forward(self, g, feats, norm_atom, norm_bond): g = g.local_var() h = feats["atom"] e = feats["bond"] # u = feats["global"] # for residual connection h_in = h e_in = e # u_in = u g.nodes["atom"].data.update({"Ah": self.A(h), "Dh": self.D(h), "Eh": self.E(h)}) g.nodes["bond"].data.update({"Be": self.B(e)}) # g.nodes["global"].data.update({"Cu": self.C(u), "Fu": self.F(u)}) # update bond feature e g.multi_update_all( { "a2b": (fn.copy_u("Ah", "m"), fn.sum("m", "e")), # A * (h_i + h_j) "b2b": (fn.copy_u("Be", "m"), fn.sum("m", "e")), # B * e_ij # "g2b": (fn.copy_u("Cu", "m"), fn.sum("m", "e")), # C * u }, "sum", ) e = g.nodes["bond"].data["e"] if self.graph_norm: e = e * norm_bond if self.batch_norm: e = self.bn_node_e(e) e = self.activation(e) if self.residual: e = e_in + e g.nodes["bond"].data["e"] = e # update atom feature h # Copy Eh to bond nodes, without reduction. # This is the first arrow in: Eh_j -> bond node -> atom i node # The second arrow is done in self.message_fn and self.reduce_fn below g.update_all(fn.copy_u("Eh", "Eh_j"), self.reduce_fn_a2b, etype="a2b") g.multi_update_all( { "a2a": (fn.copy_u("Dh", "m"), fn.sum("m", "h")), # D * h_i "b2a": (self.message_fn, self.reduce_fn), # e_ij [Had] (E * hj) # "g2a": (fn.copy_u("Fu", "m"), fn.sum("m", "h")), # F * u }, "sum", ) h = g.nodes["atom"].data["h"] if self.graph_norm: h = h * norm_atom if self.batch_norm: h = self.bn_node_h(h) h = self.activation(h) if self.residual: h = h_in + h g.nodes["atom"].data["h"] = h # # update global feature u # g.nodes["atom"].data.update({"Gh": self.G(h)}) # g.nodes["bond"].data.update({"He": self.H(e)}) # g.nodes["global"].data.update({"Iu": self.I(u)}) # g.multi_update_all( # { # "a2g": (fn.copy_u("Gh", "m"), fn.mean("m", "u")), # G * (mean_i h_i) # "b2g": (fn.copy_u("He", "m"), fn.mean("m", "u")), # H * (mean_ij e_ij) # "g2g": (fn.copy_u("Iu", "m"), fn.sum("m", "u")), # I * u # }, # "sum", # ) # u = g.nodes["global"].data["u"] # if self.batch_norm: # u = self.bn_node_u(u) # u = self.activation(u) # if self.residual: # u = u_in + u # dropout h = self.dropout(h) e = self.dropout(e) # u = self.dropout(u) # feats = {"atom": h, "bond": e, "global": u} feats = {"atom": h, "bond": e} return feats
def forward(self, g, feats, norm_atom, norm_bond): g = g.local_var() h = feats["atom"] e = feats["bond"] u = feats["global"] # for residual connection h_in = h e_in = e u_in = u g.nodes["atom"].data.update({"Ah": self.A(h), "Dh": self.D(h), "Eh": self.E(h)}) g.nodes["bond"].data.update({"Be": self.B(e)}) g.nodes["global"].data.update({"Cu": self.C(u), "Fu": self.F(u)}) # update bond feature e g.multi_update_all( { "a2b": (fn.copy_u("Ah", "m"), fn.sum("m", "e")), # A * (h_i + h_j) "b2b": (fn.copy_u("Be", "m"), fn.sum("m", "e")), # B * e_ij "g2b": (fn.copy_u("Cu", "m"), fn.sum("m", "e")), # C * u }, "sum", ) e = g.nodes["bond"].data["e"] if self.graph_norm: e = e * norm_bond if self.batch_norm: e = self.bn_node_e(e) e = self.activation(e) if self.residual: e = e_in + e g.nodes["bond"].data["e"] = e # update atom feature h # Copy Eh to bond nodes, without reduction. # This is the first arrow in: Eh_j -> bond node -> atom i node # The second arrow is done in self.message_fn and self.reduce_fn below g.update_all(fn.copy_u("Eh", "Eh_j"), self.reduce_fn_a2b, etype="a2b") g.multi_update_all( { "a2a": (fn.copy_u("Dh", "m"), fn.sum("m", "h")), # D * h_i "b2a": (self.message_fn, self.reduce_fn), # e_ij [Had] (E * hj) "g2a": (fn.copy_u("Fu", "m"), fn.sum("m", "h")), # F * u }, "sum", ) h = g.nodes["atom"].data["h"] if self.graph_norm: h = h * norm_atom if self.batch_norm: h = self.bn_node_h(h) h = self.activation(h) if self.residual: h = h_in + h g.nodes["atom"].data["h"] = h u = self.node_attn_layer(g, u, [h, e, u]).flatten(start_dim=1) if self.batch_norm: u = self.bn_node_u(u) u = self.activation(u) if self.residual: u = u_in + u # dropout h = self.dropout(h) e = self.dropout(e) u = self.dropout(u) feats = {"atom": h, "bond": e, "global": u} return feats
def propagate(self, g): g.update_all(self.msg_func, fn.sum(msg='msg', out='h'), self.apply_func)
def forward(self, graph, feat): r"""Compute graph attention network layer. Parameters ---------- graph : DGLGraph The graph. feat : torch.Tensor or pair of torch.Tensor If a torch.Tensor is given, the input feature of shape :math:`(N, D_{in})` where :math:`D_{in}` is size of input feature, :math:`N` is the number of nodes. If a pair of torch.Tensor is given, the pair must contain two tensors of shape :math:`(N_{in}, D_{in_{src}})` and :math:`(N_{out}, D_{in_{dst}})`. Returns ------- torch.Tensor The output feature of shape :math:`(N, H, D_{out})` where :math:`H` is the number of heads, and :math:`D_{out}` is size of output feature. """ graph = graph.local_var() if isinstance(feat, tuple): h_src = self.feat_drop(feat[0]) h_dst = self.feat_drop(feat[1]) feat_src = self.fc_src(h_src).view(-1, self._num_heads, self._out_feats) feat_dst = self.fc_dst(h_dst).view(-1, self._num_heads, self._out_feats) else: h_src = h_dst = self.feat_drop(feat) feat_src = feat_dst = self.fc(h_src).view(-1, self._num_heads, self._out_feats) if self.opt['att_type'] == "GAT": # NOTE: GAT paper uses "first concatenation then linear projection" # to compute attention scores, while ours is "first projection then # addition", the two approaches are mathematically equivalent: # We decompose the weight vector a mentioned in the paper into # [a_l || a_r], then # a^T [Wh_i || Wh_j] = a_l Wh_i + a_r Wh_j # Our implementation is much efficient because we do not need to # save [Wh_i || Wh_j] on edges, which is not memory-efficient. Plus, # addition could be optimized with DGL's built-in function u_add_v, # which further speeds up computation and saves memory footprint. el = (feat_src * self.attn_l).sum(dim=-1).unsqueeze(-1) er = (feat_dst * self.attn_r).sum(dim=-1).unsqueeze(-1) graph.srcdata.update({'ft': feat_src, 'el': el}) graph.dstdata.update({'er': er}) # compute edge attention, el and er are a_l Wh_i and a_r Wh_j respectively. graph.apply_edges(fn.u_add_v('el', 'er', 'e')) e = self.leaky_relu(graph.edata.pop('e')) elif self.opt['att_type'] == "cosine": el = feat_src * self.attn_l er = feat_dst * self.attn_r graph.srcdata.update({'ft': feat_src, 'el': el}) graph.dstdata.update({'er': er}) graph.srcdata['norm_h'] = F.normalize(el, p=2, dim=-1) graph.dstdata['norm_h'] = F.normalize(er, p=2, dim=-1) # compute cosine distance graph.apply_edges(fn.u_dot_v('norm_h', 'norm_h', 'cos')) e = graph.edata.pop('cos') elif self.opt['att_type'] == "scaled_dot": el = feat_src * self.attn_l er = feat_dst * self.attn_r / th.sqrt( th.tensor(self.opt['num_hidden'] / self.opt['num_heads'])) graph.srcdata.update({'ft': feat_src, 'el': el}) graph.dstdata.update({'er': er}) # compute dot graph.apply_edges(fn.u_dot_v('el', 'er', 'dot')) e = graph.edata.pop('dot') elif self.opt['att_type'] == "pearson": el = feat_src * self.attn_l er = feat_dst * self.attn_r graph.srcdata.update({'ft': feat_src, 'el': el}) graph.dstdata.update({'er': er}) src_mu = th.mean(el, dim=1, keepdim=True) graph.srcdata['norm_h'] = F.normalize(el - src_mu, p=2, dim=-1) dst_mu = th.mean(er, dim=1, keepdim=True) graph.dstdata['norm_h'] = F.normalize(er - dst_mu, p=2, dim=-1) # compute cosine distance graph.apply_edges(fn.u_dot_v('norm_h', 'norm_h', 'cos')) e = graph.edata.pop('cos') elif self.opt['att_type'] == "spearman": #todo check all these operations el = feat_src * self.attn_l er = feat_dst * self.attn_r graph.srcdata.update({'ft': feat_src, 'el': el}) graph.dstdata.update({'er': er}) el = el.view(-1, self._out_feats) er = er.view(-1, self._out_feats) el = soft_rank(el, regularization_strength=1.0) er = soft_rank(er, regularization_strength=1.0) ranked_src = soft_rank( 1000 * F.normalize(el, p=2, dim=-1)) #, regularization_strength=0.1) ranked_dst = soft_rank(1000 * F.normalize(er, p=2, dim=-1), regularization_strength=0.1) src_mu = th.mean(ranked_src, dim=1, keepdim=True) dst_mu = th.mean(ranked_dst, dim=1, keepdim=True) el = F.normalize(ranked_src - src_mu, p=2, dim=-1) er = F.normalize(ranked_dst - dst_mu, p=2, dim=-1) el = el.view(-1, self._num_heads, self._out_feats) er = er.view(-1, self._num_heads, self._out_feats) graph.srcdata['norm_h'] = F.normalize(el, p=2, dim=-1) graph.dstdata['norm_h'] = F.normalize(er, p=2, dim=-1) # compute cosine distance graph.apply_edges(fn.u_dot_v('norm_h', 'norm_h', 'cos')) e = graph.edata.pop('cos') # compute softmax graph.edata['a'] = self.attn_drop(edge_softmax(graph, e)) # message passing graph.update_all(fn.u_mul_e('ft', 'a', 'm'), fn.sum('m', 'ft')) rst = graph.dstdata['ft'] # residual if self.res_fc is not None: resval = self.res_fc(h_dst).view(h_dst.shape[0], -1, self._out_feats) rst = rst + resval # activation if self.activation: rst = self.activation(rst) return rst
msg_delta = self.W_h(node.data['accum_msg'] + node.data['alpha']) msg = torch.relu(msg_input + msg_delta) return {'msg': msg} if PAPER: mpn_gather_msg = [ DGLF.copy_edge(edge='msg', out='msg'), DGLF.copy_edge(edge='alpha', out='alpha') ] else: mpn_gather_msg = DGLF.copy_edge(edge='msg', out='msg') if PAPER: mpn_gather_reduce = [ DGLF.sum(msg='msg', out='m'), DGLF.sum(msg='alpha', out='accum_alpha'), ] else: mpn_gather_reduce = DGLF.sum(msg='msg', out='m') class GatherUpdate(nn.Module): def __init__(self, hidden_size): super(GatherUpdate, self).__init__() self.hidden_size = hidden_size self.W_o = nn.Linear(ATOM_FDIM + hidden_size, hidden_size) def forward(self, node): if PAPER:
def forward(self, graph, n_feat): graph = graph.local_var() graph.ndata['h'] = n_feat graph.update_all(fn.copy_u('h', 'm'), fn.sum('m', 'h')) n_feat += graph.ndata['h'] return n_feat.view(graph.number_of_nodes() // 2, 2, -1).sum(1)
import dgl.function as fn import torch.nn as nn from Constants import EDGE_FEATURE_NAME in_out_key = 'h' edge_layer_msg = fn.copy_edge(edge=in_out_key, out='m') edge_layer_reduce = fn.sum(msg='m', out=in_out_key) class EdgeLayer(nn.Module): def __init__(self, in_feats, out_feats): super(EdgeLayer, self).__init__() self.linear = nn.Linear(in_feats, out_feats) def forward(self, g_and_features): if type(g_and_features) is tuple: g, _ = g_and_features else: g = g_and_features features = g.edata[EDGE_FEATURE_NAME] # Creating a local scope so that all the stored ndata and edata # (such as the `'h'` ndata below) are automatically popped out # when the scope exits. with g.local_scope(): g.edata[in_out_key] = features g.update_all(edge_layer_msg, edge_layer_reduce) h = g.ndata[in_out_key] return self.linear(h)
return g, features, labels, train_mask, test_mask def evaluate(model, g, features, labels, mask): model.eval() with torch.no_grad(): logits = model(g, features) logits = logits[mask] labels = labels[mask] _, indices = torch.max(logits, dim = 1) correct = torch.sum(indices == labels) return correct.item() * 1.0 / len(labels) if __name__ == "__main__": # Since the aggragation on a node u only involves summing the neighbors' representations h gcn_msg = fn.copy_src(src = 'h', out = 'm') gcn_reduce = fn.sum(msg = "m", out = "h") net = Net() g, features, labels, train_mask, test_mask = load_cora_data() optimizer = torch.optim.Adam(net.parameters(), lr = 0.01) for epoch in range(50): net.train() logits = net(g, features) loss = nn.CrossEntropyLoss() output = loss(logits[train_mask], labels[train_mask]) optimizer.zero_grad() output.backward() optimizer.step() acc = evaluate(net, g, features, labels, test_mask) print("accurate:",acc)
def propagate(self, g, weight, incidence_in, incidence_out): g.update_all(self.msg_func, fn.sum(msg='msg', out='h'), self.apply_func) return weight
def forward(self, g): g.apply_edges(self.update_edge) g.update_all(message_func=fn.u_mul_e('new_node', 'h', 'neighbor_info'), reduce_func=fn.sum('neighbor_info', 'new_node')) return g.ndata["new_node"]
def forward(self, graph, feat): r"""Compute graph attention network layer. Parameters ---------- graph : DGLGraph The graph. feat : torch.Tensor or pair of torch.Tensor If a torch.Tensor is given, the input feature of shape :math:`(N, D_{in})` where :math:`D_{in}` is size of input feature, :math:`N` is the number of nodes. If a pair of torch.Tensor is given, the pair must contain two tensors of shape :math:`(N_{in}, D_{in_{src}})` and :math:`(N_{out}, D_{in_{dst}})`. Returns ------- torch.Tensor The output feature of shape :math:`(N, H, D_{out})` where :math:`H` is the number of heads, and :math:`D_{out}` is size of output feature. """ graph = graph.local_var() if isinstance(feat, tuple): h_src = self.feat_drop(feat[0]) h_dst = self.feat_drop(feat[1]) feat_src = self.fc_src(h_src).view(-1, self._num_heads, self._out_feats) feat_dst = self.fc_dst(h_dst).view(-1, self._num_heads, self._out_feats) else: h_src = h_dst = self.feat_drop(feat) feat_src = feat_dst = self.fc(h_src).view( -1, self._num_heads, self._out_feats) # NOTE: GAT paper uses "first concatenation then linear projection" # to compute attention scores, while ours is "first projection then # addition", the two approaches are mathematically equivalent: # We decompose the weight vector a mentioned in the paper into # [a_l || a_r], then # a^T [Wh_i || Wh_j] = a_l Wh_i + a_r Wh_j # Our implementation is much efficient because we do not need to # save [Wh_i || Wh_j] on edges, which is not memory-efficient. Plus, # addition could be optimized with DGL's built-in function u_add_v, # which further speeds up computation and saves memory footprint. el = (feat_src * self.attn_l).sum(dim=-1).unsqueeze(-1) er = (feat_dst * self.attn_r).sum(dim=-1).unsqueeze(-1) graph.srcdata.update({'ft': feat_src, 'el': el}) graph.dstdata.update({'er': er}) # compute edge attention, el and er are a_l Wh_i and a_r Wh_j respectively. graph.apply_edges(fn.u_add_v('el', 'er', 'e')) e = self.leaky_relu(graph.edata.pop('e')) e_soft = edge_softmax(graph, e) graph.edata['a'] = self.attn_drop(e_soft) # compute softmax # graph.edata['a'] = self.attn_drop(edge_softmax(graph, e)) # message passing graph.update_all(fn.u_mul_e('ft', 'a', 'm'), fn.sum('m', 'ft')) rst = graph.dstdata['ft'] # residual if self.res_fc is not None: resval = self.res_fc(h_dst).view(h_dst.shape[0], -1, self._out_feats) rst = rst + resval # activation if self.activation: rst = self.activation(rst) return rst, e_soft
def test_pickling_graph(): # graph structures and frames are pickled g = dgl.DGLGraph() g.add_nodes(3) src = F.tensor([0, 0]) dst = F.tensor([1, 2]) g.add_edges(src, dst) x = F.randn((3, 7)) y = F.randn((3, 5)) a = F.randn((2, 6)) b = F.randn((2, 4)) g.ndata['x'] = x g.ndata['y'] = y g.edata['a'] = a g.edata['b'] = b # registered functions are pickled g.register_message_func(_global_message_func) reduce_func = fn.sum('x', 'x') g.register_reduce_func(reduce_func) # custom attributes should be pickled g.foo = 2 new_g = _reconstruct_pickle(g) _assert_is_identical(g, new_g) assert new_g.foo == 2 assert new_g._message_func == _global_message_func assert isinstance(new_g._reduce_func, type(reduce_func)) assert new_g._reduce_func._name == 'sum' assert new_g._reduce_func.msg_field == 'x' assert new_g._reduce_func.out_field == 'x' # test batched graph with partial set case g2 = dgl.DGLGraph() g2.add_nodes(4) src2 = F.tensor([0, 1]) dst2 = F.tensor([2, 3]) g2.add_edges(src2, dst2) x2 = F.randn((4, 7)) y2 = F.randn((3, 5)) a2 = F.randn((2, 6)) b2 = F.randn((2, 4)) g2.ndata['x'] = x2 g2.nodes[[0, 1, 3]].data['y'] = y2 g2.edata['a'] = a2 g2.edata['b'] = b2 bg = dgl.batch([g, g2]) bg2 = _reconstruct_pickle(bg) _assert_is_identical(bg, bg2) new_g, new_g2 = dgl.unbatch(bg2) _assert_is_identical(g, new_g) _assert_is_identical(g2, new_g2) # readonly graph g = dgl.DGLGraph([(0, 1), (1, 2)], readonly=True) new_g = _reconstruct_pickle(g) _assert_is_identical(g, new_g) # multigraph g = dgl.DGLGraph([(0, 1), (0, 1), (1, 2)]) new_g = _reconstruct_pickle(g) _assert_is_identical(g, new_g) # readonly multigraph g = dgl.DGLGraph([(0, 1), (0, 1), (1, 2)], readonly=True) new_g = _reconstruct_pickle(g) _assert_is_identical(g, new_g)
def graphsage_cv_train(g, ctx, args, n_classes, train_nid, test_nid, n_test_samples): features = g.ndata['features'] labels = g.ndata['labels'] in_feats = g.ndata['features'].shape[1] norm = mx.nd.expand_dims(1./g.in_degrees().astype('float32'), 1) g.ndata['norm'] = norm.as_in_context(ctx) degs = g.in_degrees().astype('float32').asnumpy() degs[degs > args.num_neighbors] = args.num_neighbors g.ndata['subg_norm'] = mx.nd.expand_dims(mx.nd.array(1./degs, ctx=ctx), 1) g.update_all(fn.copy_src(src='features', out='m'), fn.sum(msg='m', out='preprocess'), lambda node : {'preprocess': node.data['preprocess'] * node.data['norm']}) n_layers = args.n_layers for i in range(n_layers): g.ndata['h_{}'.format(i)] = mx.nd.zeros((features.shape[0], args.n_hidden), ctx=ctx) model = GraphSAGETrain(in_feats, args.n_hidden, n_classes, n_layers, args.dropout, prefix='GraphSAGE') model.initialize(ctx=ctx) loss_fcn = gluon.loss.SoftmaxCELoss() infer_model = GraphSAGEInfer(in_feats, args.n_hidden, n_classes, n_layers, prefix='GraphSAGE') infer_model.initialize(ctx=ctx) # use optimizer print(model.collect_params()) trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': args.lr, 'wd': args.weight_decay}, kvstore=mx.kv.create('local')) # initialize graph dur = [] for epoch in range(args.n_epochs): for nf in dgl.contrib.sampling.NeighborSampler(g, args.batch_size, args.num_neighbors, neighbor_type='in', shuffle=True, num_workers=32, num_hops=n_layers, add_self_loop=True, seed_nodes=train_nid): for i in range(n_layers): agg_history_str = 'agg_h_{}'.format(i) g.pull(nf.layer_parent_nid(i+1), fn.copy_src(src='h_{}'.format(i), out='m'), fn.sum(msg='m', out=agg_history_str)) node_embed_names = [['preprocess', 'features', 'h_0']] for i in range(1, n_layers): node_embed_names.append(['h_{}'.format(i), 'agg_h_{}'.format(i-1), 'subg_norm', 'norm']) node_embed_names.append(['agg_h_{}'.format(n_layers-1), 'subg_norm', 'norm']) nf.copy_from_parent(node_embed_names=node_embed_names) # forward with mx.autograd.record(): pred = model(nf) batch_nids = nf.layer_parent_nid(-1).as_in_context(ctx) batch_labels = labels[batch_nids] loss = loss_fcn(pred, batch_labels) loss = loss.sum() / len(batch_nids) loss.backward() trainer.step(batch_size=1) node_embed_names = [['h_{}'.format(i)] for i in range(n_layers)] node_embed_names.append([]) nf.copy_to_parent(node_embed_names=node_embed_names) infer_params = infer_model.collect_params() for key in infer_params: idx = trainer._param2idx[key] trainer._kvstore.pull(idx, out=infer_params[key].data()) num_acc = 0. num_tests = 0 for nf in dgl.contrib.sampling.NeighborSampler(g, args.test_batch_size, g.number_of_nodes(), neighbor_type='in', num_hops=n_layers, seed_nodes=test_nid, add_self_loop=True): node_embed_names = [['preprocess', 'features']] for i in range(n_layers): node_embed_names.append(['norm', 'subg_norm']) nf.copy_from_parent(node_embed_names=node_embed_names) pred = infer_model(nf) batch_nids = nf.layer_parent_nid(-1).as_in_context(ctx) batch_labels = labels[batch_nids] num_acc += (pred.argmax(axis=1) == batch_labels).sum().asscalar() num_tests += nf.layer_size(-1) break print("Test Accuracy {:.4f}". format(num_acc/num_tests))
def forward(self, g): # g: graph # inputs: node_num * emb_size # g is the graph and the inputs is the input node features # first set the node features g.update_all(fn.copy_src(src='h', out='m'), fn.sum(msg='m', out='h'), self.apply_func)
def forward(self, graph, feat): r""" Description ----------- Compute GraphSAGE layer. Parameters ---------- graph : DGLGraph The graph. feat : torch.Tensor or pair of torch.Tensor If a torch.Tensor is given, it represents the input feature of shape :math:`(N, D_{in})` where :math:`D_{in}` is size of input feature, :math:`N` is the number of nodes. If a pair of torch.Tensor is given, the pair must contain two tensors of shape :math:`(N_{in}, D_{in_{src}})` and :math:`(N_{out}, D_{in_{dst}})`. Returns ------- torch.Tensor The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}` is size of output feature. """ with graph.local_scope(): if isinstance(feat, tuple): feat_src = self.feat_drop(feat[0]) feat_dst = self.feat_drop(feat[1]) else: feat_src = feat_dst = self.feat_drop(feat) if graph.is_block: feat_dst = feat_src[:graph.number_of_dst_nodes()] h_self = feat_dst # Handle the case of graphs without edges if graph.number_of_edges() == 0: graph.dstdata['neigh'] = torch.zeros( feat_dst.shape[0], self._in_src_feats).to(feat_dst) if self._aggre_type == 'mean': graph.srcdata['h'] = feat_src graph.update_all(fn.copy_src('h', 'm'), fn.mean('m', 'neigh')) h_neigh = graph.dstdata['neigh'] elif self._aggre_type == 'gcn': check_eq_shape(feat) graph.srcdata['h'] = feat_src graph.dstdata['h'] = feat_dst # same as above if homogeneous graph.update_all(fn.copy_src('h', 'm'), fn.sum('m', 'neigh')) # divide in_degrees degs = graph.in_degrees().to(feat_dst) h_neigh = (graph.dstdata['neigh'] + graph.dstdata['h']) / (degs.unsqueeze(-1) + 1) elif self._aggre_type == 'pool': graph.srcdata['h'] = F.relu(self.fc_pool(feat_src)) graph.update_all(fn.copy_src('h', 'm'), fn.max('m', 'neigh')) h_neigh = graph.dstdata['neigh'] elif self._aggre_type == 'lstm': graph.srcdata['h'] = feat_src graph.update_all(fn.copy_src('h', 'm'), self._lstm_reducer) h_neigh = graph.dstdata['neigh'] elif self._aggre_type == 'ginmean': graph.srcdata['h'] = feat_src graph.update_all(fn.copy_src('h', 'm'), self._gin_reducer('m', 'neigh')) h_neigh = graph.dstdata['neigh'] elif self._aggre_type == 'cheb': def unnLaplacian(feat, D_invsqrt_left, D_invsqrt_right, graph): """ Operation Feat * D^-1/2 A D^-1/2 但是如果写成矩阵乘法:D^-1/2 A D^-1/2 Feat""" #tmp = torch.zeros((D_invsqrt.shape[0],D_invsqrt.shape[0])).to(graph.device) # sparse tensor没有broadcast机制,最后还依赖于srcnode在feat中从0开始连续排布 #print("adj : ",graph.adj(transpose=False,ctx = graph.device).shape) #graph.srcdata['h'] = (torch.mm((graph.adj(transpose=False,ctx = graph.device)),(feat * D_invsqrt)))*D_invsqrt[::graph.number_of_dst_nodes()] #graph.update_all(fn.copy_src('h', 'm'), fn.sum('m', 'h')) #return graph.srcdata['h'] graph.srcdata[ 'h'] = feat * D_invsqrt_right # feat is srcfeat graph.update_all(fn.copy_u('h', 'm'), fn.sum('m', 'h')) return graph.dstdata.pop('h') * D_invsqrt_left D_invsqrt_right = torch.pow( graph.out_degrees().float().clamp(min=1), -0.5).unsqueeze(-1) D_invsqrt_left = torch.pow( graph.in_degrees().float().clamp(min=1), -0.5).unsqueeze(-1) #print("D_invsqrt shape: ",D_invsqrt.shape) #print(graph.__dict__) #print(dir(graph)) #graph.srcdata['h']=feat_src #graph.dstdata['h']=feat_dst #g = dgl.to_homogeneous(graph,ndata=['h']) #dgl._ffi.base.DGLError: Expect number of features to match number of nodes (len(u)). Got 70 and 76 instead. #print(g) # since the block is different every time so it's safe to call dgl's method every time instead of calculating the l_m ahead try: lambda_max = laplacian_lambda_max(graph) except BaseException: # if the largest eigenvalue is not found dgl_warning( "Largest eigonvalue not found, using default value 2 for lambda_max", RuntimeWarning) lambda_max = torch.tensor(2) # .to(feat.device) if isinstance(lambda_max, list): lambda_max = torch.tensor(lambda_max) # .to(feat.device) if lambda_max.dim() == 1: lambda_max = lambda_max.unsqueeze(-1) # (B,) to (B, 1) # broadcast from (B, 1) to (N, 1) # lambda_max = lambda_max * torch.ones((feat.shape[0],1)) #re_norm = (2 / lambda_max ) * torch.ones((graph.number_of_dst_nodes(),1)).to(graph.device) re_norm = (2 / lambda_max.to(graph.device)) * torch.ones( (graph.number_of_dst_nodes(), 1), device=graph.device) self._cheb_Xt = X_0 = feat_dst graph.srcdata[ 'h'] = feat_src * D_invsqrt_right # feat is srcfeat graph.update_all(fn.copy_u('h', 'm'), fn.sum('m', 'h')) X_1 = -re_norm * graph.dstdata['h'] * D_invsqrt_left + X_0 * ( re_norm - 1) self._cheb_Xt = torch.cat((self._cheb_Xt, X_1.float()), 1) else: raise KeyError('Aggregator type {} not recognized.'.format( self._aggre_type)) # GraphSAGE GCN does not require fc_self. if self._aggre_type == 'gcn': rst = self.fc_neigh(h_neigh) elif self._aggre_type == 'ginmean': rst = (1 + self.eps) * h_self + h_neigh rst = self.fc_gin(rst) if self.norm is not None: rst = self.norm(rst) return rst elif self._aggre_type == 'cheb': rst = self._cheb_linear(self._cheb_Xt) else: rst = self.fc_self(h_self) + self.fc_neigh(h_neigh) # activation if self.activation is not None: rst = self.activation(rst) # normalization if self.norm is not None: rst = self.norm(rst) return rst
def main(args): # load and preprocess dataset data = load_data(args) if args.self_loop and not args.dataset.startswith('reddit'): data.graph.add_edges_from([(i, i) for i in range(len(data.graph))]) train_nid = np.nonzero(data.train_mask)[0].astype(np.int64) test_nid = np.nonzero(data.test_mask)[0].astype(np.int64) features = torch.FloatTensor(data.features) labels = torch.LongTensor(data.labels) if hasattr(torch, 'BoolTensor'): train_mask = torch.BoolTensor(data.train_mask) val_mask = torch.BoolTensor(data.val_mask) test_mask = torch.BoolTensor(data.test_mask) else: train_mask = torch.ByteTensor(data.train_mask) val_mask = torch.ByteTensor(data.val_mask) test_mask = torch.ByteTensor(data.test_mask) in_feats = features.shape[1] n_classes = data.num_labels n_edges = data.graph.number_of_edges() n_train_samples = train_mask.sum().item() n_val_samples = val_mask.sum().item() n_test_samples = test_mask.sum().item() print("""----Data statistics------' #Edges %d #Classes %d #Train samples %d #Val samples %d #Test samples %d""" % (n_edges, n_classes, n_train_samples, n_val_samples, n_test_samples)) # create GCN model g = DGLGraph(data.graph, readonly=True) norm = 1. / g.in_degrees().float().unsqueeze(1) if args.gpu < 0: cuda = False else: cuda = True torch.cuda.set_device(args.gpu) features = features.cuda() labels = labels.cuda() train_mask = train_mask.cuda() val_mask = val_mask.cuda() test_mask = test_mask.cuda() norm = norm.cuda() g.ndata['features'] = features num_neighbors = args.num_neighbors n_layers = args.n_layers g.ndata['norm'] = norm g.update_all( fn.copy_src(src='features', out='m'), fn.sum(msg='m', out='preprocess'), lambda node: {'preprocess': node.data['preprocess'] * node.data['norm']}) for i in range(n_layers): g.ndata['h_{}'.format(i)] = torch.zeros( features.shape[0], args.n_hidden).to(device=features.device) g.ndata['h_{}'.format(n_layers - 1)] = torch.zeros( features.shape[0], 2 * args.n_hidden).to(device=features.device) model = GCNSampling(in_feats, args.n_hidden, n_classes, n_layers, F.relu, args.dropout) loss_fcn = nn.CrossEntropyLoss() infer_model = GCNInfer(in_feats, args.n_hidden, n_classes, n_layers, F.relu) if cuda: model.cuda() infer_model.cuda() # use optimizer optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) for epoch in range(args.n_epochs): for nf in dgl.contrib.sampling.NeighborSampler(g, args.batch_size, num_neighbors, neighbor_type='in', shuffle=True, num_workers=32, num_hops=n_layers, seed_nodes=train_nid): for i in range(n_layers): agg_history_str = 'agg_h_{}'.format(i) g.pull( nf.layer_parent_nid(i + 1).long(), fn.copy_src(src='h_{}'.format(i), out='m'), fn.sum(msg='m', out=agg_history_str), lambda node: { agg_history_str: node.data[agg_history_str] * node.data['norm'] }) node_embed_names = [['preprocess', 'h_0']] for i in range(1, n_layers): node_embed_names.append( ['h_{}'.format(i), 'agg_h_{}'.format(i - 1)]) node_embed_names.append(['agg_h_{}'.format(n_layers - 1)]) nf.copy_from_parent(node_embed_names=node_embed_names) model.train() # forward pred = model(nf) batch_nids = nf.layer_parent_nid(-1).to(device=pred.device).long() batch_labels = labels[batch_nids] loss = loss_fcn(pred, batch_labels) optimizer.zero_grad() loss.backward() optimizer.step() node_embed_names = [['h_{}'.format(i)] for i in range(n_layers)] node_embed_names.append([]) nf.copy_to_parent(node_embed_names=node_embed_names) for infer_param, param in zip(infer_model.parameters(), model.parameters()): infer_param.data.copy_(param.data) num_acc = 0. for nf in dgl.contrib.sampling.NeighborSampler(g, args.test_batch_size, g.number_of_nodes(), neighbor_type='in', num_workers=32, num_hops=n_layers, seed_nodes=test_nid): node_embed_names = [['preprocess']] for i in range(n_layers): node_embed_names.append(['norm']) nf.copy_from_parent(node_embed_names=node_embed_names) infer_model.eval() with torch.no_grad(): pred = infer_model(nf) batch_nids = nf.layer_parent_nid(-1).to( device=pred.device).long() batch_labels = labels[batch_nids] num_acc += (pred.argmax( dim=1) == batch_labels).sum().cpu().item() print("Test Accuracy {:.4f}".format(num_acc / n_test_samples))
def graphsage_cv_train(g, ctx, args, n_classes, train_nid, test_nid, n_test_samples, distributed): features = g.ndata['features'] labels = g.ndata['labels'] in_feats = g.ndata['features'].shape[1] g_ctx = features.context norm = mx.nd.expand_dims(1. / g.in_degrees().astype('float32'), 1) g.ndata['norm'] = norm.as_in_context(g_ctx) degs = g.in_degrees().astype('float32').asnumpy() degs[degs > args.num_neighbors] = args.num_neighbors g.ndata['subg_norm'] = mx.nd.expand_dims(mx.nd.array(1. / degs, ctx=g_ctx), 1) n_layers = args.n_layers if distributed: g.dist_update_all( fn.copy_src(src='features', out='m'), fn.sum(msg='m', out='preprocess'), lambda node: {'preprocess': node.data['preprocess'] * node.data['norm']}) for i in range(n_layers): g.init_ndata('h_{}'.format(i), (features.shape[0], args.n_hidden), 'float32') g.init_ndata('agg_h_{}'.format(i), (features.shape[0], args.n_hidden), 'float32') else: g.update_all( fn.copy_src(src='features', out='m'), fn.sum(msg='m', out='preprocess'), lambda node: {'preprocess': node.data['preprocess'] * node.data['norm']}) for i in range(n_layers): g.ndata['h_{}'.format(i)] = mx.nd.zeros( (features.shape[0], args.n_hidden), ctx=g_ctx) g.ndata['agg_h_{}'.format(i)] = mx.nd.zeros( (features.shape[0], args.n_hidden), ctx=g_ctx) model = GraphSAGETrain(in_feats, args.n_hidden, n_classes, n_layers, args.dropout, prefix='GraphSAGE') model.initialize(ctx=ctx) loss_fcn = gluon.loss.SoftmaxCELoss() infer_model = GraphSAGEInfer(in_feats, args.n_hidden, n_classes, n_layers, prefix='GraphSAGE') infer_model.initialize(ctx=ctx) # use optimizer print(model.collect_params()) kv_type = 'dist_sync' if distributed else 'local' trainer = gluon.Trainer(model.collect_params(), 'adam', { 'learning_rate': args.lr, 'wd': args.weight_decay }, kvstore=mx.kv.create(kv_type)) # initialize graph dur = [] adj = g.adjacency_matrix().as_in_context(g_ctx) for epoch in range(args.n_epochs): start = time.time() if distributed: msg_head = "Worker {:d}, epoch {:d}".format(g.worker_id, epoch) else: msg_head = "epoch {:d}".format(epoch) for nf in dgl.contrib.sampling.NeighborSampler(g, args.batch_size, args.num_neighbors, neighbor_type='in', shuffle=True, num_workers=32, num_hops=n_layers, add_self_loop=True, seed_nodes=train_nid): for i in range(n_layers): agg_history_str = 'agg_h_{}'.format(i) dests = nf.layer_parent_nid(i + 1).as_in_context(g_ctx) # TODO we could use DGLGraph.pull to implement this, but the current # implementation of pull is very slow. Let's manually do it for now. g.ndata[agg_history_str][dests] = mx.nd.dot( mx.nd.take(adj, dests), g.ndata['h_{}'.format(i)]) node_embed_names = [['preprocess', 'features', 'h_0']] for i in range(1, n_layers): node_embed_names.append([ 'h_{}'.format(i), 'agg_h_{}'.format(i - 1), 'subg_norm', 'norm' ]) node_embed_names.append( ['agg_h_{}'.format(n_layers - 1), 'subg_norm', 'norm']) nf.copy_from_parent(node_embed_names=node_embed_names, ctx=ctx) # forward with mx.autograd.record(): pred = model(nf) batch_nids = nf.layer_parent_nid(-1) batch_labels = labels[batch_nids].as_in_context(ctx) loss = loss_fcn(pred, batch_labels) if distributed: loss = loss.sum() / (len(batch_nids) * g.num_workers) else: loss = loss.sum() / (len(batch_nids)) loss.backward() trainer.step(batch_size=1) node_embed_names = [['h_{}'.format(i)] for i in range(n_layers)] node_embed_names.append([]) nf.copy_to_parent(node_embed_names=node_embed_names) print(msg_head + ': training takes ' + str(time.time() - start)) infer_params = infer_model.collect_params() for key in infer_params: idx = trainer._param2idx[key] trainer._kvstore.pull(idx, out=infer_params[key].data()) num_acc = 0. num_tests = 0 if not distributed or g.worker_id == 0: start = time.time() for nf in dgl.contrib.sampling.NeighborSampler( g, args.test_batch_size, g.number_of_nodes(), neighbor_type='in', num_hops=n_layers, seed_nodes=test_nid, add_self_loop=True): node_embed_names = [['preprocess', 'features']] for i in range(n_layers): node_embed_names.append(['norm', 'subg_norm']) nf.copy_from_parent(node_embed_names=node_embed_names, ctx=ctx) pred = infer_model(nf) batch_nids = nf.layer_parent_nid(-1) batch_labels = labels[batch_nids].as_in_context(ctx) num_acc += (pred.argmax( axis=1) == batch_labels).sum().asscalar() num_tests += nf.layer_size(-1) if distributed: g._sync_barrier() print(msg_head + ": Test Accuracy {:.4f}".format(num_acc / num_tests)) break elif distributed: g._sync_barrier()
def check_partition(g, part_method, reshuffle): g.ndata['labels'] = F.arange(0, g.number_of_nodes()) g.ndata['feats'] = F.tensor(np.random.randn(g.number_of_nodes(), 10), F.float32) g.edata['feats'] = F.tensor(np.random.randn(g.number_of_edges(), 10), F.float32) g.update_all(fn.copy_src('feats', 'msg'), fn.sum('msg', 'h')) g.update_all(fn.copy_edge('feats', 'msg'), fn.sum('msg', 'eh')) num_parts = 4 num_hops = 2 partition_graph(g, 'test', num_parts, '/tmp/partition', num_hops=num_hops, part_method=part_method, reshuffle=reshuffle) part_sizes = [] for i in range(num_parts): part_g, node_feats, edge_feats, gpb, _ = load_partition('/tmp/partition/test.json', i) # Check the metadata assert gpb._num_nodes() == g.number_of_nodes() assert gpb._num_edges() == g.number_of_edges() assert gpb.num_partitions() == num_parts gpb_meta = gpb.metadata() assert len(gpb_meta) == num_parts assert len(gpb.partid2nids(i)) == gpb_meta[i]['num_nodes'] assert len(gpb.partid2eids(i)) == gpb_meta[i]['num_edges'] part_sizes.append((gpb_meta[i]['num_nodes'], gpb_meta[i]['num_edges'])) local_nid = gpb.nid2localnid(F.boolean_mask(part_g.ndata[dgl.NID], part_g.ndata['inner_node']), i) assert F.dtype(local_nid) in (F.int64, F.int32) assert np.all(F.asnumpy(local_nid) == np.arange(0, len(local_nid))) local_eid = gpb.eid2localeid(F.boolean_mask(part_g.edata[dgl.EID], part_g.edata['inner_edge']), i) assert F.dtype(local_eid) in (F.int64, F.int32) assert np.all(F.asnumpy(local_eid) == np.arange(0, len(local_eid))) # Check the node map. local_nodes = F.boolean_mask(part_g.ndata[dgl.NID], part_g.ndata['inner_node']) llocal_nodes = F.nonzero_1d(part_g.ndata['inner_node']) local_nodes1 = gpb.partid2nids(i) assert F.dtype(local_nodes1) in (F.int32, F.int64) assert np.all(np.sort(F.asnumpy(local_nodes)) == np.sort(F.asnumpy(local_nodes1))) # Check the edge map. local_edges = F.boolean_mask(part_g.edata[dgl.EID], part_g.edata['inner_edge']) local_edges1 = gpb.partid2eids(i) assert F.dtype(local_edges1) in (F.int32, F.int64) assert np.all(np.sort(F.asnumpy(local_edges)) == np.sort(F.asnumpy(local_edges1))) if reshuffle: part_g.ndata['feats'] = F.gather_row(g.ndata['feats'], part_g.ndata['orig_id']) part_g.edata['feats'] = F.gather_row(g.edata['feats'], part_g.edata['orig_id']) # when we read node data from the original global graph, we should use orig_id. local_nodes = F.boolean_mask(part_g.ndata['orig_id'], part_g.ndata['inner_node']) local_edges = F.boolean_mask(part_g.edata['orig_id'], part_g.edata['inner_edge']) else: part_g.ndata['feats'] = F.gather_row(g.ndata['feats'], part_g.ndata[dgl.NID]) part_g.edata['feats'] = F.gather_row(g.edata['feats'], part_g.edata[dgl.NID]) part_g.update_all(fn.copy_src('feats', 'msg'), fn.sum('msg', 'h')) part_g.update_all(fn.copy_edge('feats', 'msg'), fn.sum('msg', 'eh')) assert F.allclose(F.gather_row(g.ndata['h'], local_nodes), F.gather_row(part_g.ndata['h'], llocal_nodes)) assert F.allclose(F.gather_row(g.ndata['eh'], local_nodes), F.gather_row(part_g.ndata['eh'], llocal_nodes)) for name in ['labels', 'feats']: assert name in node_feats assert node_feats[name].shape[0] == len(local_nodes) assert np.all(F.asnumpy(g.ndata[name])[F.asnumpy(local_nodes)] == F.asnumpy(node_feats[name])) for name in ['feats']: assert name in edge_feats assert edge_feats[name].shape[0] == len(local_edges) assert np.all(F.asnumpy(g.edata[name])[F.asnumpy(local_edges)] == F.asnumpy(edge_feats[name])) if reshuffle: node_map = [] edge_map = [] for i, (num_nodes, num_edges) in enumerate(part_sizes): node_map.append(np.ones(num_nodes) * i) edge_map.append(np.ones(num_edges) * i) node_map = np.concatenate(node_map) edge_map = np.concatenate(edge_map) nid2pid = gpb.nid2partid(F.arange(0, len(node_map))) assert F.dtype(nid2pid) in (F.int32, F.int64) assert np.all(F.asnumpy(nid2pid) == node_map) eid2pid = gpb.eid2partid(F.arange(0, len(edge_map))) assert F.dtype(eid2pid) in (F.int32, F.int64) assert np.all(F.asnumpy(eid2pid) == edge_map)