def check_partition(g, part_method, reshuffle, num_parts=4, num_trainers_per_machine=1, load_feats=True): g.ndata['labels'] = F.arange(0, g.number_of_nodes()) g.ndata['feats'] = F.tensor(np.random.randn(g.number_of_nodes(), 10), F.float32) g.edata['feats'] = F.tensor(np.random.randn(g.number_of_edges(), 10), F.float32) g.update_all(fn.copy_src('feats', 'msg'), fn.sum('msg', 'h')) g.update_all(fn.copy_edge('feats', 'msg'), fn.sum('msg', 'eh')) num_hops = 2 orig_nids, orig_eids = partition_graph( g, 'test', num_parts, '/tmp/partition', num_hops=num_hops, part_method=part_method, reshuffle=reshuffle, return_mapping=True, num_trainers_per_machine=num_trainers_per_machine) part_sizes = [] shuffled_labels = [] shuffled_edata = [] for i in range(num_parts): part_g, node_feats, edge_feats, gpb, _, ntypes, etypes = load_partition( '/tmp/partition/test.json', i, load_feats=load_feats) if not load_feats: assert not node_feats assert not edge_feats node_feats, edge_feats = load_partition_feats( '/tmp/partition/test.json', i) if num_trainers_per_machine > 1: for ntype in g.ntypes: name = ntype + '/trainer_id' assert name in node_feats part_ids = F.floor_div(node_feats[name], num_trainers_per_machine) assert np.all(F.asnumpy(part_ids) == i) for etype in g.etypes: name = etype + '/trainer_id' assert name in edge_feats part_ids = F.floor_div(edge_feats[name], num_trainers_per_machine) assert np.all(F.asnumpy(part_ids) == i) # Check the metadata assert gpb._num_nodes() == g.number_of_nodes() assert gpb._num_edges() == g.number_of_edges() assert gpb.num_partitions() == num_parts gpb_meta = gpb.metadata() assert len(gpb_meta) == num_parts assert len(gpb.partid2nids(i)) == gpb_meta[i]['num_nodes'] assert len(gpb.partid2eids(i)) == gpb_meta[i]['num_edges'] part_sizes.append((gpb_meta[i]['num_nodes'], gpb_meta[i]['num_edges'])) nid = F.boolean_mask(part_g.ndata[dgl.NID], part_g.ndata['inner_node']) local_nid = gpb.nid2localnid(nid, i) assert F.dtype(local_nid) in (F.int64, F.int32) assert np.all(F.asnumpy(local_nid) == np.arange(0, len(local_nid))) eid = F.boolean_mask(part_g.edata[dgl.EID], part_g.edata['inner_edge']) local_eid = gpb.eid2localeid(eid, i) assert F.dtype(local_eid) in (F.int64, F.int32) assert np.all(F.asnumpy(local_eid) == np.arange(0, len(local_eid))) # Check the node map. local_nodes = F.boolean_mask(part_g.ndata[dgl.NID], part_g.ndata['inner_node']) llocal_nodes = F.nonzero_1d(part_g.ndata['inner_node']) local_nodes1 = gpb.partid2nids(i) assert F.dtype(local_nodes1) in (F.int32, F.int64) assert np.all( np.sort(F.asnumpy(local_nodes)) == np.sort(F.asnumpy( local_nodes1))) assert np.all(F.asnumpy(llocal_nodes) == np.arange(len(llocal_nodes))) # Check the edge map. local_edges = F.boolean_mask(part_g.edata[dgl.EID], part_g.edata['inner_edge']) llocal_edges = F.nonzero_1d(part_g.edata['inner_edge']) local_edges1 = gpb.partid2eids(i) assert F.dtype(local_edges1) in (F.int32, F.int64) assert np.all( np.sort(F.asnumpy(local_edges)) == np.sort(F.asnumpy( local_edges1))) assert np.all(F.asnumpy(llocal_edges) == np.arange(len(llocal_edges))) # Verify the mapping between the reshuffled IDs and the original IDs. part_src_ids, part_dst_ids = part_g.edges() part_src_ids = F.gather_row(part_g.ndata[dgl.NID], part_src_ids) part_dst_ids = F.gather_row(part_g.ndata[dgl.NID], part_dst_ids) part_eids = part_g.edata[dgl.EID] orig_src_ids = F.gather_row(orig_nids, part_src_ids) orig_dst_ids = F.gather_row(orig_nids, part_dst_ids) orig_eids1 = F.gather_row(orig_eids, part_eids) orig_eids2 = g.edge_ids(orig_src_ids, orig_dst_ids) assert F.shape(orig_eids1)[0] == F.shape(orig_eids2)[0] assert np.all(F.asnumpy(orig_eids1) == F.asnumpy(orig_eids2)) if reshuffle: part_g.ndata['feats'] = F.gather_row(g.ndata['feats'], part_g.ndata['orig_id']) part_g.edata['feats'] = F.gather_row(g.edata['feats'], part_g.edata['orig_id']) # when we read node data from the original global graph, we should use orig_id. local_nodes = F.boolean_mask(part_g.ndata['orig_id'], part_g.ndata['inner_node']) local_edges = F.boolean_mask(part_g.edata['orig_id'], part_g.edata['inner_edge']) else: part_g.ndata['feats'] = F.gather_row(g.ndata['feats'], part_g.ndata[dgl.NID]) part_g.edata['feats'] = F.gather_row(g.edata['feats'], part_g.edata[dgl.NID]) part_g.update_all(fn.copy_src('feats', 'msg'), fn.sum('msg', 'h')) part_g.update_all(fn.copy_edge('feats', 'msg'), fn.sum('msg', 'eh')) assert F.allclose(F.gather_row(g.ndata['h'], local_nodes), F.gather_row(part_g.ndata['h'], llocal_nodes)) assert F.allclose(F.gather_row(g.ndata['eh'], local_nodes), F.gather_row(part_g.ndata['eh'], llocal_nodes)) for name in ['labels', 'feats']: assert '_N/' + name in node_feats assert node_feats['_N/' + name].shape[0] == len(local_nodes) true_feats = F.gather_row(g.ndata[name], local_nodes) ndata = F.gather_row(node_feats['_N/' + name], local_nid) assert np.all(F.asnumpy(true_feats) == F.asnumpy(ndata)) for name in ['feats']: assert '_E/' + name in edge_feats assert edge_feats['_E/' + name].shape[0] == len(local_edges) true_feats = F.gather_row(g.edata[name], local_edges) edata = F.gather_row(edge_feats['_E/' + name], local_eid) assert np.all(F.asnumpy(true_feats) == F.asnumpy(edata)) # This only works if node/edge IDs are shuffled. if reshuffle: shuffled_labels.append(node_feats['_N/labels']) shuffled_edata.append(edge_feats['_E/feats']) # Verify that we can reconstruct node/edge data for original IDs. if reshuffle: shuffled_labels = F.asnumpy(F.cat(shuffled_labels, 0)) shuffled_edata = F.asnumpy(F.cat(shuffled_edata, 0)) orig_labels = np.zeros(shuffled_labels.shape, dtype=shuffled_labels.dtype) orig_edata = np.zeros(shuffled_edata.shape, dtype=shuffled_edata.dtype) orig_labels[F.asnumpy(orig_nids)] = shuffled_labels orig_edata[F.asnumpy(orig_eids)] = shuffled_edata assert np.all(orig_labels == F.asnumpy(g.ndata['labels'])) assert np.all(orig_edata == F.asnumpy(g.edata['feats'])) if reshuffle: node_map = [] edge_map = [] for i, (num_nodes, num_edges) in enumerate(part_sizes): node_map.append(np.ones(num_nodes) * i) edge_map.append(np.ones(num_edges) * i) node_map = np.concatenate(node_map) edge_map = np.concatenate(edge_map) nid2pid = gpb.nid2partid(F.arange(0, len(node_map))) assert F.dtype(nid2pid) in (F.int32, F.int64) assert np.all(F.asnumpy(nid2pid) == node_map) eid2pid = gpb.eid2partid(F.arange(0, len(edge_map))) assert F.dtype(eid2pid) in (F.int32, F.int64) assert np.all(F.asnumpy(eid2pid) == edge_map)
def test_split(): #prepare_dist() g = create_random_graph(10000) num_parts = 4 num_hops = 2 partition_graph(g, 'dist_graph_test', num_parts, '/tmp/dist_graph', num_hops=num_hops, part_method='metis') node_mask = np.random.randint(0, 100, size=g.number_of_nodes()) > 30 edge_mask = np.random.randint(0, 100, size=g.number_of_edges()) > 30 selected_nodes = np.nonzero(node_mask)[0] selected_edges = np.nonzero(edge_mask)[0] # The code now collects the roles of all client processes and use the information # to determine how to split the workloads. Here is to simulate the multi-client # use case. def set_roles(num_clients): dgl.distributed.role.CUR_ROLE = 'default' dgl.distributed.role.GLOBAL_RANK = {i: i for i in range(num_clients)} dgl.distributed.role.PER_ROLE_RANK['default'] = { i: i for i in range(num_clients) } for i in range(num_parts): set_roles(num_parts) part_g, node_feats, edge_feats, gpb, _, _, _ = load_partition( '/tmp/dist_graph/dist_graph_test.json', i) local_nids = F.nonzero_1d(part_g.ndata['inner_node']) local_nids = F.gather_row(part_g.ndata[dgl.NID], local_nids) nodes1 = np.intersect1d(selected_nodes, F.asnumpy(local_nids)) nodes2 = node_split(node_mask, gpb, rank=i, force_even=False) assert np.all(np.sort(nodes1) == np.sort(F.asnumpy(nodes2))) local_nids = F.asnumpy(local_nids) for n in nodes1: assert n in local_nids set_roles(num_parts * 2) nodes3 = node_split(node_mask, gpb, rank=i * 2, force_even=False) nodes4 = node_split(node_mask, gpb, rank=i * 2 + 1, force_even=False) nodes5 = F.cat([nodes3, nodes4], 0) assert np.all(np.sort(nodes1) == np.sort(F.asnumpy(nodes5))) set_roles(num_parts) local_eids = F.nonzero_1d(part_g.edata['inner_edge']) local_eids = F.gather_row(part_g.edata[dgl.EID], local_eids) edges1 = np.intersect1d(selected_edges, F.asnumpy(local_eids)) edges2 = edge_split(edge_mask, gpb, rank=i, force_even=False) assert np.all(np.sort(edges1) == np.sort(F.asnumpy(edges2))) local_eids = F.asnumpy(local_eids) for e in edges1: assert e in local_eids set_roles(num_parts * 2) edges3 = edge_split(edge_mask, gpb, rank=i * 2, force_even=False) edges4 = edge_split(edge_mask, gpb, rank=i * 2 + 1, force_even=False) edges5 = F.cat([edges3, edges4], 0) assert np.all(np.sort(edges1) == np.sort(F.asnumpy(edges5)))
def _test_layer_sampler(prefetch=False): g = generate_rand_graph(100) nid = g.nodes() src, dst, eid = g.all_edges(form='all', order='eid') n_batches = 5 batch_size = 50 seed_batches = [ np.sort(np.random.choice(F.asnumpy(nid), batch_size, replace=False)) for i in range(n_batches) ] seed_nodes = np.hstack(seed_batches) layer_sizes = [50] * 3 LayerSampler = getattr(dgl.contrib.sampling, 'LayerSampler') sampler = LayerSampler(g, batch_size, layer_sizes, 'in', seed_nodes=seed_nodes, num_workers=4, prefetch=prefetch) for sub_g in sampler: assert all( sub_g.layer_size(i) < size for i, size in enumerate(layer_sizes)) sub_nid = F.arange(0, sub_g.number_of_nodes()) assert all( np.all(np.isin(F.asnumpy(sub_g.layer_nid(i)), F.asnumpy(sub_nid))) for i in range(sub_g.num_layers)) assert np.all( np.isin(F.asnumpy(sub_g.map_to_parent_nid(sub_nid)), F.asnumpy(nid))) sub_eid = F.arange(0, sub_g.number_of_edges()) assert np.all( np.isin(F.asnumpy(sub_g.map_to_parent_eid(sub_eid)), F.asnumpy(eid))) assert any( np.all( np.sort(F.asnumpy(sub_g.layer_parent_nid(-1))) == seed_batch) for seed_batch in seed_batches) sub_src, sub_dst = sub_g.all_edges(order='eid') for i in range(sub_g.num_blocks): block_eid = sub_g.block_eid(i) block_src = sub_g.map_to_parent_nid( F.gather_row(sub_src, block_eid)) block_dst = sub_g.map_to_parent_nid( F.gather_row(sub_dst, block_eid)) block_parent_eid = sub_g.block_parent_eid(i) block_parent_src = F.gather_row(src, block_parent_eid) block_parent_dst = F.gather_row(dst, block_parent_eid) assert np.all(F.asnumpy(block_src == block_parent_src)) n_layers = sub_g.num_layers sub_n = sub_g.number_of_nodes() assert sum(F.shape(sub_g.layer_nid(i))[0] for i in range(n_layers)) == sub_n n_blocks = sub_g.num_blocks sub_m = sub_g.number_of_edges() assert sum(F.shape(sub_g.block_eid(i))[0] for i in range(n_blocks)) == sub_m
def test_convert(): hg = create_test_heterograph() hs = [] for ntype in hg.ntypes: h = F.randn((hg.number_of_nodes(ntype), 5)) hg.nodes[ntype].data['h'] = h hs.append(h) hg.nodes['user'].data['x'] = F.randn((3, 3)) ws = [] for etype in hg.canonical_etypes: w = F.randn((hg.number_of_edges(etype), 5)) hg.edges[etype].data['w'] = w ws.append(w) hg.edges['plays'].data['x'] = F.randn((4, 3)) g = dgl.to_homo(hg) assert F.array_equal(F.cat(hs, dim=0), g.ndata['h']) assert 'x' not in g.ndata assert F.array_equal(F.cat(ws, dim=0), g.edata['w']) assert 'x' not in g.edata src, dst = g.all_edges(order='eid') src = F.asnumpy(src) dst = F.asnumpy(dst) etype_id, eid = F.asnumpy(g.edata[dgl.ETYPE]), F.asnumpy(g.edata[dgl.EID]) ntype_id, nid = F.asnumpy(g.ndata[dgl.NTYPE]), F.asnumpy(g.ndata[dgl.NID]) for i in range(g.number_of_edges()): srctype = hg.ntypes[ntype_id[src[i]]] dsttype = hg.ntypes[ntype_id[dst[i]]] etype = hg.etypes[etype_id[i]] src_i, dst_i = hg.find_edges([eid[i]], (srctype, etype, dsttype)) assert np.asscalar(F.asnumpy(src_i)) == nid[src[i]] assert np.asscalar(F.asnumpy(dst_i)) == nid[dst[i]] mg = nx.MultiDiGraph([('user', 'user', 'follows'), ('user', 'game', 'plays'), ('user', 'game', 'wishes'), ('developer', 'game', 'develops')]) for _mg in [None, mg]: hg2 = dgl.to_hetero(g, ['user', 'game', 'developer'], ['follows', 'plays', 'wishes', 'develops'], ntype_field=dgl.NTYPE, etype_field=dgl.ETYPE, metagraph=_mg) assert set(hg.ntypes) == set(hg2.ntypes) assert set(hg.canonical_etypes) == set(hg2.canonical_etypes) for ntype in hg.ntypes: assert hg.number_of_nodes(ntype) == hg2.number_of_nodes(ntype) assert F.array_equal(hg.nodes[ntype].data['h'], hg2.nodes[ntype].data['h']) for canonical_etype in hg.canonical_etypes: src, dst = hg.all_edges(etype=canonical_etype, order='eid') src2, dst2 = hg2.all_edges(etype=canonical_etype, order='eid') assert F.array_equal(src, src2) assert F.array_equal(dst, dst2) assert F.array_equal(hg.edges[canonical_etype].data['w'], hg2.edges[canonical_etype].data['w']) # hetero_from_homo test case 2 g = dgl.graph([(0, 2), (1, 2), (2, 3), (0, 3)]) g.ndata[dgl.NTYPE] = F.tensor([0, 0, 1, 2]) g.edata[dgl.ETYPE] = F.tensor([0, 0, 1, 2]) hg = dgl.to_hetero(g, ['l0', 'l1', 'l2'], ['e0', 'e1', 'e2']) assert set(hg.canonical_etypes) == set([('l0', 'e0', 'l1'), ('l1', 'e1', 'l2'), ('l0', 'e2', 'l2')]) assert hg.number_of_nodes('l0') == 2 assert hg.number_of_nodes('l1') == 1 assert hg.number_of_nodes('l2') == 1 assert hg.number_of_edges('e0') == 2 assert hg.number_of_edges('e1') == 1 assert hg.number_of_edges('e2') == 1 # hetero_from_homo test case 3 mg = nx.MultiDiGraph([('user', 'movie', 'watches'), ('user', 'TV', 'watches')]) g = dgl.graph([(0, 1), (0, 2)]) g.ndata[dgl.NTYPE] = F.tensor([0, 1, 2]) g.edata[dgl.ETYPE] = F.tensor([0, 0]) for _mg in [None, mg]: hg = dgl.to_hetero(g, ['user', 'TV', 'movie'], ['watches'], metagraph=_mg) assert set(hg.canonical_etypes) == set([('user', 'watches', 'movie'), ('user', 'watches', 'TV')]) assert hg.number_of_nodes('user') == 1 assert hg.number_of_nodes('TV') == 1 assert hg.number_of_nodes('movie') == 1 assert hg.number_of_edges(('user', 'watches', 'TV')) == 1 assert hg.number_of_edges(('user', 'watches', 'movie')) == 1 assert len(hg.etypes) == 2 # hetero_to_homo test case 2 hg = dgl.bipartite([(0, 0), (1, 1)], card=(2, 3)) g = dgl.to_homo(hg) assert g.number_of_nodes() == 5
def check_dist_emb(g, num_clients, num_nodes, num_edges): from dgl.distributed.optim import SparseAdagrad from dgl.distributed.nn import NodeEmbedding # Test sparse emb try: emb = NodeEmbedding(g.number_of_nodes(), 1, 'emb1', emb_init) lr = 0.001 optimizer = SparseAdagrad([emb], lr=lr) with F.record_grad(): feats = emb(nids) assert np.all(F.asnumpy(feats) == np.zeros((len(nids), 1))) loss = F.sum(feats + 1, 0) loss.backward() optimizer.step() feats = emb(nids) if num_clients == 1: assert_almost_equal(F.asnumpy(feats), np.ones((len(nids), 1)) * -lr) rest = np.setdiff1d(np.arange(g.number_of_nodes()), F.asnumpy(nids)) feats1 = emb(rest) assert np.all(F.asnumpy(feats1) == np.zeros((len(rest), 1))) policy = dgl.distributed.PartitionPolicy('node', g.get_partition_book()) grad_sum = dgl.distributed.DistTensor((g.number_of_nodes(), ), F.float32, 'emb1_sum', policy) if num_clients == 1: assert np.all( F.asnumpy(grad_sum[nids]) == np.ones((len(nids), 1)) * num_clients) assert np.all(F.asnumpy(grad_sum[rest]) == np.zeros((len(rest), 1))) emb = NodeEmbedding(g.number_of_nodes(), 1, 'emb2', emb_init) with F.no_grad(): feats1 = emb(nids) assert np.all(F.asnumpy(feats1) == 0) optimizer = SparseAdagrad([emb], lr=lr) with F.record_grad(): feats1 = emb(nids) feats2 = emb(nids) feats = F.cat([feats1, feats2], 0) assert np.all(F.asnumpy(feats) == np.zeros((len(nids) * 2, 1))) loss = F.sum(feats + 1, 0) loss.backward() optimizer.step() with F.no_grad(): feats = emb(nids) if num_clients == 1: assert_almost_equal(F.asnumpy(feats), np.ones((len(nids), 1)) * math.sqrt(2) * -lr) rest = np.setdiff1d(np.arange(g.number_of_nodes()), F.asnumpy(nids)) feats1 = emb(rest) assert np.all(F.asnumpy(feats1) == np.zeros((len(rest), 1))) except NotImplementedError as e: pass
def test_to_simple(index_dtype): # homogeneous graph g = dgl.graph((F.tensor([0, 1, 2, 1]), F.tensor([1, 2, 0, 2]))) g.ndata['h'] = F.tensor([[0.], [1.], [2.]]) g.edata['h'] = F.tensor([[3.], [4.], [5.], [6.]]) sg, wb = dgl.to_simple(g, writeback_mapping=True) u, v = g.all_edges(form='uv', order='eid') u = F.asnumpy(u).tolist() v = F.asnumpy(v).tolist() uv = list(zip(u, v)) eid_map = F.asnumpy(wb) su, sv = sg.all_edges(form='uv', order='eid') su = F.asnumpy(su).tolist() sv = F.asnumpy(sv).tolist() suv = list(zip(su, sv)) sc = F.asnumpy(sg.edata['count']) assert set(uv) == set(suv) for i, e in enumerate(suv): assert sc[i] == sum(e == _e for _e in uv) for i, e in enumerate(uv): assert eid_map[i] == suv.index(e) # shared ndata assert F.array_equal(sg.ndata['h'], g.ndata['h']) assert 'h' not in sg.edata # new ndata to sg sg.ndata['hh'] = F.tensor([[0.], [1.], [2.]]) assert 'hh' not in g.ndata sg = dgl.to_simple(g, writeback_mapping=False, copy_ndata=False) assert 'h' not in sg.ndata assert 'h' not in sg.edata # heterogeneous graph g = dgl.heterograph( { ('user', 'follow', 'user'): ([0, 1, 2, 1, 1, 1], [1, 3, 2, 3, 4, 4]), ('user', 'plays', 'game'): ([3, 2, 1, 1, 3, 2, 2], [5, 3, 4, 4, 5, 3, 3]) }, index_dtype=index_dtype) g.nodes['user'].data['h'] = F.tensor([0, 1, 2, 3, 4]) g.nodes['user'].data['hh'] = F.tensor([0, 1, 2, 3, 4]) g.edges['follow'].data['h'] = F.tensor([0, 1, 2, 3, 4, 5]) sg, wb = dgl.to_simple(g, return_counts='weights', writeback_mapping=True, copy_edata=True) g.nodes['game'].data['h'] = F.tensor([0, 1, 2, 3, 4, 5]) for etype in g.canonical_etypes: u, v = g.all_edges(form='uv', order='eid', etype=etype) u = F.asnumpy(u).tolist() v = F.asnumpy(v).tolist() uv = list(zip(u, v)) eid_map = F.asnumpy(wb[etype]) su, sv = sg.all_edges(form='uv', order='eid', etype=etype) su = F.asnumpy(su).tolist() sv = F.asnumpy(sv).tolist() suv = list(zip(su, sv)) sw = F.asnumpy(sg.edges[etype].data['weights']) assert set(uv) == set(suv) for i, e in enumerate(suv): assert sw[i] == sum(e == _e for _e in uv) for i, e in enumerate(uv): assert eid_map[i] == suv.index(e) # shared ndata assert F.array_equal(sg.nodes['user'].data['h'], g.nodes['user'].data['h']) assert F.array_equal(sg.nodes['user'].data['hh'], g.nodes['user'].data['hh']) assert 'h' not in sg.nodes['game'].data # new ndata to sg sg.nodes['user'].data['hhh'] = F.tensor([0, 1, 2, 3, 4]) assert 'hhh' not in g.nodes['user'].data # share edata feat_idx = F.asnumpy(wb[('user', 'follow', 'user')]) _, indices = np.unique(feat_idx, return_index=True) assert np.array_equal(F.asnumpy(sg.edges['follow'].data['h']), F.asnumpy(g.edges['follow'].data['h'])[indices]) sg = dgl.to_simple(g, writeback_mapping=False, copy_ndata=False) for ntype in g.ntypes: assert g.number_of_nodes(ntype) == sg.number_of_nodes(ntype) assert 'h' not in sg.nodes['user'].data assert 'hh' not in sg.nodes['user'].data
def _test_query(): for etype in etypes: utype, _, vtype = HG.to_canonical_etype(etype) g = HG[etype] srcs, dsts = edges[etype] for src, dst in zip(srcs, dsts): assert g.has_edge_between(src, dst) assert F.asnumpy(g.has_edges_between(srcs, dsts)).all() srcs, dsts = negative_edges[etype] for src, dst in zip(srcs, dsts): assert not g.has_edge_between(src, dst) assert not F.asnumpy(g.has_edges_between(srcs, dsts)).any() srcs, dsts = edges[etype] n_edges = len(srcs) # predecessors & in_edges & in_degree pred = [s for s, d in zip(srcs, dsts) if d == 0] assert set(F.asnumpy(g.predecessors(0)).tolist()) == set(pred) u, v = g.in_edges([0]) assert F.asnumpy(v).tolist() == [0] * len(pred) assert set(F.asnumpy(u).tolist()) == set(pred) assert g.in_degree(0) == len(pred) # successors & out_edges & out_degree succ = [d for s, d in zip(srcs, dsts) if s == 0] assert set(F.asnumpy(g.successors(0)).tolist()) == set(succ) u, v = g.out_edges([0]) assert F.asnumpy(u).tolist() == [0] * len(succ) assert set(F.asnumpy(v).tolist()) == set(succ) assert g.out_degree(0) == len(succ) # edge_id & edge_ids for i, (src, dst) in enumerate(zip(srcs, dsts)): assert g.edge_id(src, dst) == i assert F.asnumpy(g.edge_id(src, dst, force_multi=True)).tolist() == [i] assert F.asnumpy(g.edge_ids(srcs, dsts)).tolist() == list(range(n_edges)) u, v, e = g.edge_ids(srcs, dsts, force_multi=True) assert F.asnumpy(u).tolist() == srcs assert F.asnumpy(v).tolist() == dsts assert F.asnumpy(e).tolist() == list(range(n_edges)) # find_edges u, v = g.find_edges(list(range(n_edges))) assert F.asnumpy(u).tolist() == srcs assert F.asnumpy(v).tolist() == dsts # all_edges. for order in ['eid']: u, v, e = g.all_edges(form='all', order=order) assert F.asnumpy(u).tolist() == srcs assert F.asnumpy(v).tolist() == dsts assert F.asnumpy(e).tolist() == list(range(n_edges)) # in_degrees & out_degrees in_degrees = F.asnumpy(g.in_degrees()) out_degrees = F.asnumpy(g.out_degrees()) src_count = Counter(srcs) dst_count = Counter(dsts) for i in range(g.number_of_nodes(utype)): assert out_degrees[i] == src_count[i] for i in range(g.number_of_nodes(vtype)): assert in_degrees[i] == dst_count[i]
def _test_DGLCSVDataset_customized_data_parser(): with tempfile.TemporaryDirectory() as test_dir: # generate YAML/CSVs meta_yaml_path = os.path.join(test_dir, "meta.yaml") edges_csv_path_0 = os.path.join(test_dir, "test_edges_0.csv") edges_csv_path_1 = os.path.join(test_dir, "test_edges_1.csv") nodes_csv_path_0 = os.path.join(test_dir, "test_nodes_0.csv") nodes_csv_path_1 = os.path.join(test_dir, "test_nodes_1.csv") graph_csv_path = os.path.join(test_dir, "test_graph.csv") meta_yaml_data = {'dataset_name': 'default_name', 'node_data': [{'file_name': os.path.basename(nodes_csv_path_0), 'ntype': 'user', }, {'file_name': os.path.basename(nodes_csv_path_1), 'ntype': 'item', }], 'edge_data': [{'file_name': os.path.basename(edges_csv_path_0), 'etype': ['user', 'follow', 'user'], }, {'file_name': os.path.basename(edges_csv_path_1), 'etype': ['user', 'like', 'item'], }], 'graph_data': {'file_name': os.path.basename(graph_csv_path)} } with open(meta_yaml_path, 'w') as f: yaml.dump(meta_yaml_data, f, sort_keys=False) num_nodes = 100 num_edges = 500 num_graphs = 10 label_ndata = np.random.randint(2, size=num_nodes*num_graphs) df = pd.DataFrame({'node_id': np.hstack([np.arange(num_nodes) for _ in range(num_graphs)]), 'label': label_ndata, 'graph_id': np.hstack([np.full(num_nodes, i) for i in range(num_graphs)]) }) df.to_csv(nodes_csv_path_0, index=False) df.to_csv(nodes_csv_path_1, index=False) label_edata = np.random.randint(2, size=num_edges*num_graphs) df = pd.DataFrame({'src_id': np.hstack([np.random.randint(num_nodes, size=num_edges) for _ in range(num_graphs)]), 'dst_id': np.hstack([np.random.randint(num_nodes, size=num_edges) for _ in range(num_graphs)]), 'label': label_edata, 'graph_id': np.hstack([np.full(num_edges, i) for i in range(num_graphs)]) }) df.to_csv(edges_csv_path_0, index=False) df.to_csv(edges_csv_path_1, index=False) label_gdata = np.random.randint(2, size=num_graphs) df = pd.DataFrame({'label': label_gdata, 'graph_id': np.arange(num_graphs) }) df.to_csv(graph_csv_path, index=False) class CustDataParser: def __call__(self, df): data = {} for header in df: dt = df[header].to_numpy().squeeze() if header == 'label': dt += 2 data[header] = dt return data # load CSVDataset with customized node/edge/graph_data_parser csv_dataset = data.DGLCSVDataset( test_dir, node_data_parser={'user': CustDataParser()}, edge_data_parser={('user', 'like', 'item'): CustDataParser()}, graph_data_parser=CustDataParser()) assert len(csv_dataset) == num_graphs assert len(csv_dataset.data) == 1 assert 'label' in csv_dataset.data for i, (g, label) in enumerate(csv_dataset): assert not g.is_homogeneous assert F.asnumpy(label) == label_gdata[i] + 2 for ntype in g.ntypes: assert g.num_nodes(ntype) == num_nodes offset = 2 if ntype == 'user' else 0 assert np.array_equal(label_ndata[i*num_nodes:(i+1)*num_nodes]+offset, F.asnumpy(g.nodes[ntype].data['label'])) for etype in g.etypes: assert g.num_edges(etype) == num_edges offset = 2 if etype == 'like' else 0 assert np.array_equal(label_edata[i*num_edges:(i+1)*num_edges]+offset, F.asnumpy(g.edges[etype].data['label']))
def check_dist_graph(g, num_clients, num_nodes, num_edges): # Test API assert g.number_of_nodes() == num_nodes assert g.number_of_edges() == num_edges # Test reading node data nids = F.arange(0, int(g.number_of_nodes() / 2)) feats1 = g.ndata['features'][nids] feats = F.squeeze(feats1, 1) assert np.all(F.asnumpy(feats == nids)) # Test reading edge data eids = F.arange(0, int(g.number_of_edges() / 2)) feats1 = g.edata['features'][eids] feats = F.squeeze(feats1, 1) assert np.all(F.asnumpy(feats == eids)) # Test init node data new_shape = (g.number_of_nodes(), 2) g.ndata['test1'] = dgl.distributed.DistTensor(new_shape, F.int32) feats = g.ndata['test1'][nids] assert np.all(F.asnumpy(feats) == 0) # reference to a one that exists test2 = dgl.distributed.DistTensor(new_shape, F.float32, 'test2', init_func=rand_init) test3 = dgl.distributed.DistTensor(new_shape, F.float32, 'test2') assert np.all(F.asnumpy(test2[nids]) == F.asnumpy(test3[nids])) # create a tensor and destroy a tensor and create it again. test3 = dgl.distributed.DistTensor(new_shape, F.float32, 'test3', init_func=rand_init) del test3 test3 = dgl.distributed.DistTensor((g.number_of_nodes(), 3), F.float32, 'test3') del test3 # add tests for anonymous distributed tensor. test3 = dgl.distributed.DistTensor(new_shape, F.float32, init_func=rand_init) data = test3[0:10] test4 = dgl.distributed.DistTensor(new_shape, F.float32, init_func=rand_init) del test3 test5 = dgl.distributed.DistTensor(new_shape, F.float32, init_func=rand_init) assert np.sum(F.asnumpy(test5[0:10] != data)) > 0 # test a persistent tesnor test4 = dgl.distributed.DistTensor(new_shape, F.float32, 'test4', init_func=rand_init, persistent=True) del test4 try: test4 = dgl.distributed.DistTensor((g.number_of_nodes(), 3), F.float32, 'test4') raise Exception('') except: pass # Test sparse emb try: emb = DistEmbedding(g.number_of_nodes(), 1, 'emb1', emb_init) lr = 0.001 optimizer = SparseAdagrad([emb], lr=lr) with F.record_grad(): feats = emb(nids) assert np.all(F.asnumpy(feats) == np.zeros((len(nids), 1))) loss = F.sum(feats + 1, 0) loss.backward() optimizer.step() feats = emb(nids) if num_clients == 1: assert_almost_equal(F.asnumpy(feats), np.ones((len(nids), 1)) * -lr) rest = np.setdiff1d(np.arange(g.number_of_nodes()), F.asnumpy(nids)) feats1 = emb(rest) assert np.all(F.asnumpy(feats1) == np.zeros((len(rest), 1))) policy = dgl.distributed.PartitionPolicy('node', g.get_partition_book()) grad_sum = dgl.distributed.DistTensor((g.number_of_nodes(), ), F.float32, 'emb1_sum', policy) if num_clients == 1: assert np.all( F.asnumpy(grad_sum[nids]) == np.ones((len(nids), 1)) * num_clients) assert np.all(F.asnumpy(grad_sum[rest]) == np.zeros((len(rest), 1))) emb = DistEmbedding(g.number_of_nodes(), 1, 'emb2', emb_init) with F.no_grad(): feats1 = emb(nids) assert np.all(F.asnumpy(feats1) == 0) optimizer = SparseAdagrad([emb], lr=lr) with F.record_grad(): feats1 = emb(nids) feats2 = emb(nids) feats = F.cat([feats1, feats2], 0) assert np.all(F.asnumpy(feats) == np.zeros((len(nids) * 2, 1))) loss = F.sum(feats + 1, 0) loss.backward() optimizer.step() with F.no_grad(): feats = emb(nids) if num_clients == 1: assert_almost_equal(F.asnumpy(feats), np.ones((len(nids), 1)) * math.sqrt(2) * -lr) rest = np.setdiff1d(np.arange(g.number_of_nodes()), F.asnumpy(nids)) feats1 = emb(rest) assert np.all(F.asnumpy(feats1) == np.zeros((len(rest), 1))) except NotImplementedError as e: pass # Test write data new_feats = F.ones((len(nids), 2), F.int32, F.cpu()) g.ndata['test1'][nids] = new_feats feats = g.ndata['test1'][nids] assert np.all(F.asnumpy(feats) == 1) # Test metadata operations. assert len(g.ndata['features']) == g.number_of_nodes() assert g.ndata['features'].shape == (g.number_of_nodes(), 1) assert g.ndata['features'].dtype == F.int64 assert g.node_attr_schemes()['features'].dtype == F.int64 assert g.node_attr_schemes()['test1'].dtype == F.int32 assert g.node_attr_schemes()['features'].shape == (1, ) selected_nodes = np.random.randint(0, 100, size=g.number_of_nodes()) > 30 # Test node split nodes = node_split(selected_nodes, g.get_partition_book()) nodes = F.asnumpy(nodes) # We only have one partition, so the local nodes are basically all nodes in the graph. local_nids = np.arange(g.number_of_nodes()) for n in nodes: assert n in local_nids print('end')
def test_multi_recv(): # basic recv test g = generate_graph() h = g.ndata['h'] g.register_message_func(message_func) g.register_reduce_func(reduce_func) g.register_apply_node_func(apply_node_func) expected = F.copy_to(F.zeros((g.number_of_edges(), ), dtype=F.int64), F.cpu()) # two separate round of send and recv u = [4, 5, 6] v = [9] g.send((u, v)) eid = g.edge_ids(u, v) expected = F.asnumpy(expected) eid = F.asnumpy(eid) expected[eid] = 1 assert np.array_equal(g._get_msg_index().tonumpy(), expected) g.recv(v) expected[eid] = 0 assert np.array_equal(g._get_msg_index().tonumpy(), expected) u = [0] v = [1, 2, 3] g.send((u, v)) eid = g.edge_ids(u, v) eid = F.asnumpy(eid) expected[eid] = 1 assert np.array_equal(g._get_msg_index().tonumpy(), expected) g.recv(v) expected[eid] = 0 assert np.array_equal(g._get_msg_index().tonumpy(), expected) h1 = g.ndata['h'] # one send, two recv g.ndata['h'] = h u = F.tensor([0, 0, 0, 4, 5, 6]) v = F.tensor([1, 2, 3, 9, 9, 9]) g.send((u, v)) eid = g.edge_ids(u, v) eid = F.asnumpy(eid) expected[eid] = 1 assert np.array_equal(g._get_msg_index().tonumpy(), expected) u = [4, 5, 6] v = [9] g.recv(v) eid = g.edge_ids(u, v) eid = F.asnumpy(eid) expected[eid] = 0 assert np.array_equal(g._get_msg_index().tonumpy(), expected) u = [0] v = [1, 2, 3] g.recv(v) eid = g.edge_ids(u, v) eid = F.asnumpy(eid) expected[eid] = 0 assert np.array_equal(g._get_msg_index().tonumpy(), expected) h2 = g.ndata['h'] assert F.allclose(h1, h2)
def _test_DGLCSVDataset_multiple(): with tempfile.TemporaryDirectory() as test_dir: # generate YAML/CSVs meta_yaml_path = os.path.join(test_dir, "meta.yaml") edges_csv_path_0 = os.path.join(test_dir, "test_edges_0.csv") edges_csv_path_1 = os.path.join(test_dir, "test_edges_1.csv") nodes_csv_path_0 = os.path.join(test_dir, "test_nodes_0.csv") nodes_csv_path_1 = os.path.join(test_dir, "test_nodes_1.csv") graph_csv_path = os.path.join(test_dir, "test_graph.csv") meta_yaml_data = {'version': '1.0.0', 'dataset_name': 'default_name', 'node_data': [{'file_name': os.path.basename(nodes_csv_path_0), 'ntype': 'user', }, {'file_name': os.path.basename(nodes_csv_path_1), 'ntype': 'item', }], 'edge_data': [{'file_name': os.path.basename(edges_csv_path_0), 'etype': ['user', 'follow', 'user'], }, {'file_name': os.path.basename(edges_csv_path_1), 'etype': ['user', 'like', 'item'], }], 'graph_data': {'file_name': os.path.basename(graph_csv_path)} } with open(meta_yaml_path, 'w') as f: yaml.dump(meta_yaml_data, f, sort_keys=False) num_nodes = 100 num_edges = 500 num_graphs = 10 num_dims = 3 feat_ndata = np.random.rand(num_nodes*num_graphs, num_dims) label_ndata = np.random.randint(2, size=num_nodes*num_graphs) df = pd.DataFrame({'node_id': np.hstack([np.arange(num_nodes) for _ in range(num_graphs)]), 'label': label_ndata, 'feat': [line.tolist() for line in feat_ndata], 'graph_id': np.hstack([np.full(num_nodes, i) for i in range(num_graphs)]) }) df.to_csv(nodes_csv_path_0, index=False) df.to_csv(nodes_csv_path_1, index=False) feat_edata = np.random.rand(num_edges*num_graphs, num_dims) label_edata = np.random.randint(2, size=num_edges*num_graphs) df = pd.DataFrame({'src_id': np.hstack([np.random.randint(num_nodes, size=num_edges) for _ in range(num_graphs)]), 'dst_id': np.hstack([np.random.randint(num_nodes, size=num_edges) for _ in range(num_graphs)]), 'label': label_edata, 'feat': [line.tolist() for line in feat_edata], 'graph_id': np.hstack([np.full(num_edges, i) for i in range(num_graphs)]) }) df.to_csv(edges_csv_path_0, index=False) df.to_csv(edges_csv_path_1, index=False) feat_gdata = np.random.rand(num_graphs, num_dims) label_gdata = np.random.randint(2, size=num_graphs) df = pd.DataFrame({'label': label_gdata, 'feat': [line.tolist() for line in feat_gdata], 'graph_id': np.arange(num_graphs) }) df.to_csv(graph_csv_path, index=False) # load CSVDataset with default node/edge/graph_data_parser for force_reload in [True, False]: if not force_reload: # remove original node data file to verify reload from cached files os.remove(nodes_csv_path_0) assert not os.path.exists(nodes_csv_path_0) csv_dataset = data.DGLCSVDataset( test_dir, force_reload=force_reload) assert len(csv_dataset) == num_graphs assert csv_dataset.has_cache() assert len(csv_dataset.data) == 2 assert 'feat' in csv_dataset.data assert 'label' in csv_dataset.data assert F.array_equal(F.tensor(feat_gdata), csv_dataset.data['feat']) for i, (g, label) in enumerate(csv_dataset): assert not g.is_homogeneous assert F.asnumpy(label) == label_gdata[i] for ntype in g.ntypes: assert g.num_nodes(ntype) == num_nodes assert F.array_equal(F.tensor(feat_ndata[i*num_nodes:(i+1)*num_nodes]), g.nodes[ntype].data['feat']) assert np.array_equal(label_ndata[i*num_nodes:(i+1)*num_nodes], F.asnumpy(g.nodes[ntype].data['label'])) for etype in g.etypes: assert g.num_edges(etype) == num_edges assert F.array_equal(F.tensor(feat_edata[i*num_edges:(i+1)*num_edges]), g.edges[etype].data['feat']) assert np.array_equal(label_edata[i*num_edges:(i+1)*num_edges], F.asnumpy(g.edges[etype].data['label']))
def test_random_walk(): g1 = dgl.heterograph({ ('user', 'follow', 'user'): ([0, 1, 2], [1, 2, 0]) }) g2 = dgl.heterograph({ ('user', 'follow', 'user'): ([0, 1, 1, 2, 3], [1, 2, 3, 0, 0]) }) g3 = dgl.heterograph({ ('user', 'follow', 'user'): ([0, 1, 2], [1, 2, 0]), ('user', 'view', 'item'): ([0, 1, 2], [0, 1, 2]), ('item', 'viewed-by', 'user'): ([0, 1, 2], [0, 1, 2])}) g4 = dgl.heterograph({ ('user', 'follow', 'user'): ([0, 1, 1, 2, 3], [1, 2, 3, 0, 0]), ('user', 'view', 'item'): ([0, 0, 1, 2, 3, 3], [0, 1, 1, 2, 2, 1]), ('item', 'viewed-by', 'user'): ([0, 1, 1, 2, 2, 1], [0, 0, 1, 2, 3, 3])}) g2.edata['p'] = F.tensor([3, 0, 3, 3, 3], dtype=F.float32) g2.edata['p2'] = F.tensor([[3], [0], [3], [3], [3]], dtype=F.float32) g4.edges['follow'].data['p'] = F.tensor([3, 0, 3, 3, 3], dtype=F.float32) g4.edges['viewed-by'].data['p'] = F.tensor([1, 1, 1, 1, 1, 1], dtype=F.float32) traces, eids, ntypes = dgl.sampling.random_walk(g1, [0, 1, 2, 0, 1, 2], length=4, return_eids=True) check_random_walk(g1, ['follow'] * 4, traces, ntypes, trace_eids=eids) traces, eids, ntypes = dgl.sampling.random_walk(g1, [0, 1, 2, 0, 1, 2], length=4, restart_prob=0., return_eids=True) check_random_walk(g1, ['follow'] * 4, traces, ntypes, trace_eids=eids) traces, ntypes = dgl.sampling.random_walk( g1, [0, 1, 2, 0, 1, 2], length=4, restart_prob=F.zeros((4,), F.float32, F.cpu())) check_random_walk(g1, ['follow'] * 4, traces, ntypes) traces, ntypes = dgl.sampling.random_walk( g1, [0, 1, 2, 0, 1, 2], length=5, restart_prob=F.tensor([0, 0, 0, 0, 1], dtype=F.float32)) check_random_walk( g1, ['follow'] * 4, F.slice_axis(traces, 1, 0, 5), F.slice_axis(ntypes, 0, 0, 5)) assert (F.asnumpy(traces)[:, 5] == -1).all() traces, eids, ntypes = dgl.sampling.random_walk( g2, [0, 1, 2, 3, 0, 1, 2, 3], length=4, return_eids=True) check_random_walk(g2, ['follow'] * 4, traces, ntypes, trace_eids=eids) traces, eids, ntypes = dgl.sampling.random_walk( g2, [0, 1, 2, 3, 0, 1, 2, 3], length=4, prob='p', return_eids=True) check_random_walk(g2, ['follow'] * 4, traces, ntypes, 'p', trace_eids=eids) try: traces, ntypes = dgl.sampling.random_walk( g2, [0, 1, 2, 3, 0, 1, 2, 3], length=4, prob='p2') fail = False except dgl.DGLError: fail = True assert fail metapath = ['follow', 'view', 'viewed-by'] * 2 traces, eids, ntypes = dgl.sampling.random_walk( g3, [0, 1, 2, 0, 1, 2], metapath=metapath, return_eids=True) check_random_walk(g3, metapath, traces, ntypes, trace_eids=eids) metapath = ['follow', 'view', 'viewed-by'] * 2 traces, eids, ntypes = dgl.sampling.random_walk( g4, [0, 1, 2, 3, 0, 1, 2, 3], metapath=metapath, return_eids=True) check_random_walk(g4, metapath, traces, ntypes, trace_eids=eids) traces, eids, ntypes = dgl.sampling.random_walk( g4, [0, 1, 2, 0, 1, 2], metapath=metapath, return_eids=True) check_random_walk(g4, metapath, traces, ntypes, trace_eids=eids) metapath = ['follow', 'view', 'viewed-by'] * 2 traces, eids, ntypes = dgl.sampling.random_walk( g4, [0, 1, 2, 3, 0, 1, 2, 3], metapath=metapath, prob='p', return_eids=True) check_random_walk(g4, metapath, traces, ntypes, 'p', trace_eids=eids) traces, eids, ntypes = dgl.sampling.random_walk( g4, [0, 1, 2, 3, 0, 1, 2, 3], metapath=metapath, prob='p', restart_prob=0., return_eids=True) check_random_walk(g4, metapath, traces, ntypes, 'p', trace_eids=eids) traces, eids, ntypes = dgl.sampling.random_walk( g4, [0, 1, 2, 3, 0, 1, 2, 3], metapath=metapath, prob='p', restart_prob=F.zeros((6,), F.float32, F.cpu()), return_eids=True) check_random_walk(g4, metapath, traces, ntypes, 'p', trace_eids=eids) traces, eids, ntypes = dgl.sampling.random_walk( g4, [0, 1, 2, 3, 0, 1, 2, 3], metapath=metapath + ['follow'], prob='p', restart_prob=F.tensor([0, 0, 0, 0, 0, 0, 1], F.float32), return_eids=True) check_random_walk(g4, metapath, traces[:, :7], ntypes[:7], 'p', trace_eids=eids) assert (F.asnumpy(traces[:, 7]) == -1).all()
def _test_one(g): assert g.number_of_nodes() == 10 assert g.number_of_edges() == 20 assert len(g) == 10 assert not g.is_multigraph for i in range(10): assert g.has_node(i) assert i in g assert not g.has_node(11) assert not 11 in g assert F.allclose(g.has_nodes([0, 2, 10, 11]), F.tensor([1, 1, 0, 0])) src, dst = edge_pair_input() for u, v in zip(src, dst): assert g.has_edge_between(u, v) assert not g.has_edge_between(0, 0) assert F.allclose(g.has_edges_between([0, 0, 3], [0, 9, 8]), F.tensor([0, 1, 1])) assert set(F.asnumpy(g.predecessors(9))) == set([0, 5, 7, 4]) assert set(F.asnumpy(g.successors(2))) == set([7, 3]) assert g.edge_id(4, 4) == 5 assert F.allclose(g.edge_ids([4, 0], [4, 9]), F.tensor([5, 0])) src, dst = g.find_edges([3, 6, 5]) assert F.allclose(src, F.tensor([5, 7, 4])) assert F.allclose(dst, F.tensor([9, 9, 4])) src, dst, eid = g.in_edges(9, form='all') tup = list(zip(F.asnumpy(src), F.asnumpy(dst), F.asnumpy(eid))) assert set(tup) == set([(0, 9, 0), (5, 9, 3), (7, 9, 6), (4, 9, 7)]) src, dst, eid = g.in_edges([9, 0, 8], form='all') # test node#0 has no in edges tup = list(zip(F.asnumpy(src), F.asnumpy(dst), F.asnumpy(eid))) assert set(tup) == set([(0, 9, 0), (5, 9, 3), (7, 9, 6), (4, 9, 7), (3, 8, 9), (7, 8, 12)]) src, dst, eid = g.out_edges(0, form='all') tup = list(zip(F.asnumpy(src), F.asnumpy(dst), F.asnumpy(eid))) assert set(tup) == set([(0, 9, 0), (0, 6, 1), (0, 4, 4)]) src, dst, eid = g.out_edges([0, 4, 8], form='all') # test node#8 has no out edges tup = list(zip(F.asnumpy(src), F.asnumpy(dst), F.asnumpy(eid))) assert set(tup) == set([(0, 9, 0), (0, 6, 1), (0, 4, 4), (4, 3, 2), (4, 4, 5), (4, 9, 7), (4, 1, 8)]) src, dst, eid = g.edges('all', 'eid') t_src, t_dst = edge_pair_input() t_tup = list(zip(t_src, t_dst, list(range(20)))) tup = list(zip(F.asnumpy(src), F.asnumpy(dst), F.asnumpy(eid))) assert set(tup) == set(t_tup) assert list(F.asnumpy(eid)) == list(range(20)) src, dst, eid = g.edges('all', 'srcdst') t_src, t_dst = edge_pair_input() t_tup = list(zip(t_src, t_dst, list(range(20)))) tup = list(zip(F.asnumpy(src), F.asnumpy(dst), F.asnumpy(eid))) assert set(tup) == set(t_tup) assert list(F.asnumpy(src)) == sorted(list(F.asnumpy(src))) assert g.in_degree(0) == 0 assert g.in_degree(9) == 4 assert F.allclose(g.in_degrees([0, 9]), F.tensor([0, 4])) assert g.out_degree(8) == 0 assert g.out_degree(9) == 1 assert F.allclose(g.out_degrees([8, 9]), F.tensor([0, 1])) assert np.array_equal( F.sparse_to_numpy(g.adjacency_matrix(transpose=False)), scipy_coo_input().toarray().T) assert np.array_equal( F.sparse_to_numpy(g.adjacency_matrix(transpose=True)), scipy_coo_input().toarray())
def verify_hetero_graph(g, parts): num_nodes = {ntype: 0 for ntype in g.ntypes} num_edges = {etype: 0 for etype in g.etypes} for part in parts: assert len(g.ntypes) == len(F.unique(part.ndata[dgl.NTYPE])) assert len(g.etypes) == len(F.unique(part.edata[dgl.ETYPE])) for ntype in g.ntypes: ntype_id = g.get_ntype_id(ntype) inner_node_mask = _get_inner_node_mask(part, ntype_id) num_inner_nodes = F.sum(F.astype(inner_node_mask, F.int64), 0) num_nodes[ntype] += num_inner_nodes for etype in g.etypes: etype_id = g.get_etype_id(etype) inner_edge_mask = _get_inner_edge_mask(part, etype_id) num_inner_edges = F.sum(F.astype(inner_edge_mask, F.int64), 0) num_edges[etype] += num_inner_edges # Verify the number of nodes are correct. for ntype in g.ntypes: print('node {}: {}, {}'.format(ntype, g.number_of_nodes(ntype), num_nodes[ntype])) assert g.number_of_nodes(ntype) == num_nodes[ntype] # Verify the number of edges are correct. for etype in g.etypes: print('edge {}: {}, {}'.format(etype, g.number_of_edges(etype), num_edges[etype])) assert g.number_of_edges(etype) == num_edges[etype] nids = {ntype: [] for ntype in g.ntypes} eids = {etype: [] for etype in g.etypes} for part in parts: src, dst, eid = part.edges(form='all') orig_src = F.gather_row(part.ndata['orig_id'], src) orig_dst = F.gather_row(part.ndata['orig_id'], dst) orig_eid = F.gather_row(part.edata['orig_id'], eid) etype_arr = F.gather_row(part.edata[dgl.ETYPE], eid) eid_type = F.gather_row(part.edata[dgl.EID], eid) for etype in g.etypes: etype_id = g.get_etype_id(etype) src1 = F.boolean_mask(orig_src, etype_arr == etype_id) dst1 = F.boolean_mask(orig_dst, etype_arr == etype_id) eid1 = F.boolean_mask(orig_eid, etype_arr == etype_id) exist = g.has_edges_between(src1, dst1, etype=etype) assert np.all(F.asnumpy(exist)) eid2 = g.edge_ids(src1, dst1, etype=etype) assert np.all(F.asnumpy(eid1 == eid2)) eids[etype].append(F.boolean_mask(eid_type, etype_arr == etype_id)) # Make sure edge Ids fall into a range. inner_edge_mask = _get_inner_edge_mask(part, etype_id) inner_eids = np.sort( F.asnumpy(F.boolean_mask(part.edata[dgl.EID], inner_edge_mask))) assert np.all( inner_eids == np.arange(inner_eids[0], inner_eids[-1] + 1)) for ntype in g.ntypes: ntype_id = g.get_ntype_id(ntype) # Make sure inner nodes have Ids fall into a range. inner_node_mask = _get_inner_node_mask(part, ntype_id) inner_nids = F.boolean_mask(part.ndata[dgl.NID], inner_node_mask) assert np.all( F.asnumpy( inner_nids == F.arange(F.as_scalar(inner_nids[0]), F.as_scalar(inner_nids[-1]) + 1))) nids[ntype].append(inner_nids) for ntype in nids: nids_type = F.cat(nids[ntype], 0) uniq_ids = F.unique(nids_type) # We should get all nodes. assert len(uniq_ids) == g.number_of_nodes(ntype) for etype in eids: eids_type = F.cat(eids[etype], 0) uniq_ids = F.unique(eids_type) assert len(uniq_ids) == g.number_of_edges(etype)
def check_metis_partition(g, extra_hops): subgs = dgl.transform.metis_partition(g, 4, extra_cached_hops=extra_hops) num_inner_nodes = 0 num_inner_edges = 0 if subgs is not None: for part_id, subg in subgs.items(): lnode_ids = np.nonzero(F.asnumpy(subg.ndata['inner_node']))[0] ledge_ids = np.nonzero(F.asnumpy(subg.edata['inner_edge']))[0] num_inner_nodes += len(lnode_ids) num_inner_edges += len(ledge_ids) assert np.sum( F.asnumpy(subg.ndata['part_id']) == part_id) == len(lnode_ids) assert num_inner_nodes == g.number_of_nodes() print(g.number_of_edges() - num_inner_edges) if extra_hops == 0: return # partitions with node reshuffling subgs = dgl.transform.metis_partition(g, 4, extra_cached_hops=extra_hops, reshuffle=True) num_inner_nodes = 0 num_inner_edges = 0 edge_cnts = np.zeros((g.number_of_edges(), )) if subgs is not None: for part_id, subg in subgs.items(): lnode_ids = np.nonzero(F.asnumpy(subg.ndata['inner_node']))[0] ledge_ids = np.nonzero(F.asnumpy(subg.edata['inner_edge']))[0] num_inner_nodes += len(lnode_ids) num_inner_edges += len(ledge_ids) assert np.sum( F.asnumpy(subg.ndata['part_id']) == part_id) == len(lnode_ids) nids = F.asnumpy(subg.ndata[dgl.NID]) # ensure the local node Ids are contiguous. parent_ids = F.asnumpy(subg.ndata[dgl.NID]) parent_ids = parent_ids[:len(lnode_ids)] assert np.all( parent_ids == np.arange(parent_ids[0], parent_ids[-1] + 1)) # count the local edges. parent_ids = F.asnumpy(subg.edata[dgl.EID])[ledge_ids] edge_cnts[parent_ids] += 1 orig_ids = subg.ndata['orig_id'] inner_node = F.asnumpy(subg.ndata['inner_node']) for nid in range(subg.number_of_nodes()): neighs = subg.predecessors(nid) old_neighs1 = F.gather_row(orig_ids, neighs) old_nid = F.asnumpy(orig_ids[nid]) old_neighs2 = g.predecessors(old_nid) # If this is an inner node, it should have the full neighborhood. if inner_node[nid]: assert np.all( np.sort(F.asnumpy(old_neighs1)) == np.sort( F.asnumpy(old_neighs2))) # Normally, local edges are only counted once. assert np.all(edge_cnts == 1) assert num_inner_nodes == g.number_of_nodes() print(g.number_of_edges() - num_inner_edges)
def test_random_walk(): g1 = dgl.heterograph({ ('user', 'follow', 'user'): [(0, 1), (1, 2), (2, 0)] }) g2 = dgl.heterograph({ ('user', 'follow', 'user'): [(0, 1), (1, 2), (1, 3), (2, 0), (3, 0)] }) g3 = dgl.heterograph({ ('user', 'follow', 'user'): [(0, 1), (1, 2), (2, 0)], ('user', 'view', 'item'): [(0, 0), (1, 1), (2, 2)], ('item', 'viewed-by', 'user'): [(0, 0), (1, 1), (2, 2)] }) g4 = dgl.heterograph({ ('user', 'follow', 'user'): [(0, 1), (1, 2), (1, 3), (2, 0), (3, 0)], ('user', 'view', 'item'): [(0, 0), (0, 1), (1, 1), (2, 2), (3, 2), (3, 1)], ('item', 'viewed-by', 'user'): [(0, 0), (1, 0), (1, 1), (2, 2), (2, 3), (1, 3)] }) g2.edata['p'] = F.tensor([3, 0, 3, 3, 3], dtype=F.float32) g4.edges['follow'].data['p'] = F.tensor([3, 0, 3, 3, 3], dtype=F.float32) g4.edges['viewed-by'].data['p'] = F.tensor([1, 1, 1, 1, 1, 1], dtype=F.float32) traces, ntypes = dgl.sampling.random_walk(g1, [0, 1, 2, 0, 1, 2], length=4) check_random_walk(g1, ['follow'] * 4, traces, ntypes) traces, ntypes = dgl.sampling.random_walk(g1, [0, 1, 2, 0, 1, 2], length=4, restart_prob=0.) check_random_walk(g1, ['follow'] * 4, traces, ntypes) traces, ntypes = dgl.sampling.random_walk(g1, [0, 1, 2, 0, 1, 2], length=4, restart_prob=F.zeros((4, ), F.float32, F.cpu())) check_random_walk(g1, ['follow'] * 4, traces, ntypes) traces, ntypes = dgl.sampling.random_walk(g1, [0, 1, 2, 0, 1, 2], length=5, restart_prob=F.tensor( [0, 0, 0, 0, 1], dtype=F.float32)) check_random_walk(g1, ['follow'] * 4, F.slice_axis(traces, 1, 0, 5), F.slice_axis(ntypes, 0, 0, 5)) assert (F.asnumpy(traces)[:, 5] == -1).all() traces, ntypes = dgl.sampling.random_walk(g2, [0, 1, 2, 3, 0, 1, 2, 3], length=4) check_random_walk(g2, ['follow'] * 4, traces, ntypes) traces, ntypes = dgl.sampling.random_walk(g2, [0, 1, 2, 3, 0, 1, 2, 3], length=4, prob='p') check_random_walk(g2, ['follow'] * 4, traces, ntypes, 'p') metapath = ['follow', 'view', 'viewed-by'] * 2 traces, ntypes = dgl.sampling.random_walk(g3, [0, 1, 2, 0, 1, 2], metapath=metapath) check_random_walk(g3, metapath, traces, ntypes) metapath = ['follow', 'view', 'viewed-by'] * 2 traces, ntypes = dgl.sampling.random_walk(g4, [0, 1, 2, 3, 0, 1, 2, 3], metapath=metapath) check_random_walk(g4, metapath, traces, ntypes) metapath = ['follow', 'view', 'viewed-by'] * 2 traces, ntypes = dgl.sampling.random_walk(g4, [0, 1, 2, 3, 0, 1, 2, 3], metapath=metapath, prob='p') check_random_walk(g4, metapath, traces, ntypes, 'p') traces, ntypes = dgl.sampling.random_walk(g4, [0, 1, 2, 3, 0, 1, 2, 3], metapath=metapath, prob='p', restart_prob=0.) check_random_walk(g4, metapath, traces, ntypes, 'p') traces, ntypes = dgl.sampling.random_walk(g4, [0, 1, 2, 3, 0, 1, 2, 3], metapath=metapath, prob='p', restart_prob=F.zeros((6, ), F.float32, F.cpu())) check_random_walk(g4, metapath, traces, ntypes, 'p') traces, ntypes = dgl.sampling.random_walk( g4, [0, 1, 2, 3, 0, 1, 2, 3], metapath=metapath + ['follow'], prob='p', restart_prob=F.tensor([0, 0, 0, 0, 0, 0, 1], F.float32)) check_random_walk(g4, metapath, traces[:, :7], ntypes[:7], 'p') assert (F.asnumpy(traces[:, 7]) == -1).all()
def test_compact(index_dtype): g1 = dgl.heterograph( { ('user', 'follow', 'user'): [(1, 3), (3, 5)], ('user', 'plays', 'game'): [(2, 4), (3, 4), (2, 5)], ('game', 'wished-by', 'user'): [(6, 7), (5, 7)] }, { 'user': 20, 'game': 10 }, index_dtype=index_dtype) g2 = dgl.heterograph( { ('game', 'clicked-by', 'user'): [(3, 1)], ('user', 'likes', 'user'): [(1, 8), (8, 9)] }, { 'user': 20, 'game': 10 }, index_dtype=index_dtype) g3 = dgl.graph([(0, 1), (1, 2)], num_nodes=10, ntype='user', index_dtype=index_dtype) g4 = dgl.graph([(1, 3), (3, 5)], num_nodes=10, ntype='user', index_dtype=index_dtype) def _check(g, new_g, induced_nodes): assert g.ntypes == new_g.ntypes assert g.canonical_etypes == new_g.canonical_etypes for ntype in g.ntypes: assert -1 not in induced_nodes[ntype] for etype in g.canonical_etypes: g_src, g_dst = g.all_edges(order='eid', etype=etype) g_src = F.asnumpy(g_src) g_dst = F.asnumpy(g_dst) new_g_src, new_g_dst = new_g.all_edges(order='eid', etype=etype) new_g_src_mapped = induced_nodes[etype[0]][F.asnumpy(new_g_src)] new_g_dst_mapped = induced_nodes[etype[2]][F.asnumpy(new_g_dst)] assert (g_src == new_g_src_mapped).all() assert (g_dst == new_g_dst_mapped).all() # Test default new_g1 = dgl.compact_graphs(g1) induced_nodes = { ntype: new_g1.nodes[ntype].data[dgl.NID] for ntype in new_g1.ntypes } induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()} assert new_g1._idtype_str == index_dtype assert set(induced_nodes['user']) == set([1, 3, 5, 2, 7]) assert set(induced_nodes['game']) == set([4, 5, 6]) _check(g1, new_g1, induced_nodes) # Test with always_preserve given a dict new_g1 = dgl.compact_graphs(g1, always_preserve={ 'game': F.tensor([4, 7], dtype=getattr(F, index_dtype)) }) assert new_g1._idtype_str == index_dtype induced_nodes = { ntype: new_g1.nodes[ntype].data[dgl.NID] for ntype in new_g1.ntypes } induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()} assert set(induced_nodes['user']) == set([1, 3, 5, 2, 7]) assert set(induced_nodes['game']) == set([4, 5, 6, 7]) _check(g1, new_g1, induced_nodes) # Test with always_preserve given a tensor new_g3 = dgl.compact_graphs(g3, always_preserve=F.tensor([1, 7], dtype=getattr( F, index_dtype))) induced_nodes = { ntype: new_g3.nodes[ntype].data[dgl.NID] for ntype in new_g3.ntypes } induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()} assert new_g3._idtype_str == index_dtype assert set(induced_nodes['user']) == set([0, 1, 2, 7]) _check(g3, new_g3, induced_nodes) # Test multiple graphs new_g1, new_g2 = dgl.compact_graphs([g1, g2]) induced_nodes = { ntype: new_g1.nodes[ntype].data[dgl.NID] for ntype in new_g1.ntypes } induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()} assert new_g1._idtype_str == index_dtype assert new_g2._idtype_str == index_dtype assert set(induced_nodes['user']) == set([1, 3, 5, 2, 7, 8, 9]) assert set(induced_nodes['game']) == set([3, 4, 5, 6]) _check(g1, new_g1, induced_nodes) _check(g2, new_g2, induced_nodes) # Test multiple graphs with always_preserve given a dict new_g1, new_g2 = dgl.compact_graphs([g1, g2], always_preserve={ 'game': F.tensor([4, 7], dtype=getattr( F, index_dtype)) }) induced_nodes = { ntype: new_g1.nodes[ntype].data[dgl.NID] for ntype in new_g1.ntypes } induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()} assert new_g1._idtype_str == index_dtype assert new_g2._idtype_str == index_dtype assert set(induced_nodes['user']) == set([1, 3, 5, 2, 7, 8, 9]) assert set(induced_nodes['game']) == set([3, 4, 5, 6, 7]) _check(g1, new_g1, induced_nodes) _check(g2, new_g2, induced_nodes) # Test multiple graphs with always_preserve given a tensor new_g3, new_g4 = dgl.compact_graphs([g3, g4], always_preserve=F.tensor( [1, 7], dtype=getattr(F, index_dtype))) induced_nodes = { ntype: new_g3.nodes[ntype].data[dgl.NID] for ntype in new_g3.ntypes } induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()} assert new_g3._idtype_str == index_dtype assert new_g4._idtype_str == index_dtype assert set(induced_nodes['user']) == set([0, 1, 2, 3, 5, 7]) _check(g3, new_g3, induced_nodes) _check(g4, new_g4, induced_nodes)
def start_edge_dataloader(rank, tmpdir, num_server, num_workers, orig_nid, orig_eid, groundtruth_g): import dgl import torch as th dgl.distributed.initialize("mp_ip_config.txt") gpb = None disable_shared_mem = num_server > 1 if disable_shared_mem: _, _, _, gpb, _, _, _ = load_partition(tmpdir / 'test_sampling.json', rank) num_edges_to_sample = 202 batch_size = 32 dist_graph = DistGraph("test_mp", gpb=gpb, part_config=tmpdir / 'test_sampling.json') assert len(dist_graph.ntypes) == len(groundtruth_g.ntypes) assert len(dist_graph.etypes) == len(groundtruth_g.etypes) if len(dist_graph.etypes) == 1: train_eid = th.arange(num_edges_to_sample) else: train_eid = {dist_graph.etypes[0]: th.arange(num_edges_to_sample)} for i in range(num_server): part, _, _, _, _, _, _ = load_partition(tmpdir / 'test_sampling.json', i) # Create sampler sampler = dgl.dataloading.MultiLayerNeighborSampler([5, 10]) # We need to test creating DistDataLoader multiple times. for i in range(2): # Create DataLoader for constructing blocks dataloader = dgl.dataloading.EdgeDataLoader(dist_graph, train_eid, sampler, batch_size=batch_size, shuffle=True, drop_last=False, num_workers=num_workers) for epoch in range(2): for idx, (input_nodes, pos_pair_graph, blocks) in zip(range(0, num_edges_to_sample, batch_size), dataloader): block = blocks[-1] for src_type, etype, dst_type in block.canonical_etypes: o_src, o_dst = block.edges(etype=etype) src_nodes_id = block.srcnodes[src_type].data[ dgl.NID][o_src] dst_nodes_id = block.dstnodes[dst_type].data[ dgl.NID][o_dst] src_nodes_id = orig_nid[src_type][src_nodes_id] dst_nodes_id = orig_nid[dst_type][dst_nodes_id] has_edges = groundtruth_g.has_edges_between(src_nodes_id, dst_nodes_id, etype=etype) assert np.all(F.asnumpy(has_edges)) assert np.all( F.asnumpy(block.dstnodes[dst_type].data[dgl.NID]) == F. asnumpy(pos_pair_graph.nodes[dst_type].data[dgl.NID])) # assert np.all(np.unique(np.sort(F.asnumpy(dst_nodes_id))) == np.arange(idx, batch_size)) del dataloader dgl.distributed.exit_client( ) # this is needed since there's two test here in one process
def test_query(): g = create_test_heterograph() ntypes = ['user', 'game', 'developer'] canonical_etypes = [('user', 'follows', 'user'), ('user', 'plays', 'game'), ('user', 'wishes', 'game'), ('developer', 'develops', 'game')] etypes = ['follows', 'plays', 'wishes', 'develops'] # node & edge types assert set(ntypes) == set(g.ntypes) assert set(etypes) == set(g.etypes) assert set(canonical_etypes) == set(g.canonical_etypes) # metagraph mg = g.metagraph assert set(g.ntypes) == set(mg.nodes) etype_triplets = [(u, v, e) for u, v, e in mg.edges(keys=True)] assert set([('user', 'user', 'follows'), ('user', 'game', 'plays'), ('user', 'game', 'wishes'), ('developer', 'game', 'develops')]) == set(etype_triplets) for i in range(len(etypes)): assert g.to_canonical_etype(etypes[i]) == canonical_etypes[i] # number of nodes assert [g.number_of_nodes(ntype) for ntype in ntypes] == [3, 2, 2] # number of edges assert [g.number_of_edges(etype) for etype in etypes] == [2, 4, 2, 2] assert not g.is_multigraph assert g.is_readonly # has_node & has_nodes for ntype in ntypes: n = g.number_of_nodes(ntype) for i in range(n): assert g.has_node(i, ntype) assert not g.has_node(n, ntype) assert np.array_equal( F.asnumpy(g.has_nodes([0, n], ntype)).astype('int32'), [1, 0]) def _test(g): for etype in etypes: srcs, dsts = edges[etype] for src, dst in zip(srcs, dsts): assert g.has_edge_between(src, dst, etype) assert F.asnumpy(g.has_edges_between(srcs, dsts, etype)).all() srcs, dsts = negative_edges[etype] for src, dst in zip(srcs, dsts): assert not g.has_edge_between(src, dst, etype) assert not F.asnumpy(g.has_edges_between(srcs, dsts, etype)).any() srcs, dsts = edges[etype] n_edges = len(srcs) # predecessors & in_edges & in_degree pred = [s for s, d in zip(srcs, dsts) if d == 0] assert set(F.asnumpy(g.predecessors(0, etype)).tolist()) == set(pred) u, v = g.in_edges([0], etype=etype) assert F.asnumpy(v).tolist() == [0] * len(pred) assert set(F.asnumpy(u).tolist()) == set(pred) assert g.in_degree(0, etype) == len(pred) # successors & out_edges & out_degree succ = [d for s, d in zip(srcs, dsts) if s == 0] assert set(F.asnumpy(g.successors(0, etype)).tolist()) == set(succ) u, v = g.out_edges([0], etype=etype) assert F.asnumpy(u).tolist() == [0] * len(succ) assert set(F.asnumpy(v).tolist()) == set(succ) assert g.out_degree(0, etype) == len(succ) # edge_id & edge_ids for i, (src, dst) in enumerate(zip(srcs, dsts)): assert g.edge_id(src, dst, etype=etype) == i assert F.asnumpy( g.edge_id(src, dst, etype=etype, force_multi=True)).tolist() == [i] assert F.asnumpy(g.edge_ids(srcs, dsts, etype=etype)).tolist() == list( range(n_edges)) u, v, e = g.edge_ids(srcs, dsts, etype=etype, force_multi=True) assert F.asnumpy(u).tolist() == srcs assert F.asnumpy(v).tolist() == dsts assert F.asnumpy(e).tolist() == list(range(n_edges)) # find_edges u, v = g.find_edges(list(range(n_edges)), etype) assert F.asnumpy(u).tolist() == srcs assert F.asnumpy(v).tolist() == dsts # all_edges. for order in ['eid']: u, v, e = g.all_edges('all', order, etype) assert F.asnumpy(u).tolist() == srcs assert F.asnumpy(v).tolist() == dsts assert F.asnumpy(e).tolist() == list(range(n_edges)) # in_degrees & out_degrees in_degrees = F.asnumpy(g.in_degrees(etype=etype)) out_degrees = F.asnumpy(g.out_degrees(etype=etype)) src_count = Counter(srcs) dst_count = Counter(dsts) utype, _, vtype = g.to_canonical_etype(etype) for i in range(g.number_of_nodes(utype)): assert out_degrees[i] == src_count[i] for i in range(g.number_of_nodes(vtype)): assert in_degrees[i] == dst_count[i] edges = { 'follows': ([0, 1], [1, 2]), 'plays': ([0, 1, 2, 1], [0, 0, 1, 1]), 'wishes': ([0, 2], [1, 0]), 'develops': ([0, 1], [0, 1]), } # edges that does not exist in the graph negative_edges = { 'follows': ([0, 1], [0, 1]), 'plays': ([0, 2], [1, 0]), 'wishes': ([0, 1], [0, 1]), 'develops': ([0, 1], [1, 0]), } g = create_test_heterograph() _test(g) g = create_test_heterograph1() _test(g) etypes = canonical_etypes edges = { ('user', 'follows', 'user'): ([0, 1], [1, 2]), ('user', 'plays', 'game'): ([0, 1, 2, 1], [0, 0, 1, 1]), ('user', 'wishes', 'game'): ([0, 2], [1, 0]), ('developer', 'develops', 'game'): ([0, 1], [0, 1]), } # edges that does not exist in the graph negative_edges = { ('user', 'follows', 'user'): ([0, 1], [0, 1]), ('user', 'plays', 'game'): ([0, 2], [1, 0]), ('user', 'wishes', 'game'): ([0, 1], [0, 1]), ('developer', 'develops', 'game'): ([0, 1], [1, 0]), } g = create_test_heterograph() _test(g) g = create_test_heterograph1() _test(g) # test repr print(g)
def start_dist_dataloader(rank, tmpdir, num_server, drop_last, orig_nid, orig_eid): import dgl import torch as th dgl.distributed.initialize("mp_ip_config.txt") gpb = None disable_shared_mem = num_server > 0 if disable_shared_mem: _, _, _, gpb, _, _, _ = load_partition(tmpdir / 'test_sampling.json', rank) num_nodes_to_sample = 202 batch_size = 32 train_nid = th.arange(num_nodes_to_sample) dist_graph = DistGraph("test_mp", gpb=gpb, part_config=tmpdir / 'test_sampling.json') for i in range(num_server): part, _, _, _, _, _, _ = load_partition(tmpdir / 'test_sampling.json', i) # Create sampler sampler = NeighborSampler(dist_graph, [5, 10], dgl.distributed.sample_neighbors) # We need to test creating DistDataLoader multiple times. for i in range(2): # Create DataLoader for constructing blocks dataloader = DistDataLoader(dataset=train_nid.numpy(), batch_size=batch_size, collate_fn=sampler.sample_blocks, shuffle=False, drop_last=drop_last) groundtruth_g = CitationGraphDataset("cora")[0] max_nid = [] for epoch in range(2): for idx, blocks in zip(range(0, num_nodes_to_sample, batch_size), dataloader): block = blocks[-1] o_src, o_dst = block.edges() src_nodes_id = block.srcdata[dgl.NID][o_src] dst_nodes_id = block.dstdata[dgl.NID][o_dst] max_nid.append(np.max(F.asnumpy(dst_nodes_id))) src_nodes_id = orig_nid[src_nodes_id] dst_nodes_id = orig_nid[dst_nodes_id] has_edges = groundtruth_g.has_edges_between( src_nodes_id, dst_nodes_id) assert np.all(F.asnumpy(has_edges)) # assert np.all(np.unique(np.sort(F.asnumpy(dst_nodes_id))) == np.arange(idx, batch_size)) if drop_last: assert np.max( max_nid ) == num_nodes_to_sample - 1 - num_nodes_to_sample % batch_size else: assert np.max(max_nid) == num_nodes_to_sample - 1 del dataloader dgl.distributed.exit_client( ) # this is needed since there's two test here in one process
def test_flatten(): def check_mapping(g, fg): if len(fg.ntypes) == 1: SRC = DST = fg.ntypes[0] else: SRC = fg.ntypes[0] DST = fg.ntypes[1] etypes = F.asnumpy(fg.edata[dgl.ETYPE]).tolist() eids = F.asnumpy(fg.edata[dgl.EID]).tolist() for i, (etype, eid) in enumerate(zip(etypes, eids)): src_g, dst_g = g.find_edges([eid], g.canonical_etypes[etype]) src_fg, dst_fg = fg.find_edges([i]) # TODO(gq): I feel this code is quite redundant; can we just add new members (like # "induced_srcid") to returned heterograph object and not store them as features? assert src_g == fg.nodes[SRC].data[dgl.NID][src_fg] tid = F.asnumpy(fg.nodes[SRC].data[dgl.NTYPE][src_fg])[0] assert g.canonical_etypes[etype][0] == g.ntypes[tid] assert dst_g == fg.nodes[DST].data[dgl.NID][dst_fg] tid = F.asnumpy(fg.nodes[DST].data[dgl.NTYPE][dst_fg])[0] assert g.canonical_etypes[etype][2] == g.ntypes[tid] # check for wildcard slices g = create_test_heterograph() g.nodes['user'].data['h'] = F.ones((3, 5)) g.nodes['game'].data['i'] = F.ones((2, 5)) g.edges['plays'].data['e'] = F.ones((4, 4)) g.edges['wishes'].data['e'] = F.ones((2, 4)) g.edges['wishes'].data['f'] = F.ones((2, 4)) fg = g['user', :, 'game'] # user--plays->game and user--wishes->game assert len(fg.ntypes) == 2 assert fg.ntypes == ['user', 'game'] assert fg.etypes == ['plays+wishes'] assert F.array_equal(fg.nodes['user'].data['h'], F.ones((3, 5))) assert F.array_equal(fg.nodes['game'].data['i'], F.ones((2, 5))) assert F.array_equal(fg.edata['e'], F.ones((6, 4))) assert 'f' not in fg.edata etypes = F.asnumpy(fg.edata[dgl.ETYPE]).tolist() eids = F.asnumpy(fg.edata[dgl.EID]).tolist() assert set(zip(etypes, eids)) == set([(1, 0), (1, 1), (1, 2), (1, 3), (2, 0), (2, 1)]) check_mapping(g, fg) fg = g['user', :, 'user'] # NOTE(gq): The node/edge types from the parent graph is returned if there is only one # node/edge type. This differs from the behavior above. assert fg.ntypes == ['user'] assert fg.etypes == ['follows'] u1, v1 = g.edges(etype='follows', order='eid') u2, v2 = fg.edges(etype='follows', order='eid') assert F.array_equal(u1, u2) assert F.array_equal(v1, v2) fg = g['developer', :, 'game'] assert fg.ntypes == ['developer', 'game'] assert fg.etypes == ['develops'] u1, v1 = g.edges(etype='develops', order='eid') u2, v2 = fg.edges(etype='develops', order='eid') assert F.array_equal(u1, u2) assert F.array_equal(v1, v2) fg = g[:, :, :] assert fg.ntypes == ['developer+user', 'game+user'] assert fg.etypes == ['develops+follows+plays+wishes'] check_mapping(g, fg) # Test another heterograph g_x = dgl.graph(([0, 1, 2], [1, 2, 3]), 'user', 'follows') g_y = dgl.graph(([0, 2], [2, 3]), 'user', 'knows') g_x.nodes['user'].data['h'] = F.randn((4, 3)) g_x.edges['follows'].data['w'] = F.randn((3, 2)) g_y.nodes['user'].data['hh'] = F.randn((4, 5)) g_y.edges['knows'].data['ww'] = F.randn((2, 10)) g = dgl.hetero_from_relations([g_x, g_y]) assert F.array_equal(g.ndata['h'], g_x.ndata['h']) assert F.array_equal(g.ndata['hh'], g_y.ndata['hh']) assert F.array_equal(g.edges['follows'].data['w'], g_x.edata['w']) assert F.array_equal(g.edges['knows'].data['ww'], g_y.edata['ww']) fg = g['user', :, 'user'] assert fg.ntypes == ['user'] assert fg.etypes == ['follows+knows'] check_mapping(g, fg) fg = g['user', :, :] assert fg.ntypes == ['user'] assert fg.etypes == ['follows+knows'] check_mapping(g, fg)
def test_batching_batched(idtype): """Test batching a DGLHeteroGraph and a BatchedDGLHeteroGraph.""" g1 = dgl.heterograph( { ('user', 'follows', 'user'): ([0, 1], [1, 2]), ('user', 'plays', 'game'): ([0, 1], [0, 0]) }, idtype=idtype, device=F.ctx()) g2 = dgl.heterograph( { ('user', 'follows', 'user'): ([0, 1], [1, 2]), ('user', 'plays', 'game'): ([0, 1], [0, 0]) }, idtype=idtype, device=F.ctx()) bg1 = dgl.batch([g1, g2]) g3 = dgl.heterograph( { ('user', 'follows', 'user'): ([0], [1]), ('user', 'plays', 'game'): ([1], [0]) }, idtype=idtype, device=F.ctx()) bg2 = dgl.batch([bg1, g3]) assert bg2.idtype == idtype assert bg2.device == F.ctx() assert bg2.ntypes == g3.ntypes assert bg2.etypes == g3.etypes assert bg2.canonical_etypes == g3.canonical_etypes assert bg2.batch_size == 3 # Test number of nodes for ntype in bg2.ntypes: assert F.asnumpy(bg2.batch_num_nodes(ntype)).tolist() == [ g1.number_of_nodes(ntype), g2.number_of_nodes(ntype), g3.number_of_nodes(ntype) ] assert bg2.number_of_nodes(ntype) == (g1.number_of_nodes(ntype) + g2.number_of_nodes(ntype) + g3.number_of_nodes(ntype)) # Test number of edges for etype in bg2.canonical_etypes: assert F.asnumpy(bg2.batch_num_edges(etype)).tolist() == [ g1.number_of_edges(etype), g2.number_of_edges(etype), g3.number_of_edges(etype) ] assert bg2.number_of_edges(etype) == (g1.number_of_edges(etype) + g2.number_of_edges(etype) + g3.number_of_edges(etype)) # Test relabeled nodes for ntype in bg2.ntypes: assert list(F.asnumpy(bg2.nodes(ntype))) == list( range(bg2.number_of_nodes(ntype))) # Test relabeled edges src, dst = bg2.edges(etype='follows') assert list(F.asnumpy(src)) == [0, 1, 3, 4, 6] assert list(F.asnumpy(dst)) == [1, 2, 4, 5, 7] src, dst = bg2.edges(etype='plays') assert list(F.asnumpy(src)) == [0, 1, 3, 4, 7] assert list(F.asnumpy(dst)) == [0, 0, 1, 1, 2] # Test unbatching graphs g4, g5, g6 = dgl.unbatch(bg2) check_equivalence_between_heterographs(g1, g4) check_equivalence_between_heterographs(g2, g5) check_equivalence_between_heterographs(g3, g6)
def _check_neighbor_sampling_dataloader(g, nids, dl, mode): seeds = defaultdict(list) for item in dl: if mode == 'node': input_nodes, output_nodes, blocks = item elif mode == 'edge': input_nodes, pair_graph, blocks = item output_nodes = pair_graph.ndata[dgl.NID] elif mode == 'link': input_nodes, pair_graph, neg_graph, blocks = item output_nodes = pair_graph.ndata[dgl.NID] for ntype in pair_graph.ntypes: assert F.array_equal(pair_graph.nodes[ntype].data[dgl.NID], neg_graph.nodes[ntype].data[dgl.NID]) if len(g.ntypes) > 1: for ntype in g.ntypes: assert F.array_equal(input_nodes[ntype], blocks[0].srcnodes[ntype].data[dgl.NID]) assert F.array_equal(output_nodes[ntype], blocks[-1].dstnodes[ntype].data[dgl.NID]) else: assert F.array_equal(input_nodes, blocks[0].srcdata[dgl.NID]) assert F.array_equal(output_nodes, blocks[-1].dstdata[dgl.NID]) prev_dst = {ntype: None for ntype in g.ntypes} for block in blocks: for canonical_etype in block.canonical_etypes: utype, etype, vtype = canonical_etype uu, vv = block.all_edges(order='eid', etype=canonical_etype) src = block.srcnodes[utype].data[dgl.NID] dst = block.dstnodes[vtype].data[dgl.NID] if prev_dst[utype] is not None: assert F.array_equal(src, prev_dst[utype]) u = src[uu] v = dst[vv] assert F.asnumpy(g.has_edges_between(u, v, etype=canonical_etype)).all() eid = block.edges[canonical_etype].data[dgl.EID] ufound, vfound = g.find_edges(eid, etype=canonical_etype) assert F.array_equal(ufound, u) assert F.array_equal(vfound, v) for ntype in block.dsttypes: src = block.srcnodes[ntype].data[dgl.NID] dst = block.dstnodes[ntype].data[dgl.NID] assert F.array_equal(src[:block.number_of_dst_nodes(ntype)], dst) prev_dst[ntype] = dst if mode == 'node': for ntype in blocks[-1].dsttypes: seeds[ntype].append(blocks[-1].dstnodes[ntype].data[dgl.NID]) elif mode == 'edge' or mode == 'link': for etype in pair_graph.canonical_etypes: seeds[etype].append(pair_graph.edges[etype].data[dgl.EID]) # Check if all nodes/edges are iterated seeds = {k: F.cat(v, 0) for k, v in seeds.items()} for k, v in seeds.items(): if k in nids: seed_set = set(F.asnumpy(nids[k])) elif isinstance(k, tuple) and k[1] in nids: seed_set = set(F.asnumpy(nids[k[1]])) else: continue v_set = set(F.asnumpy(v)) assert v_set == seed_set
def test_empty_relation(idtype): """Test the features of batched DGLHeteroGraphs""" g1 = dgl.heterograph( { ('user', 'follows', 'user'): ([0, 1], [1, 2]), ('user', 'plays', 'game'): ([], []) }, idtype=idtype, device=F.ctx()) g1.nodes['user'].data['h1'] = F.tensor([[0.], [1.], [2.]]) g1.nodes['user'].data['h2'] = F.tensor([[3.], [4.], [5.]]) g1.edges['follows'].data['h1'] = F.tensor([[0.], [1.]]) g1.edges['follows'].data['h2'] = F.tensor([[2.], [3.]]) g2 = dgl.heterograph( { ('user', 'follows', 'user'): ([0, 1], [1, 2]), ('user', 'plays', 'game'): ([0, 1], [0, 0]) }, idtype=idtype, device=F.ctx()) g2.nodes['user'].data['h1'] = F.tensor([[0.], [1.], [2.]]) g2.nodes['user'].data['h2'] = F.tensor([[3.], [4.], [5.]]) g2.nodes['game'].data['h1'] = F.tensor([[0.]]) g2.nodes['game'].data['h2'] = F.tensor([[1.]]) g2.edges['follows'].data['h1'] = F.tensor([[0.], [1.]]) g2.edges['follows'].data['h2'] = F.tensor([[2.], [3.]]) g2.edges['plays'].data['h1'] = F.tensor([[0.], [1.]]) bg = dgl.batch([g1, g2]) # Test number of nodes for ntype in bg.ntypes: assert F.asnumpy(bg.batch_num_nodes(ntype)).tolist() == [ g1.number_of_nodes(ntype), g2.number_of_nodes(ntype) ] # Test number of edges for etype in bg.canonical_etypes: assert F.asnumpy(bg.batch_num_edges(etype)).tolist() == [ g1.number_of_edges(etype), g2.number_of_edges(etype) ] # Test features assert F.allclose( bg.nodes['user'].data['h1'], F.cat([g1.nodes['user'].data['h1'], g2.nodes['user'].data['h1']], dim=0)) assert F.allclose( bg.nodes['user'].data['h2'], F.cat([g1.nodes['user'].data['h2'], g2.nodes['user'].data['h2']], dim=0)) assert F.allclose(bg.nodes['game'].data['h1'], g2.nodes['game'].data['h1']) assert F.allclose(bg.nodes['game'].data['h2'], g2.nodes['game'].data['h2']) assert F.allclose( bg.edges['follows'].data['h1'], F.cat([g1.edges['follows'].data['h1'], g2.edges['follows'].data['h1']], dim=0)) assert F.allclose(bg.edges['plays'].data['h1'], g2.edges['plays'].data['h1']) # Test unbatching graphs g3, g4 = dgl.unbatch(bg) check_equivalence_between_heterographs(g1, g3, node_attrs={ 'user': ['h1', 'h2'], 'game': ['h1', 'h2'] }, edge_attrs={ ('user', 'follows', 'user'): ['h1'] }) check_equivalence_between_heterographs(g2, g4, node_attrs={ 'user': ['h1', 'h2'], 'game': ['h1', 'h2'] }, edge_attrs={ ('user', 'follows', 'user'): ['h1'] }) # Test graphs without edges g1 = dgl.heterograph({('u', 'r', 'v'): ([], [])}, {'u': 0, 'v': 4}) g2 = dgl.heterograph({('u', 'r', 'v'): ([], [])}, {'u': 1, 'v': 5}) dgl.batch([g1, g2])
def check_dist_graph_hetero(g, num_clients, num_nodes, num_edges): # Test API for ntype in num_nodes: assert ntype in g.ntypes assert num_nodes[ntype] == g.number_of_nodes(ntype) for etype in num_edges: assert etype in g.etypes assert num_edges[etype] == g.number_of_edges(etype) assert g.number_of_nodes() == sum( [num_nodes[ntype] for ntype in num_nodes]) assert g.number_of_edges() == sum( [num_edges[etype] for etype in num_edges]) # Test reading node data nids = F.arange(0, int(g.number_of_nodes('n1') / 2)) feats1 = g.nodes['n1'].data['feat'][nids] feats = F.squeeze(feats1, 1) assert np.all(F.asnumpy(feats == nids)) # Test reading edge data eids = F.arange(0, int(g.number_of_edges('r1') / 2)) feats1 = g.edges['r1'].data['feat'][eids] feats = F.squeeze(feats1, 1) assert np.all(F.asnumpy(feats == eids)) # Test init node data new_shape = (g.number_of_nodes('n1'), 2) g.nodes['n1'].data['test1'] = dgl.distributed.DistTensor( new_shape, F.int32) feats = g.nodes['n1'].data['test1'][nids] assert np.all(F.asnumpy(feats) == 0) # create a tensor and destroy a tensor and create it again. test3 = dgl.distributed.DistTensor(new_shape, F.float32, 'test3', init_func=rand_init) del test3 test3 = dgl.distributed.DistTensor((g.number_of_nodes('n1'), 3), F.float32, 'test3') del test3 # add tests for anonymous distributed tensor. test3 = dgl.distributed.DistTensor(new_shape, F.float32, init_func=rand_init) data = test3[0:10] test4 = dgl.distributed.DistTensor(new_shape, F.float32, init_func=rand_init) del test3 test5 = dgl.distributed.DistTensor(new_shape, F.float32, init_func=rand_init) assert np.sum(F.asnumpy(test5[0:10] != data)) > 0 # test a persistent tesnor test4 = dgl.distributed.DistTensor(new_shape, F.float32, 'test4', init_func=rand_init, persistent=True) del test4 try: test4 = dgl.distributed.DistTensor((g.number_of_nodes('n1'), 3), F.float32, 'test4') raise Exception('') except: pass # Test write data new_feats = F.ones((len(nids), 2), F.int32, F.cpu()) g.nodes['n1'].data['test1'][nids] = new_feats feats = g.nodes['n1'].data['test1'][nids] assert np.all(F.asnumpy(feats) == 1) # Test metadata operations. assert len(g.nodes['n1'].data['feat']) == g.number_of_nodes('n1') assert g.nodes['n1'].data['feat'].shape == (g.number_of_nodes('n1'), 1) assert g.nodes['n1'].data['feat'].dtype == F.int64 selected_nodes = np.random.randint(0, 100, size=g.number_of_nodes('n1')) > 30 # Test node split nodes = node_split(selected_nodes, g.get_partition_book(), ntype='n1') nodes = F.asnumpy(nodes) # We only have one partition, so the local nodes are basically all nodes in the graph. local_nids = np.arange(g.number_of_nodes('n1')) for n in nodes: assert n in local_nids print('end')
def test_topology(gs, idtype): """Test batching two DGLHeteroGraphs where some nodes are isolated in some relations""" g1, g2 = gs g1 = g1.astype(idtype).to(F.ctx()) g2 = g2.astype(idtype).to(F.ctx()) bg = dgl.batch([g1, g2]) assert bg.idtype == idtype assert bg.device == F.ctx() assert bg.ntypes == g2.ntypes assert bg.etypes == g2.etypes assert bg.canonical_etypes == g2.canonical_etypes assert bg.batch_size == 2 # Test number of nodes for ntype in bg.ntypes: print(ntype) assert F.asnumpy(bg.batch_num_nodes(ntype)).tolist() == [ g1.number_of_nodes(ntype), g2.number_of_nodes(ntype) ] assert bg.number_of_nodes(ntype) == (g1.number_of_nodes(ntype) + g2.number_of_nodes(ntype)) # Test number of edges for etype in bg.canonical_etypes: assert F.asnumpy(bg.batch_num_edges(etype)).tolist() == [ g1.number_of_edges(etype), g2.number_of_edges(etype) ] assert bg.number_of_edges(etype) == (g1.number_of_edges(etype) + g2.number_of_edges(etype)) # Test relabeled nodes for ntype in bg.ntypes: assert list(F.asnumpy(bg.nodes(ntype))) == list( range(bg.number_of_nodes(ntype))) # Test relabeled edges src, dst = bg.edges(etype=('user', 'follows', 'user')) assert list(F.asnumpy(src)) == [0, 1, 4, 5] assert list(F.asnumpy(dst)) == [1, 2, 5, 6] src, dst = bg.edges(etype=('user', 'follows', 'developer')) assert list(F.asnumpy(src)) == [0, 1, 4, 5] assert list(F.asnumpy(dst)) == [1, 2, 4, 5] src, dst, eid = bg.edges(etype='plays', form='all') assert list(F.asnumpy(src)) == [0, 1, 2, 3, 4, 5, 6] assert list(F.asnumpy(dst)) == [0, 0, 1, 1, 2, 2, 3] assert list(F.asnumpy(eid)) == [0, 1, 2, 3, 4, 5, 6] # Test unbatching graphs g3, g4 = dgl.unbatch(bg) check_equivalence_between_heterographs(g1, g3) check_equivalence_between_heterographs(g2, g4) # Test dtype cast if idtype == "int32": bg_cast = bg.long() else: bg_cast = bg.int() assert bg.batch_size == bg_cast.batch_size # Test local var bg_local = bg.local_var() assert bg.batch_size == bg_local.batch_size
def test_split_even(): #prepare_dist(1) g = create_random_graph(10000) num_parts = 4 num_hops = 2 partition_graph(g, 'dist_graph_test', num_parts, '/tmp/dist_graph', num_hops=num_hops, part_method='metis') node_mask = np.random.randint(0, 100, size=g.number_of_nodes()) > 30 edge_mask = np.random.randint(0, 100, size=g.number_of_edges()) > 30 selected_nodes = np.nonzero(node_mask)[0] selected_edges = np.nonzero(edge_mask)[0] all_nodes1 = [] all_nodes2 = [] all_edges1 = [] all_edges2 = [] # The code now collects the roles of all client processes and use the information # to determine how to split the workloads. Here is to simulate the multi-client # use case. def set_roles(num_clients): dgl.distributed.role.CUR_ROLE = 'default' dgl.distributed.role.GLOBAL_RANK = {i: i for i in range(num_clients)} dgl.distributed.role.PER_ROLE_RANK['default'] = { i: i for i in range(num_clients) } for i in range(num_parts): set_roles(num_parts) part_g, node_feats, edge_feats, gpb, _, _, _ = load_partition( '/tmp/dist_graph/dist_graph_test.json', i) local_nids = F.nonzero_1d(part_g.ndata['inner_node']) local_nids = F.gather_row(part_g.ndata[dgl.NID], local_nids) nodes = node_split(node_mask, gpb, rank=i, force_even=True) all_nodes1.append(nodes) subset = np.intersect1d(F.asnumpy(nodes), F.asnumpy(local_nids)) print('part {} get {} nodes and {} are in the partition'.format( i, len(nodes), len(subset))) set_roles(num_parts * 2) nodes1 = node_split(node_mask, gpb, rank=i * 2, force_even=True) nodes2 = node_split(node_mask, gpb, rank=i * 2 + 1, force_even=True) nodes3, _ = F.sort_1d(F.cat([nodes1, nodes2], 0)) all_nodes2.append(nodes3) subset = np.intersect1d(F.asnumpy(nodes), F.asnumpy(nodes3)) print('intersection has', len(subset)) set_roles(num_parts) local_eids = F.nonzero_1d(part_g.edata['inner_edge']) local_eids = F.gather_row(part_g.edata[dgl.EID], local_eids) edges = edge_split(edge_mask, gpb, rank=i, force_even=True) all_edges1.append(edges) subset = np.intersect1d(F.asnumpy(edges), F.asnumpy(local_eids)) print('part {} get {} edges and {} are in the partition'.format( i, len(edges), len(subset))) set_roles(num_parts * 2) edges1 = edge_split(edge_mask, gpb, rank=i * 2, force_even=True) edges2 = edge_split(edge_mask, gpb, rank=i * 2 + 1, force_even=True) edges3, _ = F.sort_1d(F.cat([edges1, edges2], 0)) all_edges2.append(edges3) subset = np.intersect1d(F.asnumpy(edges), F.asnumpy(edges3)) print('intersection has', len(subset)) all_nodes1 = F.cat(all_nodes1, 0) all_edges1 = F.cat(all_edges1, 0) all_nodes2 = F.cat(all_nodes2, 0) all_edges2 = F.cat(all_edges2, 0) all_nodes = np.nonzero(node_mask)[0] all_edges = np.nonzero(edge_mask)[0] assert np.all(all_nodes == F.asnumpy(all_nodes1)) assert np.all(all_edges == F.asnumpy(all_edges1)) assert np.all(all_nodes == F.asnumpy(all_nodes2)) assert np.all(all_edges == F.asnumpy(all_edges2))
def test_topk(g, idtype, descending): g = g.astype(idtype).to(F.ctx()) g.ndata['x'] = F.randn((g.number_of_nodes(), 3)) # Test.1: to test the case where k > number of nodes. dgl.topk_nodes(g, 'x', 100, sortby=-1) # Test.2: test correctness min_nnodes = F.asnumpy(g.batch_num_nodes()).min() if min_nnodes <= 1: return k = min_nnodes - 1 val, indices = dgl.topk_nodes(g, 'x', k, descending=descending, sortby=-1) print(k) print(g.ndata['x']) print('val', val) print('indices', indices) subg = dgl.unbatch(g) subval, subidx = [], [] for sg in subg: subx = F.asnumpy(sg.ndata['x']) ai = np.argsort(subx[:, -1:].flatten()) if descending: ai = np.ascontiguousarray(ai[::-1]) subx = np.expand_dims(subx[ai[:k]], 0) subval.append(F.tensor(subx)) subidx.append(F.tensor(np.expand_dims(ai[:k], 0))) print(F.cat(subval, dim=0)) assert F.allclose(val, F.cat(subval, dim=0)) assert F.allclose(indices, F.cat(subidx, dim=0)) # Test.3: sorby=None dgl.topk_nodes(g, 'x', k, sortby=None) g.edata['x'] = F.randn((g.number_of_edges(), 3)) # Test.4: topk edges where k > number of edges. dgl.topk_edges(g, 'x', 100, sortby=-1) # Test.5: topk edges test correctness min_nedges = F.asnumpy(g.batch_num_edges()).min() if min_nedges <= 1: return k = min_nedges - 1 val, indices = dgl.topk_edges(g, 'x', k, descending=descending, sortby=-1) print(k) print(g.edata['x']) print('val', val) print('indices', indices) subg = dgl.unbatch(g) subval, subidx = [], [] for sg in subg: subx = F.asnumpy(sg.edata['x']) ai = np.argsort(subx[:, -1:].flatten()) if descending: ai = np.ascontiguousarray(ai[::-1]) subx = np.expand_dims(subx[ai[:k]], 0) subval.append(F.tensor(subx)) subidx.append(F.tensor(np.expand_dims(ai[:k], 0))) print(F.cat(subval, dim=0)) assert F.allclose(val, F.cat(subval, dim=0)) assert F.allclose(indices, F.cat(subidx, dim=0))
def check_negative_sampler(mode, exclude_positive, neg_size): g = generate_rand_graph(100) num_edges = g.number_of_edges() etype = np.random.randint(0, 10, size=g.number_of_edges(), dtype=np.int64) g.edata['etype'] = F.copy_to(F.tensor(etype), F.cpu()) pos_gsrc, pos_gdst, pos_geid = g.all_edges(form='all', order='eid') pos_map = {} for i in range(len(pos_geid)): pos_d = int(F.asnumpy(pos_gdst[i])) pos_e = int(F.asnumpy(pos_geid[i])) pos_map[(pos_d, pos_e)] = int(F.asnumpy(pos_gsrc[i])) EdgeSampler = getattr(dgl.contrib.sampling, 'EdgeSampler') # Test the homogeneous graph. total_samples = 0 batch_size = 50 max_samples = num_edges for pos_edges, neg_edges in EdgeSampler(g, batch_size, negative_mode=mode, neg_sample_size=neg_size, exclude_positive=exclude_positive, return_false_neg=True): pos_lsrc, pos_ldst, pos_leid = pos_edges.all_edges(form='all', order='eid') assert_array_equal( F.asnumpy(F.gather_row(pos_edges.parent_eid, pos_leid)), F.asnumpy( g.edge_ids(F.gather_row(pos_edges.parent_nid, pos_lsrc), F.gather_row(pos_edges.parent_nid, pos_ldst)))) neg_lsrc, neg_ldst, neg_leid = neg_edges.all_edges(form='all', order='eid') neg_src = F.gather_row(neg_edges.parent_nid, neg_lsrc) neg_dst = F.gather_row(neg_edges.parent_nid, neg_ldst) neg_eid = F.gather_row(neg_edges.parent_eid, neg_leid) for i in range(len(neg_eid)): neg_d = int(F.asnumpy(neg_dst)[i]) neg_e = int(F.asnumpy(neg_eid)[i]) assert (neg_d, neg_e) in pos_map if exclude_positive: assert int(F.asnumpy(neg_src[i])) != pos_map[(neg_d, neg_e)] check_head_tail(neg_edges) pos_tails = F.gather_row(pos_edges.parent_nid, pos_edges.tail_nid) neg_tails = F.gather_row(neg_edges.parent_nid, neg_edges.tail_nid) pos_tails = np.sort(F.asnumpy(pos_tails)) neg_tails = np.sort(F.asnumpy(neg_tails)) np.testing.assert_equal(pos_tails, neg_tails) exist = neg_edges.edata['false_neg'] if exclude_positive: assert np.sum(F.asnumpy(exist) == 0) == len(exist) else: assert F.array_equal(g.has_edges_between(neg_src, neg_dst), exist) total_samples += batch_size if (total_samples >= max_samples): break # Test the knowledge graph. total_samples = 0 for _, neg_edges in EdgeSampler(g, batch_size, negative_mode=mode, neg_sample_size=neg_size, exclude_positive=exclude_positive, relations=g.edata['etype'], return_false_neg=True): neg_lsrc, neg_ldst, neg_leid = neg_edges.all_edges(form='all', order='eid') neg_src = F.gather_row(neg_edges.parent_nid, neg_lsrc) neg_dst = F.gather_row(neg_edges.parent_nid, neg_ldst) neg_eid = F.gather_row(neg_edges.parent_eid, neg_leid) exists = neg_edges.edata['false_neg'] neg_edges.edata['etype'] = F.gather_row(g.edata['etype'], neg_eid) for i in range(len(neg_eid)): u, v = F.asnumpy(neg_src[i]), F.asnumpy(neg_dst[i]) if g.has_edge_between(u, v): eid = g.edge_id(u, v) etype = g.edata['etype'][eid] exist = neg_edges.edata['etype'][i] == etype assert F.asnumpy(exists[i]) == F.asnumpy(exist) total_samples += batch_size if (total_samples >= max_samples): break
def check_hetero_partition(hg, part_method, num_parts=4, num_trainers_per_machine=1, load_feats=True): hg.nodes['n1'].data['labels'] = F.arange(0, hg.number_of_nodes('n1')) hg.nodes['n1'].data['feats'] = F.tensor( np.random.randn(hg.number_of_nodes('n1'), 10), F.float32) hg.edges['r1'].data['feats'] = F.tensor( np.random.randn(hg.number_of_edges('r1'), 10), F.float32) hg.edges['r1'].data['labels'] = F.arange(0, hg.number_of_edges('r1')) num_hops = 1 orig_nids, orig_eids = partition_graph( hg, 'test', num_parts, '/tmp/partition', num_hops=num_hops, part_method=part_method, reshuffle=True, return_mapping=True, num_trainers_per_machine=num_trainers_per_machine) assert len(orig_nids) == len(hg.ntypes) assert len(orig_eids) == len(hg.etypes) for ntype in hg.ntypes: assert len(orig_nids[ntype]) == hg.number_of_nodes(ntype) for etype in hg.etypes: assert len(orig_eids[etype]) == hg.number_of_edges(etype) parts = [] shuffled_labels = [] shuffled_elabels = [] for i in range(num_parts): part_g, node_feats, edge_feats, gpb, _, ntypes, etypes = load_partition( '/tmp/partition/test.json', i, load_feats=load_feats) if not load_feats: assert not node_feats assert not edge_feats node_feats, edge_feats = load_partition_feats( '/tmp/partition/test.json', i) if num_trainers_per_machine > 1: for ntype in hg.ntypes: name = ntype + '/trainer_id' assert name in node_feats part_ids = F.floor_div(node_feats[name], num_trainers_per_machine) assert np.all(F.asnumpy(part_ids) == i) for etype in hg.etypes: name = etype + '/trainer_id' assert name in edge_feats part_ids = F.floor_div(edge_feats[name], num_trainers_per_machine) assert np.all(F.asnumpy(part_ids) == i) # Verify the mapping between the reshuffled IDs and the original IDs. # These are partition-local IDs. part_src_ids, part_dst_ids = part_g.edges() # These are reshuffled global homogeneous IDs. part_src_ids = F.gather_row(part_g.ndata[dgl.NID], part_src_ids) part_dst_ids = F.gather_row(part_g.ndata[dgl.NID], part_dst_ids) part_eids = part_g.edata[dgl.EID] # These are reshuffled per-type IDs. src_ntype_ids, part_src_ids = gpb.map_to_per_ntype(part_src_ids) dst_ntype_ids, part_dst_ids = gpb.map_to_per_ntype(part_dst_ids) etype_ids, part_eids = gpb.map_to_per_etype(part_eids) # These are original per-type IDs. for etype_id, etype in enumerate(hg.etypes): part_src_ids1 = F.boolean_mask(part_src_ids, etype_ids == etype_id) src_ntype_ids1 = F.boolean_mask(src_ntype_ids, etype_ids == etype_id) part_dst_ids1 = F.boolean_mask(part_dst_ids, etype_ids == etype_id) dst_ntype_ids1 = F.boolean_mask(dst_ntype_ids, etype_ids == etype_id) part_eids1 = F.boolean_mask(part_eids, etype_ids == etype_id) assert np.all(F.asnumpy(src_ntype_ids1 == src_ntype_ids1[0])) assert np.all(F.asnumpy(dst_ntype_ids1 == dst_ntype_ids1[0])) src_ntype = hg.ntypes[F.as_scalar(src_ntype_ids1[0])] dst_ntype = hg.ntypes[F.as_scalar(dst_ntype_ids1[0])] orig_src_ids1 = F.gather_row(orig_nids[src_ntype], part_src_ids1) orig_dst_ids1 = F.gather_row(orig_nids[dst_ntype], part_dst_ids1) orig_eids1 = F.gather_row(orig_eids[etype], part_eids1) orig_eids2 = hg.edge_ids(orig_src_ids1, orig_dst_ids1, etype=etype) assert len(orig_eids1) == len(orig_eids2) assert np.all(F.asnumpy(orig_eids1) == F.asnumpy(orig_eids2)) parts.append(part_g) verify_graph_feats(hg, gpb, part_g, node_feats, edge_feats) shuffled_labels.append(node_feats['n1/labels']) shuffled_elabels.append(edge_feats['r1/labels']) verify_hetero_graph(hg, parts) shuffled_labels = F.asnumpy(F.cat(shuffled_labels, 0)) shuffled_elabels = F.asnumpy(F.cat(shuffled_elabels, 0)) orig_labels = np.zeros(shuffled_labels.shape, dtype=shuffled_labels.dtype) orig_elabels = np.zeros(shuffled_elabels.shape, dtype=shuffled_elabels.dtype) orig_labels[F.asnumpy(orig_nids['n1'])] = shuffled_labels orig_elabels[F.asnumpy(orig_eids['r1'])] = shuffled_elabels assert np.all(orig_labels == F.asnumpy(hg.nodes['n1'].data['labels'])) assert np.all(orig_elabels == F.asnumpy(hg.edges['r1'].data['labels']))