def test_partition(): g = create_random_graph(10000) g.ndata['labels'] = F.arange(0, g.number_of_nodes()) g.ndata['feats'] = F.tensor(np.random.randn(g.number_of_nodes(), 10)) num_parts = 4 num_hops = 2 partition_graph(g, 'test', num_parts, '/tmp', num_hops=num_hops, part_method='metis') for i in range(num_parts): part_g, node_feats, edge_feats, meta = load_partition( '/tmp/test.json', i) num_nodes, num_edges, node_map, edge_map = meta # Check the metadata assert num_nodes == g.number_of_nodes() assert num_edges == g.number_of_edges() # Check the node map. local_nodes = np.nonzero(node_map == i)[0] part_ids = node_map[F.asnumpy(part_g.ndata[dgl.NID])] local_nodes1 = F.asnumpy(part_g.ndata[dgl.NID])[part_ids == i] assert np.all(local_nodes == local_nodes1) # Check the edge map. assert np.all(edge_map >= 0) local_edges = np.nonzero(edge_map == i)[0] part_ids = edge_map[F.asnumpy(part_g.edata[dgl.EID])] local_edges1 = F.asnumpy(part_g.edata[dgl.EID])[part_ids == i] assert np.all(local_edges == np.sort(local_edges1)) for name in ['labels', 'feats']: assert name in node_feats assert node_feats[name].shape[0] == len(local_nodes) assert len(local_nodes) == len(node_feats[name]) assert np.all( F.asnumpy(g.ndata[name][local_nodes]) == F.asnumpy( node_feats[name])) assert len(edge_feats) == 0
def test_standalone(): os.environ['DGL_DIST_MODE'] = 'standalone' g = create_random_graph(10000) # Partition the graph num_parts = 1 graph_name = 'dist_graph_test_3' g.ndata['features'] = F.unsqueeze(F.arange(0, g.number_of_nodes()), 1) g.edata['features'] = F.unsqueeze(F.arange(0, g.number_of_edges()), 1) partition_graph(g, graph_name, num_parts, '/tmp/dist_graph') dgl.distributed.initialize("kv_ip_config.txt") dist_g = DistGraph( "kv_ip_config.txt", graph_name, part_config='/tmp/dist_graph/{}.json'.format(graph_name)) check_dist_graph(dist_g, 1, g.number_of_nodes(), g.number_of_edges()) dgl.distributed.exit_client( ) # this is needed since there's two test here in one process
def check_rpc_sampling_shuffle(tmpdir, num_server): ip_config = open("rpc_ip_config.txt", "w") for _ in range(num_server): ip_config.write('{} 1\n'.format(get_local_usable_addr())) ip_config.close() g = CitationGraphDataset("cora")[0] g.readonly() num_parts = num_server num_hops = 1 partition_graph(g, 'test_sampling', num_parts, tmpdir, num_hops=num_hops, part_method='metis', reshuffle=True) pserver_list = [] ctx = mp.get_context('spawn') for i in range(num_server): p = ctx.Process(target=start_server, args=(i, tmpdir, num_server > 1, 'test_sampling')) p.start() time.sleep(1) pserver_list.append(p) time.sleep(3) sampled_graph = start_sample_client(0, tmpdir, num_server > 1) print("Done sampling") for p in pserver_list: p.join() orig_nid = F.zeros((g.number_of_nodes(),), dtype=F.int64) orig_eid = F.zeros((g.number_of_edges(),), dtype=F.int64) for i in range(num_server): part, _, _, _, _ = load_partition(tmpdir / 'test_sampling.json', i) orig_nid[part.ndata[dgl.NID]] = part.ndata['orig_id'] orig_eid[part.edata[dgl.EID]] = part.edata['orig_id'] src, dst = sampled_graph.edges() src = orig_nid[src] dst = orig_nid[dst] assert sampled_graph.number_of_nodes() == g.number_of_nodes() assert np.all(F.asnumpy(g.has_edges_between(src, dst))) eids = g.edge_ids(src, dst) eids1 = orig_eid[sampled_graph.edata[dgl.EID]] assert np.array_equal(F.asnumpy(eids1), F.asnumpy(eids))
def check_rpc_in_subgraph(tmpdir, num_server): ip_config = open("rpc_ip_config.txt", "w") for _ in range(num_server): ip_config.write('{} 1\n'.format(get_local_usable_addr())) ip_config.close() g = CitationGraphDataset("cora")[0] g.readonly() num_parts = num_server partition_graph(g, 'test_in_subgraph', num_parts, tmpdir, num_hops=1, part_method='metis', reshuffle=False) pserver_list = [] ctx = mp.get_context('spawn') for i in range(num_server): p = ctx.Process(target=start_server, args=(i, tmpdir, num_server > 1, 'test_in_subgraph')) p.start() time.sleep(1) pserver_list.append(p) nodes = [0, 10, 99, 66, 1024, 2008] time.sleep(3) sampled_graph = start_in_subgraph_client(0, tmpdir, num_server > 1, nodes) for p in pserver_list: p.join() src, dst = sampled_graph.edges() g = dgl.as_heterograph(g) assert sampled_graph.number_of_nodes() == g.number_of_nodes() subg1 = dgl.in_subgraph(g, nodes) src1, dst1 = subg1.edges() assert np.all(np.sort(F.asnumpy(src)) == np.sort(F.asnumpy(src1))) assert np.all(np.sort(F.asnumpy(dst)) == np.sort(F.asnumpy(dst1))) eids = g.edge_ids(src, dst) assert np.array_equal(F.asnumpy(sampled_graph.edata[dgl.EID]), F.asnumpy(eids))
def check_rpc_find_edges_shuffle(tmpdir, num_server): ip_config = open("rpc_ip_config.txt", "w") for _ in range(num_server): ip_config.write('{}\n'.format(get_local_usable_addr())) ip_config.close() g = CitationGraphDataset("cora")[0] g.readonly() num_parts = num_server partition_graph(g, 'test_find_edges', num_parts, tmpdir, num_hops=1, part_method='metis', reshuffle=True) pserver_list = [] ctx = mp.get_context('spawn') for i in range(num_server): p = ctx.Process(target=start_server, args=(i, tmpdir, num_server > 1, 'test_find_edges')) p.start() time.sleep(1) pserver_list.append(p) orig_nid = F.zeros((g.number_of_nodes(), ), dtype=F.int64) orig_eid = F.zeros((g.number_of_edges(), ), dtype=F.int64) for i in range(num_server): part, _, _, _, _, _, _ = load_partition( tmpdir / 'test_find_edges.json', i) orig_nid[part.ndata[dgl.NID]] = part.ndata['orig_id'] orig_eid[part.edata[dgl.EID]] = part.edata['orig_id'] time.sleep(3) eids = F.tensor(np.random.randint(g.number_of_edges(), size=100)) u, v = g.find_edges(orig_eid[eids]) du, dv = start_find_edges_client(0, tmpdir, num_server > 1, eids) du = orig_nid[du] dv = orig_nid[dv] assert F.array_equal(u, du) assert F.array_equal(v, dv)
def check_standalone_etype_sampling(tmpdir, reshuffle): hg = CitationGraphDataset('cora')[0] num_parts = 1 num_hops = 1 partition_graph(hg, 'test_sampling', num_parts, tmpdir, num_hops=num_hops, part_method='metis', reshuffle=reshuffle) os.environ['DGL_DIST_MODE'] = 'standalone' dgl.distributed.initialize("rpc_ip_config.txt") dist_graph = DistGraph("test_sampling", part_config=tmpdir / 'test_sampling.json') sampled_graph = sample_etype_neighbors(dist_graph, [0, 10, 99, 66, 1023], dgl.ETYPE, 3) src, dst = sampled_graph.edges() assert sampled_graph.number_of_nodes() == hg.number_of_nodes() assert np.all(F.asnumpy(hg.has_edges_between(src, dst))) eids = hg.edge_ids(src, dst) assert np.array_equal( F.asnumpy(sampled_graph.edata[dgl.EID]), F.asnumpy(eids)) dgl.distributed.exit_client()
def check_rpc_get_degree_shuffle(tmpdir, num_server): generate_ip_config("rpc_ip_config.txt", num_server, num_server) g = CitationGraphDataset("cora")[0] g.readonly() num_parts = num_server partition_graph(g, 'test_get_degrees', num_parts, tmpdir, num_hops=1, part_method='metis', reshuffle=True) pserver_list = [] ctx = mp.get_context('spawn') for i in range(num_server): p = ctx.Process(target=start_server, args=(i, tmpdir, num_server > 1, 'test_get_degrees')) p.start() time.sleep(1) pserver_list.append(p) orig_nid = F.zeros((g.number_of_nodes(), ), dtype=F.int64, ctx=F.cpu()) for i in range(num_server): part, _, _, _, _, _, _ = load_partition( tmpdir / 'test_get_degrees.json', i) orig_nid[part.ndata[dgl.NID]] = part.ndata['orig_id'] nids = F.tensor(np.random.randint(g.number_of_nodes(), size=100)) in_degs, out_degs, all_in_degs, all_out_degs = start_get_degrees_client( 0, tmpdir, num_server > 1, nids) print("Done get_degree") for p in pserver_list: p.join() print('check results') assert F.array_equal(g.in_degrees(orig_nid[nids]), in_degs) assert F.array_equal(g.in_degrees(orig_nid), all_in_degs) assert F.array_equal(g.out_degrees(orig_nid[nids]), out_degs) assert F.array_equal(g.out_degrees(orig_nid), all_out_degs)
def test_split(): prepare_dist() g = create_random_graph(10000) num_parts = 4 num_hops = 2 partition_graph(g, 'test', num_parts, '/tmp', num_hops=num_hops, part_method='metis') node_mask = np.random.randint(0, 100, size=g.number_of_nodes()) > 30 edge_mask = np.random.randint(0, 100, size=g.number_of_edges()) > 30 selected_nodes = np.nonzero(node_mask)[0] selected_edges = np.nonzero(edge_mask)[0] for i in range(num_parts): part_g, node_feats, edge_feats, meta = load_partition( '/tmp/test.json', i) num_nodes, num_edges, node_map, edge_map, num_partitions = meta gpb = GraphPartitionBook(part_id=i, num_parts=num_partitions, node_map=node_map, edge_map=edge_map, part_graph=part_g) local_nids = F.nonzero_1d(part_g.ndata['local_node']) local_nids = F.gather_row(part_g.ndata[dgl.NID], local_nids) nodes1 = np.intersect1d(selected_nodes, F.asnumpy(local_nids)) nodes2 = node_split(node_mask, gpb, i) assert np.all(np.sort(nodes1) == np.sort(F.asnumpy(nodes2))) local_nids = F.asnumpy(local_nids) for n in nodes1: assert n in local_nids local_eids = F.nonzero_1d(part_g.edata['local_edge']) local_eids = F.gather_row(part_g.edata[dgl.EID], local_eids) edges1 = np.intersect1d(selected_edges, F.asnumpy(local_eids)) edges2 = edge_split(edge_mask, gpb, i) assert np.all(np.sort(edges1) == np.sort(F.asnumpy(edges2))) local_eids = F.asnumpy(local_eids) for e in edges1: assert e in local_eids
def check_standalone_etype_sampling_heterograph(tmpdir, reshuffle): hg = CitationGraphDataset('cora')[0] num_parts = 1 num_hops = 1 src, dst = hg.edges() new_hg = dgl.heterograph({('paper', 'cite', 'paper'): (src, dst), ('paper', 'cite-by', 'paper'): (dst, src)}, {'paper': hg.number_of_nodes()}) partition_graph(new_hg, 'test_hetero_sampling', num_parts, tmpdir, num_hops=num_hops, part_method='metis', reshuffle=reshuffle) os.environ['DGL_DIST_MODE'] = 'standalone' dgl.distributed.initialize("rpc_ip_config.txt") dist_graph = DistGraph("test_hetero_sampling", part_config=tmpdir / 'test_hetero_sampling.json') sampled_graph = sample_etype_neighbors(dist_graph, [0, 1, 2, 10, 99, 66, 1023, 1024, 2700, 2701], dgl.ETYPE, 1) src, dst = sampled_graph.edges(etype=('paper', 'cite', 'paper')) assert len(src) == 10 src, dst = sampled_graph.edges(etype=('paper', 'cite-by', 'paper')) assert len(src) == 10 assert sampled_graph.number_of_nodes() == new_hg.number_of_nodes() dgl.distributed.exit_client()
def test_standalone(tmpdir): ip_config = open("mp_ip_config.txt", "w") for _ in range(1): ip_config.write('{}\n'.format(get_local_usable_addr())) ip_config.close() g = CitationGraphDataset("cora")[0] print(g.idtype) num_parts = 1 num_hops = 1 partition_graph(g, 'test_sampling', num_parts, tmpdir, num_hops=num_hops, part_method='metis', reshuffle=True) os.environ['DGL_DIST_MODE'] = 'standalone' try: start_dist_dataloader(0, tmpdir, 1, True) except Exception as e: print(e) dgl.distributed.exit_client() # this is needed since there's two test here in one process
def check_rpc_sampling(tmpdir, num_server): ip_config = open("rpc_ip_config.txt", "w") for _ in range(num_server): ip_config.write('{}\n'.format(get_local_usable_addr())) ip_config.close() g = CitationGraphDataset("cora")[0] g.readonly() print(g.idtype) num_parts = num_server num_hops = 1 partition_graph(g, 'test_sampling', num_parts, tmpdir, num_hops=num_hops, part_method='metis', reshuffle=False) pserver_list = [] ctx = mp.get_context('spawn') for i in range(num_server): p = ctx.Process(target=start_server, args=(i, tmpdir, num_server > 1, 'test_sampling')) p.start() time.sleep(1) pserver_list.append(p) time.sleep(3) sampled_graph = start_sample_client(0, tmpdir, num_server > 1) print("Done sampling") for p in pserver_list: p.join() src, dst = sampled_graph.edges() assert sampled_graph.number_of_nodes() == g.number_of_nodes() assert np.all(F.asnumpy(g.has_edges_between(src, dst))) eids = g.edge_ids(src, dst) assert np.array_equal(F.asnumpy(sampled_graph.edata[dgl.EID]), F.asnumpy(eids))
def test_dataloader(tmpdir, num_server, num_workers, dataloader_type): ip_config = open("mp_ip_config.txt", "w") for _ in range(num_server): ip_config.write('{}\n'.format(get_local_usable_addr())) ip_config.close() g = CitationGraphDataset("cora")[0] print(g.idtype) num_parts = num_server num_hops = 1 partition_graph(g, 'test_sampling', num_parts, tmpdir, num_hops=num_hops, part_method='metis', reshuffle=True) pserver_list = [] ctx = mp.get_context('spawn') for i in range(num_server): p = ctx.Process(target=start_server, args=(i, tmpdir, num_server > 1, num_workers + 1)) p.start() time.sleep(1) pserver_list.append(p) time.sleep(3) os.environ['DGL_DIST_MODE'] = 'distributed' ptrainer_list = [] if dataloader_type == 'node': p = ctx.Process(target=start_node_dataloader, args=(0, tmpdir, num_server, num_workers)) p.start() time.sleep(1) ptrainer_list.append(p) for p in pserver_list: p.join() for p in ptrainer_list: p.join()
def check_dataloader(g, tmpdir, num_server, num_workers, dataloader_type): ip_config = open("mp_ip_config.txt", "w") for _ in range(num_server): ip_config.write('{}\n'.format(get_local_usable_addr())) ip_config.close() num_parts = num_server num_hops = 1 orig_nid, orig_eid = partition_graph(g, 'test_sampling', num_parts, tmpdir, num_hops=num_hops, part_method='metis', reshuffle=True, return_mapping=True) if not isinstance(orig_nid, dict): orig_nid = {g.ntypes[0]: orig_nid} if not isinstance(orig_eid, dict): orig_eid = {g.etypes[0]: orig_eid} pserver_list = [] ctx = mp.get_context('spawn') for i in range(num_server): p = ctx.Process(target=start_server, args=(i, tmpdir, num_server > 1, num_workers + 1)) p.start() time.sleep(1) pserver_list.append(p) time.sleep(3) os.environ['DGL_DIST_MODE'] = 'distributed' os.environ['DGL_NUM_SAMPLER'] = str(num_workers) ptrainer_list = [] if dataloader_type == 'node': p = ctx.Process(target=start_node_dataloader, args=(0, tmpdir, num_server, num_workers, orig_nid, orig_eid, g)) p.start() time.sleep(1) ptrainer_list.append(p) elif dataloader_type == 'edge': p = ctx.Process(target=start_edge_dataloader, args=(0, tmpdir, num_server, num_workers, orig_nid, orig_eid, g)) p.start() time.sleep(1) ptrainer_list.append(p) for p in pserver_list: p.join() for p in ptrainer_list: p.join()
def check_dist_emb_server_client(shared_mem, num_servers, num_clients): prepare_dist() g = create_random_graph(10000) # Partition the graph num_parts = 1 graph_name = 'dist_graph_test_2' g.ndata['features'] = F.unsqueeze(F.arange(0, g.number_of_nodes()), 1) g.edata['features'] = F.unsqueeze(F.arange(0, g.number_of_edges()), 1) partition_graph(g, graph_name, num_parts, '/tmp/dist_graph') # let's just test on one partition for now. # We cannot run multiple servers and clients on the same machine. serv_ps = [] ctx = mp.get_context('spawn') for serv_id in range(num_servers): p = ctx.Process(target=run_server, args=(graph_name, serv_id, num_servers, num_clients, shared_mem)) serv_ps.append(p) p.start() cli_ps = [] for cli_id in range(num_clients): print('start client', cli_id) p = ctx.Process(target=run_emb_client, args=(graph_name, 0, num_servers, num_clients, g.number_of_nodes(), g.number_of_edges())) p.start() cli_ps.append(p) for p in cli_ps: p.join() assert p.exitcode == 0 for p in serv_ps: p.join() print('clients have terminated')
def test_graph_partition_book(): g = create_random_graph(10000) g.ndata['labels'] = F.arange(0, g.number_of_nodes()) g.ndata['feats'] = F.tensor(np.random.randn(g.number_of_nodes(), 10)) num_parts = 4 num_hops = 2 create_ip_config() partition_graph(g, 'test', num_parts, '/tmp', num_hops=num_hops, part_method='metis') for i in range(num_parts): part_g, node_feats, edge_feats, meta = load_partition( '/tmp/test.json', i) num_nodes, num_edges, node_map, edge_map, num_partitions = meta gpb = GraphPartitionBook(part_id=i, num_parts=num_partitions, node_map=node_map, edge_map=edge_map, part_graph=part_g) assert gpb.num_partitions() == num_parts gpb_meta = gpb.metadata() assert len(gpb_meta) == num_parts assert np.all( F.asnumpy(gpb.nid2partid(F.arange(0, len(node_map)))) == node_map) assert np.all( F.asnumpy(gpb.eid2partid(F.arange(0, len(edge_map)))) == edge_map) assert len(gpb.partid2nids(i)) == gpb_meta[i]['num_nodes'] assert len(gpb.partid2eids(i)) == gpb_meta[i]['num_edges'] local_nid = gpb.nid2localnid(part_g.ndata[dgl.NID], i) assert np.all( F.asnumpy(local_nid) == F.asnumpy(F.arange(0, len(local_nid)))) local_eid = gpb.eid2localeid(part_g.edata[dgl.EID], i) assert np.all( F.asnumpy(local_eid) == F.asnumpy(F.arange(0, len(local_eid))))
def check_server_client_hetero(shared_mem, num_servers, num_clients): prepare_dist(num_servers) g = create_random_hetero() # Partition the graph num_parts = 1 graph_name = 'dist_graph_test_3' partition_graph(g, graph_name, num_parts, '/tmp/dist_graph') # let's just test on one partition for now. # We cannot run multiple servers and clients on the same machine. serv_ps = [] ctx = mp.get_context('spawn') for serv_id in range(num_servers): p = ctx.Process(target=run_server, args=(graph_name, serv_id, num_servers, num_clients, shared_mem)) serv_ps.append(p) p.start() cli_ps = [] num_nodes = {ntype: g.number_of_nodes(ntype) for ntype in g.ntypes} num_edges = {etype: g.number_of_edges(etype) for etype in g.etypes} for cli_id in range(num_clients): print('start client', cli_id) p = ctx.Process(target=run_client_hetero, args=(graph_name, 0, num_servers, num_clients, num_nodes, num_edges)) p.start() cli_ps.append(p) for p in cli_ps: p.join() for p in serv_ps: p.join() print('clients have terminated')
def check_rpc_sampling_shuffle(tmpdir, num_server, num_groups=1): generate_ip_config("rpc_ip_config.txt", num_server, num_server) g = CitationGraphDataset("cora")[0] g.readonly() num_parts = num_server num_hops = 1 partition_graph(g, 'test_sampling', num_parts, tmpdir, num_hops=num_hops, part_method='metis', reshuffle=True) pserver_list = [] ctx = mp.get_context('spawn') keep_alive = num_groups > 1 for i in range(num_server): p = ctx.Process(target=start_server, args=( i, tmpdir, num_server > 1, 'test_sampling', ['csc', 'coo'], keep_alive)) p.start() time.sleep(1) pserver_list.append(p) pclient_list = [] num_clients = 1 for client_id in range(num_clients): for group_id in range(num_groups): p = ctx.Process(target=start_sample_client_shuffle, args=(client_id, tmpdir, num_server > 1, g, num_server, group_id)) p.start() time.sleep(1) # avoid race condition when instantiating DistGraph pclient_list.append(p) for p in pclient_list: p.join() if keep_alive: for p in pserver_list: assert p.is_alive() # force shutdown server dgl.distributed.shutdown_servers("rpc_ip_config.txt", 1) for p in pserver_list: p.join()
def check_hetero_partition(hg, part_method): hg.nodes['n1'].data['labels'] = F.arange(0, hg.number_of_nodes('n1')) hg.nodes['n1'].data['feats'] = F.tensor( np.random.randn(hg.number_of_nodes('n1'), 10), F.float32) hg.edges['r1'].data['feats'] = F.tensor( np.random.randn(hg.number_of_edges('r1'), 10), F.float32) num_parts = 4 num_hops = 1 partition_graph(hg, 'test', num_parts, '/tmp/partition', num_hops=num_hops, part_method=part_method, reshuffle=True) parts = [] for i in range(num_parts): part_g, node_feats, edge_feats, gpb, _, ntypes, etypes = load_partition( '/tmp/partition/test.json', i) parts.append(part_g) verify_graph_feats(hg, part_g, node_feats) verify_hetero_graph(hg, parts)
def check_standalone_sampling(tmpdir): g = CitationGraphDataset("cora")[0] num_parts = 1 num_hops = 1 partition_graph(g, 'test_sampling', num_parts, tmpdir, num_hops=num_hops, part_method='metis', reshuffle=False) dist_graph = DistGraph(None, "test_sampling", part_config=tmpdir / 'test_sampling.json') sampled_graph = sample_neighbors(dist_graph, [0, 10, 99, 66, 1024, 2008], 3) src, dst = sampled_graph.edges() assert sampled_graph.number_of_nodes() == g.number_of_nodes() assert np.all(F.asnumpy(g.has_edges_between(src, dst))) eids = g.edge_ids(src, dst) assert np.array_equal(F.asnumpy(sampled_graph.edata[dgl.EID]), F.asnumpy(eids))
def check_rpc_bipartite_etype_sampling_empty(tmpdir, num_server): """sample on bipartite via sample_etype_neighbors() which yields empty sample results""" generate_ip_config("rpc_ip_config.txt", num_server, num_server) g = create_random_bipartite() num_parts = num_server num_hops = 1 orig_nids, _ = partition_graph(g, 'test_sampling', num_parts, tmpdir, num_hops=num_hops, part_method='metis', reshuffle=True, return_mapping=True) pserver_list = [] ctx = mp.get_context('spawn') for i in range(num_server): p = ctx.Process(target=start_server, args=(i, tmpdir, num_server > 1, 'test_sampling')) p.start() time.sleep(1) pserver_list.append(p) deg = get_degrees(g, orig_nids['game'], 'game') empty_nids = F.nonzero_1d(deg == 0) block, gpb = start_bipartite_etype_sample_client(0, tmpdir, num_server > 1, nodes={ 'game': empty_nids, 'user': [1] }) print("Done sampling") for p in pserver_list: p.join() assert block is not None assert block.number_of_edges() == 0 assert len(block.etypes) == len(g.etypes)
def test_dist_dataloader(tmpdir, num_server, num_workers, drop_last, reshuffle): reset_envs() ip_config = open("mp_ip_config.txt", "w") for _ in range(num_server): ip_config.write('{}\n'.format(get_local_usable_addr())) ip_config.close() g = CitationGraphDataset("cora")[0] print(g.idtype) num_parts = num_server num_hops = 1 orig_nid, orig_eid = partition_graph(g, 'test_sampling', num_parts, tmpdir, num_hops=num_hops, part_method='metis', reshuffle=reshuffle, return_mapping=True) pserver_list = [] ctx = mp.get_context('spawn') for i in range(num_server): p = ctx.Process(target=start_server, args=(i, tmpdir, num_server > 1, num_workers + 1)) p.start() time.sleep(1) pserver_list.append(p) os.environ['DGL_DIST_MODE'] = 'distributed' os.environ['DGL_NUM_SAMPLER'] = str(num_workers) ptrainer = ctx.Process(target=start_dist_dataloader, args=(0, tmpdir, num_server, drop_last, orig_nid, orig_eid)) ptrainer.start() for p in pserver_list: p.join() ptrainer.join()
def check_rpc_hetero_etype_sampling_empty_shuffle(tmpdir, num_server): ip_config = open("rpc_ip_config.txt", "w") for _ in range(num_server): ip_config.write('{}\n'.format(get_local_usable_addr())) ip_config.close() g = create_random_hetero(dense=True, empty=True) num_parts = num_server num_hops = 1 orig_nids, _ = partition_graph(g, 'test_sampling', num_parts, tmpdir, num_hops=num_hops, part_method='metis', reshuffle=True, return_mapping=True) pserver_list = [] ctx = mp.get_context('spawn') for i in range(num_server): p = ctx.Process(target=start_server, args=(i, tmpdir, num_server > 1, 'test_sampling')) p.start() time.sleep(1) pserver_list.append(p) fanout = 3 deg = get_degrees(g, orig_nids['n3'], 'n3') empty_nids = F.nonzero_1d(deg == 0) block, gpb = start_hetero_etype_sample_client(0, tmpdir, num_server > 1, fanout, nodes={'n3': empty_nids}) print("Done sampling") for p in pserver_list: p.join() assert block.number_of_edges() == 0 assert len(block.etypes) == len(g.etypes)
def check_rpc_hetero_find_edges_shuffle(tmpdir, num_server): ip_config = open("rpc_ip_config.txt", "w") for _ in range(num_server): ip_config.write('{}\n'.format(get_local_usable_addr())) ip_config.close() g = create_random_hetero() num_parts = num_server orig_nid, orig_eid = partition_graph(g, 'test_find_edges', num_parts, tmpdir, num_hops=1, part_method='metis', reshuffle=True, return_mapping=True) pserver_list = [] ctx = mp.get_context('spawn') for i in range(num_server): p = ctx.Process(target=start_server, args=(i, tmpdir, num_server > 1, 'test_find_edges', ['csr', 'coo'])) p.start() time.sleep(1) pserver_list.append(p) time.sleep(3) eids = F.tensor(np.random.randint(g.number_of_edges('r1'), size=100)) u, v = g.find_edges(orig_eid['r1'][eids], etype='r1') du, dv = start_find_edges_client(0, tmpdir, num_server > 1, eids, etype='r1') du = orig_nid['n1'][du] dv = orig_nid['n2'][dv] assert F.array_equal(u, du) assert F.array_equal(v, dv)
def check_hetero_partition_single_etype(num_trainers): user_ids = np.arange(1000) item_ids = np.arange(2000) num_edges = 3 * 1000 src_ids = np.random.choice(user_ids, size=num_edges) dst_ids = np.random.choice(item_ids, size=num_edges) hg = dgl.heterograph({('user', 'like', 'item'): (src_ids, dst_ids)}) with tempfile.TemporaryDirectory() as test_dir: orig_nids, orig_eids = partition_graph( hg, 'test', 2, test_dir, num_trainers_per_machine=num_trainers, return_mapping=True) assert len(orig_nids) == len(hg.ntypes) assert len(orig_eids) == len(hg.etypes) for ntype in hg.ntypes: assert len(orig_nids[ntype]) == hg.number_of_nodes(ntype) for etype in hg.etypes: assert len(orig_eids[etype]) == hg.number_of_edges(etype)
def check_neg_dataloader(g, tmpdir, num_server, num_workers): generate_ip_config("mp_ip_config.txt", num_server, num_server) num_parts = num_server num_hops = 1 orig_nid, orig_eid = partition_graph(g, 'test_sampling', num_parts, tmpdir, num_hops=num_hops, part_method='metis', reshuffle=True, return_mapping=True) if not isinstance(orig_nid, dict): orig_nid = {g.ntypes[0]: orig_nid} if not isinstance(orig_eid, dict): orig_eid = {g.etypes[0]: orig_eid} pserver_list = [] ctx = mp.get_context('spawn') for i in range(num_server): p = ctx.Process(target=start_server, args=(i, tmpdir, num_server > 1, num_workers + 1)) p.start() time.sleep(1) pserver_list.append(p) os.environ['DGL_DIST_MODE'] = 'distributed' os.environ['DGL_NUM_SAMPLER'] = str(num_workers) ptrainer_list = [] p = ctx.Process(target=start_dist_neg_dataloader, args=(0, tmpdir, num_server, num_workers, orig_nid, g)) p.start() ptrainer_list.append(p) for p in pserver_list: p.join() for p in ptrainer_list: p.join()
def test_standalone(tmpdir): reset_envs() generate_ip_config("mp_ip_config.txt", 1, 1) g = CitationGraphDataset("cora")[0] print(g.idtype) num_parts = 1 num_hops = 1 orig_nid, orig_eid = partition_graph(g, 'test_sampling', num_parts, tmpdir, num_hops=num_hops, part_method='metis', reshuffle=True, return_mapping=True) os.environ['DGL_DIST_MODE'] = 'standalone' try: start_dist_dataloader(0, tmpdir, 1, True, orig_nid, orig_eid) except Exception as e: print(e)
def check_rpc_hetero_etype_sampling_shuffle(tmpdir, num_server): ip_config = open("rpc_ip_config.txt", "w") for _ in range(num_server): ip_config.write('{}\n'.format(get_local_usable_addr())) ip_config.close() g = create_random_hetero(dense=True) num_parts = num_server num_hops = 1 partition_graph(g, 'test_sampling', num_parts, tmpdir, num_hops=num_hops, part_method='metis', reshuffle=True) pserver_list = [] ctx = mp.get_context('spawn') for i in range(num_server): p = ctx.Process(target=start_server, args=(i, tmpdir, num_server > 1, 'test_sampling')) p.start() time.sleep(1) pserver_list.append(p) time.sleep(3) fanout = 3 block, gpb = start_hetero_etype_sample_client(0, tmpdir, num_server > 1, fanout) print("Done sampling") for p in pserver_list: p.join() src, dst = block.edges(etype=('n1', 'r2', 'n3')) assert len(src) == 18 src, dst = block.edges(etype=('n2', 'r3', 'n3')) assert len(src) == 18 orig_nid_map = { ntype: F.zeros((g.number_of_nodes(ntype), ), dtype=F.int64) for ntype in g.ntypes } orig_eid_map = { etype: F.zeros((g.number_of_edges(etype), ), dtype=F.int64) for etype in g.etypes } for i in range(num_server): part, _, _, _, _, _, _ = load_partition(tmpdir / 'test_sampling.json', i) ntype_ids, type_nids = gpb.map_to_per_ntype(part.ndata[dgl.NID]) for ntype_id, ntype in enumerate(g.ntypes): idx = ntype_ids == ntype_id F.scatter_row_inplace(orig_nid_map[ntype], F.boolean_mask(type_nids, idx), F.boolean_mask(part.ndata['orig_id'], idx)) etype_ids, type_eids = gpb.map_to_per_etype(part.edata[dgl.EID]) for etype_id, etype in enumerate(g.etypes): idx = etype_ids == etype_id F.scatter_row_inplace(orig_eid_map[etype], F.boolean_mask(type_eids, idx), F.boolean_mask(part.edata['orig_id'], idx)) for src_type, etype, dst_type in block.canonical_etypes: src, dst = block.edges(etype=etype) # These are global Ids after shuffling. shuffled_src = F.gather_row(block.srcnodes[src_type].data[dgl.NID], src) shuffled_dst = F.gather_row(block.dstnodes[dst_type].data[dgl.NID], dst) shuffled_eid = block.edges[etype].data[dgl.EID] orig_src = F.asnumpy(F.gather_row(orig_nid_map[src_type], shuffled_src)) orig_dst = F.asnumpy(F.gather_row(orig_nid_map[dst_type], shuffled_dst)) orig_eid = F.asnumpy(F.gather_row(orig_eid_map[etype], shuffled_eid)) # Check the node Ids and edge Ids. orig_src1, orig_dst1 = g.find_edges(orig_eid, etype=etype) assert np.all(F.asnumpy(orig_src1) == orig_src) assert np.all(F.asnumpy(orig_dst1) == orig_dst)
def test_split_even(): g = create_random_graph(10000) num_parts = 4 num_hops = 2 partition_graph(g, 'dist_graph_test', num_parts, '/tmp/dist_graph', num_hops=num_hops, part_method='metis') node_mask = np.random.randint(0, 100, size=g.number_of_nodes()) > 30 edge_mask = np.random.randint(0, 100, size=g.number_of_edges()) > 30 selected_nodes = np.nonzero(node_mask)[0] selected_edges = np.nonzero(edge_mask)[0] all_nodes1 = [] all_nodes2 = [] all_edges1 = [] all_edges2 = [] # The code now collects the roles of all client processes and use the information # to determine how to split the workloads. Here is to simulate the multi-client # use case. def set_roles(num_clients): dgl.distributed.role.CUR_ROLE = 'default' dgl.distributed.role.GLOBAL_RANK = {i: i for i in range(num_clients)} dgl.distributed.role.PER_ROLE_RANK['default'] = { i: i for i in range(num_clients) } for i in range(num_parts): set_roles(num_parts) part_g, node_feats, edge_feats, gpb, _, _, _ = load_partition( '/tmp/dist_graph/dist_graph_test.json', i) local_nids = F.nonzero_1d(part_g.ndata['inner_node']) local_nids = F.gather_row(part_g.ndata[dgl.NID], local_nids) nodes = node_split(node_mask, gpb, rank=i, force_even=True) all_nodes1.append(nodes) subset = np.intersect1d(F.asnumpy(nodes), F.asnumpy(local_nids)) print('part {} get {} nodes and {} are in the partition'.format( i, len(nodes), len(subset))) set_roles(num_parts * 2) nodes1 = node_split(node_mask, gpb, rank=i * 2, force_even=True) nodes2 = node_split(node_mask, gpb, rank=i * 2 + 1, force_even=True) nodes3, _ = F.sort_1d(F.cat([nodes1, nodes2], 0)) all_nodes2.append(nodes3) subset = np.intersect1d(F.asnumpy(nodes), F.asnumpy(nodes3)) print('intersection has', len(subset)) set_roles(num_parts) local_eids = F.nonzero_1d(part_g.edata['inner_edge']) local_eids = F.gather_row(part_g.edata[dgl.EID], local_eids) edges = edge_split(edge_mask, gpb, rank=i, force_even=True) all_edges1.append(edges) subset = np.intersect1d(F.asnumpy(edges), F.asnumpy(local_eids)) print('part {} get {} edges and {} are in the partition'.format( i, len(edges), len(subset))) set_roles(num_parts * 2) edges1 = edge_split(edge_mask, gpb, rank=i * 2, force_even=True) edges2 = edge_split(edge_mask, gpb, rank=i * 2 + 1, force_even=True) edges3, _ = F.sort_1d(F.cat([edges1, edges2], 0)) all_edges2.append(edges3) subset = np.intersect1d(F.asnumpy(edges), F.asnumpy(edges3)) print('intersection has', len(subset)) all_nodes1 = F.cat(all_nodes1, 0) all_edges1 = F.cat(all_edges1, 0) all_nodes2 = F.cat(all_nodes2, 0) all_edges2 = F.cat(all_edges2, 0) all_nodes = np.nonzero(node_mask)[0] all_edges = np.nonzero(edge_mask)[0] assert np.all(all_nodes == F.asnumpy(all_nodes1)) assert np.all(all_edges == F.asnumpy(all_edges1)) assert np.all(all_nodes == F.asnumpy(all_nodes2)) assert np.all(all_edges == F.asnumpy(all_edges2))
def test_split(hetero): if hetero: g = create_random_hetero() ntype = 'n1' etype = 'r1' else: g = create_random_graph(10000) ntype = '_N' etype = '_E' num_parts = 4 num_hops = 2 partition_graph(g, 'dist_graph_test', num_parts, '/tmp/dist_graph', num_hops=num_hops, part_method='metis') node_mask = np.random.randint(0, 100, size=g.number_of_nodes(ntype)) > 30 edge_mask = np.random.randint(0, 100, size=g.number_of_edges(etype)) > 30 selected_nodes = np.nonzero(node_mask)[0] selected_edges = np.nonzero(edge_mask)[0] # The code now collects the roles of all client processes and use the information # to determine how to split the workloads. Here is to simulate the multi-client # use case. def set_roles(num_clients): dgl.distributed.role.CUR_ROLE = 'default' dgl.distributed.role.GLOBAL_RANK = {i: i for i in range(num_clients)} dgl.distributed.role.PER_ROLE_RANK['default'] = { i: i for i in range(num_clients) } for i in range(num_parts): set_roles(num_parts) part_g, node_feats, edge_feats, gpb, _, _, _ = load_partition( '/tmp/dist_graph/dist_graph_test.json', i) local_nids = F.nonzero_1d(part_g.ndata['inner_node']) local_nids = F.gather_row(part_g.ndata[dgl.NID], local_nids) if hetero: ntype_ids, nids = gpb.map_to_per_ntype(local_nids) local_nids = F.asnumpy(nids)[F.asnumpy(ntype_ids) == 0] else: local_nids = F.asnumpy(local_nids) nodes1 = np.intersect1d(selected_nodes, local_nids) nodes2 = node_split(node_mask, gpb, ntype=ntype, rank=i, force_even=False) assert np.all(np.sort(nodes1) == np.sort(F.asnumpy(nodes2))) for n in F.asnumpy(nodes2): assert n in local_nids set_roles(num_parts * 2) nodes3 = node_split(node_mask, gpb, ntype=ntype, rank=i * 2, force_even=False) nodes4 = node_split(node_mask, gpb, ntype=ntype, rank=i * 2 + 1, force_even=False) nodes5 = F.cat([nodes3, nodes4], 0) assert np.all(np.sort(nodes1) == np.sort(F.asnumpy(nodes5))) set_roles(num_parts) local_eids = F.nonzero_1d(part_g.edata['inner_edge']) local_eids = F.gather_row(part_g.edata[dgl.EID], local_eids) if hetero: etype_ids, eids = gpb.map_to_per_etype(local_eids) local_eids = F.asnumpy(eids)[F.asnumpy(etype_ids) == 0] else: local_eids = F.asnumpy(local_eids) edges1 = np.intersect1d(selected_edges, local_eids) edges2 = edge_split(edge_mask, gpb, etype=etype, rank=i, force_even=False) assert np.all(np.sort(edges1) == np.sort(F.asnumpy(edges2))) for e in F.asnumpy(edges2): assert e in local_eids set_roles(num_parts * 2) edges3 = edge_split(edge_mask, gpb, etype=etype, rank=i * 2, force_even=False) edges4 = edge_split(edge_mask, gpb, etype=etype, rank=i * 2 + 1, force_even=False) edges5 = F.cat([edges3, edges4], 0) assert np.all(np.sort(edges1) == np.sort(F.asnumpy(edges5)))
def check_server_client_hierarchy(shared_mem, num_servers, num_clients): prepare_dist(num_servers) g = create_random_graph(10000) # Partition the graph num_parts = 1 graph_name = 'dist_graph_test_2' g.ndata['features'] = F.unsqueeze(F.arange(0, g.number_of_nodes()), 1) g.edata['features'] = F.unsqueeze(F.arange(0, g.number_of_edges()), 1) partition_graph(g, graph_name, num_parts, '/tmp/dist_graph', num_trainers_per_machine=num_clients) # let's just test on one partition for now. # We cannot run multiple servers and clients on the same machine. serv_ps = [] ctx = mp.get_context('spawn') for serv_id in range(num_servers): p = ctx.Process(target=run_server, args=(graph_name, serv_id, num_servers, num_clients, shared_mem)) serv_ps.append(p) p.start() cli_ps = [] manager = mp.Manager() return_dict = manager.dict() node_mask = np.zeros((g.number_of_nodes(), ), np.int32) edge_mask = np.zeros((g.number_of_edges(), ), np.int32) nodes = np.random.choice(g.number_of_nodes(), g.number_of_nodes() // 10, replace=False) edges = np.random.choice(g.number_of_edges(), g.number_of_edges() // 10, replace=False) node_mask[nodes] = 1 edge_mask[edges] = 1 nodes = np.sort(nodes) edges = np.sort(edges) for cli_id in range(num_clients): print('start client', cli_id) p = ctx.Process(target=run_client_hierarchy, args=(graph_name, 0, num_servers, node_mask, edge_mask, return_dict)) p.start() cli_ps.append(p) for p in cli_ps: p.join() for p in serv_ps: p.join() nodes1 = [] edges1 = [] for n, e in return_dict.values(): nodes1.append(n) edges1.append(e) nodes1, _ = F.sort_1d(F.cat(nodes1, 0)) edges1, _ = F.sort_1d(F.cat(edges1, 0)) assert np.all(F.asnumpy(nodes1) == nodes) assert np.all(F.asnumpy(edges1) == edges) print('clients have terminated')