def test_kv_multi_role(): reset_envs() num_servers = 2 num_trainers = 2 num_samplers = 2 generate_ip_config("kv_ip_mul_config.txt", 1, num_servers) # There are two trainer processes and each trainer process has two sampler processes. num_clients = num_trainers * (1 + num_samplers) ctx = mp.get_context('spawn') pserver_list = [] pclient_list = [] os.environ['DGL_NUM_SAMPLER'] = str(num_samplers) os.environ['DGL_NUM_SERVER'] = str(num_servers) for i in range(num_servers): pserver = ctx.Process(target=start_server_mul_role, args=(i, num_clients, num_servers)) pserver.start() pserver_list.append(pserver) for i in range(num_trainers): pclient = ctx.Process(target=start_client_mul_role, args=(i,)) pclient.start() pclient_list.append(pclient) for i in range(num_trainers): pclient_list[i].join() for i in range(num_servers): pserver_list[i].join()
def test_standalone(tmpdir): reset_envs() generate_ip_config("mp_ip_config.txt", 1, 1) g = CitationGraphDataset("cora")[0] print(g.idtype) num_parts = 1 num_hops = 1 orig_nid, orig_eid = partition_graph(g, 'test_sampling', num_parts, tmpdir, num_hops=num_hops, part_method='metis', reshuffle=True, return_mapping=True) os.environ['DGL_DIST_MODE'] = 'standalone' try: start_dist_dataloader(0, tmpdir, 1, True, orig_nid, orig_eid) except Exception as e: print(e) dgl.distributed.exit_client( ) # this is needed since there's two test here in one process
def test_standalone_sampling(): reset_envs() import tempfile os.environ['DGL_DIST_MODE'] = 'standalone' with tempfile.TemporaryDirectory() as tmpdirname: check_standalone_sampling(Path(tmpdirname), False) check_standalone_sampling(Path(tmpdirname), True)
def test_standalone(tmpdir): reset_envs() ip_config = open("mp_ip_config.txt", "w") for _ in range(1): ip_config.write('{}\n'.format(get_local_usable_addr())) ip_config.close() g = CitationGraphDataset("cora")[0] print(g.idtype) num_parts = 1 num_hops = 1 orig_nid, orig_eid = partition_graph(g, 'test_sampling', num_parts, tmpdir, num_hops=num_hops, part_method='metis', reshuffle=True, return_mapping=True) os.environ['DGL_DIST_MODE'] = 'standalone' try: start_dist_dataloader(0, tmpdir, 1, True, orig_nid, orig_eid) except Exception as e: print(e) dgl.distributed.exit_client( ) # this is needed since there's two test here in one process
def test_multi_client_groups(): reset_envs() os.environ['DGL_DIST_MODE'] = 'distributed' ip_config = "rpc_ip_config_mul_client_groups.txt" num_machines = 5 # should test with larger number but due to possible port in-use issue. num_servers = 1 generate_ip_config(ip_config, num_machines, num_servers) # presssue test num_clients = 2 num_groups = 2 ctx = mp.get_context('spawn') pserver_list = [] for i in range(num_servers * num_machines): pserver = ctx.Process(target=start_server, args=(num_clients, ip_config, i, True, num_servers)) pserver.start() pserver_list.append(pserver) pclient_list = [] for i in range(num_clients): for group_id in range(num_groups): pclient = ctx.Process(target=start_client, args=(ip_config, group_id, num_servers)) pclient.start() pclient_list.append(pclient) for p in pclient_list: p.join() for p in pserver_list: assert p.is_alive() # force shutdown server dgl.distributed.shutdown_servers(ip_config, num_servers) for p in pserver_list: p.join()
def test_multi_client_connect(net_type): reset_envs() os.environ['DGL_DIST_MODE'] = 'distributed' ip_config = "rpc_ip_config_mul_client.txt" generate_ip_config(ip_config, 1, 1) ctx = mp.get_context('spawn') num_clients = 1 pserver = ctx.Process(target=start_server, args=(num_clients, ip_config, 0, False, 1, net_type)) # small max try times os.environ['DGL_DIST_MAX_TRY_TIMES'] = '1' expect_except = False try: start_client(ip_config, 0, 1, net_type) except dgl.distributed.DistConnectError as err: print("Expected error: {}".format(err)) expect_except = True assert expect_except # large max try times os.environ['DGL_DIST_MAX_TRY_TIMES'] = '1024' pclient = ctx.Process(target=start_client, args=(ip_config, 0, 1, net_type)) pclient.start() pserver.start() pclient.join() pserver.join() reset_envs()
def test_rpc_find_edges_shuffle(num_server): reset_envs() import tempfile os.environ['DGL_DIST_MODE'] = 'distributed' with tempfile.TemporaryDirectory() as tmpdirname: check_rpc_hetero_find_edges_shuffle(Path(tmpdirname), num_server) check_rpc_find_edges_shuffle(Path(tmpdirname), num_server)
def test_server_client(): reset_envs() os.environ['DGL_DIST_MODE'] = 'distributed' check_server_client_hierarchy(False, 1, 4) check_server_client_empty(True, 1, 1) check_server_client_hetero(True, 1, 1) check_server_client_hetero(False, 1, 1) check_server_client(True, 1, 1) check_server_client(False, 1, 1)
def test_dist_emb_server_client(): reset_envs() os.environ['DGL_DIST_MODE'] = 'distributed' check_dist_emb_server_client(True, 1, 1) check_dist_emb_server_client(False, 1, 1) check_dist_emb_server_client(True, 2, 2) check_dist_emb_server_client(True, 1, 1, 2) check_dist_emb_server_client(False, 1, 1, 2) check_dist_emb_server_client(True, 2, 2, 2)
def test_dist_dataloader(tmpdir, num_server, num_workers, drop_last, reshuffle, num_groups): reset_envs() # No multiple partitions on single machine for # multiple client groups in case of race condition. if num_groups > 1: num_server = 1 generate_ip_config("mp_ip_config.txt", num_server, num_server) g = CitationGraphDataset("cora")[0] print(g.idtype) num_parts = num_server num_hops = 1 orig_nid, orig_eid = partition_graph(g, 'test_sampling', num_parts, tmpdir, num_hops=num_hops, part_method='metis', reshuffle=reshuffle, return_mapping=True) pserver_list = [] ctx = mp.get_context('spawn') keep_alive = num_groups > 1 for i in range(num_server): p = ctx.Process(target=start_server, args=(i, tmpdir, num_server > 1, num_workers + 1, keep_alive)) p.start() time.sleep(1) pserver_list.append(p) os.environ['DGL_DIST_MODE'] = 'distributed' os.environ['DGL_NUM_SAMPLER'] = str(num_workers) ptrainer_list = [] num_trainers = 1 for trainer_id in range(num_trainers): for group_id in range(num_groups): p = ctx.Process(target=start_dist_dataloader, args=(trainer_id, tmpdir, num_server, drop_last, orig_nid, orig_eid, group_id)) p.start() time.sleep(1) # avoid race condition when instantiating DistGraph ptrainer_list.append(p) for p in ptrainer_list: p.join() if keep_alive: for p in pserver_list: assert p.is_alive() # force shutdown server dgl.distributed.shutdown_servers("mp_ip_config.txt", 1) for p in pserver_list: p.join()
def test_rpc(): reset_envs() os.environ['DGL_DIST_MODE'] = 'distributed' generate_ip_config("rpc_ip_config.txt", 1, 1) ctx = mp.get_context('spawn') pserver = ctx.Process(target=start_server, args=(1, "rpc_ip_config.txt")) pclient = ctx.Process(target=start_client, args=("rpc_ip_config.txt", )) pserver.start() pclient.start() pserver.join() pclient.join()
def test_rpc_sampling_shuffle(num_server): reset_envs() import tempfile os.environ['DGL_DIST_MODE'] = 'distributed' with tempfile.TemporaryDirectory() as tmpdirname: check_rpc_sampling_shuffle(Path(tmpdirname), num_server) check_rpc_sampling_shuffle(Path(tmpdirname), num_server, num_groups=2) check_rpc_hetero_sampling_shuffle(Path(tmpdirname), num_server) check_rpc_hetero_sampling_empty_shuffle(Path(tmpdirname), num_server) check_rpc_hetero_etype_sampling_shuffle(Path(tmpdirname), num_server) check_rpc_hetero_etype_sampling_empty_shuffle(Path(tmpdirname), num_server)
def test_rpc(): reset_envs() os.environ['DGL_DIST_MODE'] = 'distributed' ip_config = open("rpc_ip_config.txt", "w") ip_addr = get_local_usable_addr() ip_config.write('%s\n' % ip_addr) ip_config.close() ctx = mp.get_context('spawn') pserver = ctx.Process(target=start_server, args=(1, "rpc_ip_config.txt")) pclient = ctx.Process(target=start_client, args=("rpc_ip_config.txt", )) pserver.start() pclient.start() pserver.join() pclient.join()
def test_rpc_timeout(net_type): reset_envs() os.environ['DGL_DIST_MODE'] = 'distributed' ip_config = "rpc_ip_config.txt" generate_ip_config(ip_config, 1, 1) ctx = mp.get_context('spawn') pserver = ctx.Process(target=start_server, args=(1, ip_config, 0, False, 1, net_type)) pclient = ctx.Process(target=start_client_timeout, args=(ip_config, 0, 1, net_type)) pserver.start() pclient.start() pserver.join() pclient.join()
def test_standalone_node_emb(): reset_envs() os.environ['DGL_DIST_MODE'] = 'standalone' g = create_random_graph(10000) # Partition the graph num_parts = 1 graph_name = 'dist_graph_test_3' g.ndata['features'] = F.unsqueeze(F.arange(0, g.number_of_nodes()), 1) g.edata['features'] = F.unsqueeze(F.arange(0, g.number_of_edges()), 1) partition_graph(g, graph_name, num_parts, '/tmp/dist_graph') dgl.distributed.initialize("kv_ip_config.txt") dist_g = DistGraph(graph_name, part_config='/tmp/dist_graph/{}.json'.format(graph_name)) check_dist_emb(dist_g, 1, g.number_of_nodes(), g.number_of_edges()) dgl.distributed.exit_client() # this is needed since there's two test here in one process
def test_rpc_msg(): reset_envs() os.environ['DGL_DIST_MODE'] = 'distributed' from dgl.distributed.rpc import serialize_to_payload, deserialize_from_payload, RPCMessage SERVICE_ID = 32452 dgl.distributed.register_service(SERVICE_ID, MyRequest, MyResponse) req = MyRequest() data, tensors = serialize_to_payload(req) rpcmsg = RPCMessage(SERVICE_ID, 23, 0, 1, data, tensors) assert rpcmsg.service_id == SERVICE_ID assert rpcmsg.msg_seq == 23 assert rpcmsg.client_id == 0 assert rpcmsg.server_id == 1 assert len(rpcmsg.data) == len(data) assert len(rpcmsg.tensors) == 1 assert F.array_equal(rpcmsg.tensors[0], req.z)
def test_rpc_sampling_shuffle(num_server): reset_envs() import tempfile os.environ['DGL_DIST_MODE'] = 'distributed' with tempfile.TemporaryDirectory() as tmpdirname: check_rpc_sampling_shuffle(Path(tmpdirname), num_server) # [TODO][Rhett] Tests for multiple groups may fail sometimes and # root cause is unknown. Let's disable them for now. #check_rpc_sampling_shuffle(Path(tmpdirname), num_server, num_groups=2) check_rpc_hetero_sampling_shuffle(Path(tmpdirname), num_server) check_rpc_hetero_sampling_empty_shuffle(Path(tmpdirname), num_server) check_rpc_hetero_etype_sampling_shuffle(Path(tmpdirname), num_server) check_rpc_hetero_etype_sampling_empty_shuffle(Path(tmpdirname), num_server) check_rpc_bipartite_sampling_empty(Path(tmpdirname), num_server) check_rpc_bipartite_sampling_shuffle(Path(tmpdirname), num_server) check_rpc_bipartite_etype_sampling_empty(Path(tmpdirname), num_server) check_rpc_bipartite_etype_sampling_shuffle(Path(tmpdirname), num_server)
def test_serialize(): reset_envs() os.environ['DGL_DIST_MODE'] = 'distributed' from dgl.distributed.rpc import serialize_to_payload, deserialize_from_payload SERVICE_ID = 12345 dgl.distributed.register_service(SERVICE_ID, MyRequest, MyResponse) req = MyRequest() data, tensors = serialize_to_payload(req) req1 = deserialize_from_payload(MyRequest, data, tensors) req1.foo(req1.x, req1.y) assert req.x == req1.x assert req.y == req1.y assert F.array_equal(req.z, req1.z) res = MyResponse() data, tensors = serialize_to_payload(res) res1 = deserialize_from_payload(MyResponse, data, tensors) assert res.x == res1.x
def test_multi_client(): reset_envs() os.environ['DGL_DIST_MODE'] = 'distributed' generate_ip_config("rpc_ip_config_mul_client.txt", 1, 1) ctx = mp.get_context('spawn') pserver = ctx.Process(target=start_server, args=(10, "rpc_ip_config_mul_client.txt")) pclient_list = [] for i in range(10): pclient = ctx.Process(target=start_client, args=("rpc_ip_config_mul_client.txt", )) pclient_list.append(pclient) pserver.start() for i in range(10): pclient_list[i].start() for i in range(10): pclient_list[i].join() pserver.join()
def test_multi_thread_rpc(net_type): reset_envs() os.environ['DGL_DIST_MODE'] = 'distributed' num_servers = 2 ip_config = "rpc_ip_config_multithread.txt" generate_ip_config(ip_config, num_servers, num_servers) ctx = mp.get_context('spawn') pserver_list = [] for i in range(num_servers): pserver = ctx.Process(target=start_server, args=(1, ip_config, i, False, 1, net_type)) pserver.start() pserver_list.append(pserver) def start_client_multithread(ip_config): import threading dgl.distributed.connect_to_server(ip_config=ip_config, num_servers=1, net_type=net_type) dgl.distributed.register_service(HELLO_SERVICE_ID, HelloRequest, HelloResponse) req = HelloRequest(STR, INTEGER, TENSOR, simple_func) dgl.distributed.send_request(0, req) def subthread_call(server_id): req = HelloRequest(STR, INTEGER, TENSOR, simple_func) dgl.distributed.send_request(server_id, req) subthread = threading.Thread(target=subthread_call, args=(1, )) subthread.start() subthread.join() res0 = dgl.distributed.recv_response() res1 = dgl.distributed.recv_response() # Order is not guaranteed assert_array_equal(F.asnumpy(res0.tensor), F.asnumpy(TENSOR)) assert_array_equal(F.asnumpy(res1.tensor), F.asnumpy(TENSOR)) dgl.distributed.exit_client() start_client_multithread(ip_config) pserver.join()
def test_multi_client(net_type): reset_envs() os.environ['DGL_DIST_MODE'] = 'distributed' ip_config = "rpc_ip_config_mul_client.txt" generate_ip_config(ip_config, 1, 1) ctx = mp.get_context('spawn') num_clients = 20 pserver = ctx.Process(target=start_server, args=(num_clients, ip_config, 0, False, 1, net_type)) pclient_list = [] for i in range(num_clients): pclient = ctx.Process(target=start_client, args=(ip_config, 0, 1, net_type)) pclient_list.append(pclient) pserver.start() for i in range(num_clients): pclient_list[i].start() for i in range(num_clients): pclient_list[i].join() pserver.join()
def test_multi_thread_rpc(): reset_envs() os.environ['DGL_DIST_MODE'] = 'distributed' ip_config = open("rpc_ip_config_multithread.txt", "w") num_servers = 2 for _ in range(num_servers): # 3 servers ip_config.write('{}\n'.format(get_local_usable_addr())) ip_config.close() ctx = mp.get_context('spawn') pserver_list = [] for i in range(num_servers): pserver = ctx.Process(target=start_server, args=(1, "rpc_ip_config_multithread.txt", i)) pserver.start() pserver_list.append(pserver) def start_client_multithread(ip_config): import threading dgl.distributed.connect_to_server(ip_config=ip_config, num_servers=1) dgl.distributed.register_service(HELLO_SERVICE_ID, HelloRequest, HelloResponse) req = HelloRequest(STR, INTEGER, TENSOR, simple_func) dgl.distributed.send_request(0, req) def subthread_call(server_id): req = HelloRequest(STR, INTEGER, TENSOR + server_id, simple_func) dgl.distributed.send_request(server_id, req) subthread = threading.Thread(target=subthread_call, args=(1, )) subthread.start() subthread.join() res0 = dgl.distributed.recv_response() res1 = dgl.distributed.recv_response() assert_array_equal(F.asnumpy(res0.tensor), F.asnumpy(TENSOR)) assert_array_equal(F.asnumpy(res1.tensor), F.asnumpy(TENSOR + 1)) dgl.distributed.exit_client() start_client_multithread("rpc_ip_config_multithread.txt") pserver.join()
def test_dist_dataloader(tmpdir, num_server, num_workers, drop_last, reshuffle): reset_envs() ip_config = open("mp_ip_config.txt", "w") for _ in range(num_server): ip_config.write('{}\n'.format(get_local_usable_addr())) ip_config.close() g = CitationGraphDataset("cora")[0] print(g.idtype) num_parts = num_server num_hops = 1 orig_nid, orig_eid = partition_graph(g, 'test_sampling', num_parts, tmpdir, num_hops=num_hops, part_method='metis', reshuffle=reshuffle, return_mapping=True) pserver_list = [] ctx = mp.get_context('spawn') for i in range(num_server): p = ctx.Process(target=start_server, args=(i, tmpdir, num_server > 1, num_workers + 1)) p.start() time.sleep(1) pserver_list.append(p) os.environ['DGL_DIST_MODE'] = 'distributed' os.environ['DGL_NUM_SAMPLER'] = str(num_workers) ptrainer = ctx.Process(target=start_dist_dataloader, args=(0, tmpdir, num_server, drop_last, orig_nid, orig_eid)) ptrainer.start() for p in pserver_list: p.join() ptrainer.join()
def test_kv_store(): reset_envs() num_servers = 2 num_clients = 2 generate_ip_config("kv_ip_config.txt", 1, num_servers) ctx = mp.get_context('spawn') pserver_list = [] pclient_list = [] os.environ['DGL_NUM_SERVER'] = str(num_servers) for i in range(num_servers): pserver = ctx.Process(target=start_server, args=(i, num_clients, num_servers)) pserver.start() pserver_list.append(pserver) for i in range(num_clients): pclient = ctx.Process(target=start_client, args=(num_clients, num_servers)) pclient.start() pclient_list.append(pclient) for i in range(num_clients): pclient_list[i].join() for i in range(num_servers): pserver_list[i].join()
def test_multi_client(): reset_envs() os.environ['DGL_DIST_MODE'] = 'distributed' ip_config = open("rpc_ip_config_mul_client.txt", "w") ip_addr = get_local_usable_addr() ip_config.write('%s\n' % ip_addr) ip_config.close() ctx = mp.get_context('spawn') pserver = ctx.Process(target=start_server, args=(10, "rpc_ip_config_mul_client.txt")) pclient_list = [] for i in range(10): pclient = ctx.Process(target=start_client, args=("rpc_ip_config_mul_client.txt", )) pclient_list.append(pclient) pserver.start() for i in range(10): pclient_list[i].start() for i in range(10): pclient_list[i].join() pserver.join()
def test_neg_dataloader(tmpdir, num_server, num_workers): reset_envs() g = CitationGraphDataset("cora")[0] check_neg_dataloader(g, tmpdir, num_server, num_workers) g = create_random_hetero() check_neg_dataloader(g, tmpdir, num_server, num_workers)
def test_rpc_in_subgraph(): reset_envs() import tempfile os.environ['DGL_DIST_MODE'] = 'distributed' with tempfile.TemporaryDirectory() as tmpdirname: check_rpc_in_subgraph_shuffle(Path(tmpdirname), 2)
def test_rpc_sampling(): reset_envs() import tempfile os.environ['DGL_DIST_MODE'] = 'distributed' with tempfile.TemporaryDirectory() as tmpdirname: check_rpc_sampling(Path(tmpdirname), 2)