Beispiel #1
0
def test_kv_multi_role():
    reset_envs()
    num_servers = 2
    num_trainers = 2
    num_samplers = 2
    generate_ip_config("kv_ip_mul_config.txt", 1, num_servers)
    # There are two trainer processes and each trainer process has two sampler processes.
    num_clients = num_trainers * (1 + num_samplers)
    ctx = mp.get_context('spawn')
    pserver_list = []
    pclient_list = []
    os.environ['DGL_NUM_SAMPLER'] = str(num_samplers)
    os.environ['DGL_NUM_SERVER'] = str(num_servers)
    for i in range(num_servers):
        pserver = ctx.Process(target=start_server_mul_role, args=(i, num_clients, num_servers))
        pserver.start()
        pserver_list.append(pserver)
    for i in range(num_trainers):
        pclient = ctx.Process(target=start_client_mul_role, args=(i,))
        pclient.start()
        pclient_list.append(pclient)
    for i in range(num_trainers):
        pclient_list[i].join()
    for i in range(num_servers):
        pserver_list[i].join()
Beispiel #2
0
def test_standalone(tmpdir):
    reset_envs()
    generate_ip_config("mp_ip_config.txt", 1, 1)

    g = CitationGraphDataset("cora")[0]
    print(g.idtype)
    num_parts = 1
    num_hops = 1

    orig_nid, orig_eid = partition_graph(g,
                                         'test_sampling',
                                         num_parts,
                                         tmpdir,
                                         num_hops=num_hops,
                                         part_method='metis',
                                         reshuffle=True,
                                         return_mapping=True)

    os.environ['DGL_DIST_MODE'] = 'standalone'
    try:
        start_dist_dataloader(0, tmpdir, 1, True, orig_nid, orig_eid)
    except Exception as e:
        print(e)
    dgl.distributed.exit_client(
    )  # this is needed since there's two test here in one process
def test_standalone_sampling():
    reset_envs()
    import tempfile
    os.environ['DGL_DIST_MODE'] = 'standalone'
    with tempfile.TemporaryDirectory() as tmpdirname:
        check_standalone_sampling(Path(tmpdirname), False)
        check_standalone_sampling(Path(tmpdirname), True)
Beispiel #4
0
def test_standalone(tmpdir):
    reset_envs()
    ip_config = open("mp_ip_config.txt", "w")
    for _ in range(1):
        ip_config.write('{}\n'.format(get_local_usable_addr()))
    ip_config.close()

    g = CitationGraphDataset("cora")[0]
    print(g.idtype)
    num_parts = 1
    num_hops = 1

    orig_nid, orig_eid = partition_graph(g,
                                         'test_sampling',
                                         num_parts,
                                         tmpdir,
                                         num_hops=num_hops,
                                         part_method='metis',
                                         reshuffle=True,
                                         return_mapping=True)

    os.environ['DGL_DIST_MODE'] = 'standalone'
    try:
        start_dist_dataloader(0, tmpdir, 1, True, orig_nid, orig_eid)
    except Exception as e:
        print(e)
    dgl.distributed.exit_client(
    )  # this is needed since there's two test here in one process
Beispiel #5
0
def test_multi_client_groups():
    reset_envs()
    os.environ['DGL_DIST_MODE'] = 'distributed'
    ip_config = "rpc_ip_config_mul_client_groups.txt"
    num_machines = 5
    # should test with larger number but due to possible port in-use issue.
    num_servers = 1
    generate_ip_config(ip_config, num_machines, num_servers)
    # presssue test
    num_clients = 2
    num_groups = 2
    ctx = mp.get_context('spawn')
    pserver_list = []
    for i in range(num_servers * num_machines):
        pserver = ctx.Process(target=start_server,
                              args=(num_clients, ip_config, i, True,
                                    num_servers))
        pserver.start()
        pserver_list.append(pserver)
    pclient_list = []
    for i in range(num_clients):
        for group_id in range(num_groups):
            pclient = ctx.Process(target=start_client,
                                  args=(ip_config, group_id, num_servers))
            pclient.start()
            pclient_list.append(pclient)
    for p in pclient_list:
        p.join()
    for p in pserver_list:
        assert p.is_alive()
    # force shutdown server
    dgl.distributed.shutdown_servers(ip_config, num_servers)
    for p in pserver_list:
        p.join()
Beispiel #6
0
def test_multi_client_connect(net_type):
    reset_envs()
    os.environ['DGL_DIST_MODE'] = 'distributed'
    ip_config = "rpc_ip_config_mul_client.txt"
    generate_ip_config(ip_config, 1, 1)
    ctx = mp.get_context('spawn')
    num_clients = 1
    pserver = ctx.Process(target=start_server,
                          args=(num_clients, ip_config, 0, False, 1, net_type))

    # small max try times
    os.environ['DGL_DIST_MAX_TRY_TIMES'] = '1'
    expect_except = False
    try:
        start_client(ip_config, 0, 1, net_type)
    except dgl.distributed.DistConnectError as err:
        print("Expected error: {}".format(err))
        expect_except = True
    assert expect_except

    # large max try times
    os.environ['DGL_DIST_MAX_TRY_TIMES'] = '1024'
    pclient = ctx.Process(target=start_client,
                          args=(ip_config, 0, 1, net_type))
    pclient.start()
    pserver.start()
    pclient.join()
    pserver.join()
    reset_envs()
def test_rpc_find_edges_shuffle(num_server):
    reset_envs()
    import tempfile
    os.environ['DGL_DIST_MODE'] = 'distributed'
    with tempfile.TemporaryDirectory() as tmpdirname:
        check_rpc_hetero_find_edges_shuffle(Path(tmpdirname), num_server)
        check_rpc_find_edges_shuffle(Path(tmpdirname), num_server)
def test_server_client():
    reset_envs()
    os.environ['DGL_DIST_MODE'] = 'distributed'
    check_server_client_hierarchy(False, 1, 4)
    check_server_client_empty(True, 1, 1)
    check_server_client_hetero(True, 1, 1)
    check_server_client_hetero(False, 1, 1)
    check_server_client(True, 1, 1)
    check_server_client(False, 1, 1)
def test_dist_emb_server_client():
    reset_envs()
    os.environ['DGL_DIST_MODE'] = 'distributed'
    check_dist_emb_server_client(True, 1, 1)
    check_dist_emb_server_client(False, 1, 1)
    check_dist_emb_server_client(True, 2, 2)
    check_dist_emb_server_client(True, 1, 1, 2)
    check_dist_emb_server_client(False, 1, 1, 2)
    check_dist_emb_server_client(True, 2, 2, 2)
Beispiel #10
0
def test_dist_dataloader(tmpdir, num_server, num_workers, drop_last, reshuffle,
                         num_groups):
    reset_envs()
    # No multiple partitions on single machine for
    # multiple client groups in case of race condition.
    if num_groups > 1:
        num_server = 1
    generate_ip_config("mp_ip_config.txt", num_server, num_server)

    g = CitationGraphDataset("cora")[0]
    print(g.idtype)
    num_parts = num_server
    num_hops = 1

    orig_nid, orig_eid = partition_graph(g,
                                         'test_sampling',
                                         num_parts,
                                         tmpdir,
                                         num_hops=num_hops,
                                         part_method='metis',
                                         reshuffle=reshuffle,
                                         return_mapping=True)

    pserver_list = []
    ctx = mp.get_context('spawn')
    keep_alive = num_groups > 1
    for i in range(num_server):
        p = ctx.Process(target=start_server,
                        args=(i, tmpdir, num_server > 1, num_workers + 1,
                              keep_alive))
        p.start()
        time.sleep(1)
        pserver_list.append(p)

    os.environ['DGL_DIST_MODE'] = 'distributed'
    os.environ['DGL_NUM_SAMPLER'] = str(num_workers)
    ptrainer_list = []
    num_trainers = 1
    for trainer_id in range(num_trainers):
        for group_id in range(num_groups):
            p = ctx.Process(target=start_dist_dataloader,
                            args=(trainer_id, tmpdir, num_server, drop_last,
                                  orig_nid, orig_eid, group_id))
            p.start()
            time.sleep(1)  # avoid race condition when instantiating DistGraph
            ptrainer_list.append(p)

    for p in ptrainer_list:
        p.join()
    if keep_alive:
        for p in pserver_list:
            assert p.is_alive()
        # force shutdown server
        dgl.distributed.shutdown_servers("mp_ip_config.txt", 1)
    for p in pserver_list:
        p.join()
Beispiel #11
0
def test_rpc():
    reset_envs()
    os.environ['DGL_DIST_MODE'] = 'distributed'
    generate_ip_config("rpc_ip_config.txt", 1, 1)
    ctx = mp.get_context('spawn')
    pserver = ctx.Process(target=start_server, args=(1, "rpc_ip_config.txt"))
    pclient = ctx.Process(target=start_client, args=("rpc_ip_config.txt", ))
    pserver.start()
    pclient.start()
    pserver.join()
    pclient.join()
Beispiel #12
0
def test_rpc_sampling_shuffle(num_server):
    reset_envs()
    import tempfile
    os.environ['DGL_DIST_MODE'] = 'distributed'
    with tempfile.TemporaryDirectory() as tmpdirname:
        check_rpc_sampling_shuffle(Path(tmpdirname), num_server)
        check_rpc_sampling_shuffle(Path(tmpdirname), num_server, num_groups=2)
        check_rpc_hetero_sampling_shuffle(Path(tmpdirname), num_server)
        check_rpc_hetero_sampling_empty_shuffle(Path(tmpdirname), num_server)
        check_rpc_hetero_etype_sampling_shuffle(Path(tmpdirname), num_server)
        check_rpc_hetero_etype_sampling_empty_shuffle(Path(tmpdirname), num_server)
Beispiel #13
0
def test_rpc():
    reset_envs()
    os.environ['DGL_DIST_MODE'] = 'distributed'
    ip_config = open("rpc_ip_config.txt", "w")
    ip_addr = get_local_usable_addr()
    ip_config.write('%s\n' % ip_addr)
    ip_config.close()
    ctx = mp.get_context('spawn')
    pserver = ctx.Process(target=start_server, args=(1, "rpc_ip_config.txt"))
    pclient = ctx.Process(target=start_client, args=("rpc_ip_config.txt", ))
    pserver.start()
    pclient.start()
    pserver.join()
    pclient.join()
Beispiel #14
0
def test_rpc_timeout(net_type):
    reset_envs()
    os.environ['DGL_DIST_MODE'] = 'distributed'
    ip_config = "rpc_ip_config.txt"
    generate_ip_config(ip_config, 1, 1)
    ctx = mp.get_context('spawn')
    pserver = ctx.Process(target=start_server,
                          args=(1, ip_config, 0, False, 1, net_type))
    pclient = ctx.Process(target=start_client_timeout,
                          args=(ip_config, 0, 1, net_type))
    pserver.start()
    pclient.start()
    pserver.join()
    pclient.join()
Beispiel #15
0
def test_standalone_node_emb():
    reset_envs()
    os.environ['DGL_DIST_MODE'] = 'standalone'

    g = create_random_graph(10000)
    # Partition the graph
    num_parts = 1
    graph_name = 'dist_graph_test_3'
    g.ndata['features'] = F.unsqueeze(F.arange(0, g.number_of_nodes()), 1)
    g.edata['features'] = F.unsqueeze(F.arange(0, g.number_of_edges()), 1)
    partition_graph(g, graph_name, num_parts, '/tmp/dist_graph')

    dgl.distributed.initialize("kv_ip_config.txt")
    dist_g = DistGraph(graph_name, part_config='/tmp/dist_graph/{}.json'.format(graph_name))
    check_dist_emb(dist_g, 1, g.number_of_nodes(), g.number_of_edges())
    dgl.distributed.exit_client() # this is needed since there's two test here in one process
Beispiel #16
0
def test_rpc_msg():
    reset_envs()
    os.environ['DGL_DIST_MODE'] = 'distributed'
    from dgl.distributed.rpc import serialize_to_payload, deserialize_from_payload, RPCMessage
    SERVICE_ID = 32452
    dgl.distributed.register_service(SERVICE_ID, MyRequest, MyResponse)
    req = MyRequest()
    data, tensors = serialize_to_payload(req)
    rpcmsg = RPCMessage(SERVICE_ID, 23, 0, 1, data, tensors)
    assert rpcmsg.service_id == SERVICE_ID
    assert rpcmsg.msg_seq == 23
    assert rpcmsg.client_id == 0
    assert rpcmsg.server_id == 1
    assert len(rpcmsg.data) == len(data)
    assert len(rpcmsg.tensors) == 1
    assert F.array_equal(rpcmsg.tensors[0], req.z)
Beispiel #17
0
def test_rpc_sampling_shuffle(num_server):
    reset_envs()
    import tempfile
    os.environ['DGL_DIST_MODE'] = 'distributed'
    with tempfile.TemporaryDirectory() as tmpdirname:
        check_rpc_sampling_shuffle(Path(tmpdirname), num_server)
        # [TODO][Rhett] Tests for multiple groups may fail sometimes and
        # root cause is unknown. Let's disable them for now.
        #check_rpc_sampling_shuffle(Path(tmpdirname), num_server, num_groups=2)
        check_rpc_hetero_sampling_shuffle(Path(tmpdirname), num_server)
        check_rpc_hetero_sampling_empty_shuffle(Path(tmpdirname), num_server)
        check_rpc_hetero_etype_sampling_shuffle(Path(tmpdirname), num_server)
        check_rpc_hetero_etype_sampling_empty_shuffle(Path(tmpdirname), num_server)
        check_rpc_bipartite_sampling_empty(Path(tmpdirname), num_server)
        check_rpc_bipartite_sampling_shuffle(Path(tmpdirname), num_server)
        check_rpc_bipartite_etype_sampling_empty(Path(tmpdirname), num_server)
        check_rpc_bipartite_etype_sampling_shuffle(Path(tmpdirname), num_server)
Beispiel #18
0
def test_serialize():
    reset_envs()
    os.environ['DGL_DIST_MODE'] = 'distributed'
    from dgl.distributed.rpc import serialize_to_payload, deserialize_from_payload
    SERVICE_ID = 12345
    dgl.distributed.register_service(SERVICE_ID, MyRequest, MyResponse)
    req = MyRequest()
    data, tensors = serialize_to_payload(req)
    req1 = deserialize_from_payload(MyRequest, data, tensors)
    req1.foo(req1.x, req1.y)
    assert req.x == req1.x
    assert req.y == req1.y
    assert F.array_equal(req.z, req1.z)

    res = MyResponse()
    data, tensors = serialize_to_payload(res)
    res1 = deserialize_from_payload(MyResponse, data, tensors)
    assert res.x == res1.x
Beispiel #19
0
def test_multi_client():
    reset_envs()
    os.environ['DGL_DIST_MODE'] = 'distributed'
    generate_ip_config("rpc_ip_config_mul_client.txt", 1, 1)
    ctx = mp.get_context('spawn')
    pserver = ctx.Process(target=start_server,
                          args=(10, "rpc_ip_config_mul_client.txt"))
    pclient_list = []
    for i in range(10):
        pclient = ctx.Process(target=start_client,
                              args=("rpc_ip_config_mul_client.txt", ))
        pclient_list.append(pclient)
    pserver.start()
    for i in range(10):
        pclient_list[i].start()
    for i in range(10):
        pclient_list[i].join()
    pserver.join()
Beispiel #20
0
def test_multi_thread_rpc(net_type):
    reset_envs()
    os.environ['DGL_DIST_MODE'] = 'distributed'
    num_servers = 2
    ip_config = "rpc_ip_config_multithread.txt"
    generate_ip_config(ip_config, num_servers, num_servers)
    ctx = mp.get_context('spawn')
    pserver_list = []
    for i in range(num_servers):
        pserver = ctx.Process(target=start_server,
                              args=(1, ip_config, i, False, 1, net_type))
        pserver.start()
        pserver_list.append(pserver)

    def start_client_multithread(ip_config):
        import threading
        dgl.distributed.connect_to_server(ip_config=ip_config,
                                          num_servers=1,
                                          net_type=net_type)
        dgl.distributed.register_service(HELLO_SERVICE_ID, HelloRequest,
                                         HelloResponse)

        req = HelloRequest(STR, INTEGER, TENSOR, simple_func)
        dgl.distributed.send_request(0, req)

        def subthread_call(server_id):
            req = HelloRequest(STR, INTEGER, TENSOR, simple_func)
            dgl.distributed.send_request(server_id, req)

        subthread = threading.Thread(target=subthread_call, args=(1, ))
        subthread.start()
        subthread.join()

        res0 = dgl.distributed.recv_response()
        res1 = dgl.distributed.recv_response()
        # Order is not guaranteed
        assert_array_equal(F.asnumpy(res0.tensor), F.asnumpy(TENSOR))
        assert_array_equal(F.asnumpy(res1.tensor), F.asnumpy(TENSOR))
        dgl.distributed.exit_client()

    start_client_multithread(ip_config)
    pserver.join()
Beispiel #21
0
def test_multi_client(net_type):
    reset_envs()
    os.environ['DGL_DIST_MODE'] = 'distributed'
    ip_config = "rpc_ip_config_mul_client.txt"
    generate_ip_config(ip_config, 1, 1)
    ctx = mp.get_context('spawn')
    num_clients = 20
    pserver = ctx.Process(target=start_server,
                          args=(num_clients, ip_config, 0, False, 1, net_type))
    pclient_list = []
    for i in range(num_clients):
        pclient = ctx.Process(target=start_client,
                              args=(ip_config, 0, 1, net_type))
        pclient_list.append(pclient)
    pserver.start()
    for i in range(num_clients):
        pclient_list[i].start()
    for i in range(num_clients):
        pclient_list[i].join()
    pserver.join()
Beispiel #22
0
def test_multi_thread_rpc():
    reset_envs()
    os.environ['DGL_DIST_MODE'] = 'distributed'
    ip_config = open("rpc_ip_config_multithread.txt", "w")
    num_servers = 2
    for _ in range(num_servers):  # 3 servers
        ip_config.write('{}\n'.format(get_local_usable_addr()))
    ip_config.close()
    ctx = mp.get_context('spawn')
    pserver_list = []
    for i in range(num_servers):
        pserver = ctx.Process(target=start_server,
                              args=(1, "rpc_ip_config_multithread.txt", i))
        pserver.start()
        pserver_list.append(pserver)

    def start_client_multithread(ip_config):
        import threading
        dgl.distributed.connect_to_server(ip_config=ip_config, num_servers=1)
        dgl.distributed.register_service(HELLO_SERVICE_ID, HelloRequest,
                                         HelloResponse)

        req = HelloRequest(STR, INTEGER, TENSOR, simple_func)
        dgl.distributed.send_request(0, req)

        def subthread_call(server_id):
            req = HelloRequest(STR, INTEGER, TENSOR + server_id, simple_func)
            dgl.distributed.send_request(server_id, req)

        subthread = threading.Thread(target=subthread_call, args=(1, ))
        subthread.start()
        subthread.join()

        res0 = dgl.distributed.recv_response()
        res1 = dgl.distributed.recv_response()
        assert_array_equal(F.asnumpy(res0.tensor), F.asnumpy(TENSOR))
        assert_array_equal(F.asnumpy(res1.tensor), F.asnumpy(TENSOR + 1))
        dgl.distributed.exit_client()

    start_client_multithread("rpc_ip_config_multithread.txt")
    pserver.join()
Beispiel #23
0
def test_dist_dataloader(tmpdir, num_server, num_workers, drop_last,
                         reshuffle):
    reset_envs()
    ip_config = open("mp_ip_config.txt", "w")
    for _ in range(num_server):
        ip_config.write('{}\n'.format(get_local_usable_addr()))
    ip_config.close()

    g = CitationGraphDataset("cora")[0]
    print(g.idtype)
    num_parts = num_server
    num_hops = 1

    orig_nid, orig_eid = partition_graph(g,
                                         'test_sampling',
                                         num_parts,
                                         tmpdir,
                                         num_hops=num_hops,
                                         part_method='metis',
                                         reshuffle=reshuffle,
                                         return_mapping=True)

    pserver_list = []
    ctx = mp.get_context('spawn')
    for i in range(num_server):
        p = ctx.Process(target=start_server,
                        args=(i, tmpdir, num_server > 1, num_workers + 1))
        p.start()
        time.sleep(1)
        pserver_list.append(p)

    os.environ['DGL_DIST_MODE'] = 'distributed'
    os.environ['DGL_NUM_SAMPLER'] = str(num_workers)
    ptrainer = ctx.Process(target=start_dist_dataloader,
                           args=(0, tmpdir, num_server, drop_last, orig_nid,
                                 orig_eid))
    ptrainer.start()

    for p in pserver_list:
        p.join()
    ptrainer.join()
Beispiel #24
0
def test_kv_store():
    reset_envs()
    num_servers = 2
    num_clients = 2
    generate_ip_config("kv_ip_config.txt", 1, num_servers)
    ctx = mp.get_context('spawn')
    pserver_list = []
    pclient_list = []
    os.environ['DGL_NUM_SERVER'] = str(num_servers)
    for i in range(num_servers):
        pserver = ctx.Process(target=start_server, args=(i, num_clients, num_servers))
        pserver.start()
        pserver_list.append(pserver)
    for i in range(num_clients):
        pclient = ctx.Process(target=start_client, args=(num_clients, num_servers))
        pclient.start()
        pclient_list.append(pclient)
    for i in range(num_clients):
        pclient_list[i].join()
    for i in range(num_servers):
        pserver_list[i].join()
Beispiel #25
0
def test_multi_client():
    reset_envs()
    os.environ['DGL_DIST_MODE'] = 'distributed'
    ip_config = open("rpc_ip_config_mul_client.txt", "w")
    ip_addr = get_local_usable_addr()
    ip_config.write('%s\n' % ip_addr)
    ip_config.close()
    ctx = mp.get_context('spawn')
    pserver = ctx.Process(target=start_server,
                          args=(10, "rpc_ip_config_mul_client.txt"))
    pclient_list = []
    for i in range(10):
        pclient = ctx.Process(target=start_client,
                              args=("rpc_ip_config_mul_client.txt", ))
        pclient_list.append(pclient)
    pserver.start()
    for i in range(10):
        pclient_list[i].start()
    for i in range(10):
        pclient_list[i].join()
    pserver.join()
Beispiel #26
0
def test_neg_dataloader(tmpdir, num_server, num_workers):
    reset_envs()
    g = CitationGraphDataset("cora")[0]
    check_neg_dataloader(g, tmpdir, num_server, num_workers)
    g = create_random_hetero()
    check_neg_dataloader(g, tmpdir, num_server, num_workers)
def test_rpc_in_subgraph():
    reset_envs()
    import tempfile
    os.environ['DGL_DIST_MODE'] = 'distributed'
    with tempfile.TemporaryDirectory() as tmpdirname:
        check_rpc_in_subgraph_shuffle(Path(tmpdirname), 2)
def test_rpc_sampling():
    reset_envs()
    import tempfile
    os.environ['DGL_DIST_MODE'] = 'distributed'
    with tempfile.TemporaryDirectory() as tmpdirname:
        check_rpc_sampling(Path(tmpdirname), 2)