Ejemplo n.º 1
0
def _test_initialize_ucx_infiniband():
    kwargs = {"enable_infiniband": True, "net_devices": "ib0"}
    initialize(**kwargs)
    with LocalCluster(
            protocol="ucx",
            dashboard_address=None,
            n_workers=1,
            threads_per_worker=1,
            processes=True,
            config={"ucx": get_ucx_config(**kwargs)},
    ) as cluster:
        with Client(cluster) as client:
            res = da.from_array(numpy.arange(10000), chunks=(1000, ))
            res = res.sum().compute()
            assert res == 49995000

            def check_ucx_options():
                conf = ucp.get_config()
                assert "TLS" in conf
                assert "rc" in conf["TLS"]
                assert "tcp" in conf["TLS"]
                assert "sockcm" in conf["TLS"]
                assert "cuda_copy" in conf["TLS"]
                assert "sockcm" in conf["SOCKADDR_TLS_PRIORITY"]
                assert conf["NET_DEVICES"] == "ib0"
                return True

            assert client.run_on_scheduler(check_ucx_options) == True
            assert all(client.run(check_ucx_options).values())
Ejemplo n.º 2
0
def _test_initialize_ucx_all():
    initialize()
    with LocalCluster(
        protocol="ucx",
        dashboard_address=None,
        n_workers=1,
        threads_per_worker=1,
        processes=True,
        config={"distributed.comm.ucx": get_ucx_config()},
    ) as cluster:
        with Client(cluster) as client:
            res = da.from_array(numpy.arange(10000), chunks=(1000,))
            res = res.sum().compute()
            assert res == 49995000

            def check_ucx_options():
                conf = ucp.get_config()
                assert "TLS" in conf
                assert conf["TLS"] == "all"
                assert all(
                    [
                        p in conf["SOCKADDR_TLS_PRIORITY"]
                        for p in ["rdmacm", "tcp", "sockcm"]
                    ]
                )
                return True

            assert client.run_on_scheduler(check_ucx_options) is True
            assert all(client.run(check_ucx_options).values())
Ejemplo n.º 3
0
def _test_initialize_ucx_nvlink():
    initialize(enable_nvlink=True)
    with LocalCluster(
            protocol="ucx",
            dashboard_address=None,
            n_workers=1,
            threads_per_worker=1,
            processes=True,
    ) as cluster:
        with Client(cluster) as client:
            res = da.from_array(numpy.arange(10000), chunks=(1000, ))
            res = res.sum().compute()
            assert res == 49995000

            def check_ucx_options():
                conf = ucp.get_config()
                assert "TLS" in conf
                assert "cuda_ipc" in conf["TLS"]
                assert "tcp" in conf["TLS"]
                assert "sockcm" in conf["TLS"]
                assert "cuda_copy" in conf["TLS"]
                assert "sockcm" in conf["SOCKADDR_TLS_PRIORITY"]
                return True

            assert all(client.run(check_ucx_options).values())
Ejemplo n.º 4
0
async def test_dgx_ucx_infiniband_nvlink(params):
    ucp = pytest.importorskip("ucp")

    enable_tcp = params["enable_tcp"]
    enable_infiniband = params["enable_infiniband"]
    enable_nvlink = params["enable_nvlink"]

    initialize(create_cuda_context=True,
               enable_tcp_over_ucx=enable_tcp,
               enable_infiniband=enable_infiniband,
               enable_nvlink=enable_nvlink)

    async with DGX(
            interface="enp1s0f0",
            protocol="ucx",
            enable_tcp_over_ucx=enable_tcp,
            enable_infiniband=enable_infiniband,
            enable_nvlink=enable_nvlink,
            asynchronous=True,
    ) as cluster:
        async with Client(cluster, asynchronous=True) as client:
            rs = da.random.RandomState(RandomState=cupy.random.RandomState)
            a = rs.normal(10,
                          1, (int(1e4), int(1e4)),
                          chunks=(int(2.5e3), int(2.5e3)))
            x = a + a.T

            res = await client.compute(x)
Ejemplo n.º 5
0
async def run():
    initialize(
        create_cuda_context=True,
        enable_tcp_over_ucx=enable_tcp_over_ucx,
        enable_infiniband=enable_infiniband,
        enable_nvlink=enable_nvlink,
    )

    async with LocalCUDACluster(
        interface="enp1s0f0",
        protocol="ucx",
        enable_tcp_over_ucx=enable_tcp_over_ucx,
        enable_infiniband=enable_infiniband,
        enable_nvlink=enable_nvlink,
        asynchronous=True,
    ) as cluster:
        async with Client(cluster, asynchronous=True) as client:
            rs = da.random.RandomState(RandomState=cupy.random.RandomState)
            a = rs.normal(10, 1, (int(4e3), int(4e3)), chunks=(int(1e3), int(1e3)))
            x = a + a.T

            for i in range(100):
                print("Running iteration:", i)
                start = time.time()
                await client.compute(x)
                print("Time for iteration", i, ":", time.time() - start)
Ejemplo n.º 6
0
def main(
    address,
    enable_nvlink,
    enable_infiniband,
):

    enable_rdmacm = False
    ucx_net_devices = None

    if enable_infiniband:
        # enable_rdmacm = True  # RDMACM not working right now
        ucx_net_devices = "mlx5_0:1"

    # set up environment
    initialize(
        enable_tcp_over_ucx=True,
        enable_nvlink=enable_nvlink,
        enable_infiniband=enable_infiniband,
        enable_rdmacm=enable_rdmacm,
        net_devices=ucx_net_devices,
    )

    # initialize client
    client = Client(address)

    # user code here
    rs = da.random.RandomState(RandomState=cupy.random.RandomState)
    x = rs.random((10000, 10000), chunks=1000)
    x.sum().compute()

    # shutdown cluster
    client.shutdown()
Ejemplo n.º 7
0
def setup(dask_scheduler_file=None, rmm_pool_size=None):
    if dask_scheduler_file:
        cluster = None
        # Env var UCX_MAX_RNDV_RAILS=1 must be set too.
        initialize(
            enable_tcp_over_ucx=True,
            enable_nvlink=True,
            enable_infiniband=False,
            enable_rdmacm=False,
            #net_devices="mlx5_0:1",
        )
        client = Client(scheduler_file=dask_scheduler_file)

    else:
        tempdir_object = tempfile.TemporaryDirectory()
        cluster = LocalCUDACluster(local_directory=tempdir_object.name,
                                   rmm_pool_size=rmm_pool_size)
        client = Client(cluster)
        # add the obj to the client so it doesn't get deleted until
        # the 'client' obj gets cleaned up
        client.tempdir_object = tempdir_object
        client.wait_for_workers(len(get_visible_devices()))

    Comms.initialize(p2p=True)
    return (client, cluster)
Ejemplo n.º 8
0
async def run():
    initialize(
        create_cuda_context=True,
        enable_tcp_over_ucx=enable_tcp_over_ucx,
        enable_infiniband=enable_infiniband,
        enable_nvlink=enable_nvlink,
    )

    async with LocalCUDACluster(
        interface="enp1s0f0",
        protocol="ucx",
        enable_tcp_over_ucx=enable_tcp_over_ucx,
        enable_infiniband=enable_infiniband,
        enable_nvlink=enable_nvlink,
        asynchronous=True,
    ) as cluster:
        async with Client(cluster, asynchronous=True) as client:
            d = dask_cudf.from_cudf(
                cudf.DataFrame({"a": range(2 ** 16)}), npartitions=2
            )
            r = d.sum()

            for i in range(100):
                print("Running iteration:", i)
                start = time.time()
                await client.compute(r)
                print("Time for iteration", i, ":", time.time() - start)
Ejemplo n.º 9
0
def _test_initialize_ucx_nvlink():
    kwargs = {"enable_nvlink": True}
    initialize(**kwargs)
    with LocalCluster(
        protocol="ucx",
        dashboard_address=None,
        n_workers=1,
        threads_per_worker=1,
        processes=True,
        config={"distributed.comm.ucx": get_ucx_config(**kwargs)},
    ) as cluster:
        with Client(cluster) as client:
            res = da.from_array(numpy.arange(10000), chunks=(1000,))
            res = res.sum().compute()
            assert res == 49995000

            def check_ucx_options():
                conf = ucp.get_config()
                assert "TLS" in conf
                assert "cuda_ipc" in conf["TLS"]
                assert "tcp" in conf["TLS"]
                assert "cuda_copy" in conf["TLS"]
                if _ucx_110:
                    assert "tcp" in conf["SOCKADDR_TLS_PRIORITY"]
                else:
                    assert "sockcm" in conf["TLS"]
                    assert "sockcm" in conf["SOCKADDR_TLS_PRIORITY"]
                return True

            assert client.run_on_scheduler(check_ucx_options) is True
            assert all(client.run(check_ucx_options).values())
Ejemplo n.º 10
0
def _test_dataframe_shuffle(backend, protocol, n_workers):
    if backend == "cudf":
        cudf = pytest.importorskip("cudf")
        from cudf.testing._utils import assert_eq

        initialize(enable_tcp_over_ucx=True)
    else:
        from dask.dataframe.utils import assert_eq

        dask.config.update(
            dask.config.global_config,
            {
                "ucx": {
                    "tcp": True,
                    "cuda_copy": True,
                },
            },
            priority="new",
        )

    with LocalCluster(
            protocol=protocol,
            dashboard_address=None,
            n_workers=n_workers,
            threads_per_worker=1,
            processes=True,
    ) as cluster:
        with Client(cluster) as client:
            all_workers = list(client.get_worker_logs().keys())
            comms.default_comms()
            np.random.seed(42)
            df = pd.DataFrame({"key": np.random.random(100)})
            if backend == "cudf":
                df = cudf.DataFrame.from_pandas(df)

            for input_nparts in range(1, 5):
                for output_nparts in range(1, 5):
                    ddf = dd.from_pandas(
                        df.copy(),
                        npartitions=input_nparts).persist(workers=all_workers)
                    ddf = explicit_comms_shuffle(
                        ddf, ["key"], npartitions=output_nparts).persist()

                    assert ddf.npartitions == output_nparts

                    # Check that each partition of `ddf` hashes to the same value
                    result = ddf.map_partitions(check_partitions,
                                                output_nparts).compute()
                    assert all(result.to_list())

                    # Check the values of `ddf` (ignoring the row order)
                    expected = df.sort_values("key")
                    got = ddf.compute().sort_values("key")
                    if backend == "cudf":
                        assert_eq(got, expected)
                    else:
                        pd.testing.assert_frame_equal(got, expected)
Ejemplo n.º 11
0
def _test_dataframe_shuffle_merge(backend, protocol, n_workers):
    if backend == "cudf":
        cudf = pytest.importorskip("cudf")
        from cudf.testing._utils import assert_eq

        initialize(enable_tcp_over_ucx=True)
    else:
        from dask.dataframe.utils import assert_eq

        dask.config.update(
            dask.config.global_config,
            {
                "ucx": {
                    "tcp": True,
                    "cuda_copy": True,
                },
            },
            priority="new",
        )

    with LocalCluster(
            protocol=protocol,
            dashboard_address=None,
            n_workers=n_workers,
            threads_per_worker=1,
            processes=True,
    ) as cluster:
        with Client(cluster):
            nrows = n_workers * 10

            # Let's make some dataframes that we can join on the "key" column
            df1 = pd.DataFrame({
                "key": np.arange(nrows),
                "payload1": np.arange(nrows)
            })
            key = np.arange(nrows)
            np.random.shuffle(key)
            df2 = pd.DataFrame({
                "key": key[nrows // 3:],
                "payload2": np.arange(nrows)[nrows // 3:]
            })
            expected = df1.merge(df2, on="key").set_index("key")

            if backend == "cudf":
                df1 = cudf.DataFrame.from_pandas(df1)
                df2 = cudf.DataFrame.from_pandas(df2)

            ddf1 = dd.from_pandas(df1, npartitions=n_workers + 1)
            ddf2 = dd.from_pandas(df2,
                                  npartitions=n_workers -
                                  1 if n_workers > 1 else 1)
            with dask.config.set(explicit_comms=True):
                got = ddf1.merge(ddf2, on="key").set_index("key").compute()
            if backend == "cudf":
                assert_eq(got, expected)
            else:
                pd.testing.assert_frame_equal(got, expected)
Ejemplo n.º 12
0
async def test_ucx_protocol_type_error():
    pytest.importorskip("ucp")

    initialize(enable_tcp_over_ucx=True)
    with pytest.raises(TypeError):
        async with LocalCUDACluster(
            protocol="tcp", enable_tcp_over_ucx=True, asynchronous=True, data=dict
        ):
            pass
Ejemplo n.º 13
0
async def test_ucx_protocol(protocol):
    pytest.importorskip("ucp")

    initialize(enable_tcp_over_ucx=True)
    async with LocalCUDACluster(
        protocol=protocol, enable_tcp_over_ucx=True, asynchronous=True, data=dict
    ) as cluster:
        assert all(
            ws.address.startswith("ucx://") for ws in cluster.scheduler.workers.values()
        )
Ejemplo n.º 14
0
def ucx_cluster():
    initialize.initialize(create_cuda_context=True,
                          enable_tcp_over_ucx=enable_tcp_over_ucx,
                          enable_nvlink=enable_nvlink,
                          enable_infiniband=enable_infiniband)
    cluster = LocalCUDACluster(protocol="ucx",
                               enable_tcp_over_ucx=enable_tcp_over_ucx,
                               enable_nvlink=enable_nvlink,
                               enable_infiniband=enable_infiniband)
    yield cluster
    cluster.close()
Ejemplo n.º 15
0
def _test_ucx_infiniband_nvlink(enable_infiniband, enable_nvlink,
                                enable_rdmacm):
    cupy = pytest.importorskip("cupy")

    net_devices = _get_dgx_net_devices()
    openfabrics_devices = [d.split(",")[0] for d in net_devices]

    ucx_net_devices = "auto" if enable_infiniband else None
    cm_protocol = "rdmacm" if enable_rdmacm else "sockcm"

    initialize(
        enable_tcp_over_ucx=True,
        enable_infiniband=enable_infiniband,
        enable_nvlink=enable_nvlink,
        enable_rdmacm=enable_rdmacm,
    )

    with LocalCUDACluster(
            interface="ib0",
            enable_tcp_over_ucx=True,
            enable_infiniband=enable_infiniband,
            enable_nvlink=enable_nvlink,
            enable_rdmacm=enable_rdmacm,
            ucx_net_devices=ucx_net_devices,
    ) as cluster:
        with Client(cluster) as client:
            res = da.from_array(cupy.arange(10000),
                                chunks=(1000, ),
                                asarray=False)
            res = res.sum().compute()
            assert res == 49995000

            def check_ucx_options():
                conf = ucp.get_config()
                assert "TLS" in conf
                assert "tcp" in conf["TLS"]
                assert "cuda_copy" in conf["TLS"]
                assert cm_protocol in conf["TLS"]
                assert cm_protocol in conf["SOCKADDR_TLS_PRIORITY"]
                if enable_nvlink:
                    assert "cuda_ipc" in conf["TLS"]
                if enable_infiniband:
                    assert "rc" in conf["TLS"]
                return True

            if enable_infiniband:
                assert all([
                    cluster.worker_spec[k]["options"]["env"]["UCX_NET_DEVICES"]
                    == openfabrics_devices[k].split(",")[0]
                    for k in cluster.worker_spec.keys()
                ])

            assert all(client.run(check_ucx_options).values())
Ejemplo n.º 16
0
def test_initialize_ucx_tcp():
    ucp = pytest.importorskip("ucp")

    initialize(enable_tcp_over_ucx=True)

    conf = ucp.get_config()
    env = os.environ

    assert "TLS" in conf
    assert "UCX_TLS" in env

    assert "tcp" in conf["TLS"] and "tcp" in env["UCX_TLS"]
    assert "sockcm" in conf["TLS"] and "sockcm" in env["UCX_TLS"]
    assert "cuda_copy" in conf["TLS"] and "cuda_copy" in env["UCX_TLS"]

    assert "sockcm" in conf["SOCKADDR_TLS_PRIORITY"] and "sockcm" in env["UCX_SOCKADDR_TLS_PRIORITY"]
Ejemplo n.º 17
0
def test_initialize_ucx_infiniband():
    ucp = pytest.importorskip("ucp")

    initialize(enable_infiniband=True, net_devices="ib0")

    conf = ucp.get_config()
    env = os.environ

    assert "TLS" in conf
    assert "UCX_TLS" in env

    assert "rc" in conf["TLS"] and "rc" in env["UCX_TLS"]
    assert "tcp" in conf["TLS"] and "tcp" in env["UCX_TLS"]
    assert "sockcm" in conf["TLS"] and "sockcm" in env["UCX_TLS"]
    assert "cuda_copy" in conf["TLS"] and "cuda_copy" in env["UCX_TLS"]

    assert "sockcm" in conf["SOCKADDR_TLS_PRIORITY"] and "sockcm" in env["UCX_SOCKADDR_TLS_PRIORITY"]

    assert conf["NET_DEVICES"] == "ib0" and env["UCX_NET_DEVICES"] == "ib0"
Ejemplo n.º 18
0
def dask_client():
    dask_scheduler_file = os.environ.get("SCHEDULER_FILE")
    cluster = None
    client = None
    tempdir_object = None

    if dask_scheduler_file:
        # Env var UCX_MAX_RNDV_RAILS=1 must be set too.
        initialize(
            enable_tcp_over_ucx=True,
            enable_nvlink=True,
            enable_infiniband=True,
            enable_rdmacm=True,
            # net_devices="mlx5_0:1",
        )
        client = Client(scheduler_file=dask_scheduler_file)
        print("\ndask_client fixture: client created using "
              f"{dask_scheduler_file}")
    else:
        # The tempdir created by tempdir_object should be cleaned up once
        # tempdir_object goes out-of-scope and is deleted.
        tempdir_object = tempfile.TemporaryDirectory()
        cluster = LocalCUDACluster(local_directory=tempdir_object.name)
        client = Client(cluster)
        client.wait_for_workers(len(get_visible_devices()))
        print("\ndask_client fixture: client created using LocalCUDACluster")

    Comms.initialize(p2p=True)

    yield client

    Comms.destroy()
    # Shut down the connected scheduler and workers
    # therefore we will no longer rely on killing the dask cluster ID
    # for MNMG runs
    client.shutdown()
    if cluster:
        cluster.close()
    print("\ndask_client fixture: client.close() called")
Ejemplo n.º 19
0
def initialize_cluster(use_gpu=True, n_cpu=None, n_gpu=-1):
    enable_tcp_over_ucx = True
    enable_nvlink = True
    enable_infiniband = True

    logger.info('Starting dash cluster...')
    if use_gpu:
        initialize.initialize(create_cuda_context=True,
                              enable_tcp_over_ucx=enable_tcp_over_ucx,
                              enable_nvlink=enable_nvlink,
                              enable_infiniband=enable_infiniband)
        if n_gpu == -1:
            n_gpu = get_n_gpus()

        device_list = cuda_visible_devices(1, range(n_gpu)).split(',')
        CUDA_VISIBLE_DEVICES = []
        for device in device_list:
            try:
                CUDA_VISIBLE_DEVICES.append(int(device))
            except ValueError as vex:
                logger.warn(vex)

        logger.info('Using GPUs {} ...'.format(CUDA_VISIBLE_DEVICES))

        cluster = LocalCUDACluster(protocol="ucx",
                                   dashboard_address=':8787',
                                   CUDA_VISIBLE_DEVICES=CUDA_VISIBLE_DEVICES,
                                   enable_tcp_over_ucx=enable_tcp_over_ucx,
                                   enable_nvlink=enable_nvlink,
                                   enable_infiniband=enable_infiniband)
    else:
        logger.info('Using {} CPUs ...'.format(n_cpu))
        cluster = LocalCluster(dashboard_address=':8787',
                               n_workers=n_cpu,
                               threads_per_worker=4)

    client = Client(cluster)
    client.run(cupy.cuda.set_allocator)
    return client
Ejemplo n.º 20
0
def _test_ucx_infiniband_nvlink(enable_infiniband, enable_nvlink, enable_rdmacm):
    cupy = pytest.importorskip("cupy")
    ucp = pytest.importorskip("ucp")

    net_devices = _get_dgx_net_devices()
    openfabrics_devices = [d.split(",")[0] for d in net_devices]
    ucx_net_devices = None

    if enable_infiniband is None and enable_nvlink is None and enable_rdmacm is None:
        if _ucx_110 is False:
            pytest.skip(
                "Specifying transports is required on UCX < 1.10",
                allow_module_level=True,
            )
        enable_tcp_over_ucx = None
        cm_tls = ["all"]
        cm_tls_priority = ["rdmacm", "tcp", "sockcm"]
    else:
        if enable_infiniband and not _ucx_110:
            ucx_net_devices = "auto"

        enable_tcp_over_ucx = True

        if _ucx_110 is True:
            cm_tls = ["tcp"]
            if enable_rdmacm is True:
                cm_tls_priority = ["rdmacm"]
            else:
                cm_tls_priority = ["tcp"]
        else:
            cm_tls = ["tcp"]
            if enable_rdmacm is True:
                cm_tls.append("rdmacm")
                cm_tls_priority = ["rdmacm"]
            else:
                cm_tls.append("sockcm")
                cm_tls_priority = ["sockcm"]

    initialize(
        enable_tcp_over_ucx=enable_tcp_over_ucx,
        enable_infiniband=enable_infiniband,
        enable_nvlink=enable_nvlink,
        enable_rdmacm=enable_rdmacm,
    )

    with LocalCUDACluster(
        interface="ib0",
        enable_tcp_over_ucx=enable_tcp_over_ucx,
        enable_infiniband=enable_infiniband,
        enable_nvlink=enable_nvlink,
        enable_rdmacm=enable_rdmacm,
        ucx_net_devices=ucx_net_devices,
        rmm_pool_size="1 GiB",
    ) as cluster:
        with Client(cluster) as client:
            res = da.from_array(cupy.arange(10000), chunks=(1000,), asarray=False)
            res = res.sum().compute()
            assert res == 49995000

            def check_ucx_options():
                conf = ucp.get_config()
                assert "TLS" in conf
                assert all(t in conf["TLS"] for t in cm_tls)
                assert all(p in conf["SOCKADDR_TLS_PRIORITY"] for p in cm_tls_priority)
                if cm_tls != ["all"]:
                    assert "tcp" in conf["TLS"]
                    assert "cuda_copy" in conf["TLS"]
                    if enable_nvlink:
                        assert "cuda_ipc" in conf["TLS"]
                    if enable_infiniband:
                        assert "rc" in conf["TLS"]
                return True

            if ucx_net_devices == "auto":
                assert all(
                    [
                        cluster.worker_spec[k]["options"]["env"]["UCX_NET_DEVICES"]
                        == openfabrics_devices[k].split(",")[0]
                        for k in cluster.worker_spec.keys()
                    ]
                )

            assert all(client.run(check_ucx_options).values())
Ejemplo n.º 21
0
def test_initialize_cuda_context():
    initialize(create_cuda_context=True)
Ejemplo n.º 22
0
def _test_dask_cuda_worker_ucx_net_devices(enable_rdmacm):
    loop = IOLoop.current()
    ucp = pytest.importorskip("ucp")

    cm_protocol = "rdmacm" if enable_rdmacm else "sockcm"
    net_devices = _get_dgx_net_devices()
    openfabrics_devices = [d.split(",")[0] for d in net_devices]

    sched_addr = "127.0.0.1"

    # Enable proper variables for scheduler
    sched_env = os.environ.copy()
    sched_env["DASK_UCX__INFINIBAND"] = "True"
    sched_env["DASK_UCX__TCP"] = "True"
    sched_env["DASK_UCX__CUDA_COPY"] = "True"
    sched_env["DASK_UCX__NET_DEVICES"] = openfabrics_devices[0]

    if enable_rdmacm:
        sched_env["DASK_UCX__RDMACM"] = "True"
        sched_addr = get_ip_interface("ib0")

    sched_url = "ucx://" + sched_addr + ":9379"

    # Enable proper variables for workers
    worker_ucx_opts = [
        "--enable-infiniband",
        "--net-devices",
        "auto",
    ]
    if enable_rdmacm:
        worker_ucx_opts.append("--enable-rdmacm")

    # Enable proper variables for client
    initialize(
        enable_tcp_over_ucx=True,
        enable_infiniband=True,
        enable_rdmacm=enable_rdmacm,
        net_devices=openfabrics_devices[0],
    )

    with subprocess.Popen(
        [
            "dask-scheduler",
            "--protocol",
            "ucx",
            "--host",
            sched_addr,
            "--port",
            "9379",
            "--no-dashboard",
        ],
            env=sched_env,
    ) as sched_proc:
        # Scheduler with UCX will take a few seconds to fully start
        sleep(5)

        with subprocess.Popen([
                "dask-cuda-worker",
                sched_url,
                "--no-dashboard",
        ] + worker_ucx_opts) as worker_proc:
            with Client(sched_url, loop=loop) as client:

                def _timeout_callback():
                    # We must ensure processes are terminated to avoid hangs
                    # if a timeout occurs
                    worker_proc.kill()
                    sched_proc.kill()

                assert wait_workers(client, timeout_callback=_timeout_callback)

                workers_tls = client.run(lambda: ucp.get_config()["TLS"])
                workers_tls_priority = client.run(
                    lambda: ucp.get_config()["SOCKADDR_TLS_PRIORITY"])
                for tls, tls_priority in zip(workers_tls.values(),
                                             workers_tls_priority.values()):
                    assert cm_protocol in tls
                    assert cm_protocol in tls_priority
                worker_net_devices = client.run(
                    lambda: ucp.get_config()["NET_DEVICES"])
                cuda_visible_devices = client.run(
                    lambda: os.environ["CUDA_VISIBLE_DEVICES"])

                for i, v in enumerate(
                        zip(worker_net_devices.values(),
                            cuda_visible_devices.values())):
                    net_dev = v[0]
                    dev_idx = int(v[1].split(",")[0])
                    assert net_dev == openfabrics_devices[dev_idx]

            # A dask-worker with UCX protocol will not close until some work
            # is dispatched, therefore we kill the worker and scheduler to
            # ensure timely closing.
            worker_proc.kill()
            sched_proc.kill()
Ejemplo n.º 23
0
def main(args):
    # Set up workers on the local machine
    if args.protocol == "tcp":
        cluster = LocalCUDACluster(
            protocol=args.protocol,
            n_workers=args.n_workers,
            CUDA_VISIBLE_DEVICES=args.devs,
        )
    else:
        enable_infiniband = args.enable_infiniband
        enable_nvlink = args.enable_nvlink
        enable_tcp_over_ucx = args.enable_tcp_over_ucx
        cluster = LocalCUDACluster(
            protocol=args.protocol,
            n_workers=args.n_workers,
            CUDA_VISIBLE_DEVICES=args.devs,
            ucx_net_devices="auto",
            enable_tcp_over_ucx=enable_tcp_over_ucx,
            enable_infiniband=enable_infiniband,
            enable_nvlink=enable_nvlink,
        )
        initialize(
            create_cuda_context=True,
            enable_tcp_over_ucx=enable_tcp_over_ucx,
            enable_infiniband=enable_infiniband,
            enable_nvlink=enable_nvlink,
        )
    client = Client(cluster)

    def _worker_setup(initial_pool_size=None):
        import rmm

        rmm.reinitialize(
            pool_allocator=not args.no_rmm_pool,
            devices=0,
            initial_pool_size=initial_pool_size,
        )
        cupy.cuda.set_allocator(rmm.rmm_cupy_allocator)

    client.run(_worker_setup)
    # Create an RMM pool on the scheduler due to occasional deserialization
    # of CUDA objects. May cause issues with InfiniBand otherwise.
    client.run_on_scheduler(_worker_setup, 1e9)

    took_list = []
    for _ in range(args.runs - 1):
        took_list.append(run(client, args, write_profile=None))
    took_list.append(
        run(client, args, write_profile=args.profile)
    )  # Only profiling the last run

    # Collect, aggregate, and print peer-to-peer bandwidths
    incoming_logs = client.run(lambda dask_worker: dask_worker.incoming_transfer_log)
    bandwidths = defaultdict(list)
    total_nbytes = defaultdict(list)
    for k, L in incoming_logs.items():
        for d in L:
            if d["total"] >= args.ignore_size:
                bandwidths[k, d["who"]].append(d["bandwidth"])
                total_nbytes[k, d["who"]].append(d["total"])
    bandwidths = {
        (cluster.scheduler.workers[w1].name, cluster.scheduler.workers[w2].name): [
            "%s/s" % format_bytes(x) for x in numpy.quantile(v, [0.25, 0.50, 0.75])
        ]
        for (w1, w2), v in bandwidths.items()
    }
    total_nbytes = {
        (
            cluster.scheduler.workers[w1].name,
            cluster.scheduler.workers[w2].name,
        ): format_bytes(sum(nb))
        for (w1, w2), nb in total_nbytes.items()
    }

    if args.markdown:
        print("```")
    print("Merge benchmark")
    print("-------------------------------")
    print(f"backend        | {args.backend}")
    print(f"rows-per-chunk | {args.chunk_size}")
    print(f"protocol       | {args.protocol}")
    print(f"device(s)      | {args.devs}")
    print(f"rmm-pool       | {(not args.no_rmm_pool)}")
    print(f"frac-match     | {args.frac_match}")
    if args.protocol == "ucx":
        print(f"tcp            | {args.enable_tcp_over_ucx}")
        print(f"ib             | {args.enable_infiniband}")
        print(f"nvlink         | {args.enable_nvlink}")
    print(f"data-processed | {format_bytes(took_list[0][0])}")
    print("===============================")
    print("Wall-clock     | Throughput")
    print("-------------------------------")
    for data_processed, took in took_list:
        throughput = int(data_processed / took)
        m = format_time(took)
        m += " " * (15 - len(m))
        print(f"{m}| {format_bytes(throughput)}/s")
    print("===============================")
    if args.markdown:
        print("\n```")

    if args.backend == "dask":
        if args.markdown:
            print("<details>\n<summary>Worker-Worker Transfer Rates</summary>\n\n```")
        print("(w1,w2)     | 25% 50% 75% (total nbytes)")
        print("-------------------------------")
        for (d1, d2), bw in sorted(bandwidths.items()):
            print(
                "(%02d,%02d)     | %s %s %s (%s)"
                % (d1, d2, bw[0], bw[1], bw[2], total_nbytes[(d1, d2)])
            )
        if args.markdown:
            print("```\n</details>\n")