def test_dashboard_address(loop):  # noqa: F811
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    with popen(["dask-scheduler", "--port", "9369", "--no-dashboard"]):
        with popen([
                "dask-cuda-worker",
                "127.0.0.1:9369",
                "--dashboard-address",
                "127.0.0.1:9370",
        ]):
            with Client("127.0.0.1:9369", loop=loop) as client:
                assert wait_workers(client, n_gpus=get_n_gpus())

                dashboard_addresses = client.run(
                    lambda dask_worker: dask_worker._dashboard_address)
                for v in dashboard_addresses.values():
                    assert v == "127.0.0.1:9370"
def test_rmm_managed(loop):  # noqa: F811
    rmm = pytest.importorskip("rmm")
    with popen(["dask-scheduler", "--port", "9369", "--no-dashboard"]):
        with popen([
                "dask-cuda-worker",
                "127.0.0.1:9369",
                "--host",
                "127.0.0.1",
                "--rmm-managed-memory",
                "--no-dashboard",
        ]):
            with Client("127.0.0.1:9369", loop=loop) as client:
                assert wait_workers(client, n_gpus=get_n_gpus())

                memory_resource_type = client.run(
                    rmm.mr.get_current_device_resource_type)
                for v in memory_resource_type.values():
                    assert v is rmm.mr.ManagedMemoryResource
def test_cuda_mig_visible_devices_and_memory_limit_and_nthreads(
        loop):  # noqa: F811
    uuids = get_gpu_count_mig(return_uuids=True)[1]
    # test only with some MIG Instances assuming the test bed
    # does not have a huge number of mig instances
    if len(uuids) > 0:
        cuda_visible_devices = ",".join([i.decode("utf-8") for i in uuids])
    else:
        pytest.skip("No MIG devices found")

    with patch.dict(os.environ,
                    {"CUDA_VISIBLE_DEVICES": cuda_visible_devices}):
        nthreads = len(cuda_visible_devices)
        with popen(["dask-scheduler", "--port", "9359", "--no-dashboard"]):
            with popen([
                    "dask-cuda-worker",
                    "127.0.0.1:9359",
                    "--host",
                    "127.0.0.1",
                    "--nthreads",
                    str(nthreads),
                    "--no-dashboard",
                    "--worker-class",
                    "dask_cuda.utils.MockWorker",
            ]):
                with Client("127.0.0.1:9359", loop=loop) as client:
                    assert wait_workers(client, n_gpus=len(uuids))

                    # Check to see if all workers are up and
                    # CUDA_VISIBLE_DEVICES cycles properly

                    def get_visible_devices():
                        return os.environ["CUDA_VISIBLE_DEVICES"]

                    result = client.run(get_visible_devices)
                    wait(result)
                    assert all(
                        len(v.split(",")) == len(uuids)
                        for v in result.values())
                    for i in range(len(uuids)):
                        assert set(v.split(",")[i]
                                   for v in result.values()) == set(uuids)
Example #4
0
def test_cuda_visible_devices_and_memory_limit_and_nthreads(
        loop):  # noqa: F811
    os.environ["CUDA_VISIBLE_DEVICES"] = "2,3,7,8"
    nthreads = 4
    try:
        with popen(["dask-scheduler", "--port", "9359", "--no-dashboard"]):
            with popen([
                    "dask-cuda-worker",
                    "127.0.0.1:9359",
                    "--host",
                    "127.0.0.1",
                    "--device-memory-limit",
                    "1 MB",
                    "--nthreads",
                    str(nthreads),
                    "--no-dashboard",
            ]):
                with Client("127.0.0.1:9359", loop=loop) as client:
                    assert wait_workers(client, n_gpus=4)

                    def get_visible_devices():
                        return os.environ["CUDA_VISIBLE_DEVICES"]

                    # verify 4 workers with the 4 expected CUDA_VISIBLE_DEVICES
                    result = client.run(get_visible_devices)
                    expected = {
                        "2,3,7,8": 1,
                        "3,7,8,2": 1,
                        "7,8,2,3": 1,
                        "8,2,3,7": 1
                    }
                    for v in result.values():
                        del expected[v]

                    workers = client.scheduler_info()["workers"]
                    for w in workers.values():
                        assert (w["memory_limit"] == MEMORY_LIMIT //
                                len(workers) * nthreads)

                    assert len(expected) == 0
    finally:
        del os.environ["CUDA_VISIBLE_DEVICES"]
def test_cuda_visible_devices_uuid(loop):  # noqa: F811
    gpu_uuid = get_gpu_uuid_from_index(0)

    with patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": gpu_uuid}):
        with popen(["dask-scheduler", "--port", "9359", "--no-dashboard"]):
            with popen([
                    "dask-cuda-worker",
                    "127.0.0.1:9359",
                    "--host",
                    "127.0.0.1",
                    "--no-dashboard",
                    "--worker-class",
                    "dask_cuda.utils.MockWorker",
            ]):
                with Client("127.0.0.1:9359", loop=loop) as client:
                    assert wait_workers(client, n_gpus=1)

                    result = client.run(
                        lambda: os.environ["CUDA_VISIBLE_DEVICES"])
                    assert list(result.values())[0] == gpu_uuid
def test_rmm_logging(loop):  # noqa: F811
    rmm = pytest.importorskip("rmm")
    with popen(["dask-scheduler", "--port", "9369", "--no-dashboard"]):
        with popen([
                "dask-cuda-worker",
                "127.0.0.1:9369",
                "--host",
                "127.0.0.1",
                "--rmm-pool-size",
                "2 GB",
                "--rmm-log-directory",
                ".",
                "--no-dashboard",
        ]):
            with Client("127.0.0.1:9369", loop=loop) as client:
                assert wait_workers(client, n_gpus=get_n_gpus())

                memory_resource_type = client.run(
                    rmm.mr.get_current_device_resource_type)
                for v in memory_resource_type.values():
                    assert v is rmm.mr.LoggingResourceAdaptor
Example #7
0
def _test_dask_cuda_worker_ucx_net_devices(enable_rdmacm):
    loop = IOLoop.current()
    ucp = pytest.importorskip("ucp")

    cm_protocol = "rdmacm" if enable_rdmacm else "sockcm"
    net_devices = _get_dgx_net_devices()
    openfabrics_devices = [d.split(",")[0] for d in net_devices]

    sched_addr = "127.0.0.1"

    # Enable proper variables for scheduler
    sched_env = os.environ.copy()
    sched_env["DASK_UCX__INFINIBAND"] = "True"
    sched_env["DASK_UCX__TCP"] = "True"
    sched_env["DASK_UCX__CUDA_COPY"] = "True"
    sched_env["DASK_UCX__NET_DEVICES"] = openfabrics_devices[0]

    if enable_rdmacm:
        sched_env["DASK_UCX__RDMACM"] = "True"
        sched_addr = get_ip_interface("ib0")

    sched_url = "ucx://" + sched_addr + ":9379"

    # Enable proper variables for workers
    worker_ucx_opts = [
        "--enable-infiniband",
        "--net-devices",
        "auto",
    ]
    if enable_rdmacm:
        worker_ucx_opts.append("--enable-rdmacm")

    # Enable proper variables for client
    initialize(
        enable_tcp_over_ucx=True,
        enable_infiniband=True,
        enable_rdmacm=enable_rdmacm,
        net_devices=openfabrics_devices[0],
    )

    with subprocess.Popen(
        [
            "dask-scheduler",
            "--protocol",
            "ucx",
            "--host",
            sched_addr,
            "--port",
            "9379",
            "--no-dashboard",
        ],
            env=sched_env,
    ) as sched_proc:
        # Scheduler with UCX will take a few seconds to fully start
        sleep(5)

        with subprocess.Popen([
                "dask-cuda-worker",
                sched_url,
                "--no-dashboard",
        ] + worker_ucx_opts) as worker_proc:
            with Client(sched_url, loop=loop) as client:

                def _timeout_callback():
                    # We must ensure processes are terminated to avoid hangs
                    # if a timeout occurs
                    worker_proc.kill()
                    sched_proc.kill()

                assert wait_workers(client, timeout_callback=_timeout_callback)

                workers_tls = client.run(lambda: ucp.get_config()["TLS"])
                workers_tls_priority = client.run(
                    lambda: ucp.get_config()["SOCKADDR_TLS_PRIORITY"])
                for tls, tls_priority in zip(workers_tls.values(),
                                             workers_tls_priority.values()):
                    assert cm_protocol in tls
                    assert cm_protocol in tls_priority
                worker_net_devices = client.run(
                    lambda: ucp.get_config()["NET_DEVICES"])
                cuda_visible_devices = client.run(
                    lambda: os.environ["CUDA_VISIBLE_DEVICES"])

                for i, v in enumerate(
                        zip(worker_net_devices.values(),
                            cuda_visible_devices.values())):
                    net_dev = v[0]
                    dev_idx = int(v[1].split(",")[0])
                    assert net_dev == openfabrics_devices[dev_idx]

            # A dask-worker with UCX protocol will not close until some work
            # is dispatched, therefore we kill the worker and scheduler to
            # ensure timely closing.
            worker_proc.kill()
            sched_proc.kill()