Esempio n. 1
0
def test_parse_visible_devices():
    pynvml = pytest.importorskip("pynvml")
    pynvml.nvmlInit()
    indices = []
    uuids = []
    for index in range(get_gpu_count()):
        handle = pynvml.nvmlDeviceGetHandleByIndex(index)
        uuid = pynvml.nvmlDeviceGetUUID(handle).decode("utf-8")

        assert parse_cuda_visible_device(index) == index
        assert parse_cuda_visible_device(uuid) == uuid

        indices.append(str(index))
        uuids.append(pynvml.nvmlDeviceGetUUID(handle).decode("utf-8"))

    index_devices = ",".join(indices)
    os.environ["CUDA_VISIBLE_DEVICES"] = index_devices
    for index in range(get_gpu_count()):
        visible = cuda_visible_devices(index)
        assert visible.split(",")[0] == str(index)

    uuid_devices = ",".join(uuids)
    os.environ["CUDA_VISIBLE_DEVICES"] = uuid_devices
    for index in range(get_gpu_count()):
        visible = cuda_visible_devices(index)
        assert visible.split(",")[0] == str(uuids[index])

    with pytest.raises(ValueError):
        parse_cuda_visible_device("Foo")

    with pytest.raises(TypeError):
        parse_cuda_visible_device(None)
        parse_cuda_visible_device([])
def test_rmm_pool(loop):  # noqa: F811
    rmm = pytest.importorskip("rmm")
    with popen(["dask-scheduler", "--port", "9369", "--no-dashboard"]):
        with popen([
                "dask-cuda-worker",
                "127.0.0.1:9369",
                "--host",
                "127.0.0.1",
                "--rmm-pool-size",
                "2 GB",
                "--no-dashboard",
        ]):
            with Client("127.0.0.1:9369", loop=loop) as client:
                start = time()
                while True:
                    if len(client.scheduler_info()
                           ["workers"]) == get_gpu_count():
                        break
                    else:
                        assert time() - start < 10
                        sleep(0.1)

                memory_info = client.run(rmm.get_info)
                for v in memory_info.values():
                    assert v.total == 2000000000
Esempio n. 3
0
def test_parse_visible_mig_devices():
    pynvml = pytest.importorskip("pynvml")
    pynvml.nvmlInit()
    for index in range(get_gpu_count()):
        handle = pynvml.nvmlDeviceGetHandleByIndex(index)
        try:
            mode = pynvml.nvmlDeviceGetMigMode(handle)[0]
        except pynvml.NVMLError:
            # if not a MIG device, i.e. a normal GPU, skip
            continue
        if mode:
            # Just checks to see if there are any MIG enabled GPUS.
            # If there is one, check if the number of mig instances
            # in that GPU is <= to count, where count gives us the
            # maximum number of MIG devices/instances that can exist
            # under a given parent NVML device.
            count = pynvml.nvmlDeviceGetMaxMigDeviceCount(handle)
            miguuids = []
            for i in range(count):
                try:
                    mighandle = pynvml.nvmlDeviceGetMigDeviceHandleByIndex(
                        device=handle, index=i)
                    miguuids.append(mighandle)
                except pynvml.NVMLError:
                    pass
            assert len(miguuids) <= count
Esempio n. 4
0
def _test_dask_cuda_worker_ucx_net_devices(enable_rdmacm):
    loop = IOLoop.current()

    cm_protocol = "rdmacm" if enable_rdmacm else "sockcm"
    net_devices = _get_dgx_net_devices()
    openfabrics_devices = [d.split(",")[0] for d in net_devices]

    sched_addr = "127.0.0.1"

    # Enable proper variables for scheduler
    sched_env = os.environ.copy()
    sched_env["DASK_UCX__INFINIBAND"] = "True"
    sched_env["DASK_UCX__TCP"] = "True"

    if enable_rdmacm:
        sched_env["DASK_UCX__RDMACM"] = "True"
        sched_addr = get_ip_interface("ib0")

    sched_url = "ucx://" + sched_addr + ":9379"

    # Enable proper variables for workers
    worker_ucx_opts = [
        "--enable-infiniband",
        "--net-devices",
        "auto",
    ]
    if enable_rdmacm:
        worker_ucx_opts.append("--enable-rdmacm")

    # Enable proper variables for client
    initialize(enable_tcp_over_ucx=True,
               enable_infiniband=True,
               enable_rdmacm=enable_rdmacm)

    with subprocess.Popen(
        [
            "dask-scheduler",
            "--protocol",
            "ucx",
            "--host",
            sched_addr,
            "--port",
            "9379",
            "--no-dashboard",
        ],
            env=sched_env,
    ) as sched_proc:
        # Scheduler with UCX will take a few seconds to fully start
        sleep(5)

        with subprocess.Popen([
                "dask-cuda-worker",
                sched_url,
                "--no-dashboard",
        ] + worker_ucx_opts) as worker_proc:
            with Client(sched_url, loop=loop) as client:

                start = time()
                while True:
                    if len(client.scheduler_info()
                           ["workers"]) == get_gpu_count():
                        break
                    else:
                        assert time() - start < 10
                        sleep(0.1)

                workers_tls = client.run(lambda: ucp.get_config()["TLS"])
                workers_tls_priority = client.run(
                    lambda: ucp.get_config()["SOCKADDR_TLS_PRIORITY"])
                for tls, tls_priority in zip(workers_tls.values(),
                                             workers_tls_priority.values()):
                    assert cm_protocol in tls
                    assert cm_protocol in tls_priority
                worker_net_devices = client.run(
                    lambda: ucp.get_config()["NET_DEVICES"])
                cuda_visible_devices = client.run(
                    lambda: os.environ["CUDA_VISIBLE_DEVICES"])

                for i, v in enumerate(
                        zip(worker_net_devices.values(),
                            cuda_visible_devices.values())):
                    net_dev = v[0]
                    dev_idx = int(v[1].split(",")[0])
                    assert net_dev == openfabrics_devices[dev_idx]

            # A dask-worker with UCX protocol will not close until some work
            # is dispatched, therefore we kill the worker and scheduler to
            # ensure timely closing.
            worker_proc.kill()
            sched_proc.kill()
Esempio n. 5
0
def test_dask_cuda_worker_ucx_net_devices(loop):  # noqa: F811
    net_devices = _get_dgx_net_devices()

    sched_env = os.environ.copy()
    sched_env["UCX_TLS"] = "rc,sockcm,tcp,cuda_copy"
    sched_env["UCX_SOCKADDR_TLS_PRIORITY"] = "sockcm"

    with subprocess.Popen(
        [
            "dask-scheduler",
            "--protocol",
            "ucx",
            "--host",
            "127.0.0.1",
            "--port",
            "9379",
            "--no-dashboard",
        ],
            env=sched_env,
    ) as sched_proc:
        # Scheduler with UCX will take a few seconds to fully start
        sleep(5)

        with subprocess.Popen([
                "dask-cuda-worker",
                "ucx://127.0.0.1:9379",
                "--host",
                "127.0.0.1",
                "--enable-infiniband",
                "--net-devices",
                "auto",
                "--no-dashboard",
        ], ) as worker_proc:
            with Client("ucx://127.0.0.1:9379", loop=loop) as client:

                start = time()
                while True:
                    if len(client.scheduler_info()
                           ["workers"]) == get_gpu_count():
                        break
                    else:
                        assert time() - start < 10
                        sleep(0.1)

                worker_net_devices = client.run(
                    lambda: ucp.get_config()["NET_DEVICES"])
                cuda_visible_devices = client.run(
                    lambda: os.environ["CUDA_VISIBLE_DEVICES"])

                for i, v in enumerate(
                        zip(worker_net_devices.values(),
                            cuda_visible_devices.values())):
                    net_dev = v[0]
                    dev_idx = int(v[1].split(",")[0])
                    assert net_dev == net_devices[dev_idx]

            # A dask-worker with UCX protocol will not close until some work
            # is dispatched, therefore we kill the worker and scheduler to
            # ensure timely closing.
            worker_proc.kill()
            sched_proc.kill()