def test_dashboard_address(loop): # noqa: F811 os.environ["CUDA_VISIBLE_DEVICES"] = "0" with popen(["dask-scheduler", "--port", "9369", "--no-dashboard"]): with popen([ "dask-cuda-worker", "127.0.0.1:9369", "--dashboard-address", "127.0.0.1:9370", ]): with Client("127.0.0.1:9369", loop=loop) as client: assert wait_workers(client, n_gpus=get_n_gpus()) dashboard_addresses = client.run( lambda dask_worker: dask_worker._dashboard_address) for v in dashboard_addresses.values(): assert v == "127.0.0.1:9370"
def test_rmm_managed(loop): # noqa: F811 rmm = pytest.importorskip("rmm") with popen(["dask-scheduler", "--port", "9369", "--no-dashboard"]): with popen([ "dask-cuda-worker", "127.0.0.1:9369", "--host", "127.0.0.1", "--rmm-managed-memory", "--no-dashboard", ]): with Client("127.0.0.1:9369", loop=loop) as client: assert wait_workers(client, n_gpus=get_n_gpus()) memory_resource_type = client.run( rmm.mr.get_current_device_resource_type) for v in memory_resource_type.values(): assert v is rmm.mr.ManagedMemoryResource
def test_cuda_mig_visible_devices_and_memory_limit_and_nthreads( loop): # noqa: F811 uuids = get_gpu_count_mig(return_uuids=True)[1] # test only with some MIG Instances assuming the test bed # does not have a huge number of mig instances if len(uuids) > 0: cuda_visible_devices = ",".join([i.decode("utf-8") for i in uuids]) else: pytest.skip("No MIG devices found") with patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": cuda_visible_devices}): nthreads = len(cuda_visible_devices) with popen(["dask-scheduler", "--port", "9359", "--no-dashboard"]): with popen([ "dask-cuda-worker", "127.0.0.1:9359", "--host", "127.0.0.1", "--nthreads", str(nthreads), "--no-dashboard", "--worker-class", "dask_cuda.utils.MockWorker", ]): with Client("127.0.0.1:9359", loop=loop) as client: assert wait_workers(client, n_gpus=len(uuids)) # Check to see if all workers are up and # CUDA_VISIBLE_DEVICES cycles properly def get_visible_devices(): return os.environ["CUDA_VISIBLE_DEVICES"] result = client.run(get_visible_devices) wait(result) assert all( len(v.split(",")) == len(uuids) for v in result.values()) for i in range(len(uuids)): assert set(v.split(",")[i] for v in result.values()) == set(uuids)
def test_cuda_visible_devices_and_memory_limit_and_nthreads( loop): # noqa: F811 os.environ["CUDA_VISIBLE_DEVICES"] = "2,3,7,8" nthreads = 4 try: with popen(["dask-scheduler", "--port", "9359", "--no-dashboard"]): with popen([ "dask-cuda-worker", "127.0.0.1:9359", "--host", "127.0.0.1", "--device-memory-limit", "1 MB", "--nthreads", str(nthreads), "--no-dashboard", ]): with Client("127.0.0.1:9359", loop=loop) as client: assert wait_workers(client, n_gpus=4) def get_visible_devices(): return os.environ["CUDA_VISIBLE_DEVICES"] # verify 4 workers with the 4 expected CUDA_VISIBLE_DEVICES result = client.run(get_visible_devices) expected = { "2,3,7,8": 1, "3,7,8,2": 1, "7,8,2,3": 1, "8,2,3,7": 1 } for v in result.values(): del expected[v] workers = client.scheduler_info()["workers"] for w in workers.values(): assert (w["memory_limit"] == MEMORY_LIMIT // len(workers) * nthreads) assert len(expected) == 0 finally: del os.environ["CUDA_VISIBLE_DEVICES"]
def test_cuda_visible_devices_uuid(loop): # noqa: F811 gpu_uuid = get_gpu_uuid_from_index(0) with patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": gpu_uuid}): with popen(["dask-scheduler", "--port", "9359", "--no-dashboard"]): with popen([ "dask-cuda-worker", "127.0.0.1:9359", "--host", "127.0.0.1", "--no-dashboard", "--worker-class", "dask_cuda.utils.MockWorker", ]): with Client("127.0.0.1:9359", loop=loop) as client: assert wait_workers(client, n_gpus=1) result = client.run( lambda: os.environ["CUDA_VISIBLE_DEVICES"]) assert list(result.values())[0] == gpu_uuid
def test_rmm_logging(loop): # noqa: F811 rmm = pytest.importorskip("rmm") with popen(["dask-scheduler", "--port", "9369", "--no-dashboard"]): with popen([ "dask-cuda-worker", "127.0.0.1:9369", "--host", "127.0.0.1", "--rmm-pool-size", "2 GB", "--rmm-log-directory", ".", "--no-dashboard", ]): with Client("127.0.0.1:9369", loop=loop) as client: assert wait_workers(client, n_gpus=get_n_gpus()) memory_resource_type = client.run( rmm.mr.get_current_device_resource_type) for v in memory_resource_type.values(): assert v is rmm.mr.LoggingResourceAdaptor
def _test_dask_cuda_worker_ucx_net_devices(enable_rdmacm): loop = IOLoop.current() ucp = pytest.importorskip("ucp") cm_protocol = "rdmacm" if enable_rdmacm else "sockcm" net_devices = _get_dgx_net_devices() openfabrics_devices = [d.split(",")[0] for d in net_devices] sched_addr = "127.0.0.1" # Enable proper variables for scheduler sched_env = os.environ.copy() sched_env["DASK_UCX__INFINIBAND"] = "True" sched_env["DASK_UCX__TCP"] = "True" sched_env["DASK_UCX__CUDA_COPY"] = "True" sched_env["DASK_UCX__NET_DEVICES"] = openfabrics_devices[0] if enable_rdmacm: sched_env["DASK_UCX__RDMACM"] = "True" sched_addr = get_ip_interface("ib0") sched_url = "ucx://" + sched_addr + ":9379" # Enable proper variables for workers worker_ucx_opts = [ "--enable-infiniband", "--net-devices", "auto", ] if enable_rdmacm: worker_ucx_opts.append("--enable-rdmacm") # Enable proper variables for client initialize( enable_tcp_over_ucx=True, enable_infiniband=True, enable_rdmacm=enable_rdmacm, net_devices=openfabrics_devices[0], ) with subprocess.Popen( [ "dask-scheduler", "--protocol", "ucx", "--host", sched_addr, "--port", "9379", "--no-dashboard", ], env=sched_env, ) as sched_proc: # Scheduler with UCX will take a few seconds to fully start sleep(5) with subprocess.Popen([ "dask-cuda-worker", sched_url, "--no-dashboard", ] + worker_ucx_opts) as worker_proc: with Client(sched_url, loop=loop) as client: def _timeout_callback(): # We must ensure processes are terminated to avoid hangs # if a timeout occurs worker_proc.kill() sched_proc.kill() assert wait_workers(client, timeout_callback=_timeout_callback) workers_tls = client.run(lambda: ucp.get_config()["TLS"]) workers_tls_priority = client.run( lambda: ucp.get_config()["SOCKADDR_TLS_PRIORITY"]) for tls, tls_priority in zip(workers_tls.values(), workers_tls_priority.values()): assert cm_protocol in tls assert cm_protocol in tls_priority worker_net_devices = client.run( lambda: ucp.get_config()["NET_DEVICES"]) cuda_visible_devices = client.run( lambda: os.environ["CUDA_VISIBLE_DEVICES"]) for i, v in enumerate( zip(worker_net_devices.values(), cuda_visible_devices.values())): net_dev = v[0] dev_idx = int(v[1].split(",")[0]) assert net_dev == openfabrics_devices[dev_idx] # A dask-worker with UCX protocol will not close until some work # is dispatched, therefore we kill the worker and scheduler to # ensure timely closing. worker_proc.kill() sched_proc.kill()