def _test_initialize_ucx_infiniband(): kwargs = {"enable_infiniband": True, "net_devices": "ib0"} initialize(**kwargs) with LocalCluster( protocol="ucx", dashboard_address=None, n_workers=1, threads_per_worker=1, processes=True, config={"ucx": get_ucx_config(**kwargs)}, ) as cluster: with Client(cluster) as client: res = da.from_array(numpy.arange(10000), chunks=(1000, )) res = res.sum().compute() assert res == 49995000 def check_ucx_options(): conf = ucp.get_config() assert "TLS" in conf assert "rc" in conf["TLS"] assert "tcp" in conf["TLS"] assert "sockcm" in conf["TLS"] assert "cuda_copy" in conf["TLS"] assert "sockcm" in conf["SOCKADDR_TLS_PRIORITY"] assert conf["NET_DEVICES"] == "ib0" return True assert client.run_on_scheduler(check_ucx_options) == True assert all(client.run(check_ucx_options).values())
def test_get_ucx_config(enable_tcp_over_ucx, enable_infiniband, net_devices): ucx_config = get_ucx_config( enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, net_devices=net_devices, cuda_device_index=0, ) if enable_tcp_over_ucx is True: assert ucx_config["tcp"] is True assert ucx_config["cuda_copy"] is True else: assert ucx_config["tcp"] is None if enable_infiniband is True: assert ucx_config["infiniband"] is True assert ucx_config["cuda_copy"] is True else: assert ucx_config["infiniband"] is None if enable_tcp_over_ucx is False and enable_infiniband is False: assert ucx_config["cuda_copy"] is None if net_devices == "eth0": assert ucx_config["net-devices"] == "eth0" elif net_devices == "auto": # Since the actual device is system-dependent, we don't do any # checks at the moment. If any InfiniBand devices are available, # that will be the value of "net-devices", otherwise an empty string. pass elif net_devices == "": assert "net-device" not in ucx_config
def _test_initialize_ucx_nvlink(): kwargs = {"enable_nvlink": True} initialize(**kwargs) with LocalCluster( protocol="ucx", dashboard_address=None, n_workers=1, threads_per_worker=1, processes=True, config={"distributed.comm.ucx": get_ucx_config(**kwargs)}, ) as cluster: with Client(cluster) as client: res = da.from_array(numpy.arange(10000), chunks=(1000,)) res = res.sum().compute() assert res == 49995000 def check_ucx_options(): conf = ucp.get_config() assert "TLS" in conf assert "cuda_ipc" in conf["TLS"] assert "tcp" in conf["TLS"] assert "cuda_copy" in conf["TLS"] if _ucx_110: assert "tcp" in conf["SOCKADDR_TLS_PRIORITY"] else: assert "sockcm" in conf["TLS"] assert "sockcm" in conf["SOCKADDR_TLS_PRIORITY"] return True assert client.run_on_scheduler(check_ucx_options) is True assert all(client.run(check_ucx_options).values())
def _test_initialize_ucx_all(): initialize() with LocalCluster( protocol="ucx", dashboard_address=None, n_workers=1, threads_per_worker=1, processes=True, config={"distributed.comm.ucx": get_ucx_config()}, ) as cluster: with Client(cluster) as client: res = da.from_array(numpy.arange(10000), chunks=(1000,)) res = res.sum().compute() assert res == 49995000 def check_ucx_options(): conf = ucp.get_config() assert "TLS" in conf assert conf["TLS"] == "all" assert all( [ p in conf["SOCKADDR_TLS_PRIORITY"] for p in ["rdmacm", "tcp", "sockcm"] ] ) return True assert client.run_on_scheduler(check_ucx_options) is True assert all(client.run(check_ucx_options).values())
def test_get_ucx_config(enable_tcp_over_ucx, enable_infiniband, net_devices): pytest.importorskip("ucp") kwargs = { "enable_tcp_over_ucx": enable_tcp_over_ucx, "enable_infiniband": enable_infiniband, "net_devices": net_devices, "cuda_device_index": 0, } if net_devices == "auto" and enable_infiniband is False: with pytest.raises(ValueError): get_ucx_config(**kwargs) return else: ucx_config = get_ucx_config(**kwargs) if enable_tcp_over_ucx is True: assert ucx_config["tcp"] is True assert ucx_config["cuda_copy"] is True else: assert ucx_config["tcp"] is None if enable_infiniband is True: assert ucx_config["infiniband"] is True assert ucx_config["cuda_copy"] is True else: assert ucx_config["infiniband"] is None if enable_tcp_over_ucx is False and enable_infiniband is False: assert ucx_config["cuda_copy"] is None if net_devices == "eth0": assert ucx_config["net-devices"] == "eth0" elif net_devices == "auto": # Since the actual device is system-dependent, we don't do any # checks at the moment. If any InfiniBand devices are available, # that will be the value of "net-devices", otherwise an empty string. pass elif net_devices == "": assert "net-device" not in ucx_config
def _test_dataframe_shuffle_merge(backend, protocol, n_workers): if backend == "cudf": cudf = pytest.importorskip("cudf") initialize(enable_tcp_over_ucx=True) else: dask.config.update( dask.config.global_config, { "distributed.comm.ucx": get_ucx_config(enable_tcp_over_ucx=True), }, priority="new", ) with LocalCluster( protocol=protocol, dashboard_address=None, n_workers=n_workers, threads_per_worker=1, processes=True, ) as cluster: with Client(cluster): nrows = n_workers * 10 # Let's make some dataframes that we can join on the "key" column df1 = pd.DataFrame({ "key": np.arange(nrows), "payload1": np.arange(nrows) }) key = np.arange(nrows) np.random.shuffle(key) df2 = pd.DataFrame({ "key": key[nrows // 3:], "payload2": np.arange(nrows)[nrows // 3:] }) expected = df1.merge(df2, on="key").set_index("key") if backend == "cudf": df1 = cudf.DataFrame.from_pandas(df1) df2 = cudf.DataFrame.from_pandas(df2) ddf1 = dd.from_pandas(df1, npartitions=n_workers + 1) ddf2 = dd.from_pandas(df2, npartitions=n_workers - 1 if n_workers > 1 else 1) with dask.config.set(explicit_comms=True): got = ddf1.merge(ddf2, on="key").set_index("key").compute() assert_eq(got, expected)
def _test_dataframe_shuffle(backend, protocol, n_workers): if backend == "cudf": cudf = pytest.importorskip("cudf") initialize(enable_tcp_over_ucx=True) else: dask.config.update( dask.config.global_config, { "distributed.comm.ucx": get_ucx_config(enable_tcp_over_ucx=True), }, priority="new", ) with LocalCluster( protocol=protocol, dashboard_address=None, n_workers=n_workers, threads_per_worker=1, processes=True, ) as cluster: with Client(cluster) as client: all_workers = list(client.get_worker_logs().keys()) comms.default_comms() np.random.seed(42) df = pd.DataFrame({"key": np.random.random(100)}) if backend == "cudf": df = cudf.DataFrame.from_pandas(df) for input_nparts in range(1, 5): for output_nparts in range(1, 5): ddf = dd.from_pandas( df.copy(), npartitions=input_nparts).persist(workers=all_workers) ddf = explicit_comms_shuffle( ddf, ["key"], npartitions=output_nparts).persist() assert ddf.npartitions == output_nparts # Check that each partition of `ddf` hashes to the same value result = ddf.map_partitions(check_partitions, output_nparts).compute() assert all(result.to_list()) # Check the values of `ddf` (ignoring the row order) expected = df.sort_values("key") got = ddf.compute().sort_values("key") assert_eq(got, expected)
def _test_local_cluster(protocol): dask.config.update( dask.config.global_config, { "distributed.comm.ucx": get_ucx_config(enable_tcp_over_ucx=True), }, priority="new", ) with LocalCluster( protocol=protocol, dashboard_address=None, n_workers=4, threads_per_worker=1, processes=True, ) as cluster: with Client(cluster) as client: c = comms.CommsContext(client) assert sum(c.run(my_rank, 0)) == sum(range(4))
def test_get_ucx_config(enable_tcp_over_ucx, enable_infiniband, enable_nvlink, net_devices): pytest.importorskip("ucp") kwargs = { "enable_tcp_over_ucx": enable_tcp_over_ucx, "enable_infiniband": enable_infiniband, "enable_nvlink": enable_nvlink, "net_devices": net_devices, "cuda_device_index": 0, } if net_devices == "auto" and enable_infiniband is False: with pytest.raises(ValueError): get_ucx_config(**kwargs) return else: ucx_config = get_ucx_config(**kwargs) assert ucx_config[canonical_name("create_cuda_context", ucx_config)] is True if enable_tcp_over_ucx is not None: assert ucx_config[canonical_name("tcp", ucx_config)] is enable_tcp_over_ucx else: if (enable_infiniband is not True and enable_nvlink is not True and not (enable_infiniband is None and enable_nvlink is None)): assert ucx_config[canonical_name("tcp", ucx_config)] is True else: assert ucx_config[canonical_name("tcp", ucx_config)] is None if enable_infiniband is not None: assert ucx_config[canonical_name("infiniband", ucx_config)] is enable_infiniband else: if (enable_tcp_over_ucx is not True and enable_nvlink is not True and not (enable_tcp_over_ucx is None and enable_nvlink is None)): assert ucx_config[canonical_name("infiniband", ucx_config)] is True else: assert ucx_config[canonical_name("infiniband", ucx_config)] is None if enable_nvlink is not None: assert ucx_config[canonical_name("nvlink", ucx_config)] is enable_nvlink else: if (enable_tcp_over_ucx is not True and enable_infiniband is not True and not (enable_tcp_over_ucx is None and enable_infiniband is None)): assert ucx_config[canonical_name("nvlink", ucx_config)] is True else: assert ucx_config[canonical_name("nvlink", ucx_config)] is None if any( opt is not None for opt in [enable_tcp_over_ucx, enable_infiniband, enable_nvlink] ) and not all(opt is False for opt in [enable_tcp_over_ucx, enable_infiniband, enable_nvlink]): assert ucx_config[canonical_name("cuda-copy", ucx_config)] is True else: assert ucx_config[canonical_name("cuda-copy", ucx_config)] is None if net_devices == "auto": # Since the actual device is system-dependent, we don't do any # checks at the moment. If any InfiniBand devices are available, # that will be the value of "net-devices", otherwise an empty string. pass elif net_devices == "eth0": assert ucx_config[canonical_name("net-devices", ucx_config)] == "eth0" else: assert ucx_config[canonical_name("net-devices", ucx_config)] is None