async def run(): initialize( create_cuda_context=True, enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, ) async with DGX( interface="enp1s0f0", protocol="ucx", enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, asynchronous=True, ) as dgx: async with Client(dgx, asynchronous=True) as client: rs = da.random.RandomState(RandomState=cupy.random.RandomState) a = rs.normal(10, 1, (int(4e3), int(4e3)), chunks=(int(1e3), int(1e3))) x = a + a.T for i in range(100): print("Running iteration:", i) start = time.time() await client.compute(x) print("Time for iteration", i, ":", time.time() - start)
def _test_tcp_only(): with pytest.warns(DeprecationWarning): with DGX(protocol="tcp") as cluster: with Client(cluster): res = da.from_array(numpy.arange(10000), chunks=(1000, )) res = res.sum().compute() assert res == 49995000
async def test_dgx_ucx_infiniband_nvlink(params): ucp = pytest.importorskip("ucp") enable_tcp = params["enable_tcp"] enable_infiniband = params["enable_infiniband"] enable_nvlink = params["enable_nvlink"] initialize(create_cuda_context=True, enable_tcp_over_ucx=enable_tcp, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink) async with DGX( interface="enp1s0f0", protocol="ucx", enable_tcp_over_ucx=enable_tcp, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, asynchronous=True, ) as cluster: async with Client(cluster, asynchronous=True) as client: rs = da.random.RandomState(RandomState=cupy.random.RandomState) a = rs.normal(10, 1, (int(1e4), int(1e4)), chunks=(int(2.5e3), int(2.5e3))) x = a + a.T res = await client.compute(x)
async def run(): initialize( create_cuda_context=True, enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, ) async with DGX( interface="enp1s0f0", protocol="ucx", enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, asynchronous=True, ) as dgx: async with Client(dgx, asynchronous=True) as client: d = dask_cudf.from_cudf(cudf.DataFrame({"a": range(2**16)}), npartitions=2) r = d.sum() for i in range(100): print("Running iteration:", i) start = time.time() await client.compute(r) print("Time for iteration", i, ":", time.time() - start)
async def test_dgx_tcp_over_ucx(): ucx_env = get_ucx_env(enable_tcp=True) os.environ.update(ucx_env) ucp = pytest.importorskip("ucp") async with DGX(protocol="ucx", enable_tcp_over_ucx=True, asynchronous=True) as cluster: async with Client(cluster, asynchronous=True): pass
def _test_tcp_over_ucx(): with pytest.warns(DeprecationWarning): with DGX(enable_tcp_over_ucx=True) as cluster: with Client(cluster) as client: res = da.from_array(numpy.arange(10000), chunks=(1000, )) res = res.sum().compute() assert res == 49995000 def check_ucx_options(): conf = ucp.get_config() assert "TLS" in conf assert "tcp" in conf["TLS"] assert "sockcm" in conf["TLS"] assert "cuda_copy" in conf["TLS"] assert "sockcm" in conf["SOCKADDR_TLS_PRIORITY"] return True assert all(client.run(check_ucx_options).values())
def _test_ucx_infiniband_nvlink(enable_infiniband, enable_nvlink): cupy = pytest.importorskip("cupy") net_devices = _get_dgx_net_devices() ucx_net_devices = "auto" if enable_infiniband else None with pytest.warns(DeprecationWarning): with DGX( enable_tcp_over_ucx=True, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, ucx_net_devices=ucx_net_devices, ) as cluster: with Client(cluster) as client: res = da.from_array(cupy.arange(10000), chunks=(1000, ), asarray=False) res = res.sum().compute() assert res == 49995000 def check_ucx_options(): conf = ucp.get_config() assert "TLS" in conf assert "tcp" in conf["TLS"] assert "sockcm" in conf["TLS"] assert "cuda_copy" in conf["TLS"] assert "sockcm" in conf["SOCKADDR_TLS_PRIORITY"] if enable_nvlink: assert "cuda_ipc" in conf["TLS"] if enable_infiniband: assert "rc" in conf["TLS"] return True if enable_infiniband: assert all([ cluster.worker_spec[k]["options"]["env"] ["UCX_NET_DEVICES"] == net_devices[k] for k in cluster.worker_spec.keys() ]) assert all(client.run(check_ucx_options).values())
async def test_dgx(): async with DGX(enable_infiniband=False, enable_nvlink=False, asynchronous=True) as cluster: async with Client(cluster, asynchronous=True): pass
async def test_dgx(): async with DGX(asynchronous=True) as cluster: async with Client(cluster, asynchronous=True): pass