Beispiel #1
0
def _test_initialize_ucx_infiniband():
    kwargs = {"enable_infiniband": True, "net_devices": "ib0"}
    initialize(**kwargs)
    with LocalCluster(
            protocol="ucx",
            dashboard_address=None,
            n_workers=1,
            threads_per_worker=1,
            processes=True,
            config={"ucx": get_ucx_config(**kwargs)},
    ) as cluster:
        with Client(cluster) as client:
            res = da.from_array(numpy.arange(10000), chunks=(1000, ))
            res = res.sum().compute()
            assert res == 49995000

            def check_ucx_options():
                conf = ucp.get_config()
                assert "TLS" in conf
                assert "rc" in conf["TLS"]
                assert "tcp" in conf["TLS"]
                assert "sockcm" in conf["TLS"]
                assert "cuda_copy" in conf["TLS"]
                assert "sockcm" in conf["SOCKADDR_TLS_PRIORITY"]
                assert conf["NET_DEVICES"] == "ib0"
                return True

            assert client.run_on_scheduler(check_ucx_options) == True
            assert all(client.run(check_ucx_options).values())
Beispiel #2
0
def test_get_ucx_config(enable_tcp_over_ucx, enable_infiniband, net_devices):
    ucx_config = get_ucx_config(
        enable_tcp_over_ucx=enable_tcp_over_ucx,
        enable_infiniband=enable_infiniband,
        net_devices=net_devices,
        cuda_device_index=0,
    )

    if enable_tcp_over_ucx is True:
        assert ucx_config["tcp"] is True
        assert ucx_config["cuda_copy"] is True
    else:
        assert ucx_config["tcp"] is None

    if enable_infiniband is True:
        assert ucx_config["infiniband"] is True
        assert ucx_config["cuda_copy"] is True
    else:
        assert ucx_config["infiniband"] is None

    if enable_tcp_over_ucx is False and enable_infiniband is False:
        assert ucx_config["cuda_copy"] is None

    if net_devices == "eth0":
        assert ucx_config["net-devices"] == "eth0"
    elif net_devices == "auto":
        # Since the actual device is system-dependent, we don't do any
        # checks at the moment. If any InfiniBand devices are available,
        # that will be the value of "net-devices", otherwise an empty string.
        pass
    elif net_devices == "":
        assert "net-device" not in ucx_config
Beispiel #3
0
def _test_initialize_ucx_nvlink():
    kwargs = {"enable_nvlink": True}
    initialize(**kwargs)
    with LocalCluster(
        protocol="ucx",
        dashboard_address=None,
        n_workers=1,
        threads_per_worker=1,
        processes=True,
        config={"distributed.comm.ucx": get_ucx_config(**kwargs)},
    ) as cluster:
        with Client(cluster) as client:
            res = da.from_array(numpy.arange(10000), chunks=(1000,))
            res = res.sum().compute()
            assert res == 49995000

            def check_ucx_options():
                conf = ucp.get_config()
                assert "TLS" in conf
                assert "cuda_ipc" in conf["TLS"]
                assert "tcp" in conf["TLS"]
                assert "cuda_copy" in conf["TLS"]
                if _ucx_110:
                    assert "tcp" in conf["SOCKADDR_TLS_PRIORITY"]
                else:
                    assert "sockcm" in conf["TLS"]
                    assert "sockcm" in conf["SOCKADDR_TLS_PRIORITY"]
                return True

            assert client.run_on_scheduler(check_ucx_options) is True
            assert all(client.run(check_ucx_options).values())
Beispiel #4
0
def _test_initialize_ucx_all():
    initialize()
    with LocalCluster(
        protocol="ucx",
        dashboard_address=None,
        n_workers=1,
        threads_per_worker=1,
        processes=True,
        config={"distributed.comm.ucx": get_ucx_config()},
    ) as cluster:
        with Client(cluster) as client:
            res = da.from_array(numpy.arange(10000), chunks=(1000,))
            res = res.sum().compute()
            assert res == 49995000

            def check_ucx_options():
                conf = ucp.get_config()
                assert "TLS" in conf
                assert conf["TLS"] == "all"
                assert all(
                    [
                        p in conf["SOCKADDR_TLS_PRIORITY"]
                        for p in ["rdmacm", "tcp", "sockcm"]
                    ]
                )
                return True

            assert client.run_on_scheduler(check_ucx_options) is True
            assert all(client.run(check_ucx_options).values())
Beispiel #5
0
def test_get_ucx_config(enable_tcp_over_ucx, enable_infiniband, net_devices):
    pytest.importorskip("ucp")

    kwargs = {
        "enable_tcp_over_ucx": enable_tcp_over_ucx,
        "enable_infiniband": enable_infiniband,
        "net_devices": net_devices,
        "cuda_device_index": 0,
    }
    if net_devices == "auto" and enable_infiniband is False:
        with pytest.raises(ValueError):
            get_ucx_config(**kwargs)
        return
    else:
        ucx_config = get_ucx_config(**kwargs)

    if enable_tcp_over_ucx is True:
        assert ucx_config["tcp"] is True
        assert ucx_config["cuda_copy"] is True
    else:
        assert ucx_config["tcp"] is None

    if enable_infiniband is True:
        assert ucx_config["infiniband"] is True
        assert ucx_config["cuda_copy"] is True
    else:
        assert ucx_config["infiniband"] is None

    if enable_tcp_over_ucx is False and enable_infiniband is False:
        assert ucx_config["cuda_copy"] is None

    if net_devices == "eth0":
        assert ucx_config["net-devices"] == "eth0"
    elif net_devices == "auto":
        # Since the actual device is system-dependent, we don't do any
        # checks at the moment. If any InfiniBand devices are available,
        # that will be the value of "net-devices", otherwise an empty string.
        pass
    elif net_devices == "":
        assert "net-device" not in ucx_config
def _test_dataframe_shuffle_merge(backend, protocol, n_workers):
    if backend == "cudf":
        cudf = pytest.importorskip("cudf")

        initialize(enable_tcp_over_ucx=True)
    else:

        dask.config.update(
            dask.config.global_config,
            {
                "distributed.comm.ucx":
                get_ucx_config(enable_tcp_over_ucx=True),
            },
            priority="new",
        )

    with LocalCluster(
            protocol=protocol,
            dashboard_address=None,
            n_workers=n_workers,
            threads_per_worker=1,
            processes=True,
    ) as cluster:
        with Client(cluster):
            nrows = n_workers * 10

            # Let's make some dataframes that we can join on the "key" column
            df1 = pd.DataFrame({
                "key": np.arange(nrows),
                "payload1": np.arange(nrows)
            })
            key = np.arange(nrows)
            np.random.shuffle(key)
            df2 = pd.DataFrame({
                "key": key[nrows // 3:],
                "payload2": np.arange(nrows)[nrows // 3:]
            })
            expected = df1.merge(df2, on="key").set_index("key")

            if backend == "cudf":
                df1 = cudf.DataFrame.from_pandas(df1)
                df2 = cudf.DataFrame.from_pandas(df2)

            ddf1 = dd.from_pandas(df1, npartitions=n_workers + 1)
            ddf2 = dd.from_pandas(df2,
                                  npartitions=n_workers -
                                  1 if n_workers > 1 else 1)
            with dask.config.set(explicit_comms=True):
                got = ddf1.merge(ddf2, on="key").set_index("key").compute()
            assert_eq(got, expected)
def _test_dataframe_shuffle(backend, protocol, n_workers):
    if backend == "cudf":
        cudf = pytest.importorskip("cudf")
        initialize(enable_tcp_over_ucx=True)
    else:
        dask.config.update(
            dask.config.global_config,
            {
                "distributed.comm.ucx":
                get_ucx_config(enable_tcp_over_ucx=True),
            },
            priority="new",
        )

    with LocalCluster(
            protocol=protocol,
            dashboard_address=None,
            n_workers=n_workers,
            threads_per_worker=1,
            processes=True,
    ) as cluster:
        with Client(cluster) as client:
            all_workers = list(client.get_worker_logs().keys())
            comms.default_comms()
            np.random.seed(42)
            df = pd.DataFrame({"key": np.random.random(100)})
            if backend == "cudf":
                df = cudf.DataFrame.from_pandas(df)

            for input_nparts in range(1, 5):
                for output_nparts in range(1, 5):
                    ddf = dd.from_pandas(
                        df.copy(),
                        npartitions=input_nparts).persist(workers=all_workers)
                    ddf = explicit_comms_shuffle(
                        ddf, ["key"], npartitions=output_nparts).persist()

                    assert ddf.npartitions == output_nparts

                    # Check that each partition of `ddf` hashes to the same value
                    result = ddf.map_partitions(check_partitions,
                                                output_nparts).compute()
                    assert all(result.to_list())

                    # Check the values of `ddf` (ignoring the row order)
                    expected = df.sort_values("key")
                    got = ddf.compute().sort_values("key")
                    assert_eq(got, expected)
def _test_local_cluster(protocol):
    dask.config.update(
        dask.config.global_config,
        {
            "distributed.comm.ucx": get_ucx_config(enable_tcp_over_ucx=True),
        },
        priority="new",
    )

    with LocalCluster(
            protocol=protocol,
            dashboard_address=None,
            n_workers=4,
            threads_per_worker=1,
            processes=True,
    ) as cluster:
        with Client(cluster) as client:
            c = comms.CommsContext(client)
            assert sum(c.run(my_rank, 0)) == sum(range(4))
Beispiel #9
0
def test_get_ucx_config(enable_tcp_over_ucx, enable_infiniband, enable_nvlink,
                        net_devices):
    pytest.importorskip("ucp")

    kwargs = {
        "enable_tcp_over_ucx": enable_tcp_over_ucx,
        "enable_infiniband": enable_infiniband,
        "enable_nvlink": enable_nvlink,
        "net_devices": net_devices,
        "cuda_device_index": 0,
    }
    if net_devices == "auto" and enable_infiniband is False:
        with pytest.raises(ValueError):
            get_ucx_config(**kwargs)
        return
    else:
        ucx_config = get_ucx_config(**kwargs)

    assert ucx_config[canonical_name("create_cuda_context",
                                     ucx_config)] is True

    if enable_tcp_over_ucx is not None:
        assert ucx_config[canonical_name("tcp",
                                         ucx_config)] is enable_tcp_over_ucx
    else:
        if (enable_infiniband is not True and enable_nvlink is not True
                and not (enable_infiniband is None and enable_nvlink is None)):
            assert ucx_config[canonical_name("tcp", ucx_config)] is True
        else:
            assert ucx_config[canonical_name("tcp", ucx_config)] is None

    if enable_infiniband is not None:
        assert ucx_config[canonical_name("infiniband",
                                         ucx_config)] is enable_infiniband
    else:
        if (enable_tcp_over_ucx is not True and enable_nvlink is not True and
                not (enable_tcp_over_ucx is None and enable_nvlink is None)):
            assert ucx_config[canonical_name("infiniband", ucx_config)] is True
        else:
            assert ucx_config[canonical_name("infiniband", ucx_config)] is None

    if enable_nvlink is not None:
        assert ucx_config[canonical_name("nvlink",
                                         ucx_config)] is enable_nvlink
    else:
        if (enable_tcp_over_ucx is not True and enable_infiniband is not True
                and not (enable_tcp_over_ucx is None
                         and enable_infiniband is None)):
            assert ucx_config[canonical_name("nvlink", ucx_config)] is True
        else:
            assert ucx_config[canonical_name("nvlink", ucx_config)] is None

    if any(
            opt is not None
            for opt in [enable_tcp_over_ucx, enable_infiniband, enable_nvlink]
    ) and not all(opt is False for opt in
                  [enable_tcp_over_ucx, enable_infiniband, enable_nvlink]):
        assert ucx_config[canonical_name("cuda-copy", ucx_config)] is True
    else:
        assert ucx_config[canonical_name("cuda-copy", ucx_config)] is None

    if net_devices == "auto":
        # Since the actual device is system-dependent, we don't do any
        # checks at the moment. If any InfiniBand devices are available,
        # that will be the value of "net-devices", otherwise an empty string.
        pass
    elif net_devices == "eth0":
        assert ucx_config[canonical_name("net-devices", ucx_config)] == "eth0"
    else:
        assert ucx_config[canonical_name("net-devices", ucx_config)] is None