async def test_scale_up_down(cleanup):
    start = time()
    async with SpecCluster(
            scheduler=scheduler,
            workers={
                "slow": {
                    "cls": SlowWorker,
                    "options": {
                        "delay": 5
                    }
                },
                "fast": {
                    "cls": Worker,
                    "options": {}
                },
            },
            asynchronous=True,
    ) as cluster:
        cluster.scale(1)  # remove a worker, hopefully the one we don't have
        await cluster

        assert list(cluster.worker_spec) == ["fast"]

        cluster.scale(0)
        await cluster
        assert not cluster.worker_spec
Esempio n. 2
0
async def test_run_spec_cluster_worker_names(cleanup):
    worker = {"cls": Worker, "options": {"nthreads": 1}}

    class MyCluster(SpecCluster):
        def _new_worker_name(self, worker_number):
            return f"prefix-{self.name}-{worker_number}-suffix"

    async with SpecCluster(asynchronous=True,
                           scheduler=scheduler,
                           worker=worker) as cluster:
        cluster.scale(2)
        await cluster
        worker_names = [0, 1]
        assert list(cluster.worker_spec) == worker_names
        assert sorted(list(cluster.workers)) == worker_names

    async with MyCluster(asynchronous=True,
                         scheduler=scheduler,
                         worker=worker,
                         name="test-name") as cluster:
        worker_names = [
            "prefix-test-name-0-suffix", "prefix-test-name-1-suffix"
        ]
        cluster.scale(2)
        await cluster
        assert list(cluster.worker_spec) == worker_names
        assert sorted(list(cluster.workers)) == worker_names
Esempio n. 3
0
async def test_unexpected_closed_worker(cleanup):
    worker = {"cls": Worker, "options": {"nthreads": 1}}
    with dask.config.set({"distributed.deploy.lost-worker-timeout": "10ms"}):
        async with SpecCluster(asynchronous=True,
                               scheduler=scheduler,
                               worker=worker) as cluster:
            assert not cluster.workers
            assert not cluster.worker_spec

            # Scale up
            cluster.scale(2)
            assert not cluster.workers
            assert cluster.worker_spec

            await cluster
            assert len(cluster.workers) == 2

            # Close one
            await list(cluster.workers.values())[0].close()
            start = time()
            while len(cluster.workers) > 1:  # wait for messages to flow around
                await asyncio.sleep(0.01)
                assert time() < start + 2
            assert len(cluster.workers) == 1
            assert len(cluster.worker_spec) == 2

            await cluster
            assert len(cluster.workers) == 2
async def test_startup(cleanup):
    start = time()
    async with SpecCluster(
            scheduler=scheduler,
            workers={
                0: {
                    "cls": Worker,
                    "options": {}
                },
                1: {
                    "cls": SlowWorker,
                    "options": {
                        "delay": 5
                    }
                },
                2: {
                    "cls": SlowWorker,
                    "options": {
                        "delay": 0
                    }
                },
            },
            asynchronous=True,
    ) as cluster:
        assert len(cluster.workers) == len(cluster.worker_spec) == 3
        assert time() < start + 5
        assert 0 <= len(cluster.scheduler_info["workers"]) <= 2

        async with Client(cluster, asynchronous=True) as client:
            await client.wait_for_workers(n_workers=2)
Esempio n. 5
0
async def test_adaptive_killed_worker(cleanup):
    with dask.config.set({"distributed.deploy.lost-worker-timeout": 0.1}):

        async with SpecCluster(
                asynchronous=True,
                worker={
                    "cls": Nanny,
                    "options": {
                        "nthreads": 1
                    }
                },
                scheduler={
                    "cls": Scheduler,
                    "options": {
                        "port": 0
                    }
                },
        ) as cluster:

            async with Client(cluster, asynchronous=True) as client:

                cluster.adapt(minimum=1, maximum=1)

                # Scale up a cluster with 1 worker.
                while len(cluster.workers) != 1:
                    await asyncio.sleep(0.01)

                future = client.submit(sleep, 0.1)

                # Kill the only worker.
                [worker_id] = cluster.workers
                await cluster.workers[worker_id].kill()

                # Wait for the worker to re-spawn and finish sleeping.
                await future.result(timeout=5)
Esempio n. 6
0
async def test_logs(cleanup):
    worker = {"cls": Worker, "options": {"nthreads": 1}}
    async with SpecCluster(
        asynchronous=True, scheduler=scheduler, worker=worker
    ) as cluster:
        cluster.scale(2)
        await cluster

        logs = await cluster.logs()
        assert is_valid_xml("<div>" + logs._repr_html_() + "</div>")
        assert "Scheduler" in logs
        for worker in cluster.scheduler.workers:
            assert worker in logs

        assert "Registered" in str(logs)

        logs = await cluster.logs(scheduler=True, workers=False)
        assert list(logs) == ["Scheduler"]

        logs = await cluster.logs(scheduler=False, workers=False)
        assert list(logs) == []

        logs = await cluster.logs(scheduler=False, workers=True)
        assert set(logs) == set(cluster.scheduler.workers)

        w = toolz.first(cluster.scheduler.workers)
        logs = await cluster.logs(scheduler=False, workers=[w])
        assert set(logs) == {w}
Esempio n. 7
0
def test_spec_close_clusters(loop):
    workers = {0: {"cls": Worker}}
    scheduler = {"cls": Scheduler, "options": {"port": 0}}
    cluster = SpecCluster(workers=workers, scheduler=scheduler, loop=loop)
    assert cluster in SpecCluster._instances
    close_clusters()
    assert cluster.status == "closed"
Esempio n. 8
0
async def test_scale(cleanup):
    worker = {"cls": Worker, "options": {"nthreads": 1}}
    async with SpecCluster(asynchronous=True,
                           scheduler=scheduler,
                           worker=worker) as cluster:
        assert not cluster.workers
        assert not cluster.worker_spec

        # Scale up
        cluster.scale(2)
        assert not cluster.workers
        assert cluster.worker_spec

        await cluster
        assert len(cluster.workers) == 2

        # Scale down
        cluster.scale(1)
        assert len(cluster.workers) == 2

        await cluster
        assert len(cluster.workers) == 1

        # Can use with await
        await cluster.scale(2)
        await cluster
        assert len(cluster.workers) == 2
async def test_adaptive(cleanup):
    start = time()
    async with SpecCluster(
            scheduler=scheduler,
            workers={"fast": {
                "cls": Worker,
                "options": {}
            }},
            worker={
                "cls": SlowWorker,
                "options": {
                    "delay": 5
                }
            },
            asynchronous=True,
    ) as cluster:
        cluster.adapt(minimum=1,
                      maximum=4,
                      target_duration="1s",
                      interval="20ms")
        async with Client(cluster, asynchronous=True) as client:
            futures = client.map(slowinc, range(200), delay=0.1)

            while len(cluster.worker_spec) <= 1:
                await asyncio.sleep(0.05)

            del futures

            while len(cluster.worker_spec) > 1:
                await asyncio.sleep(0.05)

            assert list(cluster.worker_spec) == ["fast"]
Esempio n. 10
0
async def test_bad_close(cleanup):
    with warnings.catch_warnings(record=True) as record:
        cluster = SpecCluster(workers=worker_spec,
                              scheduler=scheduler,
                              asynchronous=True)
        await cluster.close()

    assert not record
Esempio n. 11
0
async def test_nanny_port():
    scheduler = {"cls": Scheduler}
    workers = {0: {"cls": Nanny, "options": {"port": 9200}}}

    async with SpecCluster(scheduler=scheduler,
                           workers=workers,
                           asynchronous=True) as cluster:
        pass
Esempio n. 12
0
def test_loop_started():
    cluster = SpecCluster(worker_spec,
                          scheduler={
                              "cls": Scheduler,
                              "options": {
                                  "port": 0
                              }
                          })
Esempio n. 13
0
async def test_broken_worker():
    with pytest.raises(Exception) as info:
        async with SpecCluster(
            asynchronous=True,
            workers={"good": {"cls": Worker}, "bad": {"cls": BrokenWorker}},
        ) as cluster:
            pass

    assert "Broken" in str(info.value)
Esempio n. 14
0
async def test_dashboard_link(cleanup):
    async with SpecCluster(
        workers=worker_spec,
        scheduler={
            "cls": Scheduler,
            "options": {"port": 0, "dashboard_address": ":12345"},
        },
        asynchronous=True,
    ) as cluster:
        assert "12345" in cluster.dashboard_link
Esempio n. 15
0
async def test_MultiWorker(cleanup):
    async with SpecCluster(
            scheduler=scheduler,
            worker={
                "cls": MultiWorker,
                "options": {
                    "n": 2,
                    "nthreads": 4,
                    "memory_limit": "4 GB"
                },
                "group": ["-0", "-1"],
            },
            asynchronous=True,
    ) as cluster:
        s = cluster.scheduler
        async with Client(cluster, asynchronous=True) as client:
            cluster.scale(2)
            await cluster
            assert len(cluster.worker_spec) == 2
            await client.wait_for_workers(4)
            while len(cluster.scheduler_info["workers"]) < 4:
                await asyncio.sleep(0.01)

            while "workers=4" not in repr(cluster):
                await asyncio.sleep(0.1)

            workers_line = re.search("(Workers.+)",
                                     cluster._repr_html_()).group(1)
            assert re.match("Workers.*4", workers_line)

            cluster.scale(1)
            await cluster
            assert len(s.workers) == 2

            cluster.scale(memory="6GB")
            await cluster
            assert len(cluster.worker_spec) == 2
            assert len(s.workers) == 4
            assert cluster.plan == {ws.name for ws in s.workers.values()}

            cluster.scale(cores=10)
            await cluster
            assert len(cluster.workers) == 3

            adapt = cluster.adapt(minimum=0, maximum=4)

            for i in range(adapt.wait_count):  # relax down to 0 workers
                await adapt.adapt()
            await cluster
            assert not s.workers

            future = client.submit(lambda x: x + 1, 10)
            await future
            assert len(cluster.workers) == 1
Esempio n. 16
0
async def test_ProcessInterfaceValid(cleanup):
    async with SpecCluster(scheduler=scheduler,
                           worker={"cls": ProcessInterface},
                           asynchronous=True) as cluster:
        cluster.scale(2)
        await cluster
        assert len(cluster.worker_spec) == len(cluster.workers) == 2

        cluster.scale(1)
        await cluster
        assert len(cluster.worker_spec) == len(cluster.workers) == 1
Esempio n. 17
0
async def test_scale_cores_memory(cleanup):
    async with SpecCluster(
        scheduler=scheduler,
        worker={"cls": Worker, "options": {"nthreads": 1}},
        asynchronous=True,
    ) as cluster:
        cluster.scale(cores=2)
        assert len(cluster.worker_spec) == 2
        with pytest.raises(ValueError) as info:
            cluster.scale(memory="5GB")

        assert "memory" in str(info.value)
Esempio n. 18
0
async def test_restart():
    """Regression test for https://github.com/dask/distributed/issues/3062"""
    worker = {"cls": Nanny, "options": {"nthreads": 1}}
    async with SpecCluster(asynchronous=True,
                           scheduler=scheduler,
                           worker=worker) as cluster:
        async with Client(cluster, asynchronous=True) as client:
            cluster.scale(2)
            await cluster
            assert len(cluster.workers) == 2
            await client.restart()
            while len(cluster.workers) < 2:
                await asyncio.sleep(0.01)
Esempio n. 19
0
async def test_restart(cleanup):
    # Regression test for https://github.com/dask/distributed/issues/3062
    worker = {"cls": Nanny, "options": {"nthreads": 1}}
    with dask.config.set({"distributed.deploy.lost-worker-timeout": "2s"}):
        async with SpecCluster(asynchronous=True,
                               scheduler=scheduler,
                               worker=worker) as cluster:
            async with Client(cluster, asynchronous=True) as client:
                cluster.scale(2)
                await cluster
                assert len(cluster.workers) == 2
                await client.restart()
                start = time()
                while len(cluster.workers) < 2:
                    await asyncio.sleep(0.5)
                    assert time() < start + 60
Esempio n. 20
0
async def test_MultiWorker(cleanup):
    async with SpecCluster(
            scheduler=scheduler,
            worker={
                "cls": MultiWorker,
                "options": {
                    "n": 2,
                    "nthreads": 4,
                    "memory_limit": "4 GB"
                },
                "group": ["-0", "-1"],
            },
            asynchronous=True,
    ) as cluster:
        s = cluster.scheduler
        async with Client(cluster, asynchronous=True) as client:
            cluster.scale(2)
            await cluster
            assert len(cluster.worker_spec) == 2
            await client.wait_for_workers(4)

            assert "workers=4" in repr(cluster)

            cluster.scale(1)
            await cluster
            assert len(s.workers) == 2

            cluster.scale(memory="6GB")
            await cluster
            assert len(cluster.worker_spec) == 2
            assert len(s.workers) == 4
            assert cluster.plan == {ws.name for ws in s.workers.values()}

            cluster.scale(cores=10)
            await cluster
            assert len(cluster.workers) == 3

            adapt = cluster.adapt(minimum=0, maximum=4)

            for i in range(adapt.wait_count):  # relax down to 0 workers
                await adapt.adapt()
            await cluster
            assert not s.workers

            future = client.submit(lambda x: x + 1, 10)
            await future
            assert len(cluster.workers) == 1
Esempio n. 21
0
async def test_scheduler_info(cleanup):
    async with SpecCluster(workers=worker_spec,
                           scheduler=scheduler,
                           asynchronous=True) as cluster:
        assert (cluster.scheduler_info["id"] == cluster.scheduler.id
                )  # present at startup

        start = time()  # wait for all workers
        while len(cluster.scheduler_info["workers"]) < len(cluster.workers):
            await asyncio.sleep(0.01)
            assert time() < start + 1

        assert set(cluster.scheduler.identity()["workers"]) == set(
            cluster.scheduler_info["workers"])
        assert (cluster.scheduler.identity()["services"] ==
                cluster.scheduler_info["services"])
        assert len(cluster.scheduler_info["workers"]) == len(cluster.workers)
Esempio n. 22
0
async def test_widget(cleanup):
    async with SpecCluster(
        workers=worker_spec,
        scheduler=scheduler,
        asynchronous=True,
        worker={"cls": Worker, "options": {"nthreads": 1}},
    ) as cluster:

        start = time()  # wait for all workers
        while len(cluster.scheduler_info["workers"]) < len(cluster.worker_spec):
            await asyncio.sleep(0.01)
            assert time() < start + 1

        assert "3" in cluster._widget_status()
        assert "GB" in cluster._widget_status()

        cluster.scale(5)
        assert "3 / 5" in cluster._widget_status()
Esempio n. 23
0
def test_spec_sync(loop):
    worker_spec = {
        0: {
            "cls": Worker,
            "options": {
                "nthreads": 1
            }
        },
        1: {
            "cls": Worker,
            "options": {
                "nthreads": 2
            }
        },
        "my-worker": {
            "cls": MyWorker,
            "options": {
                "nthreads": 3
            }
        },
    }
    with SpecCluster(workers=worker_spec, scheduler=scheduler,
                     loop=loop) as cluster:
        assert cluster.worker_spec is worker_spec

        assert len(cluster.workers) == 3
        assert set(cluster.workers) == set(worker_spec)
        assert isinstance(cluster.workers[0], Worker)
        assert isinstance(cluster.workers[1], Worker)
        assert isinstance(cluster.workers["my-worker"], MyWorker)

        assert cluster.workers[0].nthreads == 1
        assert cluster.workers[1].nthreads == 2
        assert cluster.workers["my-worker"].nthreads == 3

        with Client(cluster, loop=loop) as client:
            assert cluster.loop is cluster.scheduler.loop
            assert cluster.loop is client.loop
            result = client.submit(lambda x: x + 1, 10).result()
            assert result == 11
Esempio n. 24
0
async def test_specification():
    async with SpecCluster(workers=worker_spec,
                           scheduler=scheduler,
                           asynchronous=True) as cluster:
        assert cluster.worker_spec is worker_spec

        assert len(cluster.workers) == 3
        assert set(cluster.workers) == set(worker_spec)
        assert isinstance(cluster.workers[0], Worker)
        assert isinstance(cluster.workers[1], Worker)
        assert isinstance(cluster.workers["my-worker"], MyWorker)

        assert cluster.workers[0].nthreads == 1
        assert cluster.workers[1].nthreads == 2
        assert cluster.workers["my-worker"].nthreads == 3

        async with Client(cluster, asynchronous=True) as client:
            result = await client.submit(lambda x: x + 1, 10)
            assert result == 11

        for name in cluster.workers:
            assert cluster.workers[name].name == name
Esempio n. 25
0
def DGX(
    interface=None,
    dashboard_address=":8787",
    threads_per_worker=1,
    silence_logs=True,
    CUDA_VISIBLE_DEVICES=None,
    protocol=None,
    enable_tcp_over_ucx=False,
    enable_infiniband=False,
    enable_nvlink=False,
    **kwargs,
):
    """ A Local Cluster for a DGX 1 machine

    NVIDIA's DGX-1 machine has a complex architecture mapping CPUs, GPUs, and
    network hardware.  This function creates a local cluster that tries to
    respect this hardware as much as possible.

    It creates one Dask worker process per GPU, and assigns each worker process
    the correct CPU cores and Network interface cards to maximize performance.
    If UCX and UCX-Py are also available, it's possible to use InfiniBand and
    NVLink connections for optimal data transfer performance.

    That being said, things aren't perfect.  Today a DGX has very high
    performance between certain sets of GPUs and not others.  A Dask DGX
    cluster that uses only certain tightly coupled parts of the computer will
    have significantly higher bandwidth than a deployment on the entire thing.

    Parameters
    ----------
    interface: str
        The external interface used to connect to the scheduler, usually
        the ethernet interface is used for connection (not the InfiniBand!).
    dashboard_address: str
        The address for the scheduler dashboard.  Defaults to ":8787".
    threads_per_worker: int
        Number of threads to be used for each CUDA worker process.
    silence_logs: bool
        Disable logging for all worker processes
    CUDA_VISIBLE_DEVICES: str
        String like ``"0,1,2,3"`` or ``[0, 1, 2, 3]`` to restrict activity to
        different GPUs
    protocol: str
        Protocol to use for communication, e.g., "tcp" or "ucx"
    enable_tcp_over_ucx: bool
        Set environment variables to enable TCP over UCX, even if InfiniBand
        and NVLink are not supported or disabled.
    enable_infiniband: bool
        Set environment variables to enable UCX InfiniBand support, requires
        protocol='ucx' and implies enable_tcp_over_ucx=True.
    enable_nvlink: bool
        Set environment variables to enable UCX NVLink support, requires
        protocol='ucx' and implies enable_tcp_over_ucx=True.

    Raises
    ------
    TypeError
        If enable_infiniband or enable_nvlink is True and protocol is not 'ucx'

    Examples
    --------
    >>> from dask_cuda import DGX
    >>> from dask.distributed import Client
    >>> cluster = DGX()
    >>> client = Client(cluster)
    """
    if (enable_tcp_over_ucx or enable_infiniband
            or enable_nvlink) and protocol != "ucx":
        raise TypeError(
            "Enabling InfiniBand or NVLink requires protocol='ucx'")

    ucx_net_devices = ""
    if enable_infiniband:
        ucx_net_devices = lambda i: "mlx5_%d:1" % (i // 2)

    spec = worker_spec(
        interface=interface,
        dashboard_address=dashboard_address,
        threads_per_worker=threads_per_worker,
        silence_logs=silence_logs,
        CUDA_VISIBLE_DEVICES=CUDA_VISIBLE_DEVICES,
        enable_tcp_over_ucx=enable_tcp_over_ucx,
        enable_infiniband=enable_infiniband,
        ucx_net_devices=ucx_net_devices,
        enable_nvlink=enable_nvlink,
        protocol=protocol,
        **kwargs,
    )

    scheduler = {
        "cls": Scheduler,
        "options": {
            "interface":
            interface,
            "protocol":
            protocol,
            "dashboard_address":
            dashboard_address,
            **get_preload_options(
                protocol=protocol,
                enable_tcp_over_ucx=enable_tcp_over_ucx,
                enable_infiniband=enable_infiniband,
                enable_nvlink=enable_nvlink,
            ),
        },
    }

    return SpecCluster(workers=spec,
                       scheduler=scheduler,
                       silence_logs=silence_logs,
                       **kwargs)
Esempio n. 26
0
def test_spec_close_clusters(loop):
    workers = {0: {"cls": Worker}}
    cluster = SpecCluster(workers=workers, scheduler=scheduler, loop=loop)
    assert cluster in SpecCluster._instances
    close_clusters()
    assert cluster.status == Status.closed
Esempio n. 27
0
def DGX(interface="ib",
        dashboard_address=":8787",
        threads_per_worker=1,
        silence_logs=True,
        CUDA_VISIBLE_DEVICES=None,
        **kwargs):
    """ A Local Cluster for a DGX 1 machine

    NVIDIA's DGX-1 machine has a complex architecture mapping CPUs, GPUs, and
    network hardware.  This function creates a local cluster that tries to
    respect this hardware as much as possible.

    It creates one Dask worker process per GPU, and assigns each worker process
    the correct CPU cores and Network interface cards to maximize performance.

    That being said, things aren't perfect.  Today a DGX has very high
    performance between certain sets of GPUs and not others.  A Dask DGX
    cluster that uses only certain tightly coupled parts of the computer will
    have significantly higher bandwidth than a deployment on the entire thing.

    Parameters
    ----------
    interface: str
        The interface prefix for the infiniband networking cards.  This is
        often "ib"` or "bond".  We will add the numeric suffix 0,1,2,3 as
        appropriate.  Defaults to "ib".
    dashboard_address: str
        The address for the scheduler dashboard.  Defaults to ":8787".
    CUDA_VISIBLE_DEVICES: str
        String like ``"0,1,2,3"`` or ``[0, 1, 2, 3]`` to restrict activity to
        different GPUs

    Examples
    --------
    >>> from dask_cuda import DGX
    >>> from dask.distributed import Client
    >>> cluster = DGX(interface='ib')
    >>> client = Client(cluster)
    """
    if CUDA_VISIBLE_DEVICES is None:
        CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES",
                                              "0,1,2,3,4,5,6,7")
    if isinstance(CUDA_VISIBLE_DEVICES, str):
        CUDA_VISIBLE_DEVICES = CUDA_VISIBLE_DEVICES.split(",")
    CUDA_VISIBLE_DEVICES = list(map(int, CUDA_VISIBLE_DEVICES))
    memory_limit = TOTAL_MEMORY / 8

    spec = {
        i: {
            "cls": Nanny,
            "options": {
                "env": {
                    "CUDA_VISIBLE_DEVICES":
                    cuda_visible_devices(ii, CUDA_VISIBLE_DEVICES),
                    # 'UCX_NET_DEVICES': 'mlx5_%d:1' % (i // 2)
                    "UCX_TLS":
                    "rc,cuda_copy,cuda_ipc",
                },
                "interface": interface + str(i // 2),
                "protocol": "ucx",
                "nthreads": threads_per_worker,
                "data": dict,
                "preload": ["dask_cuda.initialize_context"],
                "dashboard_address": ":0",
                "plugins": [CPUAffinity(affinity[i])],
                "silence_logs": silence_logs,
                "memory_limit": memory_limit,
            },
        }
        for ii, i in enumerate(CUDA_VISIBLE_DEVICES)
    }

    scheduler = {
        "cls": Scheduler,
        "options": {
            "interface": interface + str(CUDA_VISIBLE_DEVICES[0] // 2),
            "protocol": "ucx",
            "dashboard_address": dashboard_address,
        },
    }

    return SpecCluster(workers=spec,
                       scheduler=scheduler,
                       silence_logs=silence_logs,
                       **kwargs)
Esempio n. 28
0
def test_loop_started():
    cluster = SpecCluster(worker_spec)
Esempio n. 29
0
def test_loop_started():
    with SpecCluster(worker_spec, scheduler=scheduler):
        pass