Beispiel #1
0
def test_shutdown_unexpected_closed_peer(caplog):
    """
    Test clean server shutdown after unexpected peer close

    This will causes some UCX warnings to be issued, but this as expected.
    The main goal is to assert that the processes exit without errors
    despite a somewhat messy initial state.
    """
    endpoint_error_handling = ucp.get_ucx_version() >= (1, 10, 0)
    if endpoint_error_handling is False and any([
            t.startswith(i) for i in ("rc", "dc", "ud")
            for t in ucp.get_active_transports()
    ]):
        pytest.skip("Endpoint error handling is required when rc, dc or ud"
                    "transport is enabled")

    client_queue = mp.Queue()
    server_queue = mp.Queue()
    p1 = mp.Process(
        target=_test_shutdown_unexpected_closed_peer_server,
        args=(client_queue, server_queue, endpoint_error_handling),
    )
    p1.start()
    p2 = mp.Process(
        target=_test_shutdown_unexpected_closed_peer_client,
        args=(client_queue, server_queue, endpoint_error_handling),
    )
    p2.start()
    p2.join()
    server_queue.put("client is down")
    p1.join()

    assert not p1.exitcode
    assert not p2.exitcode
Beispiel #2
0
def test_get_ucx_version():
    ucp.reset()
    version = ucp.get_ucx_version()
    assert isinstance(version, tuple)
    assert len(version) == 3
    # Check UCX isn't initialized
    assert ucp.core._ctx is None
Beispiel #3
0
async def test_close_callback(server_close_callback):
    endpoint_error_handling = ucp.get_ucx_version() >= (1, 10, 0)
    closed = [False]

    def _close_callback():
        closed[0] = True

    async def server_node(ep):
        if server_close_callback is True:
            ep.set_close_callback(_close_callback)
        msg = bytearray(10)
        await ep.recv(msg)
        if server_close_callback is False:
            await ep.close()

    async def client_node(port):
        ep = await ucp.create_endpoint(
            ucp.get_address(), port, endpoint_error_handling=endpoint_error_handling
        )
        if server_close_callback is False:
            ep.set_close_callback(_close_callback)
        await ep.send(bytearray(b"0" * 10))
        if server_close_callback is True:
            await ep.close()

    listener = ucp.create_listener(
        server_node, endpoint_error_handling=endpoint_error_handling
    )
    await client_node(listener.port)
    assert closed[0] is True
def test_from_worker_address_error(error_type):
    q1 = mp.Queue()
    q2 = mp.Queue()

    server = mp.Process(
        target=_test_from_worker_address_error_server,
        args=(q1, q2, error_type),
    )
    server.start()

    client = mp.Process(
        target=_test_from_worker_address_error_client,
        args=(q1, q2, error_type),
    )
    client.start()

    server.join()
    client.join()

    assert not server.exitcode

    if ucp.get_ucx_version() < (1, 12, 0) and client.exitcode == 1:
        if all(t in error_type for t in ["timeout", "send"]):
            pytest.xfail(
                "Requires https://github.com/openucx/ucx/pull/7527 with rc/ud."
            )
        elif all(t in error_type for t in ["timeout", "recv"]):
            pytest.xfail(
                "Requires https://github.com/openucx/ucx/pull/7531 with rc/ud."
            )
    assert not client.exitcode
Beispiel #5
0
def _scrub_ucx_config():
    """Function to scrub dask config options for valid UCX config options"""

    # configuration of UCX can happen in two ways:
    # 1) high level on/off flags which correspond to UCX configuration
    # 2) explicitly defined UCX configuration flags

    # import does not initialize ucp -- this will occur outside this function
    from ucp import get_config, get_ucx_version

    ucx_110 = get_ucx_version() >= (1, 10, 0)

    options = {}

    # if any of the high level flags are set, as long as they are not Null/None,
    # we assume we should configure basic TLS settings for UCX, otherwise we
    # leave UCX to its default configuration
    if any([
            dask.config.get("ucx.tcp"),
            dask.config.get("ucx.nvlink"),
            dask.config.get("ucx.infiniband"),
    ]):
        if dask.config.get("ucx.rdmacm"):
            tls = "tcp" if ucx_110 else "tcp,rdmacm"
            tls_priority = "rdmacm"
        else:
            tls = "tcp" if ucx_110 else "tcp,sockcm"
            tls_priority = "tcp" if ucx_110 else "sockcm"

        # CUDA COPY can optionally be used with ucx -- we rely on the user
        # to define when messages will include CUDA objects.  Note:
        # defining only the Infiniband flag will not enable cuda_copy
        if any(
            [dask.config.get("ucx.nvlink"),
             dask.config.get("ucx.cuda_copy")]):
            tls = tls + ",cuda_copy"

        if dask.config.get("ucx.infiniband"):
            tls = "rc," + tls
        if dask.config.get("ucx.nvlink"):
            tls = tls + ",cuda_ipc"

        options = {"TLS": tls, "SOCKADDR_TLS_PRIORITY": tls_priority}

        net_devices = dask.config.get("ucx.net-devices")
        if net_devices is not None and net_devices != "":
            options["NET_DEVICES"] = net_devices

    # ANY UCX options defined in config will overwrite high level dask.ucx flags
    valid_ucx_keys = list(get_config().keys())
    for k, v in dask.config.get("ucx").items():
        if k in valid_ucx_keys:
            options[k] = v
        else:
            logger.debug(
                "Key: %s with value: %s not a valid UCX configuration option" %
                (k, v))

    return options
def test_send_recv_cu(endpoint_error_handling):
    if endpoint_error_handling is True and ucp.get_ucx_version() < (1, 11, 0):
        pytest.skip(
            "Endpoint error handling support for all transports is only available "
            "in UCX >= 1.11.0")

    base_env = os.environ
    env_client = base_env.copy()
    # grab first two devices
    cvd = get_cuda_devices()[:2]
    cvd = ",".join(map(str, cvd))
    # reverse CVD for other worker
    env_client["CUDA_VISIBLE_DEVICES"] = cvd[::-1]

    port = random.randint(13000, 15500)
    # serialize function and send to the client and server
    # server will use the return value of the contents,
    # serialize the values, then send serialized values to client.
    # client will compare return values of the deserialized
    # data sent from the server

    func = cloudpickle.dumps(cupy_obj)
    ctx = multiprocessing.get_context("spawn")
    server_process = ctx.Process(name="server",
                                 target=server,
                                 args=[port, func, endpoint_error_handling])
    client_process = ctx.Process(name="client",
                                 target=client,
                                 args=[port, func, endpoint_error_handling])

    server_process.start()
    # cudf will ping the driver for validity of device
    # this will influence device on which a cuda context is created.
    # work around is to update env with new CVD before spawning
    os.environ.update(env_client)
    client_process.start()

    server_process.join()
    client_process.join()

    print("server_process.exitcode:", server_process.exitcode)
    assert server_process.exitcode == 0
    assert client_process.exitcode == -9
Beispiel #7
0
def get_ucx_net_devices(cuda_device_index,
                        ucx_net_devices,
                        get_openfabrics=True,
                        get_network=False):
    if ucp.get_ucx_version() >= (1, 11, 0) and ucx_net_devices == "auto":
        return None

    if cuda_device_index is None and (callable(ucx_net_devices)
                                      or ucx_net_devices == "auto"):
        raise ValueError(
            "A CUDA device index must be specified if the "
            "ucx_net_devices variable is either callable or 'auto'")
    elif cuda_device_index is not None:
        dev = int(cuda_device_index)

    net_dev = None
    if callable(ucx_net_devices):
        net_dev = ucx_net_devices(int(cuda_device_index))
    elif isinstance(ucx_net_devices, str):
        if ucx_net_devices == "auto":
            # If TopologicalDistance from ucp is available, we set the UCX
            # net device to the closest network device explicitly.
            from ucp._libs.topological_distance import TopologicalDistance

            net_dev = ""
            td = TopologicalDistance()
            if get_openfabrics:
                ibs = td.get_cuda_distances_from_device_index(
                    dev, "openfabrics")
                if len(ibs) > 0:
                    net_dev += ibs[0]["name"] + ":1"
            if get_network:
                ifnames = td.get_cuda_distances_from_device_index(
                    dev, "network")
                if len(ifnames) > 0:
                    if len(net_dev) > 0:
                        net_dev += ","
                    net_dev += ifnames[0]["name"]
        else:
            net_dev = ucx_net_devices
    return net_dev
def test_from_worker_address_error(error_type):
    os.environ["UCX_WARN_UNUSED_ENV_VARS"] = "n"
    # Set low timeouts to ensure tests quickly raise as expected
    os.environ["UCX_KEEPALIVE_INTERVAL"] = "100ms"
    os.environ["UCX_UD_TIMEOUT"] = "100ms"

    q1 = mp.Queue()
    q2 = mp.Queue()

    server = mp.Process(
        target=_test_from_worker_address_error_server,
        args=(q1, q2, error_type),
    )
    server.start()

    client = mp.Process(
        target=_test_from_worker_address_error_client,
        args=(q1, q2, error_type),
    )
    client.start()

    server.join()
    client.join()

    assert not server.exitcode

    if ucp.get_ucx_version() < (1, 12, 0) and client.exitcode == 1:
        if error_type == "timeout_send":
            pytest.xfail(
                "Requires https://github.com/openucx/ucx/pull/7527 with rc/ud."
            )
        elif error_type == "timeout_recv":
            pytest.xfail(
                "Requires https://github.com/openucx/ucx/pull/7531 with rc/ud."
            )
    assert not client.exitcode
                    task = asyncio.wait_for(ep.recv(msg, tag=0,
                                                    force_tag=True),
                                            timeout=3.0)

                    q2.put("ready")

                    remote_disconnected = q1.get()
                    assert remote_disconnected == "disconnected"

                    await task

    asyncio.get_event_loop().run_until_complete(run())


@pytest.mark.skipif(
    ucp.get_ucx_version() < (1, 11, 0),
    reason=
    "Endpoint error handling is unreliable in UCX releases prior to 1.11.0",
)
@pytest.mark.parametrize("error_type",
                         ["unreachable", "timeout_send", "timeout_recv"])
def test_from_worker_address_error(error_type):
    os.environ["UCX_WARN_UNUSED_ENV_VARS"] = "n"
    # Set low timeouts to ensure tests quickly raise as expected
    os.environ["UCX_KEEPALIVE_INTERVAL"] = "100ms"
    os.environ["UCX_UD_TIMEOUT"] = "100ms"

    q1 = mp.Queue()
    q2 = mp.Queue()

    server = mp.Process(
Beispiel #10
0
try:
    from nvtx import annotate as nvtx_annotate
except ImportError:
    # If nvtx module is not installed, `annotate` yields only.
    from contextlib import contextmanager

    @contextmanager
    def nvtx_annotate(message=None, color="blue", domain=None):
        yield


try:
    import ucp

    _ucx_110 = ucp.get_ucx_version() >= (1, 10, 0)
    _ucx_111 = ucp.get_ucx_version() >= (1, 11, 0)
except ImportError:
    _ucx_110 = False
    _ucx_111 = False


class CPUAffinity:
    def __init__(self, cores):
        self.cores = cores

    def setup(self, worker=None):
        os.sched_setaffinity(0, self.cores)


class RMMSetup:
Beispiel #11
0
    assert config["SEG_SIZE"] == options["SEG_SIZE"]


@patch.dict(os.environ, {"UCX_SEG_SIZE": "4M"})
def test_init_options_and_env():
    ucp.reset()
    options = {"SEG_SIZE": "3M"}  # Should be ignored
    ucp.init(options, env_takes_precedence=True)
    config = ucp.get_config()
    assert config["SEG_SIZE"] == os.environ["UCX_SEG_SIZE"]
    # Provided options dict was not modified.
    assert options == {"SEG_SIZE": "3M"}


@pytest.mark.skipif(
    ucp.get_ucx_version() >= (1, 12, 0),
    reason="Beginning with UCX >= 1.12, it's only possible to validate "
    "UCP options but not options from other modules such as UCT. "
    "See https://github.com/openucx/ucx/issues/7519.",
)
def test_init_unknown_option():
    ucp.reset()
    options = {"UNKNOWN_OPTION": "3M"}
    with pytest.raises(ucp.exceptions.UCXConfigError):
        ucp.init(options)


def test_init_invalid_option():
    ucp.reset()
    options = {"SEG_SIZE": "invalid-size"}
    with pytest.raises(ucp.exceptions.UCXConfigError):
import cloudpickle
import numpy as np
import pytest
from debug_utils import get_cuda_devices, set_rmm
from utils import recv, send

from distributed.comm.utils import to_frames
from distributed.protocol import to_serialize

import ucp
from ucp._libs.topological_distance import TopologicalDistance

cupy = pytest.importorskip("cupy")
rmm = pytest.importorskip("rmm")

UCX_110 = ucp.get_ucx_version() >= (1, 10, 0)
TRANSFER_ITERATIONS = 5
EP_ITERATIONS = 3


def get_environment_variables(cuda_device_index):
    env = os.environ.copy()

    env["CUDA_VISIBLE_DEVICES"] = str(cuda_device_index)

    if not UCX_110:
        tls = env.get("UCX_TLS")
        if tls is not None and "rc" in tls:
            td = TopologicalDistance()
            closest_openfabrics = td.get_cuda_distances_from_device_index(
                cuda_device_index, "openfabrics")
Beispiel #13
0
def init_once():
    global ucp, host_array, device_array
    global ucx_create_endpoint, ucx_create_listener
    global pre_existing_cuda_context, cuda_context_created

    if ucp is not None:
        return

    # remove/process dask.ucx flags for valid ucx options
    ucx_config = _scrub_ucx_config()

    # We ensure the CUDA context is created before initializing UCX. This can't
    # be safely handled externally because communications in Dask start before
    # preload scripts run.
    if dask.config.get("distributed.comm.ucx.create-cuda-context") is True or (
        "TLS" in ucx_config and "cuda_copy" in ucx_config["TLS"]
    ):
        try:
            import numba.cuda
        except ImportError:
            raise ImportError(
                "CUDA support with UCX requires Numba for context management"
            )

        cuda_visible_device = int(
            os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")[0]
        )
        pre_existing_cuda_context = has_cuda_context()
        if pre_existing_cuda_context is not False:
            warnings.warn(
                f"A CUDA context for device {pre_existing_cuda_context} already exists on process "
                f"ID {os.getpid()}. This is often the result of a CUDA-enabled library calling a "
                "CUDA runtime function before Dask-CUDA can spawn worker processes. Please make "
                "sure any such function calls don't happen at import time or in the global scope "
                "of a program."
            )

        numba.cuda.current_context()

        cuda_context_created = has_cuda_context()
        if (
            cuda_context_created is not False
            and cuda_context_created != cuda_visible_device
        ):
            warnings.warn(
                f"Worker with process ID {os.getpid()} should have a CUDA context assigned to "
                f"device {cuda_visible_device}, but instead the CUDA context is on device "
                "{cuda_context_created}. This is often the result of a CUDA-enabled library "
                "calling a CUDA runtime function before Dask-CUDA can spawn worker processes. "
                "Please make sure any such function calls don't happen at import time or in "
                "the global scope of a program."
            )

    import ucp as _ucp

    ucp = _ucp

    ucp.init(options=ucx_config, env_takes_precedence=True)

    # Find the function, `host_array()`, to use when allocating new host arrays
    try:
        import numpy

        host_array = lambda n: numpy.empty((n,), dtype="u1")
    except ImportError:
        host_array = lambda n: bytearray(n)

    # Find the function, `cuda_array()`, to use when allocating new CUDA arrays
    try:
        import rmm

        device_array = lambda n: rmm.DeviceBuffer(size=n)
    except ImportError:
        try:
            import numba.cuda

            def numba_device_array(n):
                a = numba.cuda.device_array((n,), dtype="u1")
                weakref.finalize(a, numba.cuda.current_context)
                return a

            device_array = numba_device_array
        except ImportError:

            def device_array(n):
                raise RuntimeError(
                    "In order to send/recv CUDA arrays, Numba or RMM is required"
                )

    pool_size_str = dask.config.get("distributed.rmm.pool-size")
    if pool_size_str is not None:
        pool_size = parse_bytes(pool_size_str)
        rmm.reinitialize(
            pool_allocator=True, managed_memory=False, initial_pool_size=pool_size
        )

    try:
        from ucp.endpoint_reuse import EndpointReuse
    except ImportError:
        ucx_create_endpoint = ucp.create_endpoint
        ucx_create_listener = ucp.create_listener
    else:
        reuse_endpoints = dask.config.get("distributed.comm.ucx.reuse-endpoints")
        if (
            reuse_endpoints is None and ucp.get_ucx_version() >= (1, 11, 0)
        ) or reuse_endpoints is False:
            ucx_create_endpoint = ucp.create_endpoint
            ucx_create_listener = ucp.create_listener
        else:
            ucx_create_endpoint = EndpointReuse.create_endpoint
            ucx_create_listener = EndpointReuse.create_listener