def test_shutdown_unexpected_closed_peer(caplog): """ Test clean server shutdown after unexpected peer close This will causes some UCX warnings to be issued, but this as expected. The main goal is to assert that the processes exit without errors despite a somewhat messy initial state. """ endpoint_error_handling = ucp.get_ucx_version() >= (1, 10, 0) if endpoint_error_handling is False and any([ t.startswith(i) for i in ("rc", "dc", "ud") for t in ucp.get_active_transports() ]): pytest.skip("Endpoint error handling is required when rc, dc or ud" "transport is enabled") client_queue = mp.Queue() server_queue = mp.Queue() p1 = mp.Process( target=_test_shutdown_unexpected_closed_peer_server, args=(client_queue, server_queue, endpoint_error_handling), ) p1.start() p2 = mp.Process( target=_test_shutdown_unexpected_closed_peer_client, args=(client_queue, server_queue, endpoint_error_handling), ) p2.start() p2.join() server_queue.put("client is down") p1.join() assert not p1.exitcode assert not p2.exitcode
def test_get_ucx_version(): ucp.reset() version = ucp.get_ucx_version() assert isinstance(version, tuple) assert len(version) == 3 # Check UCX isn't initialized assert ucp.core._ctx is None
async def test_close_callback(server_close_callback): endpoint_error_handling = ucp.get_ucx_version() >= (1, 10, 0) closed = [False] def _close_callback(): closed[0] = True async def server_node(ep): if server_close_callback is True: ep.set_close_callback(_close_callback) msg = bytearray(10) await ep.recv(msg) if server_close_callback is False: await ep.close() async def client_node(port): ep = await ucp.create_endpoint( ucp.get_address(), port, endpoint_error_handling=endpoint_error_handling ) if server_close_callback is False: ep.set_close_callback(_close_callback) await ep.send(bytearray(b"0" * 10)) if server_close_callback is True: await ep.close() listener = ucp.create_listener( server_node, endpoint_error_handling=endpoint_error_handling ) await client_node(listener.port) assert closed[0] is True
def test_from_worker_address_error(error_type): q1 = mp.Queue() q2 = mp.Queue() server = mp.Process( target=_test_from_worker_address_error_server, args=(q1, q2, error_type), ) server.start() client = mp.Process( target=_test_from_worker_address_error_client, args=(q1, q2, error_type), ) client.start() server.join() client.join() assert not server.exitcode if ucp.get_ucx_version() < (1, 12, 0) and client.exitcode == 1: if all(t in error_type for t in ["timeout", "send"]): pytest.xfail( "Requires https://github.com/openucx/ucx/pull/7527 with rc/ud." ) elif all(t in error_type for t in ["timeout", "recv"]): pytest.xfail( "Requires https://github.com/openucx/ucx/pull/7531 with rc/ud." ) assert not client.exitcode
def _scrub_ucx_config(): """Function to scrub dask config options for valid UCX config options""" # configuration of UCX can happen in two ways: # 1) high level on/off flags which correspond to UCX configuration # 2) explicitly defined UCX configuration flags # import does not initialize ucp -- this will occur outside this function from ucp import get_config, get_ucx_version ucx_110 = get_ucx_version() >= (1, 10, 0) options = {} # if any of the high level flags are set, as long as they are not Null/None, # we assume we should configure basic TLS settings for UCX, otherwise we # leave UCX to its default configuration if any([ dask.config.get("ucx.tcp"), dask.config.get("ucx.nvlink"), dask.config.get("ucx.infiniband"), ]): if dask.config.get("ucx.rdmacm"): tls = "tcp" if ucx_110 else "tcp,rdmacm" tls_priority = "rdmacm" else: tls = "tcp" if ucx_110 else "tcp,sockcm" tls_priority = "tcp" if ucx_110 else "sockcm" # CUDA COPY can optionally be used with ucx -- we rely on the user # to define when messages will include CUDA objects. Note: # defining only the Infiniband flag will not enable cuda_copy if any( [dask.config.get("ucx.nvlink"), dask.config.get("ucx.cuda_copy")]): tls = tls + ",cuda_copy" if dask.config.get("ucx.infiniband"): tls = "rc," + tls if dask.config.get("ucx.nvlink"): tls = tls + ",cuda_ipc" options = {"TLS": tls, "SOCKADDR_TLS_PRIORITY": tls_priority} net_devices = dask.config.get("ucx.net-devices") if net_devices is not None and net_devices != "": options["NET_DEVICES"] = net_devices # ANY UCX options defined in config will overwrite high level dask.ucx flags valid_ucx_keys = list(get_config().keys()) for k, v in dask.config.get("ucx").items(): if k in valid_ucx_keys: options[k] = v else: logger.debug( "Key: %s with value: %s not a valid UCX configuration option" % (k, v)) return options
def test_send_recv_cu(endpoint_error_handling): if endpoint_error_handling is True and ucp.get_ucx_version() < (1, 11, 0): pytest.skip( "Endpoint error handling support for all transports is only available " "in UCX >= 1.11.0") base_env = os.environ env_client = base_env.copy() # grab first two devices cvd = get_cuda_devices()[:2] cvd = ",".join(map(str, cvd)) # reverse CVD for other worker env_client["CUDA_VISIBLE_DEVICES"] = cvd[::-1] port = random.randint(13000, 15500) # serialize function and send to the client and server # server will use the return value of the contents, # serialize the values, then send serialized values to client. # client will compare return values of the deserialized # data sent from the server func = cloudpickle.dumps(cupy_obj) ctx = multiprocessing.get_context("spawn") server_process = ctx.Process(name="server", target=server, args=[port, func, endpoint_error_handling]) client_process = ctx.Process(name="client", target=client, args=[port, func, endpoint_error_handling]) server_process.start() # cudf will ping the driver for validity of device # this will influence device on which a cuda context is created. # work around is to update env with new CVD before spawning os.environ.update(env_client) client_process.start() server_process.join() client_process.join() print("server_process.exitcode:", server_process.exitcode) assert server_process.exitcode == 0 assert client_process.exitcode == -9
def get_ucx_net_devices(cuda_device_index, ucx_net_devices, get_openfabrics=True, get_network=False): if ucp.get_ucx_version() >= (1, 11, 0) and ucx_net_devices == "auto": return None if cuda_device_index is None and (callable(ucx_net_devices) or ucx_net_devices == "auto"): raise ValueError( "A CUDA device index must be specified if the " "ucx_net_devices variable is either callable or 'auto'") elif cuda_device_index is not None: dev = int(cuda_device_index) net_dev = None if callable(ucx_net_devices): net_dev = ucx_net_devices(int(cuda_device_index)) elif isinstance(ucx_net_devices, str): if ucx_net_devices == "auto": # If TopologicalDistance from ucp is available, we set the UCX # net device to the closest network device explicitly. from ucp._libs.topological_distance import TopologicalDistance net_dev = "" td = TopologicalDistance() if get_openfabrics: ibs = td.get_cuda_distances_from_device_index( dev, "openfabrics") if len(ibs) > 0: net_dev += ibs[0]["name"] + ":1" if get_network: ifnames = td.get_cuda_distances_from_device_index( dev, "network") if len(ifnames) > 0: if len(net_dev) > 0: net_dev += "," net_dev += ifnames[0]["name"] else: net_dev = ucx_net_devices return net_dev
def test_from_worker_address_error(error_type): os.environ["UCX_WARN_UNUSED_ENV_VARS"] = "n" # Set low timeouts to ensure tests quickly raise as expected os.environ["UCX_KEEPALIVE_INTERVAL"] = "100ms" os.environ["UCX_UD_TIMEOUT"] = "100ms" q1 = mp.Queue() q2 = mp.Queue() server = mp.Process( target=_test_from_worker_address_error_server, args=(q1, q2, error_type), ) server.start() client = mp.Process( target=_test_from_worker_address_error_client, args=(q1, q2, error_type), ) client.start() server.join() client.join() assert not server.exitcode if ucp.get_ucx_version() < (1, 12, 0) and client.exitcode == 1: if error_type == "timeout_send": pytest.xfail( "Requires https://github.com/openucx/ucx/pull/7527 with rc/ud." ) elif error_type == "timeout_recv": pytest.xfail( "Requires https://github.com/openucx/ucx/pull/7531 with rc/ud." ) assert not client.exitcode
task = asyncio.wait_for(ep.recv(msg, tag=0, force_tag=True), timeout=3.0) q2.put("ready") remote_disconnected = q1.get() assert remote_disconnected == "disconnected" await task asyncio.get_event_loop().run_until_complete(run()) @pytest.mark.skipif( ucp.get_ucx_version() < (1, 11, 0), reason= "Endpoint error handling is unreliable in UCX releases prior to 1.11.0", ) @pytest.mark.parametrize("error_type", ["unreachable", "timeout_send", "timeout_recv"]) def test_from_worker_address_error(error_type): os.environ["UCX_WARN_UNUSED_ENV_VARS"] = "n" # Set low timeouts to ensure tests quickly raise as expected os.environ["UCX_KEEPALIVE_INTERVAL"] = "100ms" os.environ["UCX_UD_TIMEOUT"] = "100ms" q1 = mp.Queue() q2 = mp.Queue() server = mp.Process(
try: from nvtx import annotate as nvtx_annotate except ImportError: # If nvtx module is not installed, `annotate` yields only. from contextlib import contextmanager @contextmanager def nvtx_annotate(message=None, color="blue", domain=None): yield try: import ucp _ucx_110 = ucp.get_ucx_version() >= (1, 10, 0) _ucx_111 = ucp.get_ucx_version() >= (1, 11, 0) except ImportError: _ucx_110 = False _ucx_111 = False class CPUAffinity: def __init__(self, cores): self.cores = cores def setup(self, worker=None): os.sched_setaffinity(0, self.cores) class RMMSetup:
assert config["SEG_SIZE"] == options["SEG_SIZE"] @patch.dict(os.environ, {"UCX_SEG_SIZE": "4M"}) def test_init_options_and_env(): ucp.reset() options = {"SEG_SIZE": "3M"} # Should be ignored ucp.init(options, env_takes_precedence=True) config = ucp.get_config() assert config["SEG_SIZE"] == os.environ["UCX_SEG_SIZE"] # Provided options dict was not modified. assert options == {"SEG_SIZE": "3M"} @pytest.mark.skipif( ucp.get_ucx_version() >= (1, 12, 0), reason="Beginning with UCX >= 1.12, it's only possible to validate " "UCP options but not options from other modules such as UCT. " "See https://github.com/openucx/ucx/issues/7519.", ) def test_init_unknown_option(): ucp.reset() options = {"UNKNOWN_OPTION": "3M"} with pytest.raises(ucp.exceptions.UCXConfigError): ucp.init(options) def test_init_invalid_option(): ucp.reset() options = {"SEG_SIZE": "invalid-size"} with pytest.raises(ucp.exceptions.UCXConfigError):
import cloudpickle import numpy as np import pytest from debug_utils import get_cuda_devices, set_rmm from utils import recv, send from distributed.comm.utils import to_frames from distributed.protocol import to_serialize import ucp from ucp._libs.topological_distance import TopologicalDistance cupy = pytest.importorskip("cupy") rmm = pytest.importorskip("rmm") UCX_110 = ucp.get_ucx_version() >= (1, 10, 0) TRANSFER_ITERATIONS = 5 EP_ITERATIONS = 3 def get_environment_variables(cuda_device_index): env = os.environ.copy() env["CUDA_VISIBLE_DEVICES"] = str(cuda_device_index) if not UCX_110: tls = env.get("UCX_TLS") if tls is not None and "rc" in tls: td = TopologicalDistance() closest_openfabrics = td.get_cuda_distances_from_device_index( cuda_device_index, "openfabrics")
def init_once(): global ucp, host_array, device_array global ucx_create_endpoint, ucx_create_listener global pre_existing_cuda_context, cuda_context_created if ucp is not None: return # remove/process dask.ucx flags for valid ucx options ucx_config = _scrub_ucx_config() # We ensure the CUDA context is created before initializing UCX. This can't # be safely handled externally because communications in Dask start before # preload scripts run. if dask.config.get("distributed.comm.ucx.create-cuda-context") is True or ( "TLS" in ucx_config and "cuda_copy" in ucx_config["TLS"] ): try: import numba.cuda except ImportError: raise ImportError( "CUDA support with UCX requires Numba for context management" ) cuda_visible_device = int( os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")[0] ) pre_existing_cuda_context = has_cuda_context() if pre_existing_cuda_context is not False: warnings.warn( f"A CUDA context for device {pre_existing_cuda_context} already exists on process " f"ID {os.getpid()}. This is often the result of a CUDA-enabled library calling a " "CUDA runtime function before Dask-CUDA can spawn worker processes. Please make " "sure any such function calls don't happen at import time or in the global scope " "of a program." ) numba.cuda.current_context() cuda_context_created = has_cuda_context() if ( cuda_context_created is not False and cuda_context_created != cuda_visible_device ): warnings.warn( f"Worker with process ID {os.getpid()} should have a CUDA context assigned to " f"device {cuda_visible_device}, but instead the CUDA context is on device " "{cuda_context_created}. This is often the result of a CUDA-enabled library " "calling a CUDA runtime function before Dask-CUDA can spawn worker processes. " "Please make sure any such function calls don't happen at import time or in " "the global scope of a program." ) import ucp as _ucp ucp = _ucp ucp.init(options=ucx_config, env_takes_precedence=True) # Find the function, `host_array()`, to use when allocating new host arrays try: import numpy host_array = lambda n: numpy.empty((n,), dtype="u1") except ImportError: host_array = lambda n: bytearray(n) # Find the function, `cuda_array()`, to use when allocating new CUDA arrays try: import rmm device_array = lambda n: rmm.DeviceBuffer(size=n) except ImportError: try: import numba.cuda def numba_device_array(n): a = numba.cuda.device_array((n,), dtype="u1") weakref.finalize(a, numba.cuda.current_context) return a device_array = numba_device_array except ImportError: def device_array(n): raise RuntimeError( "In order to send/recv CUDA arrays, Numba or RMM is required" ) pool_size_str = dask.config.get("distributed.rmm.pool-size") if pool_size_str is not None: pool_size = parse_bytes(pool_size_str) rmm.reinitialize( pool_allocator=True, managed_memory=False, initial_pool_size=pool_size ) try: from ucp.endpoint_reuse import EndpointReuse except ImportError: ucx_create_endpoint = ucp.create_endpoint ucx_create_listener = ucp.create_listener else: reuse_endpoints = dask.config.get("distributed.comm.ucx.reuse-endpoints") if ( reuse_endpoints is None and ucp.get_ucx_version() >= (1, 11, 0) ) or reuse_endpoints is False: ucx_create_endpoint = ucp.create_endpoint ucx_create_listener = ucp.create_listener else: ucx_create_endpoint = EndpointReuse.create_endpoint ucx_create_listener = EndpointReuse.create_listener