def test_resource_limit(monkeypatch): assert parse_memory_limit("250MiB", 1, total_cores=1) == 1024 * 1024 * 250 new_limit = 1024 * 1024 * 200 import distributed.worker monkeypatch.setattr(distributed.system, "MEMORY_LIMIT", new_limit) assert parse_memory_limit("250MiB", 1, total_cores=1) == new_limit
def test_resource_limit(): assert parse_memory_limit("250MiB", 1, total_cores=1) == 1024 * 1024 * 250 # get current limit resource = pytest.importorskip("resource") try: hard_limit = resource.getrlimit(resource.RLIMIT_RSS)[1] except OSError: pytest.skip("resource could not get the RSS limit") memory_limit = psutil.virtual_memory().total if hard_limit > memory_limit or hard_limit < 0: hard_limit = memory_limit # decrease memory limit by one byte new_limit = hard_limit - 1 try: resource.setrlimit(resource.RLIMIT_RSS, (new_limit, new_limit)) assert parse_memory_limit(hard_limit, 1, total_cores=1) == new_limit except OSError: pytest.skip("resource could not set the RSS limit")
def __init__( self, n_workers=None, threads_per_worker=1, processes=True, memory_limit="auto", device_memory_limit=0.8, CUDA_VISIBLE_DEVICES=None, data=None, local_directory=None, protocol=None, enable_tcp_over_ucx=False, enable_infiniband=False, enable_nvlink=False, enable_rdmacm=False, ucx_net_devices=None, rmm_pool_size=None, rmm_managed_memory=False, **kwargs, ): # Required by RAPIDS libraries (e.g., cuDF) to ensure no context # initialization happens before we can set CUDA_VISIBLE_DEVICES os.environ["RAPIDS_NO_INITIALIZE"] = "True" if CUDA_VISIBLE_DEVICES is None: CUDA_VISIBLE_DEVICES = cuda_visible_devices(0) if isinstance(CUDA_VISIBLE_DEVICES, str): CUDA_VISIBLE_DEVICES = CUDA_VISIBLE_DEVICES.split(",") CUDA_VISIBLE_DEVICES = list( map(parse_cuda_visible_device, CUDA_VISIBLE_DEVICES)) if n_workers is None: n_workers = len(CUDA_VISIBLE_DEVICES) self.host_memory_limit = parse_memory_limit(memory_limit, threads_per_worker, n_workers) self.device_memory_limit = parse_device_memory_limit( device_memory_limit, device_index=0) self.rmm_pool_size = rmm_pool_size self.rmm_managed_memory = rmm_managed_memory if rmm_pool_size is not None or rmm_managed_memory: try: import rmm # noqa F401 except ImportError: raise ValueError( "RMM pool or managed memory requested but module 'rmm' " "is not available. For installation instructions, please " "see https://github.com/rapidsai/rmm") # pragma: no cover if self.rmm_pool_size is not None: self.rmm_pool_size = parse_bytes(self.rmm_pool_size) else: if enable_nvlink: warnings.warn( "When using NVLink we recommend setting a " "`rmm_pool_size`. Please see: " "https://dask-cuda.readthedocs.io/en/latest/ucx.html" "#important-notes for more details") if not processes: raise ValueError( "Processes are necessary in order to use multiple GPUs with Dask" ) if data is None: data = ( DeviceHostFile, { "device_memory_limit": self.device_memory_limit, "memory_limit": self.host_memory_limit, "local_directory": local_directory or dask.config.get("temporary-directory") or os.getcwd(), }, ) if enable_tcp_over_ucx or enable_infiniband or enable_nvlink: if protocol is None: protocol = "ucx" elif protocol != "ucx": raise TypeError( "Enabling InfiniBand or NVLink requires protocol='ucx'") if ucx_net_devices == "auto": try: from ucp._libs.topological_distance import TopologicalDistance # NOQA except ImportError: raise ValueError( "ucx_net_devices set to 'auto' but UCX-Py is not " "installed or it's compiled without hwloc support") elif ucx_net_devices == "": raise ValueError("ucx_net_devices can not be an empty string") self.ucx_net_devices = ucx_net_devices self.set_ucx_net_devices = enable_infiniband self.host = kwargs.get("host", None) initialize( enable_tcp_over_ucx=enable_tcp_over_ucx, enable_nvlink=enable_nvlink, enable_infiniband=enable_infiniband, enable_rdmacm=enable_rdmacm, net_devices=ucx_net_devices, cuda_device_index=0, ) super().__init__( n_workers=0, threads_per_worker=threads_per_worker, memory_limit=self.host_memory_limit, processes=True, data=data, local_directory=local_directory, protocol=protocol, config={ "ucx": get_ucx_config( enable_tcp_over_ucx=enable_tcp_over_ucx, enable_nvlink=enable_nvlink, enable_infiniband=enable_infiniband, enable_rdmacm=enable_rdmacm, ) }, **kwargs, ) self.new_spec["options"]["preload"] = self.new_spec["options"].get( "preload", []) + ["dask_cuda.initialize"] self.new_spec[ "options"]["preload_argv"] = self.new_spec["options"].get( "preload_argv", []) + ["--create-cuda-context"] self.cuda_visible_devices = CUDA_VISIBLE_DEVICES self.scale(n_workers) self.sync(self._correct_state)
def main( scheduler, host, nthreads, name, memory_limit, device_memory_limit, rmm_pool_size, pid_file, resources, dashboard, dashboard_address, local_directory, scheduler_file, interface, death_timeout, preload, dashboard_prefix, tls_ca_file, tls_cert, tls_key, enable_tcp_over_ucx, enable_infiniband, enable_nvlink, enable_rdmacm, net_devices, **kwargs, ): enable_proctitle_on_current() enable_proctitle_on_children() if tls_ca_file and tls_cert and tls_key: sec = Security( tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key ) else: sec = None try: nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(",")) except KeyError: nprocs = get_n_gpus() if not nthreads: nthreads = min(1, multiprocessing.cpu_count() // nprocs) memory_limit = parse_memory_limit(memory_limit, nthreads, total_cores=nprocs) if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {} if dashboard: try: from distributed.dashboard import BokehWorker except ImportError: pass else: if dashboard_prefix: result = (BokehWorker, {"prefix": dashboard_prefix}) else: result = BokehWorker services[("dashboard", dashboard_address)] = result if resources: resources = resources.replace(",", " ").split() resources = dict(pair.split("=") for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() preload_argv = kwargs.get("preload_argv", []) kwargs = {"worker_port": None, "listen_address": None} t = Nanny if not scheduler and not scheduler_file and "scheduler-address" not in config: raise ValueError( "Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786" ) if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) if rmm_pool_size is not None: try: import rmm # noqa F401 except ImportError: raise ValueError( "RMM pool requested but module 'rmm' is not available. " "For installation instructions, please see " "https://github.com/rapidsai/rmm" ) # pragma: no cover rmm_pool_size = parse_bytes(rmm_pool_size) nannies = [ t( scheduler, scheduler_file=scheduler_file, nthreads=nthreads, services=services, loop=loop, resources=resources, memory_limit=memory_limit, interface=get_ucx_net_devices( cuda_device_index=i, ucx_net_devices=net_devices, get_openfabrics=False, get_network=True, ), preload=(list(preload) or []) + ["dask_cuda.initialize"], preload_argv=(list(preload_argv) or []) + ["--create-cuda-context"], security=sec, env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)}, plugins={CPUAffinity(get_cpu_affinity(i)), RMMPool(rmm_pool_size)}, name=name if nprocs == 1 or not name else name + "-" + str(i), local_directory=local_directory, config={ "ucx": get_ucx_config( enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, enable_rdmacm=enable_rdmacm, net_devices=net_devices, cuda_device_index=i, ) }, data=( DeviceHostFile, { "device_memory_limit": get_device_total_memory(index=i) if (device_memory_limit == "auto" or device_memory_limit == int(0)) else parse_bytes(device_memory_limit), "memory_limit": memory_limit, "local_directory": local_directory, }, ), **kwargs, ) for i in range(nprocs) ] @gen.coroutine def close_all(): # Unregister all workers from scheduler yield [n._close(timeout=2) for n in nannies] def on_signal(signum): logger.info("Exiting on signal %d", signum) close_all() @gen.coroutine def run(): yield nannies yield [n.finished() for n in nannies] install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker")
def __init__( self, scheduler=None, host=None, nthreads=0, name=None, memory_limit="auto", device_memory_limit="auto", rmm_pool_size=None, rmm_managed_memory=False, pid_file=None, resources=None, dashboard=True, dashboard_address=":0", local_directory=None, scheduler_file=None, interface=None, death_timeout=None, preload=[], dashboard_prefix=None, security=None, enable_tcp_over_ucx=False, enable_infiniband=False, enable_nvlink=False, enable_rdmacm=False, net_devices=None, **kwargs, ): # Required by RAPIDS libraries (e.g., cuDF) to ensure no context # initialization happens before we can set CUDA_VISIBLE_DEVICES os.environ["RAPIDS_NO_INITIALIZE"] = "True" enable_proctitle_on_current() enable_proctitle_on_children() try: nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(",")) except KeyError: nprocs = get_n_gpus() if not nthreads: nthreads = min(1, multiprocessing.cpu_count() // nprocs) memory_limit = parse_memory_limit(memory_limit, nthreads, total_cores=nprocs) if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {} if dashboard: try: from distributed.dashboard import BokehWorker except ImportError: pass else: if dashboard_prefix: result = (BokehWorker, {"prefix": dashboard_prefix}) else: result = BokehWorker services[("dashboard", dashboard_address)] = result if resources: resources = resources.replace(",", " ").split() resources = dict(pair.split("=") for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() preload_argv = kwargs.get("preload_argv", []) kwargs = {"worker_port": None, "listen_address": None} t = Nanny if ( not scheduler and not scheduler_file and dask.config.get("scheduler-address", None) is None ): raise ValueError( "Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786" ) if interface and host: raise ValueError("Can not specify both interface and host") if rmm_pool_size is not None or rmm_managed_memory: try: import rmm # noqa F401 except ImportError: raise ValueError( "RMM pool requested but module 'rmm' is not available. " "For installation instructions, please see " "https://github.com/rapidsai/rmm" ) # pragma: no cover if rmm_pool_size is not None: rmm_pool_size = parse_bytes(rmm_pool_size) else: if enable_nvlink: warnings.warn( "When using NVLink we recommend setting a " "`rmm_pool_size`. Please see: " "https://dask-cuda.readthedocs.io/en/latest/ucx.html" "#important-notes for more details" ) if enable_nvlink and rmm_managed_memory: raise ValueError( "RMM managed memory and NVLink are currently incompatible." ) # Ensure this parent dask-cuda-worker process uses the same UCX # configuration as child worker processes created by it. initialize( create_cuda_context=False, enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, enable_rdmacm=enable_rdmacm, net_devices=net_devices, cuda_device_index=0, ) self.nannies = [ t( scheduler, scheduler_file=scheduler_file, nthreads=nthreads, services=services, loop=loop, resources=resources, memory_limit=memory_limit, interface=_get_interface(interface, host, i, net_devices), host=host, preload=(list(preload) or []) + ["dask_cuda.initialize"], preload_argv=(list(preload_argv) or []) + ["--create-cuda-context"], security=security, env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)}, plugins={ CPUAffinity(get_cpu_affinity(i)), RMMSetup(rmm_pool_size, rmm_managed_memory), }, name=name if nprocs == 1 or not name else name + "-" + str(i), local_directory=local_directory, config={ "ucx": get_ucx_config( enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, enable_rdmacm=enable_rdmacm, net_devices=net_devices, cuda_device_index=i, ) }, data=( DeviceHostFile, { "device_memory_limit": parse_device_memory_limit( device_memory_limit, device_index=i ), "memory_limit": memory_limit, "local_directory": local_directory, }, ), **kwargs, ) for i in range(nprocs) ]
def __init__( self, CUDA_VISIBLE_DEVICES=None, n_workers=None, threads_per_worker=1, memory_limit="auto", device_memory_limit=0.8, data=None, local_directory=None, protocol=None, enable_tcp_over_ucx=False, enable_infiniband=False, enable_nvlink=False, enable_rdmacm=False, ucx_net_devices=None, rmm_pool_size=None, rmm_managed_memory=False, rmm_async=False, rmm_log_directory=None, jit_unspill=None, log_spilling=False, worker_class=None, **kwargs, ): # Required by RAPIDS libraries (e.g., cuDF) to ensure no context # initialization happens before we can set CUDA_VISIBLE_DEVICES os.environ["RAPIDS_NO_INITIALIZE"] = "True" if threads_per_worker < 1: raise ValueError("threads_per_worker must be higher than 0.") if CUDA_VISIBLE_DEVICES is None: CUDA_VISIBLE_DEVICES = cuda_visible_devices(0) if isinstance(CUDA_VISIBLE_DEVICES, str): CUDA_VISIBLE_DEVICES = CUDA_VISIBLE_DEVICES.split(",") CUDA_VISIBLE_DEVICES = list( map(parse_cuda_visible_device, CUDA_VISIBLE_DEVICES)) if n_workers is None: n_workers = len(CUDA_VISIBLE_DEVICES) if n_workers < 1: raise ValueError("Number of workers cannot be less than 1.") self.host_memory_limit = parse_memory_limit(memory_limit, threads_per_worker, n_workers) self.device_memory_limit = parse_device_memory_limit( device_memory_limit, device_index=nvml_device_index(0, CUDA_VISIBLE_DEVICES)) self.rmm_pool_size = rmm_pool_size self.rmm_managed_memory = rmm_managed_memory self.rmm_async = rmm_async if rmm_pool_size is not None or rmm_managed_memory: try: import rmm # noqa F401 except ImportError: raise ValueError( "RMM pool or managed memory requested but module 'rmm' " "is not available. For installation instructions, please " "see https://github.com/rapidsai/rmm") # pragma: no cover if rmm_async: raise ValueError( "RMM pool and managed memory are incompatible with asynchronous " "allocator") if self.rmm_pool_size is not None: self.rmm_pool_size = parse_bytes(self.rmm_pool_size) else: if enable_nvlink: warnings.warn( "When using NVLink we recommend setting a " "`rmm_pool_size`. Please see: " "https://dask-cuda.readthedocs.io/en/latest/ucx.html" "#important-notes for more details") self.rmm_log_directory = rmm_log_directory if not kwargs.pop("processes", True): raise ValueError( "Processes are necessary in order to use multiple GPUs with Dask" ) if jit_unspill is None: self.jit_unspill = dask.config.get("jit-unspill", default=False) else: self.jit_unspill = jit_unspill data = kwargs.pop("data", None) if data is None: if self.jit_unspill: data = ( ProxifyHostFile, { "device_memory_limit": self.device_memory_limit, }, ) else: data = ( DeviceHostFile, { "device_memory_limit": self.device_memory_limit, "memory_limit": self.host_memory_limit, "local_directory": local_directory or dask.config.get("temporary-directory") or os.getcwd(), "log_spilling": log_spilling, }, ) if enable_tcp_over_ucx or enable_infiniband or enable_nvlink: if protocol is None: protocol = "ucx" elif protocol != "ucx": raise TypeError( "Enabling InfiniBand or NVLink requires protocol='ucx'") if ucx_net_devices == "auto": if _ucx_111: warnings.warn( "Starting with UCX 1.11, `ucx_net_devices='auto' is deprecated, " "it should now be left unspecified for the same behavior. " "Please make sure to read the updated UCX Configuration section in " "https://docs.rapids.ai/api/dask-cuda/nightly/ucx.html, " "where significant performance considerations for InfiniBand with " "UCX 1.11 and above is documented.", ) else: try: from ucp._libs.topological_distance import ( # NOQA TopologicalDistance, ) except ImportError: raise ValueError( "ucx_net_devices set to 'auto' but UCX-Py is not " "installed or it's compiled without hwloc support") elif ucx_net_devices == "": raise ValueError("ucx_net_devices can not be an empty string") self.ucx_net_devices = ucx_net_devices self.set_ucx_net_devices = enable_infiniband self.host = kwargs.get("host", None) initialize( enable_tcp_over_ucx=enable_tcp_over_ucx, enable_nvlink=enable_nvlink, enable_infiniband=enable_infiniband, enable_rdmacm=enable_rdmacm, net_devices=ucx_net_devices, cuda_device_index=0, ) if worker_class is not None: from functools import partial worker_class = partial( LoggedNanny if log_spilling is True else Nanny, worker_class=worker_class, ) super().__init__( n_workers=0, threads_per_worker=threads_per_worker, memory_limit=self.host_memory_limit, processes=True, data=data, local_directory=local_directory, protocol=protocol, worker_class=worker_class, config={ "ucx": get_ucx_config( enable_tcp_over_ucx=enable_tcp_over_ucx, enable_nvlink=enable_nvlink, enable_infiniband=enable_infiniband, enable_rdmacm=enable_rdmacm, ) }, **kwargs, ) self.new_spec["options"]["preload"] = self.new_spec["options"].get( "preload", []) + ["dask_cuda.initialize"] self.new_spec[ "options"]["preload_argv"] = self.new_spec["options"].get( "preload_argv", []) + ["--create-cuda-context"] self.cuda_visible_devices = CUDA_VISIBLE_DEVICES self.scale(n_workers) self.sync(self._correct_state)
def main( scheduler, host, nthreads, name, memory_limit, device_memory_limit, pid_file, resources, dashboard, dashboard_address, local_directory, scheduler_file, interface, death_timeout, preload, dashboard_prefix, tls_ca_file, tls_cert, tls_key, **kwargs, ): enable_proctitle_on_current() enable_proctitle_on_children() sec = Security( tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key ) try: nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(",")) except KeyError: nprocs = get_n_gpus() if not nthreads: nthreads = min(1, multiprocessing.cpu_count() // nprocs) if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {} if dashboard: try: from distributed.dashboard import BokehWorker except ImportError: pass else: if dashboard_prefix: result = (BokehWorker, {"prefix": dashboard_prefix}) else: result = BokehWorker services[("dashboard", dashboard_address)] = result if resources: resources = resources.replace(",", " ").split() resources = dict(pair.split("=") for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() kwargs = {"worker_port": None, "listen_address": None} t = Nanny if not scheduler and not scheduler_file and "scheduler-address" not in config: raise ValueError( "Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786" ) if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) nannies = [ t( scheduler, scheduler_file=scheduler_file, nthreads=nthreads, services=services, loop=loop, resources=resources, memory_limit=memory_limit, host=host, preload=(preload or []) + ["dask_cuda.initialize_context"], security=sec, env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)}, name=name if nprocs == 1 or not name else name + "-" + str(i), data=( DeviceHostFile, { "device_memory_limit": get_device_total_memory(index=i) if (device_memory_limit == "auto" or device_memory_limit == int(0)) else parse_bytes(device_memory_limit), "memory_limit": parse_memory_limit( memory_limit, nthreads, total_cores=nprocs ), "local_directory": local_directory, }, ), **kwargs, ) for i in range(nprocs) ] @gen.coroutine def close_all(): # Unregister all workers from scheduler yield [n._close(timeout=2) for n in nannies] def on_signal(signum): logger.info("Exiting on signal %d", signum) close_all() @gen.coroutine def run(): yield nannies yield [n.finished() for n in nannies] install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker")
def __init__( self, scheduler=None, host=None, nthreads=1, name=None, memory_limit="auto", device_memory_limit="auto", rmm_pool_size=None, rmm_maximum_pool_size=None, rmm_managed_memory=False, rmm_async=False, rmm_log_directory=None, pid_file=None, resources=None, dashboard=True, dashboard_address=":0", local_directory=None, shared_filesystem=None, scheduler_file=None, interface=None, preload=[], dashboard_prefix=None, security=None, enable_tcp_over_ucx=None, enable_infiniband=None, enable_nvlink=None, enable_rdmacm=None, net_devices=None, jit_unspill=None, worker_class=None, **kwargs, ): # Required by RAPIDS libraries (e.g., cuDF) to ensure no context # initialization happens before we can set CUDA_VISIBLE_DEVICES os.environ["RAPIDS_NO_INITIALIZE"] = "True" enable_proctitle_on_current() enable_proctitle_on_children() try: nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(",")) except KeyError: nprocs = get_n_gpus() if nthreads < 1: raise ValueError("nthreads must be higher than 0.") memory_limit = parse_memory_limit(memory_limit, nthreads, total_cores=nprocs) if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) if resources: resources = resources.replace(",", " ").split() resources = dict(pair.split("=") for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() preload_argv = kwargs.pop("preload_argv", []) kwargs = {"worker_port": None, "listen_address": None, **kwargs} if (not scheduler and not scheduler_file and dask.config.get("scheduler-address", None) is None): raise ValueError("Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786") if isinstance(scheduler, Cluster): scheduler = scheduler.scheduler_address if interface and host: raise ValueError("Can not specify both interface and host") if rmm_pool_size is not None or rmm_managed_memory: try: import rmm # noqa F401 except ImportError: raise ValueError( "RMM pool requested but module 'rmm' is not available. " "For installation instructions, please see " "https://github.com/rapidsai/rmm") # pragma: no cover if rmm_async: raise ValueError( "RMM pool and managed memory are incompatible with asynchronous " "allocator") if rmm_pool_size is not None: rmm_pool_size = parse_bytes(rmm_pool_size) if rmm_maximum_pool_size is not None: rmm_maximum_pool_size = parse_bytes(rmm_maximum_pool_size) else: if enable_nvlink: warnings.warn( "When using NVLink we recommend setting a " "`rmm_pool_size`. Please see: " "https://dask-cuda.readthedocs.io/en/latest/ucx.html" "#important-notes for more details") if enable_nvlink and rmm_managed_memory: raise ValueError( "RMM managed memory and NVLink are currently incompatible.") if _ucx_111 and net_devices == "auto": warnings.warn( "Starting with UCX 1.11, `ucx_net_devices='auto' is deprecated, " "it should now be left unspecified for the same behavior. " "Please make sure to read the updated UCX Configuration section in " "https://docs.rapids.ai/api/dask-cuda/nightly/ucx.html, " "where significant performance considerations for InfiniBand with " "UCX 1.11 and above is documented.", ) # Ensure this parent dask-cuda-worker process uses the same UCX # configuration as child worker processes created by it. initialize( create_cuda_context=False, enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, enable_rdmacm=enable_rdmacm, net_devices=net_devices, cuda_device_index=0, ) if jit_unspill is None: self.jit_unspill = dask.config.get("jit-unspill", default=False) else: self.jit_unspill = jit_unspill if self.jit_unspill: data = lambda i: ( ProxifyHostFile, { "device_memory_limit": parse_device_memory_limit(device_memory_limit, device_index=i), "memory_limit": memory_limit, "local_directory": local_directory, "shared_filesystem": shared_filesystem, }, ) else: data = lambda i: ( DeviceHostFile, { "device_memory_limit": parse_device_memory_limit(device_memory_limit, device_index=i), "memory_limit": memory_limit, "local_directory": local_directory, }, ) self.nannies = [ Nanny( scheduler, scheduler_file=scheduler_file, nthreads=nthreads, dashboard=dashboard, dashboard_address=dashboard_address, http_prefix=dashboard_prefix, loop=loop, resources=resources, memory_limit=memory_limit, interface=_get_interface(interface, host, i, net_devices), host=host, preload=(list(preload) or []) + ["dask_cuda.initialize"], preload_argv=(list(preload_argv) or []) + ["--create-cuda-context"], security=security, env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)}, plugins={ CPUAffinity( get_cpu_affinity( nvml_device_index(i, cuda_visible_devices(i)))), RMMSetup( rmm_pool_size, rmm_maximum_pool_size, rmm_managed_memory, rmm_async, rmm_log_directory, ), }, name=name if nprocs == 1 or name is None else str(name) + "-" + str(i), local_directory=local_directory, config={ "distributed.comm.ucx": get_ucx_config( enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, enable_rdmacm=enable_rdmacm, net_devices=net_devices, cuda_device_index=i, ) }, data=data(nvml_device_index(i, cuda_visible_devices(i))), worker_class=worker_class, **kwargs, ) for i in range(nprocs) ]
def main( scheduler, host, nthreads, name, memory_limit, device_memory_limit, pid_file, reconnect, resources, dashboard, dashboard_address, local_directory, scheduler_file, interface, death_timeout, preload, preload_argv, bokeh_prefix, tls_ca_file, tls_cert, tls_key, ): enable_proctitle_on_current() enable_proctitle_on_children() sec = Security(tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key) try: nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(",")) except KeyError: nprocs = get_n_gpus() if not nthreads: nthreads = min(1, multiprocessing.cpu_count() // nprocs) if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {} if dashboard: try: from distributed.dashboard import BokehWorker except ImportError: pass else: if bokeh_prefix: result = (BokehWorker, {"prefix": bokeh_prefix}) else: result = BokehWorker services[("dashboard", dashboard_address)] = result if resources: resources = resources.replace(",", " ").split() resources = dict(pair.split("=") for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() kwargs = {"worker_port": None, "listen_address": None} t = Nanny if not scheduler and not scheduler_file and "scheduler-address" not in config: raise ValueError("Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786") if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) if host: addr = uri_from_host_port(host, 0, 0) else: # Choose appropriate address for scheduler addr = None if death_timeout is not None: death_timeout = parse_timedelta(death_timeout, "s") local_dir = kwargs.get("local_dir", "dask-worker-space") with warn_on_duration( "1s", "Creating scratch directories is taking a surprisingly long time. " "This is often due to running workers on a network file system. " "Consider specifying a local-directory to point workers to write " "scratch data to a local disk.", ): _workspace = WorkSpace(os.path.abspath(local_dir)) _workdir = _workspace.new_work_dir(prefix="worker-") local_dir = _workdir.dir_path nannies = [ t( scheduler, scheduler_file=scheduler_file, nthreads=nthreads, services=services, loop=loop, resources=resources, memory_limit=memory_limit, reconnect=reconnect, local_dir=local_directory, death_timeout=death_timeout, preload=(preload or []) + ["dask_cuda.initialize_context"], preload_argv=preload_argv, security=sec, contact_address=None, env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)}, name=name if nprocs == 1 or not name else name + "-" + str(i), data=( DeviceHostFile, { "device_memory_limit": get_device_total_memory(index=i) if (device_memory_limit == "auto" or device_memory_limit == int(0)) else parse_bytes(device_memory_limit), "memory_limit": parse_memory_limit(memory_limit, nthreads, total_cores=nprocs), "local_dir": local_dir, }, ), **kwargs, ) for i in range(nprocs) ] @gen.coroutine def close_all(): # Unregister all workers from scheduler yield [n._close(timeout=2) for n in nannies] def on_signal(signum): logger.info("Exiting on signal %d", signum) close_all() @gen.coroutine def run(): yield [n._start(addr) for n in nannies] while all(n.status != "closed" for n in nannies): yield gen.sleep(0.2) install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker")
def __init__( self, n_workers=None, threads_per_worker=1, processes=True, memory_limit="auto", device_memory_limit=None, CUDA_VISIBLE_DEVICES=None, data=None, local_directory=None, protocol=None, enable_tcp_over_ucx=False, enable_infiniband=False, enable_nvlink=False, enable_rdmacm=False, ucx_net_devices=None, rmm_pool_size=None, **kwargs, ): if CUDA_VISIBLE_DEVICES is None: CUDA_VISIBLE_DEVICES = cuda_visible_devices(0) if isinstance(CUDA_VISIBLE_DEVICES, str): CUDA_VISIBLE_DEVICES = CUDA_VISIBLE_DEVICES.split(",") CUDA_VISIBLE_DEVICES = list(map(int, CUDA_VISIBLE_DEVICES)) if n_workers is None: n_workers = len(CUDA_VISIBLE_DEVICES) self.host_memory_limit = parse_memory_limit(memory_limit, threads_per_worker, n_workers) self.device_memory_limit = device_memory_limit self.rmm_pool_size = rmm_pool_size if rmm_pool_size is not None: try: import rmm # noqa F401 except ImportError: raise ValueError( "RMM pool requested but module 'rmm' is not available. " "For installation instructions, please see " "https://github.com/rapidsai/rmm") # pragma: no cover self.rmm_pool_size = parse_bytes(self.rmm_pool_size) if not processes: raise ValueError( "Processes are necessary in order to use multiple GPUs with Dask" ) if self.device_memory_limit is None: self.device_memory_limit = get_device_total_memory(0) elif isinstance(self.device_memory_limit, str): self.device_memory_limit = parse_bytes(self.device_memory_limit) if data is None: data = ( DeviceHostFile, { "device_memory_limit": self.device_memory_limit, "memory_limit": self.host_memory_limit, "local_directory": local_directory or dask.config.get("temporary-directory") or os.getcwd(), }, ) if enable_tcp_over_ucx or enable_infiniband or enable_nvlink: if protocol is None: protocol = "ucx" elif protocol != "ucx": raise TypeError( "Enabling InfiniBand or NVLink requires protocol='ucx'") if ucx_net_devices == "auto": try: from ucp._libs.topological_distance import TopologicalDistance # noqa except ImportError: raise ValueError( "ucx_net_devices set to 'auto' but UCX-Py is not " "installed or it's compiled without hwloc support") elif ucx_net_devices == "": raise ValueError("ucx_net_devices can not be an empty string") self.ucx_net_devices = ucx_net_devices self.set_ucx_net_devices = enable_infiniband self.host = kwargs.get("host", None) super().__init__( n_workers=0, threads_per_worker=threads_per_worker, memory_limit=self.host_memory_limit, processes=True, data=data, local_directory=local_directory, protocol=protocol, config={ "ucx": get_ucx_config( enable_tcp_over_ucx=enable_tcp_over_ucx, enable_nvlink=enable_nvlink, enable_infiniband=enable_infiniband, enable_rdmacm=enable_rdmacm, ) }, **kwargs, ) self.new_spec["options"]["preload"] = self.new_spec["options"].get( "preload", []) + ["dask_cuda.initialize"] self.new_spec[ "options"]["preload_argv"] = self.new_spec["options"].get( "preload_argv", []) + ["--create-cuda-context"] self.cuda_visible_devices = CUDA_VISIBLE_DEVICES self.scale(n_workers) self.sync(self._correct_state)