def connect(cls, *args, **kwargs): """ Setup slurm cluster """ # Create a slurm cluster with all the various class settings cls.local_cluster = cls.cluster_controller_class( queue=cls.queue, project=cls.project, walltime=cls.walltime, job_cpu=cls.job_cpu, cores=cls.cores, processes=cls.processes, job_mem=cls.job_mem, env_extra=cls.env_extra, interface=cls.interface, local_directory=cls.local_directory, memory=cls.memory, job_extra=cls.cluster_controller_options) # Hacks for older version of dask_jobqueue try: if cls.worker_memory_limit == 0: cls.local_cluster._command_template = memory_limit_0( cls.local_cluster._command_template) if cls.hack_cluster_controller_for_NYU: cls.local_cluster.job_header = fix_header_for_nyu( cls.local_cluster.job_header, DEFAULT_NYU_HEADER) # Hacks for newer version of dask_jobqueue except AttributeError: from dask_jobqueue.slurm import SLURMJob class SLURMJobMemLimit(SLURMJob): def __init__(self, *args, **kwargs): super(SLURMJobMemLimit, self).__init__(*args, **kwargs) if cls.worker_memory_limit == 0: self._command_template = memory_limit_0( self._command_template) if cls.hack_cluster_controller_for_NYU: self.job_header = fix_header_for_nyu( self.job_header, DEFAULT_NYU_HEADER) cls.local_cluster.job_cls = SLURMJobMemLimit if cls.control_adaptive: cls.local_cluster.adapt(minimum=cls.minimum_cores, maximum=cls.maximum_cores, interval=cls.interval, wait_count=cls.wait_count) else: cls.local_cluster.adapt(minimum=cls.maximum_cores, maximum=cls.maximum_cores, interval=cls.interval, wait_count=cls.wait_count) cls.local_cluster.scheduler.allowed_failures = cls.allowed_failures cls.client = distributed.Client(cls.local_cluster) return True
def connect(cls, *args, **kwargs): """ Setup slurm cluster """ # Create a slurm cluster with all the various class settings cls.local_cluster = cls.cluster_controller_class(queue=cls.queue, project=cls.project, walltime=cls.walltime, job_cpu=cls.job_cpu, cores=cls.cores, processes=cls.processes, job_mem=cls.job_mem, env_extra=cls.env_extra, interface=cls.interface, local_directory=cls.local_directory, memory=cls.memory, job_extra=cls.cluster_controller_options) # Deactivate the worker memory nanny if cls.worker_memory_limit == 0: cls.local_cluster._command_template = memory_limit_0(cls.local_cluster._command_template) # Rewrite the command headers so that the SLURM controller will work with the NYU prince cluster if cls.hack_cluster_controller_for_NYU: cls.local_cluster.job_header = fix_header_for_nyu(cls.local_cluster.job_header, DEFAULT_NYU_HEADER) if cls.control_adaptive: cls.local_cluster.adapt(minimum=cls.minimum_cores, maximum=cls.maximum_cores, interval=cls.interval, wait_count=cls.wait_count) else: cls.local_cluster.adapt(minimum=cls.maximum_cores, maximum=cls.maximum_cores, interval=cls.interval, wait_count=cls.wait_count) cls.local_cluster.scheduler.allowed_failures = cls.allowed_failures cls.client = distributed.Client(cls.local_cluster) return True
def connect(cls, *args, **kwargs): """ Setup slurm cluster """ # Create a slurm cluster with all the various class settings cls._local_cluster = cls._cluster_controller_class(queue=cls._queue, project=cls._project, interface=cls._interface, walltime=cls._job_time, job_cpu=cls._job_n_workers * cls._worker_n_threads, cores=cls._job_n_workers * cls._worker_n_threads, processes=cls._job_n_workers, job_mem=cls._job_mem, env_extra=cls._config_env(), local_directory=cls._local_directory, memory=cls._job_mem, job_extra=cls._job_slurm_commands, job_cls=SLURMJobNoMemLimit) cls.client = distributed.Client(cls._local_cluster, direct_to_workers=True) cls._add_local_node_workers(cls._num_local_workers) cls._tracker = WorkerTracker() utils.Debug.vprint("Dask dashboard: {cl}".format(cl = cls.client.dashboard_link), level=0) return True
def connect(cls, scheduler_uri, *args, client_kwargs: Optional[dict] = None, **kwargs): """ Connect to a remote dask scheduler. Parameters ---------- scheduler_uri: str Compatible with the :code:`address` parameter of :class:`distributed.Client`. client_kwargs: dict or None Passed as kwargs to :class:`distributed.Client`. :code:`client_kwargs['set_as_default']` is set to :code:`False` unless specified otherwise to avoid interference with Dask-based workflows. Pass :code:`client_kwargs={'set_as_default': True}` to set the Client as the default Dask scheduler and keep it running when the Context closes. *args, **kwargs: Passed to :class:`DaskJobExecutor`. Returns ------- DaskJobExecutor the connected JobExecutor """ if client_kwargs is None: client_kwargs = {} if client_kwargs.get('set_as_default') is None: client_kwargs['set_as_default'] = False is_local = not client_kwargs['set_as_default'] client = dd.Client(address=scheduler_uri, **client_kwargs) return cls(client=client, is_local=is_local, *args, **kwargs)
def local_cluster_url(): """ Shared dask cluster, can be used repeatedly by different executors. This allows numba caching across tests, without sharing the executor, for example """ cluster_port = find_unused_port() devices = detect() spec = cluster_spec( # Only use at most 2 CPUs and 1 GPU cpus=devices['cpus'][:2], cudas=devices['cudas'][:1], has_cupy=devices['has_cupy']) cluster_kwargs = { 'silence_logs': logging.WARN, 'scheduler': { 'cls': Scheduler, 'options': { 'port': cluster_port }, }, } with set_num_threads_env(1, set_numba=False): cluster = dd.SpecCluster(workers=spec, **(cluster_kwargs or {})) client = dd.Client(cluster, set_as_default=False) client.wait_for_workers(len(spec)) client.close() yield 'tcp://localhost:%d' % cluster_port cluster.close()
def wrapped(df, *args, **kwargs): if cluster is None: my_cluster = get_or_create_cluster() else: my_cluster = cluster client = dd.Client(my_cluster) try: task_queue = dd.Queue() result_queue = dd.Queue() workers = [] for _ in range(len(my_cluster.workers)): w = client.submit(worker, task_queue, result_queue, args, kwargs) workers.append(w) with tempfile.TemporaryDirectory( prefix=TMPDIR_PREFIX) as tmpdir: LOG.info('pandas parallel tmpdir={}', tmpdir) n_tasks = 0 for df_path in parallel_df.stream_split(df, tmpdir): task_queue.put(df_path) n_tasks += 1 task_queue.put(_TASK_END) range_n_tasks = range(n_tasks) if progress_bar: range_n_tasks = tqdm.tqdm(range_n_tasks, ncols=80, ascii=True) if iterator: return _get_result_iterator(range_n_tasks, result_queue) else: return _get_result_dataframe(range_n_tasks, result_queue) finally: client.close()
def make_local(cls, spec=None, cluster_kwargs=None, client_kwargs=None): """ Spin up a local dask cluster interesting cluster_kwargs: threads_per_worker n_workers Returns ------- DaskJobExecutor the connected JobExecutor """ if spec is None: spec = cluster_spec(**detect()) if client_kwargs is None: client_kwargs = {} if client_kwargs.get('set_as_default') is None: client_kwargs['set_as_default'] = False if cluster_kwargs is None: cluster_kwargs = {} if cluster_kwargs.get('silence_logs') is None: cluster_kwargs['silence_logs'] = logging.WARN cluster = dd.SpecCluster(workers=spec, **(cluster_kwargs or {})) client = dd.Client(cluster, **(client_kwargs or {})) return cls(client=client, is_local=True, lt_resources=True)
def main(path, scheduler_uri, stackheight, dtype, scan_size, detector_size, bench, which, skip, num_workers, num_nodes, warmup_rounds): scan_size = tuple(int(x) for x in scan_size.split(",")) detector_size = tuple(int(x) for x in detector_size.split(",")) (a, b) = scan_size (f, g) = detector_size # We have to make sure that this happens before numpy is loaded. if bench == 'libertem': for k in [ 'OMP_NUM_THREADS', 'OPENBLAS_NUM_THREADS', 'MKL_NUM_THREADS' ]: os.environ[k] = '1' import numpy as np roi = np.zeros(shape=scan_size, dtype=np.bool) roi[(a * 5) // 10:(a * 6) // 10, (b * 2) // 10:(b * 3) // 10] = True mask = np.zeros(shape=detector_size, dtype=np.float32) mask[(f * 5) // 10:(f * 6) // 10, (g * 2) // 10:(g * 3) // 10] = 1 if bench == 'libertem': from LiberTEMBenchmark import LiberTEMBenchmark as Bm elif bench == 'numpy': from NumpyBenchmark import NumpyBenchmark as Bm elif bench == 'dask.distributed': import dask.distributed as dd dd.Client(address=scheduler_uri) from DaskBenchmark import DaskBenchmark as Bm elif bench == 'dask': from DaskBenchmark import DaskBenchmark as Bm b = Bm(path, dtype, scan_size, detector_size, warmup_rounds, roi, mask) print(json.dumps(b.bench_all(which, skip)))
def test_imports(): client = dd.Client("127.0.0.1:8786") futures = client.map(remote_fiona_import, range(10)) versions = client.gather(futures) print( f"remote fiona import test complete - found version {versions[0][:-1]}" )
def test_parallel(self, testcluster): # repeat selected test w/parallel processing engine client = dd.Client(testcluster) par_tests = ["test_dataselection"] for test in par_tests: getattr(self, test)() client.close()
def client_startup(cluster, n_jobs: int, total_workers: int): """ Start a dask client Args: cluster: Dask cluster n_jobs: number of jobs to submit to the cluster total_workers: number of total workers in the cluster Returns: DaskClient instance, list of workers ID """ if n_jobs <= 0: raise ValueError("n_jobs must equal or greater than 1!") if isinstance(cluster, daskD.LocalCluster) or isinstance(cluster, KubeCluster): cluster.scale(n_jobs) else: cluster.scale(jobs=n_jobs) # Creating dask Client client = daskD.Client(cluster) workers = 0 t0 = time() while workers < total_workers: workers = len(client.get_worker_logs().keys()) # If the number of workers is not reached in 5 minutes raise exception if time() - t0 > 300.0: raise SystemError("Dask could not start the requested workers within 5 minutes!" "Try different n_jobs.") WorkerIds = list(client.get_worker_logs().keys()) return client, WorkerIds
def initialize_dask(n, factor = 5, slurm = False): if not slurm: cores = len(os.sched_getaffinity(0)) cluster = distributed.LocalCluster(processes = False, n_workers = 1, threads_per_worker = 1) else: n = min(100, n) py = './enter_conda.sh python3' params = { 'python' : py, 'cores' : 1, 'memory' : '512MB', 'walltime' : '180', 'processes' : 1, 'job_extra' : [ '--qos use-everything', '--array 0-{0:d}'.format(n - 1), '--requeue', '--output "/dev/null"' ], 'env_extra' : [ 'JOB_ID=${SLURM_ARRAY_JOB_ID%;*}_${SLURM_ARRAY_TASK_ID%;*}', 'source /etc/profile.d/modules.sh', 'cd {0!s}'.format(CONFIG['PATHS', 'root']), ] } cluster = SLURMCluster(**params) print(cluster.job_script()) cluster.scale(1) print(cluster.dashboard_link) return distributed.Client(cluster)
def connect_to_scheduler(self, scheduler_address): """ Start a connection to a running scheduler. The Python interpreter is not occupied. Args: scheduler_address: Address of running scheduler of the form 'ip-address:port' Example: >>> connect_to_scheduler('127.0.0.1:8791') References: - http://distributed.readthedocs.io/en/latest/client.html - http://distributed.readthedocs.io/en/latest/api.html#distributed.client.Client """ from dask import distributed # Precondition _verify_address(scheduler_address) # Establish connection try: self._connection = distributed.Client(scheduler_address) except Exception as err: raise ConnectionError( 'Could not connect to scheduler. Did you start it? ' 'Is it still running? Is the address correct?\n{}'.format(err))
def make_local(cls, cluster_kwargs=None, client_kwargs=None): """ interesting cluster_kwargs: threads_per_worker n_workers """ cluster = dd.LocalCluster(**(cluster_kwargs or {})) client = dd.Client(cluster, **(client_kwargs or {})) return cls(client=client, is_local=True)
def client_in_background(): # A running Dask client can introduce a timing issue # between automatic closing of a numpy.memmap object and renaming # the underlying file with dd.LocalCluster() as cluster: client = dd.Client(cluster, set_as_default=False) yield # to fix "distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client" # NOQA client.close()
def test_parallel(self, testcluster): # collect all tests of current class and repeat them in parallel client = dd.Client(testcluster) all_tests = [attr for attr in self.__dir__() if (inspect.ismethod(getattr(self, attr)) and attr != "test_parallel")] for test in all_tests: getattr(self, test)() flush_local_cluster(testcluster) client.close()
def client(tmpdir_factory, request): with tmpdir_factory.mktemp("dask_cluster").as_cwd(): lc = distributed.LocalCluster(n_workers=request.param, processes=True) client = distributed.Client(lc) yield client client.close() lc.close()
def test_pickling(): def pxfn(fp): """Pixel function for recipe. `ds` lives in the closure""" return np.ones(fp.shape) * id(ds) def slave(): print('slave', dd.get_worker()) """Slave process routine. `ds` and `oldid` live in the closure and are pickled by cloudpickle in Client.sumbit""" assert id(ds) != oldid, 'this test makes sense if `ds` was pickled' assert 'v1' in ds assert 'v2' not in ds assert 'r1' in ds assert 'r2' not in ds assert 'r3' in ds assert (ds._queued_count, ds._locked_count, ds.v1.activated, ds.r1.activated, ds.r3.activated) == (0, 0, False, False, True) assert ds.v1.get_data(0)[1] == str(oldid) assert (ds.r1.get_data() == oldid).all() assert (ds.r3.get_data() == id(ds)).all(), '`slave` and `pxfn` should share the same `ds` obj' ds.v1.insert_data((0, 1), ['42']) ds.r1.fill(42) assert ds.v1.get_data(1)[1] == '42' assert (ds.r1.get_data() == 42).all() ds.deactivate_all() ds = buzz.DataSource(max_activated=2) oldid = id(ds) fp = buzz.Footprint( tl=(1, 1), size=(10, 10), rsize=(10, 10), ) clust = dd.LocalCluster(n_workers=1, threads_per_worker=1, scheduler_port=0) print() print(clust) cl = dd.Client(clust) print(cl) with ds.create_vector('v1', '/tmp/v1.shp', **V_META).delete: with ds.create_raster('r1', '/tmp/t1.shp', fp, float, 1).delete: ds.create_raster('r2', '', fp, float, 1, driver='MEM') ds.create_recipe_raster('r3', pxfn, fp, float) ds.create_vector('v2',**MEMV_META) ds.v1.insert_data((0, 1), [str(oldid)]) ds.v2.insert_data((0, 1), [str(oldid)]) ds.r1.fill(oldid) ds.r2.fill(oldid) ds.deactivate_all() cl.submit(slave).result() assert ds.v1.get_data(1)[1] == '42' assert (ds.r1.get_data() == 42).all()
def connect(cls, scheduler_uri, *args, **kwargs): """ Connect to a remote dask scheduler Returns ------- DaskJobExecutor the connected JobExecutor """ client = dd.Client(address=scheduler_uri) return cls(client=client, is_local=False, *args, **kwargs)
def test_parallel(self, testcluster): # repeat selected test w/parallel processing engine client = dd.Client(testcluster) par_tests = [ "test_relative_array_padding", "test_absolute_nextpow2_array_padding", "test_object_padding", "test_dataselection" ] for test in par_tests: getattr(self, test)() client.close()
def create_client(): """ Initializes a `dask.distributed` client for local computing. """ cores = len(os.sched_getaffinity(0)) nworkers = int(np.ceil(cores / 8)) cluster = distributed.LocalCluster(n_workers=nworkers, threads_per_worker=min(cores, 8), resources={'foo': nworkers}) print(cluster) print(cluster.dashboard_link) client = distributed.Client(cluster) return client
def __init__(self, scheduler_uri=None, client=None, is_local=False): self.is_local = is_local if client is not None: if scheduler_uri: raise ValueError( "pass either client or scheduler_uri, not both") self.client = client else: if client: raise ValueError( "pass either client or scheduler_uri, not both") self.client = dd.Client(scheduler_uri, processes=False)
def compute_reconstruction_error(self, lams_reg, gnd_truth): """Compute the reconstructions for each regularization weight error against the ground truth. Parameters ---------- lams_reg : numpy.array_like List of regularization weights. gnd_truth : numpy.array_like Expected reconstruction. Returns ------- err_l1 : numpy.array_like l1-norm errors for each reconstruction. err_l2 : numpy.array_like l2-norm errors for each reconstruction. """ if self.verbose: print("Computing reconstruction error:") print("- Regularization weights range: [%g, %g] in %d steps" % (lams_reg[0], lams_reg[-1], len(lams_reg))) c = tm.time() err_l1 = np.empty((len(lams_reg),)) err_l2 = np.empty((len(lams_reg),)) if self.parallel_eval: client = dd.Client() r = client.map(self.compute_reconstruction_and_loss, lams_reg) for ii_l, l in enumerate(lams_reg): if self.parallel_eval: _, rec = r[ii_l].result() else: _, rec = self.compute_reconstruction_and_loss(l) residual = np.abs(gnd_truth - rec) err_l1[ii_l] = np.linalg.norm(residual.ravel(), ord=1) err_l2[ii_l] = np.linalg.norm(residual.ravel(), ord=2) ** 2 if self.verbose: print("Done in %g seconds.\n" % (tm.time() - c)) if self.plot_result: f, axs = plt.subplots(2, 1, sharex=True) axs[0].set_xscale("log", nonpositive="clip") axs[0].plot(lams_reg, err_l1, label="Error - l1-norm") axs[0].legend() axs[1].set_xscale("log", nonpositive="clip") axs[1].plot(lams_reg, err_l2, label="Error - l2-norm ^ 2") axs[1].legend() plt.show(block=False) return err_l1, err_l2
def connect(cls, *args, **kwargs): """ Setup local cluster """ kwargs["n_workers"] = kwargs.pop("n_workers", cls.processes) kwargs["threads_per_worker"] = kwargs.pop("threads_per_worker", 1) kwargs["processes"] = kwargs.pop("processes", True) kwargs["local_dir"] = kwargs.pop("local_dir", cls.local_dir) cls.local_cluster = distributed.LocalCluster(*args, **kwargs) cls.client = distributed.Client(cls.local_cluster) return True
def test_corr_parallel(self, testcluster=None): ppl.ioff() client = dd.Client(testcluster) all_tests = [ attr for attr in self.__dir__() if (inspect.ismethod(getattr(self, attr)) and 'parallel' not in attr) ] for test in all_tests: test_method = getattr(self, test) test_method() client.close() ppl.ion()
def test_dask_unchunked_input(self): p = da.from_array(self.p_def) t = da.from_array(self.t_def) q = da.from_array(self.q_def) # Start dask cluster cluster = dd.LocalCluster(n_workers=3, threads_per_worker=2) print(cluster.dashboard_link) client = dd.Client(cluster) out = map_blocks(relhum, t, q, p).compute() assert np.allclose(out, self.rh_gt_2, atol=0.1) client.shutdown()
def test_dask_chunked_input(self): p = da.from_array(self.p_def, chunks="auto") t = da.from_array(self.t_def, chunks="auto") q = da.from_array(self.q_def, chunks="auto") # Start dask cluster cluster = dd.LocalCluster(n_workers=3, threads_per_worker=2) print(cluster.dashboard_link) client = dd.Client(cluster) out = client.submit(relhum, t, q, p).result() assert np.allclose(out, self.rh_gt_2, atol=0.1) client.shutdown()
def test_dask_chunked_input(self): tk = da.from_array(np.asarray(t_def) + 273.15, chunks="auto") rh = da.from_array(rh_def, chunks="auto") # Start dask cluster cluster = dd.LocalCluster(n_workers=3, threads_per_worker=2) print(cluster.dashboard_link) client = dd.Client(cluster) out = map_blocks(dewtemp, tk, rh).compute() assert np.allclose(out - 273.15, dt_2, atol=0.1) cluster.close() client.close()
def flush_local_cluster(testcluster, timeout=10): """ Resets a parallel computing client to avoid memory spilling """ if isinstance(testcluster, dd.LocalCluster): # client.restart() client = dd.get_client() client.close() time.sleep(1.0) client = dd.Client(testcluster) waiting = 0 while len([w["memory_limit"] for w in testcluster.scheduler_info["workers"].values()]) == 0 \ and waiting < timeout: time.sleep(1.0) waiting += 1 return
def make_local(cls, cluster_kwargs=None, client_kwargs=None): """ Spin up a local dask cluster interesting cluster_kwargs: threads_per_worker n_workers Returns ------- DaskJobExecutor the connected JobExecutor """ cluster = dd.LocalCluster(**(cluster_kwargs or {})) client = dd.Client(cluster, **(client_kwargs or {})) return cls(client=client, is_local=True)