def main(args=None): args = parse_args() ctx = directory = tempfile.TemporaryDirectory() with ctx: original = os.path.join(str(directory), args.original) split = os.path.join(str(directory), args.split) final = os.path.join(str(directory), args.final) shape = (args.n_slices, ) + args.shape chunks = (1, ) + args.shape a = da.random.random(shape, chunks=chunks) a.to_zarr(original, overwrite=True) with Client(): print("rechunking") t0 = tic() with performance_report(): rechunk.rechunk(original, split, final, args.split_chunks) t1 = tic() took = t1 - t0 gbs = a.nbytes / 1e9 / took print( f"Rechunked {dask.utils.format_bytes(a.nbytes)} in {t1 - t0:.2f}s ({gbs:0.2f} GB/s)" )
def main(config_json: str, dry: bool = False, scheduler: str = ''): now_str = datetime.now().strftime('%Y:%m:%d:%H:%M:%S') with fsspec.open(config_json) as fh: json_blob = json.load(fh) spec = MultiscaleStorageSpec(**json_blob) if not os.path.exists(spec.logging_dir): os.makedirs(spec.logging_dir) logger.addHandler( logging.FileHandler(filename=os.path.join( spec.logging_dir, f'multiscale_generation_{now_str}.log'))) logger.addHandler(logging.StreamHandler()) logger.info(f'Loaded MultiscaleStorageSpec: {spec.json(indent=2)}') store_op = prepare_multiscale_storage( source=spec.source.url, source_chunks=spec.source.chunks, dest=spec.destination.url, dest_chunks=spec.destination.chunks, dest_access_mode=spec.destination.access_mode, downsampling_method=spec.downsampling_spec.method, downsampling_factors=spec.downsampling_spec.factors, downsampling_levels=spec.downsampling_spec.levels, downsampling_chunks=spec.downsampling_spec.chunks) if spec.cluster_spec.deployment == 'dask_local': from distributed import LocalCluster clusterClass = LocalCluster elif spec.cluster_spec.deployment == 'dask_lsf': from dask_jobqueue import LSFCluster from functools import partial clusterClass = partial( LSFCluster, ncpus=spec.cluster_spec.worker.num_cores, mem=f'{15 * spec.cluster_spec.worker.num_cores}GB', processes=spec.cluster_spec.worker.num_cores) clusterClass.__name__ = LSFCluster.__name__ if not dry: logger.info( f'Creating an instance of {clusterClass.__name__} and scaling to {spec.cluster_spec.worker.num_workers} workers' ) with clusterClass() as clust, Client(clust) as cl, performance_report( os.path.join(spec.logging_dir, f'dask_distributed_perf_report_{now_str}.html')): cl.cluster.scale(spec.cluster_spec.worker.num_workers) logger.info(f'Cluster dashboard url: {cl.cluster.dashboard_link}') logger.info( f'Begin saving multiscale data to {spec.destination.url}') start = time.time() futures = cl.compute(store_op) results = cl.gather(futures) end = time.time() logger.info(f'Done saving multiscale data after {end - start}s')
def start(self) -> Iterator[None]: """ Context manager for initializing execution. Creates a `dask.distributed.Client` and yields it. """ if sys.platform != "win32": # Fix for https://github.com/dask/distributed/issues/4168 import multiprocessing.popen_spawn_posix # noqa from distributed import Client, performance_report performance_report_context = ( performance_report(self.performance_report_path) if self.performance_report_path else nullcontext() ) try: if self.address is not None: self.logger.info( "Connecting to an existing Dask cluster at %s", self.address ) with Client(self.address, **self.client_kwargs) as client: with performance_report_context: self.client = client try: self._pre_start_yield() yield finally: self._post_start_yield() else: assert callable(self.cluster_class) # mypy assert isinstance(self.cluster_kwargs, dict) # mypy self.logger.info( "Creating a new Dask cluster with `%s.%s`...", self.cluster_class.__module__, self.cluster_class.__qualname__, ) with self.cluster_class(**self.cluster_kwargs) as cluster: if getattr(cluster, "dashboard_link", None): self.logger.info( "The Dask dashboard is available at %s", cluster.dashboard_link, ) if self.adapt_kwargs: cluster.adapt(**self.adapt_kwargs) with Client(cluster, **self.client_kwargs) as client: with performance_report_context: self.client = client try: self._pre_start_yield() yield finally: self._post_start_yield() finally: self.client = None
# gc.set_debug(gc.DEBUG_STATS | gc.DEBUG_COLLECTABLE | gc.DEBUG_UNCOLLECTABLE) # print("Enabling GC debug logging on scheduler") # client.run_on_scheduler(enable_gc_debug) print("Here we go!") dask.config.set({ # This is key---otherwise we're uploading ~300MiB of graph to the scheduler "optimization.fuse.active": False, # Handle flaky connections to Coiled "distributed.comm.retry.count": 5, }) test_name = "purepy-shuffle-gc-asyncbatchedsend-nohttp" with ( distributed.performance_report(f"results/{test_name}.html"), pyspy_on_scheduler( f"results/{test_name}.json", subprocesses=True, idle=True, native=True, ), ): elapsed = main() print(f"{elapsed:.1f} sec") client.shutdown() client.close()
levels = list(range(1,6)) crop = (slice(8192),) * 3 def reducer(v, **kwargs): return np.mean(v, dtype='float32', **kwargs) source_path = '/nrs/flyem/bench/Z0720_07m_BR.n5/render/Sec30/v1_acquire_trimmed_align___20210413_194018/s0' target_path = '/nrs/flyem/bench/Z0720_07m_BR.n5/test_dask_down/' store_chunks = read(source_path, storage_options={'normalize_keys': False}).chunks read_chunks=(1024,) * 3 data = read_xarray(source_path, storage_options={'normalize_keys': False}, chunks=read_chunks, name='test_data')[crop] multi = get(levels, multiscale(data, reducer, (2,2,2))) if not chunk_locking: for m in multi: m.data = ensure_minimum_chunksize(m.data, store_chunks) multi_store = Multiscales(name, {f's{l}' : m for l,m in zip(levels, multi)}) if __name__ == '__main__': with get_cluster(threads_per_worker=tpw) as cluster, Client(cluster) as cl: print(cl.cluster.dashboard_link) cl.cluster.scale(num_workers) cl.wait_for_workers(num_workers) with performance_report(filename=os.path.join(target_path, f'{name}_report.html')): store_group, store_arrays, storage_op = multi_store.store(target_path, locking=chunk_locking, client=cl, mode='w') result = cl.compute(dask.delayed(storage_op), sync=True)