コード例 #1
0
ファイル: basic.py プロジェクト: TomAugspurger/rechunk
def main(args=None):
    args = parse_args()

    ctx = directory = tempfile.TemporaryDirectory()

    with ctx:
        original = os.path.join(str(directory), args.original)
        split = os.path.join(str(directory), args.split)
        final = os.path.join(str(directory), args.final)

        shape = (args.n_slices, ) + args.shape
        chunks = (1, ) + args.shape
        a = da.random.random(shape, chunks=chunks)
        a.to_zarr(original, overwrite=True)

        with Client():
            print("rechunking")
            t0 = tic()

            with performance_report():
                rechunk.rechunk(original, split, final, args.split_chunks)
                t1 = tic()

        took = t1 - t0
        gbs = a.nbytes / 1e9 / took
        print(
            f"Rechunked {dask.utils.format_bytes(a.nbytes)} in {t1 - t0:.2f}s ({gbs:0.2f} GB/s)"
        )
コード例 #2
0
def main(config_json: str, dry: bool = False, scheduler: str = ''):

    now_str = datetime.now().strftime('%Y:%m:%d:%H:%M:%S')

    with fsspec.open(config_json) as fh:
        json_blob = json.load(fh)

    spec = MultiscaleStorageSpec(**json_blob)

    if not os.path.exists(spec.logging_dir):
        os.makedirs(spec.logging_dir)

    logger.addHandler(
        logging.FileHandler(filename=os.path.join(
            spec.logging_dir, f'multiscale_generation_{now_str}.log')))
    logger.addHandler(logging.StreamHandler())
    logger.info(f'Loaded MultiscaleStorageSpec: {spec.json(indent=2)}')

    store_op = prepare_multiscale_storage(
        source=spec.source.url,
        source_chunks=spec.source.chunks,
        dest=spec.destination.url,
        dest_chunks=spec.destination.chunks,
        dest_access_mode=spec.destination.access_mode,
        downsampling_method=spec.downsampling_spec.method,
        downsampling_factors=spec.downsampling_spec.factors,
        downsampling_levels=spec.downsampling_spec.levels,
        downsampling_chunks=spec.downsampling_spec.chunks)

    if spec.cluster_spec.deployment == 'dask_local':
        from distributed import LocalCluster
        clusterClass = LocalCluster
    elif spec.cluster_spec.deployment == 'dask_lsf':
        from dask_jobqueue import LSFCluster
        from functools import partial
        clusterClass = partial(
            LSFCluster,
            ncpus=spec.cluster_spec.worker.num_cores,
            mem=f'{15 * spec.cluster_spec.worker.num_cores}GB',
            processes=spec.cluster_spec.worker.num_cores)
        clusterClass.__name__ = LSFCluster.__name__

    if not dry:
        logger.info(
            f'Creating an instance of {clusterClass.__name__} and scaling to {spec.cluster_spec.worker.num_workers} workers'
        )
        with clusterClass() as clust, Client(clust) as cl, performance_report(
                os.path.join(spec.logging_dir,
                             f'dask_distributed_perf_report_{now_str}.html')):
            cl.cluster.scale(spec.cluster_spec.worker.num_workers)
            logger.info(f'Cluster dashboard url: {cl.cluster.dashboard_link}')
            logger.info(
                f'Begin saving multiscale data to {spec.destination.url}')
            start = time.time()
            futures = cl.compute(store_op)
            results = cl.gather(futures)
            end = time.time()
            logger.info(f'Done saving multiscale data after {end - start}s')
コード例 #3
0
ファイル: dask.py プロジェクト: tank0226/prefect
    def start(self) -> Iterator[None]:
        """
        Context manager for initializing execution.

        Creates a `dask.distributed.Client` and yields it.
        """
        if sys.platform != "win32":
            # Fix for https://github.com/dask/distributed/issues/4168
            import multiprocessing.popen_spawn_posix  # noqa
        from distributed import Client, performance_report

        performance_report_context = (
            performance_report(self.performance_report_path)
            if self.performance_report_path
            else nullcontext()
        )

        try:
            if self.address is not None:
                self.logger.info(
                    "Connecting to an existing Dask cluster at %s", self.address
                )
                with Client(self.address, **self.client_kwargs) as client:

                    with performance_report_context:
                        self.client = client
                        try:
                            self._pre_start_yield()
                            yield
                        finally:
                            self._post_start_yield()
            else:
                assert callable(self.cluster_class)  # mypy
                assert isinstance(self.cluster_kwargs, dict)  # mypy
                self.logger.info(
                    "Creating a new Dask cluster with `%s.%s`...",
                    self.cluster_class.__module__,
                    self.cluster_class.__qualname__,
                )
                with self.cluster_class(**self.cluster_kwargs) as cluster:
                    if getattr(cluster, "dashboard_link", None):
                        self.logger.info(
                            "The Dask dashboard is available at %s",
                            cluster.dashboard_link,
                        )
                    if self.adapt_kwargs:
                        cluster.adapt(**self.adapt_kwargs)
                    with Client(cluster, **self.client_kwargs) as client:
                        with performance_report_context:
                            self.client = client
                            try:
                                self._pre_start_yield()
                                yield
                            finally:
                                self._post_start_yield()
        finally:
            self.client = None
コード例 #4
0
    #     gc.set_debug(gc.DEBUG_STATS | gc.DEBUG_COLLECTABLE | gc.DEBUG_UNCOLLECTABLE)

    # print("Enabling GC debug logging on scheduler")
    # client.run_on_scheduler(enable_gc_debug)

    print("Here we go!")

    dask.config.set({
        # This is key---otherwise we're uploading ~300MiB of graph to the scheduler
        "optimization.fuse.active": False,
        # Handle flaky connections to Coiled
        "distributed.comm.retry.count": 5,
    })

    test_name = "purepy-shuffle-gc-asyncbatchedsend-nohttp"
    with (
            distributed.performance_report(f"results/{test_name}.html"),
            pyspy_on_scheduler(
                f"results/{test_name}.json",
                subprocesses=True,
                idle=True,
                native=True,
            ),
    ):
        elapsed = main()
        print(f"{elapsed:.1f} sec")

    client.shutdown()
    client.close()
コード例 #5
0
levels = list(range(1,6))
crop = (slice(8192),) * 3

def reducer(v, **kwargs):
    return np.mean(v, dtype='float32', **kwargs)

source_path = '/nrs/flyem/bench/Z0720_07m_BR.n5/render/Sec30/v1_acquire_trimmed_align___20210413_194018/s0'
target_path = '/nrs/flyem/bench/Z0720_07m_BR.n5/test_dask_down/'

store_chunks = read(source_path, storage_options={'normalize_keys': False}).chunks
read_chunks=(1024,) * 3

data = read_xarray(source_path, storage_options={'normalize_keys': False}, chunks=read_chunks, name='test_data')[crop]

multi = get(levels, multiscale(data, reducer, (2,2,2)))

if not chunk_locking:
    for m in multi:
        m.data = ensure_minimum_chunksize(m.data, store_chunks)

    
multi_store = Multiscales(name, {f's{l}' : m for l,m in zip(levels, multi)})

if __name__ == '__main__':
    with get_cluster(threads_per_worker=tpw) as cluster, Client(cluster) as cl:
        print(cl.cluster.dashboard_link)
        cl.cluster.scale(num_workers)
        cl.wait_for_workers(num_workers)
        with performance_report(filename=os.path.join(target_path, f'{name}_report.html')):
            store_group, store_arrays, storage_op = multi_store.store(target_path, locking=chunk_locking, client=cl, mode='w')
            result = cl.compute(dask.delayed(storage_op), sync=True)