def create_local_dask_cluster(spare_mem='3Gb', display_client=True): """ Using the datacube utils function `start_local_dask`, generate a local dask cluster. Automatically detects if on AWS or NCI. Example use : import sys sys.path.append("../Scripts") from dea_dask import create_local_dask_cluster create_local_dask_cluster(spare_mem='4Gb') Parameters ---------- spare_mem : String, optional The amount of memory, in Gb, to leave for the notebook to run. This memory will not be used by the cluster. e.g '3Gb' display_client : Bool, optional An optional boolean indicating whether to display a summary of the dask client, including a link to monitor progress of the analysis. Set to False to hide this display. """ if 'AWS_ACCESS_KEY_ID' in os.environ: # Close previous client if any client = locals().get('client', None) if client is not None: client.close() del client # Configure dashboard link to go over proxy dask.config.set({"distributed.dashboard.link": os.environ.get('JUPYTERHUB_SERVICE_PREFIX', '/')+"proxy/{port}/status"}) # Start up a local cluster client = start_local_dask(mem_safety_margin=spare_mem) ## Configure GDAL for s3 access configure_s3_access(aws_unsigned=True, client=client); else: # Close previous client if any client = locals().get('client', None) if client is not None: client.close() del client # Start up a local cluster on NCI client = start_local_dask(mem_safety_margin=spare_mem) # Show the dask cluster settings if display_client: display(client)
def _init_dask(self) -> Client: cfg = self._cfg _log = self._log nthreads = cfg.threads if nthreads <= 0: nthreads = get_max_cpu() memory_limit: Union[str, int] = cfg.memory_limit if memory_limit == "": _1G = 1 << 30 memory_limit = get_max_mem() if memory_limit > 2 * _1G: # leave at least a gig extra if total mem more than 2G memory_limit -= _1G client = start_local_dask( threads_per_worker=nthreads, processes=False, memory_limit=memory_limit ) aws_unsigned = self._cfg.aws_unsigned for c in (None, client): configure_s3_access( aws_unsigned=aws_unsigned, cloud_defaults=True, client=c ) _log.info(f"Started local Dask {client}") return client
def save(xx, location, product_name, verbose): client = start_local_dask( nanny=False, n_workers=1, threads_per_worker=8, mem_safety_margin="0G", processes=False, ) gdal_cfg = {"GDAL_CACHEMAX": 8 * (1 << 30)} configure_s3_access(aws_unsigned=True, cloud_defaults=True, **gdal_cfg) configure_s3_access(aws_unsigned=True, cloud_defaults=True, client=client, **gdal_cfg) rgba = to_rgba(xx.isel(time=0), clamp=(0, 3000)) rgba = xr_to_mem(rgba, client) if verbose: print(f"Writing {location}/{product_name}.tif") write_cog( rgba, f"{location}/{product_name}.tif", blocksize=1024, compress="zstd", zstd_level=4, overview_levels=[], NUM_THREADS="ALL_CPUS", BIGTIFF="YES", SPARSE_OK=True, )
def test_compute_tasks(): client = start_local_dask(threads_per_worker=1, dashboard_address=None) tasks = (dask.delayed(x) for x in range(100)) xx = [x for x in compute_tasks(tasks, client)] assert xx == [x for x in range(100)] client.close() del client
def test_pmap(): client = start_local_dask(threads_per_worker=1, dashboard_address=None) xx_it = pmap(str, range(101), client=client) xx = [x for x in xx_it] assert xx == [str(x) for x in range(101)] client.close() del client
def create_local_dask_cluster(spare_mem='3Gb', aws_unsigned= True, display_client=True, **kwargs): """ Using the datacube utils function 'start_local_dask', generate a local dask cluster. Example use : import sys sys.path.append("../Scripts") from deafrica_dask import create_local_dask_cluster create_local_dask_cluster(spare_mem='4Gb') Parameters ---------- spare_mem : String, optional The amount of memory, in Gb, to leave for the notebook to run. This memory will not be used by the cluster. e.g '3Gb' aws_unsigned : Bool, optional This parameter determines if credentials for S3 access are required and passes them on to processing threads, either local or on dask cluster. Set to True if working with publicly available datasets, and False if working with private data. i.e if loading Landsat C2 provisional data set this to aws_unsigned=False display_client : Bool, optional An optional boolean indicating whether to display a summary of the dask client, including a link to monitor progress of the analysis. Set to False to hide this display. **kwargs: Additional keyword arguments that will be passed to start_local_dask(). E.g. n_workers can be set to be greater than 1. """ # configure dashboard link to go over proxy dask.config.set({"distributed.dashboard.link": os.environ.get('JUPYTERHUB_SERVICE_PREFIX', '/')+"proxy/{port}/status"}) # start up a local cluster client = start_local_dask(mem_safety_margin=spare_mem, **kwargs) ## Configure GDAL for s3 access configure_s3_access(aws_unsigned=aws_unsigned, client=client); # Show the dask cluster settings if display_client: display(client)
def create_local_dask_cluster(workers=1, threads=None, mem_limit=None, spare_mem="20Gb", display_client=True): """ Using the datacube utils function `start_local_dask`, generate a local dask cluster. Parameters ---------- spare_mem : String, optional The amount of memory, in Gb, to leave for the notebook to run. This memory will not be used by the cluster. e.g '3Gb' display_client : Bool, optional An optional boolean indicating whether to display a summary of the dask client, including a link to monitor progress of the analysis. Set to False to hide this display. workers: int Number of worker processes to launch threads: int, optional Number of threads per worker, default is as many as there are CPUs mem_limit: String, optional Maximum memory to use across all workers """ if _HAVE_PROXY: # Configure dashboard link to go over proxy prefix = os.environ.get("JUPYTERHUB_SERVICE_PREFIX", "/") dask.config.set( {"distributed.dashboard.link": prefix + "proxy/{port}/status"}) # Start up a local cluster client = start_local_dask( n_workers=workers, threads_per_worker=threads, memory_limit=mem_limit, mem_safety_margin=spare_mem, ) # Show the dask cluster settings if display_client: display(client)
def __init__( self, config: Optional[FeaturePathConfig] = None, geobox_dict: Optional[Dict] = None, client: Optional[Client] = None, ): self.config = config if config else FeaturePathConfig() self.geobox_dict = geobox_dict if not client: nthreads = get_max_cpu() memory_limit = get_max_mem() client = start_local_dask( threads_per_worker=nthreads, processes=False, memory_limit=int(0.9 * memory_limit), ) configure_s3_access(aws_unsigned=True, cloud_defaults=True, client=client) self.client = client setup_logging() self._log = logging.getLogger(__name__)
def create_mosaic( dc: Datacube, product: str, out_product: str, time: Tuple[str, str], time_str: str, bands: Tuple[str], s3_output_root: str, split_bands: bool = False, resolution: int = 120, overwrite: bool = False, ): log = setup_logging() log.info(f"Creating mosaic for {product} over {time}") client = start_local_dask() assets = {} data = dc.load( product=product, time=time, resolution=(-resolution, resolution), dask_chunks={"x": 2048, "y": 2048}, measurements=bands, ) # This is a bad idea, we run out of memory # data.persist() if not split_bands: log.info("Creating a single tif file") out_file = _get_path(s3_output_root, out_product, time_str, "tif") exists = s3_head_object(out_file) is not None skip_writing = not (not exists or overwrite) try: asset, _ = _save_opinionated_cog( data, out_file, skip_writing=skip_writing, ) except ValueError: log.exception( "Failed to create COG, please check that you only have one timestep in the period." ) exit(1) assets[bands[0]] = asset if skip_writing: log.info(f"File exists, and overwrite is False. Not writing {out_file}") else: log.info(f"Finished writing: {asset.href}") else: log.info("Creating multiple tif files") for band in bands: out_file = _get_path( s3_output_root, out_product, time_str, "tif", band=band ) exists = s3_head_object(out_file) is not None skip_writing = not (not exists or overwrite) try: asset, band = _save_opinionated_cog( data=data, out_file=out_file, band=band, skip_writing=skip_writing, ) except ValueError: log.exception( "Failed to create COG, please check that you only have one timestep in the period." ) exit(1) assets[band] = asset if skip_writing: log.info(f"File exists, and overwrite is False. Not writing {out_file}") else: log.info(f"Finished writing: {asset.href}") # Aggressively heavy handed, but we get memory leaks otherwise client.restart() out_stac_file = _get_path(s3_output_root, out_product, time_str, "stac-item.json") item = create_stac_item( assets[bands[0]].href, id=f"{product}_{time_str}", assets=assets, with_proj=True, properties={ "odc:product": out_product, "start_datetime": f"{time[0]}T00:00:00Z", "end_datetime": f"{time[1]}T23:59:59Z", }, ) item.set_self_href(out_stac_file) log.info(f"Writing STAC: {out_stac_file}") client = s3_client(aws_unsigned=False) s3_dump( data=json.dumps(item.to_dict(), indent=2), url=item.self_href, ACL="bucket-owner-full-control", ContentType="application/json", s3=client, )
# %matplotlib inline from IPython.display import display, Image import matplotlib.pyplot as plt plt.rcParams["axes.facecolor"] = "magenta" # makes transparent pixels obvious import numpy as np import xarray as xr # %% from dask.distributed import Client, wait as dask_wait from datacube.utils.dask import start_local_dask from datacube.utils.rio import configure_s3_access if False: client = start_local_dask(scheduler_port=11311, threads_per_worker=16) configure_s3_access(aws_unsigned=True, cloud_defaults=True, client=client) else: client = Client("tcp://127.0.0.1:11311") client.restart() client # %% from odc.algo import to_rgba from odc.ui import to_jpeg_data def mk_roi(y, x, sz=256): return np.s_[y : y + sz, x : x + sz]
def run_gm(cache_file, tasks, dryrun, verbose, threads, x_chunks, y_chunks, overwrite, public, location): """ Run Pixel Quality stats Task could be one of the 3 things \b 1. Comma-separated triplet: period,x,y or 'x[+-]<int>/y[+-]<int>/period 2019--P1Y,+003,-004 2019--P1Y/3/-4 `/` is also accepted x+003/y-004/2019--P1Y is accepted as well 2. A zero based index 3. A slice following python convention <start>:<stop>[:<step] ::10 -- every tenth task: 0,10,20,.. 1::10 -- every tenth but skip first one 1, 11, 21 .. :100 -- first 100 tasks If no tasks are supplied the whole file will be processed. """ from tqdm.auto import tqdm from functools import partial import dask import psutil from .io import S3COGSink from ._gm import gm_input_data, gm_reduce, gm_product from .proc import process_tasks from .tasks import TaskReader from datacube.utils.dask import start_local_dask from datacube.utils.rio import configure_s3_access dask.config.set({'distributed.worker.memory.target': False}) dask.config.set({'distributed.worker.memory.spill': False}) dask.config.set({'distributed.worker.memory.pause': False}) dask.config.set({'distributed.worker.memory.terminate': False}) # config resampling = 'bilinear' COG_OPTS = dict( compress='deflate', predict=2, zlevel=6, blocksize=800, ovr_blocksize= 256, # ovr_blocksize must be powers of 2 for some reason in GDAL overview_resampling='bilinear') ncpus = psutil.cpu_count() # .. if threads <= 0: threads = ncpus rdr = TaskReader(cache_file) product = gm_product(location=location) if verbose: print(repr(rdr)) def _proc(task): NY, NX = task.geobox.shape ds_in = gm_input_data(task, resampling=resampling, chunk=(NY // y_chunks, NX)) tdim = list(ds_in.dims)[0] ds_in = ds_in.chunk({tdim: -1, 'x': NX // x_chunks}) ds = gm_reduce(ds_in, num_threads=ncpus // x_chunks + 2, wk_rows=(NY // y_chunks) // 4, as_array=True) return ds def dry_run_proc(task, sink, check_s3=False): uri = sink.uri(task) exists = None if check_s3: exists = sink.exists(task) nds = len(task.datasets) ndays = len(set(ds.center_time.date() for ds in task.datasets)) if overwrite: flag = {None: '', True: ' (recompute)', False: ' (new)'}[exists] else: flag = {None: '', True: ' (skip)', False: ' (new)'}[exists] task_id = f"{task.short_time}/{task.tile_index[0]:+05d}/{task.tile_index[1]:+05d}" print(f"{task_id} days={ndays:03} ds={nds:04} {uri}{flag}") return uri if len(tasks) == 0: tasks = rdr.all_tiles if verbose: print(f"Found {len(tasks):,d} tasks in the file") else: try: tasks = parse_all_tasks(tasks, rdr.all_tiles) except ValueError as e: print(str(e), file=sys.stderr) sys.exit(1) if verbose: print(f"Will process {len(tasks):,d} tasks") sink = S3COGSink(cog_opts=COG_OPTS, public=public) if product.location.startswith('s3:'): if not sink.verify_s3_credentials(): print("Failed to load S3 credentials") sys.exit(2) if verbose and sink._creds: creds_rw = sink._creds print( f'creds: ..{creds_rw.access_key[-5:]} ..{creds_rw.secret_key[-5:]}' ) _tasks = rdr.stream(tasks, product) client = None if not dryrun: if verbose: print("Starting local Dask cluster") client = start_local_dask(threads_per_worker=threads, mem_safety_margin='1G') # TODO: aws_unsigned is not always desirable configure_s3_access(aws_unsigned=True, cloud_defaults=True, client=client) if verbose: print(client) if dryrun: results = map(partial(dry_run_proc, sink=sink, check_s3=not overwrite), _tasks) else: results = process_tasks(_tasks, _proc, client, sink, check_exists=not overwrite, chunked_persist=x_chunks, verbose=verbose) if not dryrun and verbose: results = tqdm(results, total=len(tasks)) for p in results: if verbose and not dryrun: print(p) if verbose: print("Exiting") if client is not None: client.close()
def test_start_local_dask_dashboard_link(monkeypatch): monkeypatch.setenv('JUPYTERHUB_SERVICE_PREFIX', 'user/test/') client = start_local_dask() assert client.dashboard_link.startswith('user/test/proxy/')
def create_local_dask_cluster(spare_mem='3Gb', aws_unsigned=True, display_client=True, start_local_dask_kwargs=None, configure_s3_access_kwargs=None): """ Credit belongs to Digital Earth Africa: https://github.com/digitalearthafrica/deafrica-sandbox-notebooks/blob/master/Scripts/deafrica_dask.py Using the datacube utils function 'start_local_dask', generate a local dask cluster. Example use : import sys sys.path.append("../Scripts") from deafrica_dask import create_local_dask_cluster create_local_dask_cluster(spare_mem='4Gb') Parameters ---------- spare_mem : String, optional The amount of memory, in Gb, to leave for the notebook to run. This memory will not be used by the cluster. e.g '3Gb' aws_unsigned : Bool, optional This parameter determines if credentials for S3 access are required and passes them on to processing threads, either local or on dask cluster. Set to True if working with publicly available datasets, and False if working with private data. i.e if loading Landsat C2 provisional data set this to aws_unsigned=False display_client : Bool, optional An optional boolean indicating whether to display a summary of the dask client, including a link to monitor progress of the analysis. Set to False to hide this display. start_local_dask_kwargs: dict, optional Keyword arguments for the function `datacube.utils.dask.start_local_dask`, which creates the Dask client. Some settings to configure include the number of workers, number of threads per worker, and the memory limit. configure_s3_access_kwargs: dict, optional Keyword arguments for the function `datacube.utils.rio.configure_s3_access`, which configures the Dask to access S3. """ start_local_dask_kwargs = {} if start_local_dask_kwargs is None else start_local_dask_kwargs configure_s3_access_kwargs = {} if configure_s3_access_kwargs is None else configure_s3_access_kwargs # configure dashboard link to go over proxy dask.config.set({ "distributed.dashboard.link": os.environ.get('JUPYTERHUB_SERVICE_PREFIX', '/') + "proxy/{port}/status" }) # start up a local cluster num_physical_cpu = psutil.cpu_count(logical=False) num_logical_cpu = psutil.cpu_count(logical=True) num_logical_per_physical = num_logical_cpu / num_physical_cpu start_local_dask_kwargs.setdefault('n_workers', num_physical_cpu - 1) start_local_dask_kwargs.setdefault( 'threads_per_worker', int(num_logical_per_physical * start_local_dask_kwargs['n_workers'])) client = start_local_dask(mem_safety_margin=spare_mem, **start_local_dask_kwargs) ## Configure GDAL for s3 access configure_s3_access(aws_unsigned=aws_unsigned, client=client, **configure_s3_access_kwargs) return client
def run_pq(cache_file, tasks, dryrun, verbose, threads, overwrite, public, location): """ Run Pixel Quality stats Task could be one of the 3 things \b 1. Comma-separated triplet: period,x,y or 'x[+-]<int>/y[+-]<int>/period 2019--P1Y,+003,-004 2019--P1Y/3/-4 `/` is also accepted x+003/y-004/2019--P1Y is accepted as well 2. A zero based index 3. A slice following python convention <start>:<stop>[:<step] ::10 -- every tenth task: 0,10,20,.. 1::10 -- every tenth but skip first one 1, 11, 21 .. :100 -- first 100 tasks If no tasks are supplied the whole file will be processed. """ from tqdm.auto import tqdm from functools import partial from .io import S3COGSink from ._pq import pq_input_data, pq_reduce, pq_product from .proc import process_tasks from .tasks import TaskReader from datacube.utils.dask import start_local_dask from datacube.utils.rio import configure_s3_access # config resampling = 'nearest' COG_OPTS = dict(compress='deflate', predict=2, zlevel=6, blocksize=800) # .. rdr = TaskReader(cache_file) product = pq_product(location=location) if verbose: print(repr(rdr)) def pq_proc(task): ds_in = pq_input_data(task, resampling=resampling) ds = pq_reduce(ds_in) return ds def dry_run_proc(task, sink, check_s3=False): uri = sink.uri(task) exists = None if check_s3: exists = sink.exists(task) nds = len(task.datasets) ndays = len(set(ds.center_time.date() for ds in task.datasets)) if overwrite: flag = {None: '', True: ' (recompute)', False: ' (new)'}[exists] else: flag = {None: '', True: ' (skip)', False: ' (new)'}[exists] task_id = f"{task.short_time}/{task.tile_index[0]:+05d}/{task.tile_index[1]:+05d}" print(f"{task_id} days={ndays:03} ds={nds:04} {uri}{flag}") return uri if len(tasks) == 0: tasks = rdr.all_tiles if verbose: print(f"Found {len(tasks):,d} tasks in the file") else: try: tasks = parse_all_tasks(tasks, rdr.all_tiles) except ValueError as e: print(str(e), file=sys.stderr) sys.exit(1) if verbose: print(f"Will process {len(tasks):,d} tasks") sink = S3COGSink(cog_opts=COG_OPTS, public=public) if product.location.startswith('s3:'): if not sink.verify_s3_credentials(): print("Failed to load S3 credentials") sys.exit(2) if verbose and sink._creds: creds_rw = sink._creds print( f'creds: ..{creds_rw.access_key[-5:]} ..{creds_rw.secret_key[-5:]}' ) _tasks = rdr.stream(tasks, product) client = None if not dryrun: if verbose: print("Starting local Dask cluster") client = start_local_dask(threads_per_worker=threads, mem_safety_margin='1G') # TODO: aws_unsigned is not always desirable configure_s3_access(aws_unsigned=True, cloud_defaults=True, client=client) if verbose: print(client) if dryrun: results = map(partial(dry_run_proc, sink=sink, check_s3=not overwrite), _tasks) else: results = process_tasks(_tasks, pq_proc, client, sink, check_exists=not overwrite, verbose=verbose) if not dryrun and verbose: results = tqdm(results, total=len(tasks)) for p in results: if verbose and not dryrun: print(p) if verbose: print("Exiting") if client is not None: client.close()