Example #1
0
def create_local_dask_cluster(spare_mem='3Gb', display_client=True):
    """
    Using the datacube utils function `start_local_dask`, generate
    a local dask cluster. Automatically detects if on AWS or NCI.
    
    Example use :
        
        import sys
        sys.path.append("../Scripts")
        from dea_dask import create_local_dask_cluster
        
        create_local_dask_cluster(spare_mem='4Gb')
    
    Parameters
    ----------  
    spare_mem : String, optional
        The amount of memory, in Gb, to leave for the notebook to run.
        This memory will not be used by the cluster. e.g '3Gb'
    display_client : Bool, optional
        An optional boolean indicating whether to display a summary of
        the dask client, including a link to monitor progress of the
        analysis. Set to False to hide this display.
    
    """

    if 'AWS_ACCESS_KEY_ID' in os.environ:
        
        # Close previous client if any
        client = locals().get('client', None)
        if client is not None:
            client.close()
            del client
        
        # Configure dashboard link to go over proxy
        dask.config.set({"distributed.dashboard.link":
                     os.environ.get('JUPYTERHUB_SERVICE_PREFIX', '/')+"proxy/{port}/status"})
                
        # Start up a local cluster
        client = start_local_dask(mem_safety_margin=spare_mem)

        ## Configure GDAL for s3 access
        configure_s3_access(aws_unsigned=True,  
                            client=client);
    else:        
        
        # Close previous client if any
        client = locals().get('client', None)
        if client is not None:
            client.close()
            del client
            
        # Start up a local cluster on NCI
        client = start_local_dask(mem_safety_margin=spare_mem)

    # Show the dask cluster settings
    if display_client:
        display(client)
Example #2
0
    def _init_dask(self) -> Client:
        cfg = self._cfg
        _log = self._log

        nthreads = cfg.threads
        if nthreads <= 0:
            nthreads = get_max_cpu()

        memory_limit: Union[str, int] = cfg.memory_limit
        if memory_limit == "":
            _1G = 1 << 30
            memory_limit = get_max_mem()
            if memory_limit > 2 * _1G:
                # leave at least a gig extra if total mem more than 2G
                memory_limit -= _1G

        client = start_local_dask(
            threads_per_worker=nthreads, processes=False, memory_limit=memory_limit
        )
        aws_unsigned = self._cfg.aws_unsigned
        for c in (None, client):
            configure_s3_access(
                aws_unsigned=aws_unsigned, cloud_defaults=True, client=c
            )
        _log.info(f"Started local Dask {client}")

        return client
Example #3
0
def save(xx, location, product_name, verbose):
    client = start_local_dask(
        nanny=False,
        n_workers=1,
        threads_per_worker=8,
        mem_safety_margin="0G",
        processes=False,
    )

    gdal_cfg = {"GDAL_CACHEMAX": 8 * (1 << 30)}
    configure_s3_access(aws_unsigned=True, cloud_defaults=True, **gdal_cfg)
    configure_s3_access(aws_unsigned=True,
                        cloud_defaults=True,
                        client=client,
                        **gdal_cfg)

    rgba = to_rgba(xx.isel(time=0), clamp=(0, 3000))
    rgba = xr_to_mem(rgba, client)

    if verbose:
        print(f"Writing {location}/{product_name}.tif")

    write_cog(
        rgba,
        f"{location}/{product_name}.tif",
        blocksize=1024,
        compress="zstd",
        zstd_level=4,
        overview_levels=[],
        NUM_THREADS="ALL_CPUS",
        BIGTIFF="YES",
        SPARSE_OK=True,
    )
def test_compute_tasks():
    client = start_local_dask(threads_per_worker=1, dashboard_address=None)

    tasks = (dask.delayed(x) for x in range(100))
    xx = [x for x in compute_tasks(tasks, client)]
    assert xx == [x for x in range(100)]

    client.close()
    del client
def test_pmap():
    client = start_local_dask(threads_per_worker=1, dashboard_address=None)

    xx_it = pmap(str, range(101), client=client)
    xx = [x for x in xx_it]

    assert xx == [str(x) for x in range(101)]

    client.close()
    del client
def create_local_dask_cluster(spare_mem='3Gb',
                              aws_unsigned= True,
                              display_client=True,
                              **kwargs):
    """
    Using the datacube utils function 'start_local_dask', generate
    a local dask cluster.
    
    Example use :
        
        import sys
        sys.path.append("../Scripts")
        from deafrica_dask import create_local_dask_cluster
        
        create_local_dask_cluster(spare_mem='4Gb')
    
    Parameters
    ----------  
    spare_mem : String, optional
        The amount of memory, in Gb, to leave for the notebook to run.
        This memory will not be used by the cluster. e.g '3Gb'
    aws_unsigned : Bool, optional
         This parameter determines if credentials for S3 access are required and
         passes them on to processing threads, either local or on dask cluster. 
         Set to True if working with publicly available datasets, and False if
         working with private data. i.e if loading Landsat C2 provisional data set 
         this to aws_unsigned=False
    display_client : Bool, optional
        An optional boolean indicating whether to display a summary of
        the dask client, including a link to monitor progress of the
        analysis. Set to False to hide this display.
    **kwargs:
        Additional keyword arguments that will be passed to start_local_dask().
        E.g. n_workers can be set to be greater than 1.
    """

    # configure dashboard link to go over proxy
    dask.config.set({"distributed.dashboard.link":
                 os.environ.get('JUPYTERHUB_SERVICE_PREFIX', '/')+"proxy/{port}/status"})

    # start up a local cluster  
    client = start_local_dask(mem_safety_margin=spare_mem, **kwargs)

    ## Configure GDAL for s3 access
    configure_s3_access(aws_unsigned=aws_unsigned,  
                        client=client);

    # Show the dask cluster settings
    if display_client:
        display(client)
Example #7
0
def create_local_dask_cluster(workers=1,
                              threads=None,
                              mem_limit=None,
                              spare_mem="20Gb",
                              display_client=True):
    """
    Using the datacube utils function `start_local_dask`, generate
    a local dask cluster.

    Parameters
    ----------
    spare_mem : String, optional
        The amount of memory, in Gb, to leave for the notebook to run.
        This memory will not be used by the cluster. e.g '3Gb'
    display_client : Bool, optional
        An optional boolean indicating whether to display a summary of
        the dask client, including a link to monitor progress of the
        analysis. Set to False to hide this display.
    workers: int
        Number of worker processes to launch
    threads: int, optional
        Number of threads per worker, default is as many as there are CPUs
    mem_limit: String, optional
        Maximum memory to use across all workers
    """

    if _HAVE_PROXY:
        # Configure dashboard link to go over proxy
        prefix = os.environ.get("JUPYTERHUB_SERVICE_PREFIX", "/")
        dask.config.set(
            {"distributed.dashboard.link": prefix + "proxy/{port}/status"})

    # Start up a local cluster
    client = start_local_dask(
        n_workers=workers,
        threads_per_worker=threads,
        memory_limit=mem_limit,
        mem_safety_margin=spare_mem,
    )

    # Show the dask cluster settings
    if display_client:
        display(client)
Example #8
0
    def __init__(
        self,
        config: Optional[FeaturePathConfig] = None,
        geobox_dict: Optional[Dict] = None,
        client: Optional[Client] = None,
    ):
        self.config = config if config else FeaturePathConfig()
        self.geobox_dict = geobox_dict
        if not client:
            nthreads = get_max_cpu()
            memory_limit = get_max_mem()
            client = start_local_dask(
                threads_per_worker=nthreads,
                processes=False,
                memory_limit=int(0.9 * memory_limit),
            )
            configure_s3_access(aws_unsigned=True,
                                cloud_defaults=True,
                                client=client)
        self.client = client

        setup_logging()
        self._log = logging.getLogger(__name__)
def create_mosaic(
    dc: Datacube,
    product: str,
    out_product: str,
    time: Tuple[str, str],
    time_str: str,
    bands: Tuple[str],
    s3_output_root: str,
    split_bands: bool = False,
    resolution: int = 120,
    overwrite: bool = False,
):
    log = setup_logging()
    log.info(f"Creating mosaic for {product} over {time}")

    client = start_local_dask()

    assets = {}
    data = dc.load(
        product=product,
        time=time,
        resolution=(-resolution, resolution),
        dask_chunks={"x": 2048, "y": 2048},
        measurements=bands,
    )

    # This is a bad idea, we run out of memory
    # data.persist()

    if not split_bands:
        log.info("Creating a single tif file")
        out_file = _get_path(s3_output_root, out_product, time_str, "tif")
        exists = s3_head_object(out_file) is not None
        skip_writing = not (not exists or overwrite)
        try:
            asset, _ = _save_opinionated_cog(
                data,
                out_file,
                skip_writing=skip_writing,
            )
        except ValueError:
            log.exception(
                "Failed to create COG, please check that you only have one timestep in the period."
            )
            exit(1)
        assets[bands[0]] = asset
        if skip_writing:
            log.info(f"File exists, and overwrite is False. Not writing {out_file}")
        else:
            log.info(f"Finished writing: {asset.href}")
    else:
        log.info("Creating multiple tif files")

        for band in bands:
            out_file = _get_path(
                s3_output_root, out_product, time_str, "tif", band=band
            )
            exists = s3_head_object(out_file) is not None
            skip_writing = not (not exists or overwrite)

            try:
                asset, band = _save_opinionated_cog(
                    data=data,
                    out_file=out_file,
                    band=band,
                    skip_writing=skip_writing,
                )
            except ValueError:
                log.exception(
                    "Failed to create COG, please check that you only have one timestep in the period."
                )
                exit(1)
            assets[band] = asset
            if skip_writing:
                log.info(f"File exists, and overwrite is False. Not writing {out_file}")
            else:
                log.info(f"Finished writing: {asset.href}")
                # Aggressively heavy handed, but we get memory leaks otherwise
                client.restart()

    out_stac_file = _get_path(s3_output_root, out_product, time_str, "stac-item.json")
    item = create_stac_item(
        assets[bands[0]].href,
        id=f"{product}_{time_str}",
        assets=assets,
        with_proj=True,
        properties={
            "odc:product": out_product,
            "start_datetime": f"{time[0]}T00:00:00Z",
            "end_datetime": f"{time[1]}T23:59:59Z",
        },
    )
    item.set_self_href(out_stac_file)

    log.info(f"Writing STAC: {out_stac_file}")
    client = s3_client(aws_unsigned=False)
    s3_dump(
        data=json.dumps(item.to_dict(), indent=2),
        url=item.self_href,
        ACL="bucket-owner-full-control",
        ContentType="application/json",
        s3=client,
    )
Example #10
0
# %matplotlib inline
from IPython.display import display, Image

import matplotlib.pyplot as plt

plt.rcParams["axes.facecolor"] = "magenta"  # makes transparent pixels obvious
import numpy as np
import xarray as xr

# %%
from dask.distributed import Client, wait as dask_wait
from datacube.utils.dask import start_local_dask
from datacube.utils.rio import configure_s3_access

if False:
    client = start_local_dask(scheduler_port=11311, threads_per_worker=16)
    configure_s3_access(aws_unsigned=True, cloud_defaults=True, client=client)
else:
    client = Client("tcp://127.0.0.1:11311")

client.restart()
client

# %%
from odc.algo import to_rgba
from odc.ui import to_jpeg_data


def mk_roi(y, x, sz=256):
    return np.s_[y : y + sz, x : x + sz]
Example #11
0
def run_gm(cache_file, tasks, dryrun, verbose, threads, x_chunks, y_chunks,
           overwrite, public, location):
    """
    Run Pixel Quality stats

    Task could be one of the 3 things

    \b
    1. Comma-separated triplet: period,x,y or 'x[+-]<int>/y[+-]<int>/period
       2019--P1Y,+003,-004
       2019--P1Y/3/-4          `/` is also accepted
       x+003/y-004/2019--P1Y   is accepted as well
    2. A zero based index
    3. A slice following python convention <start>:<stop>[:<step]
        ::10 -- every tenth task: 0,10,20,..
       1::10 -- every tenth but skip first one 1, 11, 21 ..
        :100 -- first 100 tasks

    If no tasks are supplied the whole file will be processed.
    """
    from tqdm.auto import tqdm
    from functools import partial
    import dask
    import psutil
    from .io import S3COGSink
    from ._gm import gm_input_data, gm_reduce, gm_product
    from .proc import process_tasks
    from .tasks import TaskReader
    from datacube.utils.dask import start_local_dask
    from datacube.utils.rio import configure_s3_access

    dask.config.set({'distributed.worker.memory.target': False})
    dask.config.set({'distributed.worker.memory.spill': False})
    dask.config.set({'distributed.worker.memory.pause': False})
    dask.config.set({'distributed.worker.memory.terminate': False})

    # config
    resampling = 'bilinear'
    COG_OPTS = dict(
        compress='deflate',
        predict=2,
        zlevel=6,
        blocksize=800,
        ovr_blocksize=
        256,  # ovr_blocksize must be powers of 2 for some reason in GDAL
        overview_resampling='bilinear')
    ncpus = psutil.cpu_count()
    # ..

    if threads <= 0:
        threads = ncpus

    rdr = TaskReader(cache_file)
    product = gm_product(location=location)

    if verbose:
        print(repr(rdr))

    def _proc(task):
        NY, NX = task.geobox.shape

        ds_in = gm_input_data(task,
                              resampling=resampling,
                              chunk=(NY // y_chunks, NX))
        tdim = list(ds_in.dims)[0]
        ds_in = ds_in.chunk({tdim: -1, 'x': NX // x_chunks})
        ds = gm_reduce(ds_in,
                       num_threads=ncpus // x_chunks + 2,
                       wk_rows=(NY // y_chunks) // 4,
                       as_array=True)
        return ds

    def dry_run_proc(task, sink, check_s3=False):
        uri = sink.uri(task)
        exists = None
        if check_s3:
            exists = sink.exists(task)

        nds = len(task.datasets)
        ndays = len(set(ds.center_time.date() for ds in task.datasets))

        if overwrite:
            flag = {None: '', True: ' (recompute)', False: ' (new)'}[exists]
        else:
            flag = {None: '', True: ' (skip)', False: ' (new)'}[exists]

        task_id = f"{task.short_time}/{task.tile_index[0]:+05d}/{task.tile_index[1]:+05d}"
        print(f"{task_id} days={ndays:03} ds={nds:04} {uri}{flag}")

        return uri

    if len(tasks) == 0:
        tasks = rdr.all_tiles
        if verbose:
            print(f"Found {len(tasks):,d} tasks in the file")
    else:
        try:
            tasks = parse_all_tasks(tasks, rdr.all_tiles)
        except ValueError as e:
            print(str(e), file=sys.stderr)
            sys.exit(1)

    if verbose:
        print(f"Will process {len(tasks):,d} tasks")

    sink = S3COGSink(cog_opts=COG_OPTS, public=public)

    if product.location.startswith('s3:'):
        if not sink.verify_s3_credentials():
            print("Failed to load S3 credentials")
            sys.exit(2)

    if verbose and sink._creds:
        creds_rw = sink._creds
        print(
            f'creds: ..{creds_rw.access_key[-5:]} ..{creds_rw.secret_key[-5:]}'
        )

    _tasks = rdr.stream(tasks, product)

    client = None
    if not dryrun:
        if verbose:
            print("Starting local Dask cluster")

        client = start_local_dask(threads_per_worker=threads,
                                  mem_safety_margin='1G')

        # TODO: aws_unsigned is not always desirable
        configure_s3_access(aws_unsigned=True,
                            cloud_defaults=True,
                            client=client)
        if verbose:
            print(client)

    if dryrun:
        results = map(partial(dry_run_proc, sink=sink, check_s3=not overwrite),
                      _tasks)
    else:
        results = process_tasks(_tasks,
                                _proc,
                                client,
                                sink,
                                check_exists=not overwrite,
                                chunked_persist=x_chunks,
                                verbose=verbose)
    if not dryrun and verbose:
        results = tqdm(results, total=len(tasks))

    for p in results:
        if verbose and not dryrun:
            print(p)

    if verbose:
        print("Exiting")

    if client is not None:
        client.close()
Example #12
0
def test_start_local_dask_dashboard_link(monkeypatch):
    monkeypatch.setenv('JUPYTERHUB_SERVICE_PREFIX', 'user/test/')
    client = start_local_dask()
    assert client.dashboard_link.startswith('user/test/proxy/')
Example #13
0
def create_local_dask_cluster(spare_mem='3Gb',
                              aws_unsigned=True,
                              display_client=True,
                              start_local_dask_kwargs=None,
                              configure_s3_access_kwargs=None):
    """
    Credit belongs to Digital Earth Africa:
    https://github.com/digitalearthafrica/deafrica-sandbox-notebooks/blob/master/Scripts/deafrica_dask.py
    
    Using the datacube utils function 'start_local_dask', generate
    a local dask cluster.
    
    Example use :
        
        import sys
        sys.path.append("../Scripts")
        from deafrica_dask import create_local_dask_cluster
        
        create_local_dask_cluster(spare_mem='4Gb')
    
    Parameters
    ----------  
    spare_mem : String, optional
        The amount of memory, in Gb, to leave for the notebook to run.
        This memory will not be used by the cluster. e.g '3Gb'
    aws_unsigned : Bool, optional
         This parameter determines if credentials for S3 access are required and
         passes them on to processing threads, either local or on dask cluster. 
         Set to True if working with publicly available datasets, and False if
         working with private data. i.e if loading Landsat C2 provisional data set 
         this to aws_unsigned=False
    display_client : Bool, optional
        An optional boolean indicating whether to display a summary of
        the dask client, including a link to monitor progress of the
        analysis. Set to False to hide this display.
    start_local_dask_kwargs: dict, optional
        Keyword arguments for the function `datacube.utils.dask.start_local_dask`, which
        creates the Dask client.
        Some settings to configure include the number of workers, number of threads per worker, and the memory limit.
    configure_s3_access_kwargs: dict, optional
        Keyword arguments for the function `datacube.utils.rio.configure_s3_access`, which
        configures the Dask to access S3.
    """
    start_local_dask_kwargs = {} if start_local_dask_kwargs is None else start_local_dask_kwargs
    configure_s3_access_kwargs = {} if configure_s3_access_kwargs is None else configure_s3_access_kwargs

    # configure dashboard link to go over proxy
    dask.config.set({
        "distributed.dashboard.link":
        os.environ.get('JUPYTERHUB_SERVICE_PREFIX', '/') +
        "proxy/{port}/status"
    })

    # start up a local cluster
    num_physical_cpu = psutil.cpu_count(logical=False)
    num_logical_cpu = psutil.cpu_count(logical=True)
    num_logical_per_physical = num_logical_cpu / num_physical_cpu
    start_local_dask_kwargs.setdefault('n_workers', num_physical_cpu - 1)
    start_local_dask_kwargs.setdefault(
        'threads_per_worker',
        int(num_logical_per_physical * start_local_dask_kwargs['n_workers']))
    client = start_local_dask(mem_safety_margin=spare_mem,
                              **start_local_dask_kwargs)

    ## Configure GDAL for s3 access
    configure_s3_access(aws_unsigned=aws_unsigned,
                        client=client,
                        **configure_s3_access_kwargs)

    return client
Example #14
0
def run_pq(cache_file, tasks, dryrun, verbose, threads, overwrite, public,
           location):
    """
    Run Pixel Quality stats

    Task could be one of the 3 things

    \b
    1. Comma-separated triplet: period,x,y or 'x[+-]<int>/y[+-]<int>/period
       2019--P1Y,+003,-004
       2019--P1Y/3/-4          `/` is also accepted
       x+003/y-004/2019--P1Y   is accepted as well
    2. A zero based index
    3. A slice following python convention <start>:<stop>[:<step]
        ::10 -- every tenth task: 0,10,20,..
       1::10 -- every tenth but skip first one 1, 11, 21 ..
        :100 -- first 100 tasks

    If no tasks are supplied the whole file will be processed.
    """
    from tqdm.auto import tqdm
    from functools import partial
    from .io import S3COGSink
    from ._pq import pq_input_data, pq_reduce, pq_product
    from .proc import process_tasks
    from .tasks import TaskReader
    from datacube.utils.dask import start_local_dask
    from datacube.utils.rio import configure_s3_access

    # config
    resampling = 'nearest'
    COG_OPTS = dict(compress='deflate', predict=2, zlevel=6, blocksize=800)
    # ..

    rdr = TaskReader(cache_file)
    product = pq_product(location=location)

    if verbose:
        print(repr(rdr))

    def pq_proc(task):
        ds_in = pq_input_data(task, resampling=resampling)
        ds = pq_reduce(ds_in)
        return ds

    def dry_run_proc(task, sink, check_s3=False):
        uri = sink.uri(task)
        exists = None
        if check_s3:
            exists = sink.exists(task)

        nds = len(task.datasets)
        ndays = len(set(ds.center_time.date() for ds in task.datasets))

        if overwrite:
            flag = {None: '', True: ' (recompute)', False: ' (new)'}[exists]
        else:
            flag = {None: '', True: ' (skip)', False: ' (new)'}[exists]

        task_id = f"{task.short_time}/{task.tile_index[0]:+05d}/{task.tile_index[1]:+05d}"
        print(f"{task_id} days={ndays:03} ds={nds:04} {uri}{flag}")

        return uri

    if len(tasks) == 0:
        tasks = rdr.all_tiles
        if verbose:
            print(f"Found {len(tasks):,d} tasks in the file")
    else:
        try:
            tasks = parse_all_tasks(tasks, rdr.all_tiles)
        except ValueError as e:
            print(str(e), file=sys.stderr)
            sys.exit(1)

    if verbose:
        print(f"Will process {len(tasks):,d} tasks")

    sink = S3COGSink(cog_opts=COG_OPTS, public=public)

    if product.location.startswith('s3:'):
        if not sink.verify_s3_credentials():
            print("Failed to load S3 credentials")
            sys.exit(2)

    if verbose and sink._creds:
        creds_rw = sink._creds
        print(
            f'creds: ..{creds_rw.access_key[-5:]} ..{creds_rw.secret_key[-5:]}'
        )

    _tasks = rdr.stream(tasks, product)

    client = None
    if not dryrun:
        if verbose:
            print("Starting local Dask cluster")

        client = start_local_dask(threads_per_worker=threads,
                                  mem_safety_margin='1G')

        # TODO: aws_unsigned is not always desirable
        configure_s3_access(aws_unsigned=True,
                            cloud_defaults=True,
                            client=client)
        if verbose:
            print(client)

    if dryrun:
        results = map(partial(dry_run_proc, sink=sink, check_s3=not overwrite),
                      _tasks)
    else:
        results = process_tasks(_tasks,
                                pq_proc,
                                client,
                                sink,
                                check_exists=not overwrite,
                                verbose=verbose)
    if not dryrun and verbose:
        results = tqdm(results, total=len(tasks))

    for p in results:
        if verbose and not dryrun:
            print(p)

    if verbose:
        print("Exiting")

    if client is not None:
        client.close()