Esempio n. 1
0
def test_async(c, s, a, b):
    x = create_test_data()
    assert not dask.is_dask_collection(x)
    y = x.chunk({'dim2': 4}) + 10
    assert dask.is_dask_collection(y)
    assert dask.is_dask_collection(y.var1)
    assert dask.is_dask_collection(y.var2)

    z = y.persist()
    assert str(z)

    assert dask.is_dask_collection(z)
    assert dask.is_dask_collection(z.var1)
    assert dask.is_dask_collection(z.var2)
    assert len(y.__dask_graph__()) > len(z.__dask_graph__())

    assert not futures_of(y)
    assert futures_of(z)

    future = c.compute(z)
    w = yield future
    assert not dask.is_dask_collection(w)
    assert_allclose(x + 10, w)

    assert s.task_state
def test_get_individual_entries_of_matrices_simulation_example():
    """
    Tests that the function returns a dask task and that the computed task
    returns the expected tuple while using the simuation
    """
    task = get_individual_entries_of_matrices(
        lambda_2=2,
        lambda_1_1=0.1,
        lambda_1_2=0.5,
        mu_1=2,
        mu_2=1,
        num_of_servers_1=2,
        num_of_servers_2=2,
        threshold_1=3,
        threshold_2=5,
        system_capacity_1=4,
        system_capacity_2=6,
        buffer_capacity_1=2,
        buffer_capacity_2=2,
        target=2,
        alpha=0.5,
        use_simulation=True,
        runtime=300,
        num_of_trials=3,
        warm_up_time=5,
        seed_num_1=0,
        seed_num_2=0,
    )

    assert da.is_dask_collection(task)
    values = da.compute(task)
    assert np.allclose(
        values,
        ((3, 5, 0.7613063676529543, -0.0006520260736895711,
          -0.027937014444158834), ),
    )
Esempio n. 3
0
    def merge(
            self,
            other,
            on=None,
            how="left",
            left_index=False,
            right_index=False,
            suffixes=("_x", "_y"),
    ):
        """Merging two dataframes on the column(s) indicated in *on*.
        """
        if (left_index or right_index or not dask.is_dask_collection(other)
                or self.npartitions == 1 and how in ("inner", "right")
                or other.npartitions == 1 and how in ("inner", "left")):
            return dd.merge(
                self,
                other,
                how=how,
                suffixes=suffixes,
                left_index=left_index,
                right_index=right_index,
            )

        if not on and not left_index and not right_index:
            on = [c for c in self.columns if c in other.columns]
            if not on:
                left_index = right_index = True

        return join_impl.join_frames(
            left=self,
            right=other,
            on=on,
            how=how,
            lsuffix=suffixes[0],
            rsuffix=suffixes[1],
        )
Esempio n. 4
0
def gap_fill(x: xr.DataArray, fallback: xr.DataArray, nodata=None, attrs=None):
    """Fill missing values in `x` with values from `fallback`.

    x,fallback are expected to be xarray.DataArray with identical shape and dtype.

        out[pix] = x[pix] if x[pix] != x.nodata else fallback[pix]
    """

    if nodata is None:
        nodata = getattr(x, "nodata", None)

    if nodata is None:
        nodata = default_nodata(x.dtype)
    else:
        nodata = x.dtype.type(nodata)

    if attrs is None:
        attrs = x.attrs.copy()

    if dask.is_dask_collection(x):
        data = da.map_blocks(
            _gap_fill_np,
            x.data,
            fallback.data,
            nodata,
            name=randomize("gap_fill"),
            dtype=x.dtype,
        )
    else:
        data = _gap_fill_np(x.data, fallback.data, nodata)

    return xr.DataArray(data,
                        attrs=attrs,
                        dims=x.dims,
                        coords=x.coords,
                        name=x.name)
Esempio n. 5
0
def convert_dask_collection(dc):
    """
    Convert dask collection object into mars.core.Object via remote API

    Parameters
    ----------
    dc: dask collection
        Dask collection object to be converted.

    Returns
    -------
    Object
        Mars Object.
    """
    if not is_dask_collection(dc):
        raise TypeError(
            f"'{type(dc).__name__}' object is not a valid dask collection")

    dc.__dask_graph__().validate()
    dsk = optimize(dc)[0].__dask_graph__()

    first_key = next(iter(dsk.keys()))
    if isinstance(first_key, str):
        key = [first_key]
    elif isinstance(first_key, tuple):
        key = sorted([i for i in dsk.keys() if i[0] == first_key[0]],
                     key=lambda x: x[1])
    else:
        raise ValueError(
            f"Dask collection object seems be broken, with unexpected key type:'{type(first_key).__name__}'"
        )
    res = reduce(mars_dask_get(dsk, [key]))
    if isinstance(dc, Bag):
        return spawn(lambda x: list(x[0][0]), args=(res, ))
    else:
        return res
Esempio n. 6
0
def xr_geomedian_tmad(ds, axis='time', where=None, **kw):
    """
    :param ds: xr.Dataset|xr.DataArray|numpy array
    Other parameters:
    **kwargs -- passed on to pcm.gnmpcm
       maxiters   : int         1000
       eps        : float       0.0001
       num_threads: int| None   None
    """

    import hdstats
    def gm_tmad(arr, **kw):
        """
        arr: a high dimensional numpy array where the last dimension will be reduced. 
    
        returns: a numpy array with one less dimension than input.
        """
        gm = hdstats.nangeomedian_pcm(arr, **kw)
        nt = kw.pop('num_threads', None)
        emad = hdstats.emad_pcm(arr, gm, num_threads=nt)[:,:, np.newaxis]
        smad = hdstats.smad_pcm(arr, gm, num_threads=nt)[:,:, np.newaxis]
        bcmad = hdstats.bcmad_pcm(arr, gm, num_threads=nt)[:,:, np.newaxis]
        return np.concatenate([gm, emad, smad, bcmad], axis=-1)


    def norm_input(ds, axis):
        if isinstance(ds, xr.DataArray):
            xx = ds
            if len(xx.dims) != 4:
                raise ValueError("Expect 4 dimensions on input: y,x,band,time")
            if axis is not None and xx.dims[3] != axis:
                raise ValueError(f"Can only reduce last dimension, expect: y,x,band,{axis}")
            return None, xx, xx.data
        elif isinstance(ds, xr.Dataset):
            xx = reshape_for_geomedian(ds, axis)
            return ds, xx, xx.data
        else:  # assume numpy or similar
            xx_data = ds
            if xx_data.ndim != 4:
                raise ValueError("Expect 4 dimensions on input: y,x,band,time")
            return None, None, xx_data

    kw.setdefault('nocheck', False)
    kw.setdefault('num_threads', 1)
    kw.setdefault('eps', 1e-6)

    ds, xx, xx_data = norm_input(ds, axis)
    is_dask = dask.is_dask_collection(xx_data)

    if where is not None:
        if is_dask:
            raise NotImplementedError("Dask version doesn't support output masking currently")

        if where.shape != xx_data.shape[:2]:
            raise ValueError("Shape for `where` parameter doesn't match")
        set_nan = ~where
    else:
        set_nan = None

    if is_dask:
        if xx_data.shape[-2:] != xx_data.chunksize[-2:]:
            xx_data = xx_data.rechunk(xx_data.chunksize[:2] + (-1, -1))

        data = da.map_blocks(lambda x: gm_tmad(x, **kw),
                             xx_data,
                             name=randomize('geomedian'),
                             dtype=xx_data.dtype, 
                             chunks=xx_data.chunks[:-2] + (xx_data.chunks[-2][0]+3,),
                             drop_axis=3)
    else:
        data = gm_tmad(xx_data, **kw)

    if set_nan is not None:
        data[set_nan, :] = np.nan

    if xx is None:
        return data

    dims = xx.dims[:-1]
    cc = {k: xx.coords[k] for k in dims}
    cc[dims[-1]] = np.hstack([xx.coords[dims[-1]].values,['edev', 'sdev', 'bcdev']])
    xx_out = xr.DataArray(data, dims=dims, coords=cc)

    if ds is None:
        xx_out.attrs.update(xx.attrs)
        return xx_out

    ds_out = xx_out.to_dataset(dim='band')
    for b in ds.data_vars.keys():
        src, dst = ds[b], ds_out[b]
        dst.attrs.update(src.attrs)

    return assign_crs(ds_out, crs=ds.geobox.crs)
Esempio n. 7
0
def geomedian_with_mads(
    src: Union[xr.Dataset, xr.DataArray],
    compute_mads: bool = True,
    compute_count: bool = True,
    out_chunks: Optional[Tuple[int, int, int]] = None,
    reshape_strategy: str = "mem",
    scale: float = 1.0,
    offset: float = 0.0,
    eps: Optional[float] = None,
    maxiters: int = 1000,
    num_threads: int = 1,
    **kw,
) -> xr.Dataset:
    """
    Compute Geomedian on Dask backed Dataset.

    NOTE: Default configuration of this code assumes that entire input can be
    loaded in to RAM on the Dask worker. It also assumes that there is only one
    worker in the cluster, or that entire task will get scheduled on one single
    worker only. See ``reshape_strategy`` parameter.

    :param src: xr.Dataset or a single array in YXBT order, bands can be either
                float or integer with `nodata` values to indicate gaps in data.

    :param compute_mads: Whether to compute smad,emad,bcmad statistics

    :param compute_count: Whether to compute count statistic (number of
                          contributing observations per output pixels)

    :param out_chunks: Advanced option, allows to rechunk output internally,
                       order is ``(ny, nx, nband)``

    :param reshape_strategy: One of ``mem`` (default) or ``yxbt``. This is only
    applicable when supplying Dataset object. It controls how Dataset is
    reshaped into DataArray in the format expected by Geomedian code. If you
    have enough RAM and use single-worker Dask cluster, then use ``mem``, it
    should be the most efficient. If there is not enough RAM to load entire
    input you can try ``yxbt`` mode, but you might still run out of RAM anyway.
    If using multi-worker Dask cluster you have to use ``yxbt`` strategy.

    :param scale, offset: Only used when input contains integer values, actual
                          Geomedian will run on scaled values
                          ``scale*X+offset``. Only affects internal
                          computation, final result is scaled back to the
                          original value range.

    :param eps: Termination criteria passed on to geomedian algorithm

    :param maxiters: Maximum number of iterations done per output pixel

    :param num_threads: Configure internal concurrency of the Geomedian
                        computation. Default is 1 as we assume that Dask will
                        run a bunch of those concurrently.

    :param work_chunks: Default is ``(100, 100)``, only applicable when input
                        is Dataset.
    """
    if not dask.is_dask_collection(src):
        raise ValueError("This method only works on Dask inputs")

    if isinstance(src, xr.DataArray):
        yxbt = src
    else:
        # TODO: better automatic defaults for work_chunks
        ny, nx = kw.get("work_chunks", (100, 100))
        if reshape_strategy == "mem":
            yxbt = yxbt_sink(src, (ny, nx, -1, -1))
        elif reshape_strategy == "yxbt":
            yxbt = reshape_yxbt(src, yx_chunks=(ny, nx))
        else:
            raise ValueError(
                f"Reshape strategy '{reshape_strategy}' not understood use one of: mem or yxbt"
            )

    ny, nx, nb, nt = yxbt.shape
    nodata = yxbt.attrs.get("nodata", None)
    assert yxbt.chunks is not None
    if yxbt.data.numblocks[2:4] != (1, 1):
        raise ValueError(
            "There should be one dask block along time and band dimension")

    n_extras = (3 if compute_mads else 0) + (1 if compute_count else 0)
    chunks = (*yxbt.chunks[:2], (nb + n_extras, ))

    is_float = yxbt.dtype.kind == "f"

    if eps is None:
        eps = 1e-4 if is_float else 0.1 * scale

    op = functools.partial(
        _gm_mads_compute_f32,
        compute_mads=compute_mads,
        compute_count=compute_count,
        nodata=nodata,
        scale=scale,
        offset=offset,
        eps=eps,
        maxiters=maxiters,
        num_threads=num_threads,
    )

    _gm = da.map_blocks(op,
                        yxbt.data,
                        dtype="float32",
                        drop_axis=3,
                        chunks=chunks,
                        name="geomedian")
    if out_chunks is not None:
        _gm = _gm.rechunk(out_chunks)

    gm_data = _gm[:, :, :nb]
    if not is_float:
        gm_data = da.map_blocks(
            lambda x: from_float_np(
                x, yxbt.dtype, nodata, scale=1 / scale, offset=offset / scale),
            gm_data,
            dtype=yxbt.dtype,
        )

    dims = yxbt.dims[:3]
    coords = {k: yxbt.coords[k] for k in dims}
    result = xr.DataArray(data=gm_data,
                          dims=dims,
                          coords=coords,
                          attrs=yxbt.attrs).to_dataset("band")

    for dv in result.data_vars.values():
        dv.attrs.update(yxbt.attrs)

    next_stat = nb
    if compute_mads:
        smad = _gm[:, :, next_stat + 0]
        emad = _gm[:, :, next_stat + 1]
        bcmad = _gm[:, :, next_stat + 2]
        next_stat += 3

        if not is_float:
            emad = emad * (1 / scale)

        result["smad"] = xr.DataArray(data=smad,
                                      dims=dims[:2],
                                      coords=result.coords)
        result["emad"] = xr.DataArray(data=emad,
                                      dims=dims[:2],
                                      coords=result.coords)
        result["bcmad"] = xr.DataArray(data=bcmad,
                                       dims=dims[:2],
                                       coords=result.coords)

    if compute_count:
        count = _gm[:, :, next_stat].astype("uint16")
        next_stat += 1
        result["count"] = xr.DataArray(data=count,
                                       dims=dims[:2],
                                       coords=result.coords)

    return result
Esempio n. 8
0
def to_raster(data,
              filename,
              readxsize=None,
              readysize=None,
              use_dask_store=False,
              separate=False,
              out_block_type='zarr',
              keep_blocks=False,
              verbose=0,
              overwrite=False,
              gdal_cache=512,
              scheduler='mpool',
              n_jobs=1,
              n_workers=None,
              n_threads=None,
              n_chunks=None,
              overviews=False,
              resampling='nearest',
              use_client=False,
              address=None,
              total_memory=48,
              **kwargs):
    """
    Writes a ``dask`` array to a raster file

    Args:
        data (DataArray): The ``xarray.DataArray`` to write.
        filename (str): The output file name to write to.
        readxsize (Optional[int]): The size of column chunks to read. If not given, ``readxsize`` defaults to Dask
            chunk size.
        readysize (Optional[int]): The size of row chunks to read. If not given, ``readysize`` defaults to Dask
            chunk size.
        separate (Optional[bool]): Whether to write blocks as separate files. Otherwise, write to a single file.
        use_dask_store (Optional[bool]): Whether to use ``dask.array.store`` to save with Dask task graphs.
        out_block_type (Optional[str]): The output block type. Choices are ['gtiff', 'zarr'].
            Only used if ``separate`` = ``True``.
        keep_blocks (Optional[bool]): Whether to keep the blocks stored on disk. Only used if ``separate`` = ``True``.
        verbose (Optional[int]): The verbosity level.
        overwrite (Optional[bool]): Whether to overwrite an existing file.
        gdal_cache (Optional[int]): The ``GDAL`` cache size (in MB).
        scheduler (Optional[str]): The ``concurrent.futures`` scheduler to use. Choices are ['processes', 'threads', 'mpool'].

            mpool: process pool of workers using ``multiprocessing.Pool``
            processes: process pool of workers using ``concurrent.futures``
            threads: thread pool of workers using ``concurrent.futures``

        n_jobs (Optional[int]): The total number of parallel jobs.
        n_workers (Optional[int]): The number of processes.
        n_threads (Optional[int]): The number of threads.
        n_chunks (Optional[int]): The chunk size of windows. If not given, equal to ``n_workers`` x 50.
        overviews (Optional[bool or list]): Whether to build overview layers.
        resampling (Optional[str]): The resampling method for overviews when ``overviews`` is ``True`` or a ``list``.
            Choices are ['average', 'bilinear', 'cubic', 'cubic_spline', 'gauss', 'lanczos', 'max', 'med', 'min', 'mode', 'nearest'].
        use_client (Optional[bool]): Whether to use a ``dask`` client.
        address (Optional[str]): A cluster address to pass to client. Only used when ``use_client`` = ``True``.
        total_memory (Optional[int]): The total memory (in GB) required when ``use_client`` = ``True``.
        kwargs (Optional[dict]): Additional keyword arguments to pass to ``rasterio.write``.

    Returns:
        ``dask.delayed`` object

    Examples:
        >>> import geowombat as gw
        >>>
        >>> # Use 8 parallel workers
        >>> with gw.open('input.tif') as ds:
        >>>     gw.to_raster(ds, 'output.tif', n_jobs=8)
        >>>
        >>> # Use 4 process workers and 2 thread workers
        >>> with gw.open('input.tif') as ds:
        >>>     gw.to_raster(ds, 'output.tif', n_workers=4, n_threads=2)
        >>>
        >>> # Control the window chunks passed to concurrent.futures
        >>> with gw.open('input.tif') as ds:
        >>>     gw.to_raster(ds, 'output.tif', n_workers=4, n_threads=2, n_chunks=16)
        >>>
        >>> # Compress the output and build overviews
        >>> with gw.open('input.tif') as ds:
        >>>     gw.to_raster(ds, 'output.tif', n_jobs=8, overviews=True, compress='lzw')
    """

    if MKL_LIB:
        __ = MKL_LIB.MKL_Set_Num_Threads(n_threads)

    pfile = Path(filename)

    if scheduler.lower() == 'mpool':
        pool_executor = multi.Pool
    else:
        pool_executor = concurrent.futures.ProcessPoolExecutor if scheduler.lower(
        ) == 'processes' else concurrent.futures.ThreadPoolExecutor

    if overwrite:

        if pfile.is_file():
            pfile.unlink()

    if pfile.is_file():
        logger.warning('  The output file already exists.')
        return

    if not is_dask_collection(data.data):
        logger.exception('  The data should be a dask array.')

    if use_client:

        if address:
            cluster_object = _cluster_dummy
        else:
            cluster_object = LocalCluster

        client_object = Client

    else:

        cluster_object = _cluster_dummy
        client_object = _client_dummy

    if isinstance(n_workers, int) and isinstance(n_threads, int):
        n_jobs = n_workers * n_threads
    else:

        n_workers = n_jobs
        n_threads = 1

    mem_per_core = int(total_memory / n_workers)

    if not isinstance(n_chunks, int):
        n_chunks = n_workers * 50

    if not isinstance(readxsize, int):
        readxsize = data.gw.col_chunks

    if not isinstance(readysize, int):
        readysize = data.gw.row_chunks

    chunksize = (data.gw.row_chunks, data.gw.col_chunks)

    # Force tiled outputs with no file sharing
    kwargs['sharing'] = False

    if data.gw.tiled:
        kwargs['tiled'] = True

    if 'compress' in kwargs:

        # Store the compression type because
        #   it is removed in concurrent writing
        compress = True
        compress_type = kwargs['compress']
        del kwargs['compress']

    elif isinstance(data.gw.compress,
                    str) and data.gw.compress.lower() in ['lzw', 'deflate']:

        compress = True
        compress_type = data.gw.compress

    else:
        compress = False

    if 'nodata' not in kwargs:

        if isinstance(data.gw.nodata, int) or isinstance(
                data.gw.nodata, float):
            kwargs['nodata'] = data.gw.nodata

    if 'blockxsize' not in kwargs:
        kwargs['blockxsize'] = data.gw.col_chunks

    if 'blockysize' not in kwargs:
        kwargs['blockysize'] = data.gw.row_chunks

    if 'bigtiff' not in kwargs:
        kwargs['bigtiff'] = data.gw.bigtiff

    if 'driver' not in kwargs:
        kwargs['driver'] = data.gw.driver

    if 'count' not in kwargs:
        kwargs['count'] = data.gw.nbands

    if 'width' not in kwargs:
        kwargs['width'] = data.gw.ncols

    if 'height' not in kwargs:
        kwargs['height'] = data.gw.nrows

    if separate:

        d_name = pfile.parent
        sub_dir = d_name.joinpath('sub_tmp_')
        zarr_file = sub_dir.joinpath('data.zarr').as_posix()

        sub_dir.mkdir(parents=True, exist_ok=True)

        root = zarr.open(zarr_file, mode='w')

    else:

        root = None

        if verbose > 0:
            logger.info('  Creating the file ...\n')

        with rio.open(filename, mode='w', **kwargs) as rio_dst:
            pass

    if verbose > 0:
        logger.info('  Writing data to file ...\n')

    with rio.Env(GDAL_CACHEMAX=gdal_cache):

        if not use_dask_store:

            windows = get_window_offsets(data.gw.nrows,
                                         data.gw.ncols,
                                         readysize,
                                         readxsize,
                                         return_as='list')

            n_windows = len(windows)

            # Iterate over the windows in chunks
            for wchunk in range(0, n_windows, n_chunks):

                window_slice = windows[wchunk:wchunk + n_chunks]
                n_windows_slice = len(window_slice)

                if verbose > 0:

                    logger.info('  Windows {:,d}--{:,d} of {:,d} ...'.format(
                        wchunk + 1, wchunk + n_windows_slice, n_windows))

                if len(data.shape) == 2:
                    data_gen = ((data[w.row_off:w.row_off + w.height,
                                      w.col_off:w.col_off + w.width], filename,
                                 w, n_threads, separate, chunksize, root)
                                for w in window_slice)
                elif len(data.shape) == 3:
                    data_gen = ((data[:, w.row_off:w.row_off + w.height,
                                      w.col_off:w.col_off + w.width], filename,
                                 w, n_threads, separate, chunksize, root)
                                for w in window_slice)
                else:
                    data_gen = ((data[:, :, w.row_off:w.row_off + w.height,
                                      w.col_off:w.col_off + w.width], filename,
                                 w, n_threads, separate, chunksize, root)
                                for w in window_slice)

                with pool_executor(n_workers) as executor:

                    if scheduler == 'mpool':

                        for zarr_file in tqdm(executor.imap_unordered(
                                _write_xarray, data_gen),
                                              total=n_windows_slice):
                            pass

                    else:

                        for zarr_file in tqdm(executor.map(
                                _write_xarray, data_gen),
                                              total=n_windows_slice):
                            pass

            # if overviews:
            #
            #     if not isinstance(overviews, list):
            #         overviews = [2, 4, 8, 16]
            #
            #     if resampling not in ['average', 'bilinear', 'cubic', 'cubic_spline',
            #                           'gauss', 'lanczos', 'max', 'med', 'min', 'mode', 'nearest']:
            #
            #         logger.warning("  The resampling method is not supported by rasterio. Setting to 'nearest'")
            #
            #         resampling = 'nearest'
            #
            #     if verbose > 0:
            #         logger.info('  Building pyramid overviews ...')
            #
            #     rio_dst.build_overviews(overviews, getattr(Resampling, resampling))
            #     rio_dst.update_tags(ns='overviews', resampling=resampling)

        else:

            with cluster_object(
                    n_workers=n_workers,
                    threads_per_worker=n_threads,
                    scheduler_port=0,
                    processes=False,
                    memory_limit='{:d}GB'.format(mem_per_core)) as cluster:

                cluster_address = address if address else cluster

                with client_object(address=cluster_address) as client:

                    with WriteDaskArray(filename,
                                        overwrite=overwrite,
                                        separate=separate,
                                        out_block_type=out_block_type,
                                        keep_blocks=keep_blocks,
                                        gdal_cache=gdal_cache,
                                        **kwargs) as dst:

                        # Store the data and return a lazy evaluator
                        res = da.store(da.squeeze(data.data),
                                       dst,
                                       lock=False,
                                       compute=False)

                        if verbose > 0:
                            logger.info('  Writing data to file ...')

                        # Send the data to file
                        #
                        # *Note that the progress bar will
                        #   not work with a client.
                        if use_client:
                            res.compute(num_workers=n_jobs)
                        else:

                            with ProgressBar():
                                res.compute(num_workers=n_jobs)

                        if verbose > 0:
                            logger.info('  Finished writing data to file.')

                        out_block_type = dst.out_block_type
                        keep_blocks = dst.keep_blocks
                        zarr_file = dst.zarr_file
                        sub_dir = dst.sub_dir

        if compress:

            if verbose > 0:
                logger.info('  Compressing output file ...')

            if separate:

                group_keys = list(root.group_keys())
                n_groups = len(group_keys)

                if out_block_type.lower() == 'zarr':
                    # root = zarr.open(zarr_file, mode='r')
                    open_file = zarr_file
                else:

                    outfiles = sorted(
                        fnmatch.filter(os.listdir(sub_dir), '*.tif'))
                    outfiles = [os.path.join(sub_dir, fn) for fn in outfiles]

                    # data_gen = ((fn, None, 'gtiff') for fn in outfiles)

                kwargs['compress'] = compress_type

                n_windows = len(group_keys)

                # Compress into one file
                with rio.open(filename, mode='w', **kwargs) as dst_:

                    # Iterate over the windows in chunks
                    for wchunk in range(0, n_groups, n_chunks):

                        group_keys_slice = group_keys[wchunk:wchunk + n_chunks]
                        n_windows_slice = len(group_keys_slice)

                        if verbose > 0:

                            logger.info(
                                '  Windows {:,d}--{:,d} of {:,d} ...'.format(
                                    wchunk + 1, wchunk + n_windows_slice,
                                    n_windows))

                        ################################################
                        data_gen = ((open_file, group, 'zarr')
                                    for group in group_keys_slice)

                        # for f in tqdm(executor.map(_compressor, data_gen), total=n_windows_slice):
                        #     pass
                        #
                        # futures = [executor.submit(_compress_dummy, iter_[0], iter_[1], None) for iter_ in data_gen]
                        #
                        # for f in tqdm(concurrent.futures.as_completed(futures), total=n_windows_slice):
                        #
                        #     out_window, out_block = f.result()
                        #
                        #     dst_.write(np.squeeze(out_block),
                        #                window=out_window,
                        #                indexes=out_indexes_)
                        ################################################

                        # data_gen = ((root, group, 'zarr') for group in group_keys_slice)

                        # for f, g, t in tqdm(data_gen, total=n_windows_slice):
                        #
                        #     out_window, out_indexes, out_block = _block_read_func(f, g, t)

                        # executor.map(_block_write_func, data_gen)

                        with concurrent.futures.ProcessPoolExecutor(
                                max_workers=n_workers) as executor:

                            # Submit all of the tasks as futures
                            futures = [
                                executor.submit(_block_read_func, f, g, t)
                                for f, g, t in data_gen
                            ]

                            for f in tqdm(
                                    concurrent.futures.as_completed(futures),
                                    total=n_windows_slice):

                                out_window, out_indexes, out_block = f.result()

                                dst_.write(out_block,
                                           window=out_window,
                                           indexes=out_indexes)

                        futures = None

                if not keep_blocks:
                    shutil.rmtree(sub_dir)

            else:

                p = Path(filename)

                d_name = p.parent
                f_base, f_ext = os.path.splitext(p.name)

                ld = string.ascii_letters + string.digits
                rstr = ''.join(random.choice(ld) for i in range(0, 9))

                temp_file = d_name.joinpath('{}_temp_{}{}'.format(
                    f_base, rstr, f_ext))

                compress_raster(filename,
                                temp_file.as_posix(),
                                n_jobs=n_jobs,
                                gdal_cache=gdal_cache,
                                compress=compress_type)

                temp_file.rename(filename)

            if verbose > 0:
                logger.info('  Finished compressing')

    if verbose > 0:
        logger.info('\nFinished writing the data.')
Esempio n. 9
0
def _num_samples(X):
    result = sk_validation._num_samples(X)
    if dask.is_dask_collection(result):
        # dask dataframe
        result = result.compute()
    return result
Esempio n. 10
0
 def fit(self, X, y=None):
     if not dask.is_dask_collection(X):
         raise TypeError(_TYPE_MSG.format(type(X)))
     self._fit(X)
     return self
Esempio n. 11
0
def _train(
    client,
    params,
    data,
    labels,
    dmatrix_kwargs={},
    evals_result=None,
    **kwargs
):
    """
    Asynchronous version of train

    See Also
    --------
    train
    """
    # Break apart Dask.array/dataframe into chunks/parts
    data_parts = data.to_delayed()
    label_parts = labels.to_delayed()
    if isinstance(data_parts, np.ndarray):
        assert data_parts.shape[1] == 1
        data_parts = data_parts.flatten().tolist()
    if isinstance(label_parts, np.ndarray):
        assert label_parts.ndim == 1 or label_parts.shape[1] == 1
        label_parts = label_parts.flatten().tolist()

    # Arrange parts into pairs.  This enforces co-locality
    parts = list(map(delayed, zip(data_parts, label_parts)))
    parts = client.compute(parts)  # Start computation in the background
    yield wait(parts)

    for part in parts:
        if part.status == "error":
            yield part  # trigger error locally

    if kwargs.get("eval_set"):
        if any(
            is_dask_collection(e)
            for evals in kwargs.get("eval_set")
            for e in evals
        ):
            raise TypeError(
                "Evaluation set must not contain dask collections."
            )

    # Because XGBoost-python doesn't yet allow iterative training, we need to
    # find the locations of all chunks and map them to particular Dask workers
    key_to_part_dict = dict([(part.key, part) for part in parts])
    who_has = yield client.scheduler.who_has(
        keys=[part.key for part in parts]
    )
    worker_map = defaultdict(list)
    for key, workers in who_has.items():
        worker_map[first(workers)].append(key_to_part_dict[key])

    ncores = yield client.scheduler.ncores()  # Number of cores per worker

    # Start the XGBoost tracker on the Dask scheduler
    env = yield client._run_on_scheduler(start_tracker,
                                         None,
                                         len(worker_map))

    # Tell each worker to train on the chunks/parts that it has locally
    futures = [
        client.submit(
            train_part,
            env,
            assoc(params, "nthread", ncores[worker]),
            list_of_parts,
            workers=worker,
            dmatrix_kwargs=dmatrix_kwargs,
            **kwargs
        )
        for worker, list_of_parts in worker_map.items()
    ]

    # Get the results, only one will be non-None
    results = yield client._gather(futures)
    result, _evals_result = [v for v in results if v.count(None) != len(v)][0]

    if evals_result is not None:
        evals_result.update(_evals_result)

    num_class = params.get("num_class")
    if num_class:
        result.set_attr(num_class=str(num_class))
    raise gen.Return(result)
Esempio n. 12
0
 def __dask_layers__(self):
     return sum([
         v.__dask_layers__()
         for v in self._data_vars.values() if dask.is_dask_collection(v)
     ], ())
Esempio n. 13
0
def _load_into_memory(res):
    """Compute if res is lazy data."""
    if dask.is_dask_collection(res):
        res = res.compute()
    return res
Esempio n. 14
0
 def test_df_inverse_transform(self):
     mask = ["3", "4"]
     a = dpp.MinMaxScaler(columns=mask)
     result = a.inverse_transform(a.fit_transform(df2))
     assert dask.is_dask_collection(result)
     assert_eq_df(result, df2)
Esempio n. 15
0
def xr_reproject_array(
    src: xr.DataArray,
    geobox: GeoBox,
    resampling: str = "nearest",
    chunks: Optional[Tuple[int, int]] = None,
    dst_nodata: Optional[NodataType] = None,
) -> xr.DataArray:
    """
    Reproject DataArray to a given GeoBox

    :param src       : Input src[(time,) y,x (, band)]
    :param geobox    : GeoBox of the destination
    :param resampling: Resampling strategy as a string: nearest, bilinear, average, mode ...
    :param chunks    : In Y,X dimensions only, default is to use input chunk size
    :param dst_nodata: nodata marker for dst image (default is to use src.nodata)
    """
    src_nodata = getattr(src, "nodata", None)
    if dst_nodata is None:
        dst_nodata = src_nodata

    src_geobox = src.geobox
    assert src_geobox is not None

    yx_dims = spatial_dims(src)
    axis = tuple(src.dims).index(yx_dims[0])

    src_dims = tuple(src.dims)
    dst_dims = src_dims[:axis] + geobox.dims + src_dims[axis + 2:]

    coords = geobox.xr_coords(with_crs=True)

    # copy non-spatial coords from src to dst
    src_non_spatial_dims = src_dims[:axis] + src_dims[axis + 2:]
    for dim in src_non_spatial_dims:
        if dim not in coords:
            coords[dim] = src.coords[dim]

    attrs = {}
    if dst_nodata is not None:
        attrs["nodata"] = dst_nodata

    if is_dask_collection(src):
        data = dask_reproject(
            src.data,
            src_geobox,
            geobox,
            resampling=resampling,
            chunks=chunks,
            src_nodata=src_nodata,
            dst_nodata=dst_nodata,
            axis=axis,
        )
    else:
        data = _reproject_block_impl(
            src.data,
            src_geobox,
            geobox,
            resampling=resampling,
            src_nodata=src_nodata,
            dst_nodata=dst_nodata,
            axis=axis,
        )

    return xr.DataArray(data,
                        name=src.name,
                        coords=coords,
                        dims=dst_dims,
                        attrs=attrs)
Esempio n. 16
0
 def __dask_keys__(self):
     return [
         v.__dask_keys__() for v in self._data_vars.values()
         if dask.is_dask_collection(v)
     ]
Esempio n. 17
0
def xr_phenology(
    da,
    stats=[
        "SOS",
        "POS",
        "EOS",
        "Trough",
        "vSOS",
        "vPOS",
        "vEOS",
        "LOS",
        "AOS",
        "ROG",
        "ROS",
    ],
    method_sos="median",
    method_eos="median",
    complete='fast_complete',
    smoothing=None,
    show_progress=True,
):
    """
    Obtain land surface phenology metrics from an
    xarray.DataArray containing a timeseries of a 
    vegetation index like NDVI.

    last modified June 2020

    Parameters
    ----------
    da :  xarray.DataArray
        DataArray should contain a 2D or 3D time series of a
        vegetation index like NDVI, EVI
    stats : list
        list of phenological statistics to return. Regardless of
        the metrics returned, all statistics are calculated
        due to inter-dependencies between metrics.
        Options include:
            SOS = DOY of start of season
            POS = DOY of peak of season
            EOS = DOY of end of season
            vSOS = Value at start of season
            vPOS = Value at peak of season
            vEOS = Value at end of season
            Trough = Minimum value of season
            LOS = Length of season (DOY)
            AOS = Amplitude of season (in value units)
            ROG = Rate of greening
            ROS = Rate of senescence
    method_sos : str 
        If 'first' then vSOS is estimated as the first positive 
        slope on the greening side of the curve. If 'median',
        then vSOS is estimated as the median value of the postive
        slopes on the greening side of the curve.
    method_eos : str
        If 'last' then vEOS is estimated as the last negative slope
        on the senescing side of the curve. If 'median', then vEOS is
        estimated as the 'median' value of the negative slopes on the
        senescing side of the curve.
    complete : str
        If 'fast_complete', the timeseries will be completed (gap filled) using
        fast_completion(), if 'linear', time series with be completed using 
        da.interpolate_na(method='linear')
    smoothing : str
        If 'wiener', the timeseries will be smoothed using the
        scipy.signal.wiener filter with a window size of 3.  If 'rolling_mean', 
        then timeseries is smoothed using a rolling mean with a window size of 3.
        If set to 'linear', will be smoothed using da.resample(time='1W').interpolate('linear')

    Outputs
    -------
        xarray.Dataset containing variables for the selected 
        phenology statistics 

    """
    # Check inputs before running calculations
    if dask.is_dask_collection(da):
        if version.parse(xr.__version__) < version.parse('0.16.0'):
            raise TypeError(
                "Dask arrays are not currently supported by this function, " +
                "run da.compute() before passing dataArray.")
        stats_dtype = {
            "SOS": np.int16,
            "POS": np.int16,
            "EOS": np.int16,
            "Trough": np.float32,
            "vSOS": np.float32,
            "vPOS": np.float32,
            "vEOS": np.float32,
            "LOS": np.int16,
            "AOS": np.float32,
            "ROG": np.float32,
            "ROS": np.float32,
        }
        da_template = da.isel(time=0).drop('time')
        template = xr.Dataset({
            var_name: da_template.astype(var_dtype)
            for var_name, var_dtype in stats_dtype.items() if var_name in stats
        })
        da_all_time = da.chunk({'time': -1})

        lazy_phenology = da_all_time.map_blocks(xr_phenology,
                                                kwargs=dict(
                                                    stats=stats,
                                                    method_sos=method_sos,
                                                    method_eos=method_eos,
                                                    complete=complete,
                                                    smoothing=smoothing,
                                                ),
                                                template=xr.Dataset(template))

        try:
            crs = da.geobox.crs
            lazy_phenology = assign_crs(lazy_phenology, str(crs))
        except:
            pass

        return lazy_phenology

    if method_sos not in ("median", "first"):
        raise ValueError("method_sos should be either 'median' or 'first'")

    if method_eos not in ("median", "last"):
        raise ValueError("method_eos should be either 'median' or 'last'")

    # If stats supplied is not a list, convert to list.
    stats = stats if isinstance(stats, list) else [stats]

    #try to grab the crs info
    try:
        crs = da.geobox.crs
    except:
        pass

    # complete timeseries
    if complete is not None:

        if complete == 'fast_complete':

            if len(da.shape) == 1:
                print(
                    "fast_complete does not operate on 1D timeseries, using 'linear' instead"
                )
                da = da.interpolate_na(dim='time', method='linear')

            else:
                print("Completing using fast_complete...")
                da = fast_completion(da)

        if complete == 'linear':
            print("Completing using linear interp...")
            da = da.interpolate_na(dim='time', method='linear')

    if smoothing is not None:

        if smoothing == "wiener":
            if len(da.shape) == 1:
                print(
                    "wiener method does not operate on 1D timeseries, using 'rolling_mean' instead"
                )
                da = da.rolling(time=3, min_periods=1).mean()

            else:
                print("   Smoothing with wiener filter...")
                da = smooth(da)

        if smoothing == "rolling_mean":
            print("   Smoothing with rolling mean...")
            da = da.rolling(time=3, min_periods=1).mean()

        if smoothing == 'linear':
            print("    Smoothing using linear interpolation...")
            da = da.resample(time='1W').interpolate('linear')

    # remove any remaining all-NaN pixels
    mask = da.isnull().all("time")
    da = da.where(~mask, other=0)

    # calculate the statistics
    print("      Phenology...")
    vpos = _vpos(da)
    pos = _pos(da)
    trough = _trough(da)
    aos = _aos(vpos, trough)
    vsos = _vsos(da, pos, method_sos=method_sos)
    sos = _sos(vsos)
    veos = _veos(da, pos, method_eos=method_eos)
    eos = _eos(veos)
    los = _los(da, eos, sos)
    rog = _rog(vpos, vsos, pos, sos)
    ros = _ros(veos, vpos, eos, pos)

    # Dictionary containing the statistics
    stats_dict = {
        "SOS": sos.astype(np.int16),
        "EOS": eos.astype(np.int16),
        "vSOS": vsos.astype(np.float32),
        "vPOS": vpos.astype(np.float32),
        "Trough": trough.astype(np.float32),
        "POS": pos.astype(np.int16),
        "vEOS": veos.astype(np.float32),
        "LOS": los.astype(np.int16),
        "AOS": aos.astype(np.float32),
        "ROG": rog.astype(np.float32),
        "ROS": ros.astype(np.float32),
    }

    # intialise dataset with first statistic
    ds = stats_dict[stats[0]].to_dataset(name=stats[0])

    # add the other stats to the dataset
    for stat in stats[1:]:
        print("         " + stat)
        stats_keep = stats_dict.get(stat)
        ds[stat] = stats_dict[stat]

    try:
        ds = assign_crs(ds, str(crs))
    except:
        pass

    return ds.drop('time')
Esempio n. 18
0
def return_inits_and_verif_dates(
    forecast: xr.Dataset,
    verif: xr.Dataset,
    alignment: str,
    reference: Optional[Union[str, List[str]]] = None,
    hist: Optional[xr.Dataset] = None,
) -> returnType:
    """Return initializations and verification dates per a given alignment strategy.

    Args:
        forecast (``xarray`` object): Prediction ensemble with ``init`` dim renamed to
            ``time`` and containing ``lead`` dim.
        verif (``xarray`` object): Verification data with ``time`` dim.
        alignment (str): Strategy for initialization-verification alignment.
            * 'same_inits': Use a common set of initializations that verify
               across all leads. This ensures that there is no bias in the result due
               to the state of the system for the given initializations.
            * 'same_verifs': Use a common verification window across all leads. This
               ensures that there is no bias in the result due to the observational
               period being verified against.
            * 'maximize': Use all available initializations at each lead that verify
               against the observations provided. This changes both the set of
               initializations and the verification window used at each lead.

    Return:
        inits (dict): Keys are the lead time integer, values are an ``xr.DataArray`` of
            initialization dates.
        verif_dates (dict): Keys are the lead time integer, values are an
            ``xr.CFTimeIndex`` of verification dates.
    """
    if isinstance(reference, str):
        reference = [reference]
    elif reference is None:
        reference = []

    is_in_list(alignment, VALID_ALIGNMENTS, "alignment")
    units = forecast["lead"].attrs["units"]
    leads = forecast["lead"].values

    # `init` renamed to `time` in compute functions.
    all_inits = forecast["time"]
    all_verifs = verif["time"]

    # If aligning reference='uninitialized', need to account for potential differences
    # in its temporal coverage. Note that the reference='uninitialized' only aligns
    # verification dates and doesn't need to care about inits.
    if hist is not None:
        all_verifs = np.sort(list(set(all_verifs.data) & set(hist["time"].data)))
        all_verifs = xr.DataArray(all_verifs, dims=["time"], coords=[all_verifs])

    # Construct list of `n` offset over all leads.
    n, freq = get_multiple_lead_cftime_shift_args(units, leads)

    if "valid_time" not in forecast.coords:  # old: create init_lead_matrix
        init_lead_matrix = _construct_init_lead_matrix(forecast, n, freq, leads)
    else:  # new: use valid_time(init, lead)
        init_lead_matrix = forecast["valid_time"].drop_vars("valid_time").rename(None)
    if dask.is_dask_collection(init_lead_matrix):
        init_lead_matrix = init_lead_matrix.compute()

    # A union between `inits` and observations in the verification data is required
    # for persistence, since the persistence forecast is based off a common set of
    # initializations.
    if "persistence" in reference:
        union_with_verifs = _isin(all_inits, all_verifs)
        init_lead_matrix = init_lead_matrix.where(union_with_verifs, drop=True)
    valid_inits = init_lead_matrix["time"]

    if "same_init" in alignment:
        return _same_inits_alignment(
            init_lead_matrix, valid_inits, all_verifs, leads, n, freq
        )
    elif "same_verif" in alignment:
        return _same_verifs_alignment(
            init_lead_matrix, valid_inits, all_verifs, leads, n, freq
        )
    elif alignment == "maximize":
        return _maximize_alignment(init_lead_matrix, all_verifs, leads)
    else:
        raise ValueError
Esempio n. 19
0
def temporal_statistics(da, stats):
    """
    Obtain generic temporal statistics using the hdstats temporal library:
    https://github.com/daleroberts/hdstats/blob/master/hdstats/ts.pyx
    
    last modified June 2020
    
    Parameters
    ----------
    da :  xarray.DataArray
        DataArray should contain a 3D time series.
    stats : list
        list of temporal statistics to calculate.
        Options include:
            'discordance' = 
            'f_std' = std of discrete fourier transform coefficients, returns
                      three layers: f_std_n1, f_std_n2, f_std_n3
            'f_mean' = mean of discrete fourier transform coefficients, returns
                       three layers: f_mean_n1, f_mean_n2, f_mean_n3
            'f_median' = median of discrete fourier transform coefficients, returns
                         three layers: f_median_n1, f_median_n2, f_median_n3
            'mean_change' = mean of discrete difference along time dimension
            'median_change' = median of discrete difference along time dimension
            'abs_change' = mean of absolute discrete difference along time dimension
            'complexity' = 
            'central_diff' = 
            'num_peaks' : The number of peaks in the timeseries, defined with a local
                          window of size 10.  NOTE: This statistic is very slow
    Outputs
    -------
        xarray.Dataset containing variables for the selected 
        temporal statistics
        
    """

    # if dask arrays then map the blocks
    if dask.is_dask_collection(da):
        if version.parse(xr.__version__) < version.parse("0.16.0"):
            raise TypeError(
                "Dask arrays are only supported by this function if using, " +
                "xarray v0.16, run da.compute() before passing dataArray.")

        # create a template that matches the final datasets dims & vars
        arr = da.isel(time=0).drop("time")

        # deal with the case where fourier is first in the list
        if stats[0] in ("f_std", "f_median", "f_mean"):
            template = xr.zeros_like(arr).to_dataset(name=stats[0] + "_n1")
            template[stats[0] + "_n2"] = xr.zeros_like(arr)
            template[stats[0] + "_n3"] = xr.zeros_like(arr)

            for stat in stats[1:]:
                if stat in ("f_std", "f_median", "f_mean"):
                    template[stat + "_n1"] = xr.zeros_like(arr)
                    template[stat + "_n2"] = xr.zeros_like(arr)
                    template[stat + "_n3"] = xr.zeros_like(arr)
                else:
                    template[stat] = xr.zeros_like(arr)
        else:
            template = xr.zeros_like(arr).to_dataset(name=stats[0])

            for stat in stats:
                if stat in ("f_std", "f_median", "f_mean"):
                    template[stat + "_n1"] = xr.zeros_like(arr)
                    template[stat + "_n2"] = xr.zeros_like(arr)
                    template[stat + "_n3"] = xr.zeros_like(arr)
                else:
                    template[stat] = xr.zeros_like(arr)
        try:
            template = template.drop('spatial_ref')
        except:
            pass

        # ensure the time chunk is set to -1
        da_all_time = da.chunk({"time": -1})

        # apply function across chunks
        lazy_ds = da_all_time.map_blocks(temporal_statistics,
                                         kwargs={"stats": stats},
                                         template=template)

        try:
            crs = da.geobox.crs
            lazy_ds = assign_crs(lazy_ds, str(crs))
        except:
            pass

        return lazy_ds

    # If stats supplied is not a list, convert to list.
    stats = stats if isinstance(stats, list) else [stats]

    # grab all the attributes of the xarray
    x, y, time, attrs = da.x, da.y, da.time, da.attrs

    # deal with any all-NaN pixels by filling with 0's
    mask = da.isnull().all("time")
    da = da.where(~mask, other=0)

    # complete timeseries
    print("Completing...")
    da = fast_completion(da)

    # ensure dim order is correct for functions
    da = da.transpose("y", "x", "time").values

    stats_dict = {
        "discordance": lambda da: hdstats.discordance(da, n=10),
        "f_std": lambda da: hdstats.fourier_std(da, n=3, step=5),
        "f_mean": lambda da: hdstats.fourier_mean(da, n=3, step=5),
        "f_median": lambda da: hdstats.fourier_median(da, n=3, step=5),
        "mean_change": lambda da: hdstats.mean_change(da),
        "median_change": lambda da: hdstats.median_change(da),
        "abs_change": lambda da: hdstats.mean_abs_change(da),
        "complexity": lambda da: hdstats.complexity(da),
        "central_diff": lambda da: hdstats.mean_central_diff(da),
        "num_peaks": lambda da: hdstats.number_peaks(da, 10),
    }

    print("   Statistics:")
    # if one of the fourier functions is first (or only)
    # stat in the list then we need to deal with this
    if stats[0] in ("f_std", "f_median", "f_mean"):
        print("      " + stats[0])
        stat_func = stats_dict.get(str(stats[0]))
        zz = stat_func(da)
        n1 = zz[:, :, 0]
        n2 = zz[:, :, 1]
        n3 = zz[:, :, 2]

        # intialise dataset with first statistic
        ds = xr.DataArray(n1,
                          attrs=attrs,
                          coords={
                              "x": x,
                              "y": y
                          },
                          dims=["y", "x"]).to_dataset(name=stats[0] + "_n1")

        # add other datasets
        for i, j in zip([n2, n3], ["n2", "n3"]):
            ds[stats[0] + "_" + j] = xr.DataArray(i,
                                                  attrs=attrs,
                                                  coords={
                                                      "x": x,
                                                      "y": y
                                                  },
                                                  dims=["y", "x"])
    else:
        # simpler if first function isn't fourier transform
        first_func = stats_dict.get(str(stats[0]))
        print("      " + stats[0])
        ds = first_func(da)

        # convert back to xarray dataset
        ds = xr.DataArray(ds,
                          attrs=attrs,
                          coords={
                              "x": x,
                              "y": y
                          },
                          dims=["y", "x"]).to_dataset(name=stats[0])

    # loop through the other functions
    for stat in stats[1:]:
        print("      " + stat)

        # handle the fourier transform examples
        if stat in ("f_std", "f_median", "f_mean"):
            stat_func = stats_dict.get(str(stat))
            zz = stat_func(da)
            n1 = zz[:, :, 0]
            n2 = zz[:, :, 1]
            n3 = zz[:, :, 2]

            for i, j in zip([n1, n2, n3], ["n1", "n2", "n3"]):
                ds[stat + "_" + j] = xr.DataArray(i,
                                                  attrs=attrs,
                                                  coords={
                                                      "x": x,
                                                      "y": y
                                                  },
                                                  dims=["y", "x"])

        else:
            # Select a stats function from the dictionary
            # and add to the dataset
            stat_func = stats_dict.get(str(stat))
            ds[stat] = xr.DataArray(stat_func(da),
                                    attrs=attrs,
                                    coords={
                                        "x": x,
                                        "y": y
                                    },
                                    dims=["y", "x"])

    # try to add back the geobox
    try:
        crs = da.geobox.crs
        ds = assign_crs(ds, str(crs))
    except:
        pass

    return ds
Esempio n. 20
0
 def test_inverse_transform(self):
     a = dpp.StandardScaler()
     result = a.inverse_transform(a.fit_transform(X))
     assert dask.is_dask_collection(result)
     assert_eq_ar(result, X)
Esempio n. 21
0
def map_blocks(
    func: Callable[..., T_DSorDA],
    obj: Union[DataArray, Dataset],
    args: Sequence[Any] = (),
    kwargs: Mapping[str, Any] = None,
) -> T_DSorDA:
    """Apply a function to each chunk of a DataArray or Dataset. This function is
    experimental and its signature may change.

    Parameters
    ----------
    func: callable
        User-provided function that accepts a DataArray or Dataset as its first
        parameter. The function will receive a subset of 'obj' (see below),
        corresponding to one chunk along each chunked dimension. ``func`` will be
        executed as ``func(obj_subset, *args, **kwargs)``.

        The function will be first run on mocked-up data, that looks like 'obj' but
        has sizes 0, to determine properties of the returned object such as dtype,
        variable names, new dimensions and new indexes (if any).

        This function must return either a single DataArray or a single Dataset.

        This function cannot change size of existing dimensions, or add new chunked
        dimensions.
    obj: DataArray, Dataset
        Passed to the function as its first argument, one dask chunk at a time.
    args: Sequence
        Passed verbatim to func after unpacking, after the sliced obj. xarray objects,
        if any, will not be split by chunks. Passing dask collections is not allowed.
    kwargs: Mapping
        Passed verbatim to func after unpacking. xarray objects, if any, will not be
        split by chunks. Passing dask collections is not allowed.

    Returns
    -------
    A single DataArray or Dataset with dask backend, reassembled from the outputs of the
    function.

    Notes
    -----
    This function is designed for when one needs to manipulate a whole xarray object
    within each chunk. In the more common case where one can work on numpy arrays, it is
    recommended to use apply_ufunc.

    If none of the variables in obj is backed by dask, calling this function is
    equivalent to calling ``func(obj, *args, **kwargs)``.

    See Also
    --------
    dask.array.map_blocks, xarray.apply_ufunc, xarray.Dataset.map_blocks,
    xarray.DataArray.map_blocks

    Examples
    --------

    Calculate an anomaly from climatology using ``.groupby()``. Using
    ``xr.map_blocks()`` allows for parallel operations with knowledge of ``xarray``,
    its indices, and its methods like ``.groupby()``.

    >>> def calculate_anomaly(da, groupby_type="time.month"):
    ...     # Necessary workaround to xarray's check with zero dimensions
    ...     # https://github.com/pydata/xarray/issues/3575
    ...     if sum(da.shape) == 0:
    ...         return da
    ...     gb = da.groupby(groupby_type)
    ...     clim = gb.mean(dim="time")
    ...     return gb - clim
    >>> time = xr.cftime_range("1990-01", "1992-01", freq="M")
    >>> np.random.seed(123)
    >>> array = xr.DataArray(
    ...     np.random.rand(len(time)), dims="time", coords=[time]
    ... ).chunk()
    >>> xr.map_blocks(calculate_anomaly, array).compute()
    <xarray.DataArray (time: 24)>
    array([ 0.12894847,  0.11323072, -0.0855964 , -0.09334032,  0.26848862,
            0.12382735,  0.22460641,  0.07650108, -0.07673453, -0.22865714,
           -0.19063865,  0.0590131 , -0.12894847, -0.11323072,  0.0855964 ,
            0.09334032, -0.26848862, -0.12382735, -0.22460641, -0.07650108,
            0.07673453,  0.22865714,  0.19063865, -0.0590131 ])
    Coordinates:
      * time     (time) object 1990-01-31 00:00:00 ... 1991-12-31 00:00:00

    Note that one must explicitly use ``args=[]`` and ``kwargs={}`` to pass arguments
    to the function being applied in ``xr.map_blocks()``:

    >>> xr.map_blocks(
    ...     calculate_anomaly, array, kwargs={"groupby_type": "time.year"},
    ... )
    <xarray.DataArray (time: 24)>
    array([ 0.15361741, -0.25671244, -0.31600032,  0.008463  ,  0.1766172 ,
           -0.11974531,  0.43791243,  0.14197797, -0.06191987, -0.15073425,
           -0.19967375,  0.18619794, -0.05100474, -0.42989909, -0.09153273,
            0.24841842, -0.30708526, -0.31412523,  0.04197439,  0.0422506 ,
            0.14482397,  0.35985481,  0.23487834,  0.12144652])
    Coordinates:
        * time     (time) object 1990-01-31 00:00:00 ... 1991-12-31 00:00:00
    """

    def _wrapper(func, obj, to_array, args, kwargs):
        if to_array:
            obj = dataset_to_dataarray(obj)

        result = func(obj, *args, **kwargs)

        for name, index in result.indexes.items():
            if name in obj.indexes:
                if len(index) != len(obj.indexes[name]):
                    raise ValueError(
                        "Length of the %r dimension has changed. This is not allowed."
                        % name
                    )

        return make_dict(result)

    if not isinstance(args, Sequence):
        raise TypeError("args must be a sequence (for example, a list or tuple).")
    if kwargs is None:
        kwargs = {}
    elif not isinstance(kwargs, Mapping):
        raise TypeError("kwargs must be a mapping (for example, a dict)")

    for value in list(args) + list(kwargs.values()):
        if dask.is_dask_collection(value):
            raise TypeError(
                "Cannot pass dask collections in args or kwargs yet. Please compute or "
                "load values before passing to map_blocks."
            )

    if not dask.is_dask_collection(obj):
        return func(obj, *args, **kwargs)

    if isinstance(obj, DataArray):
        # only using _to_temp_dataset would break
        # func = lambda x: x.to_dataset()
        # since that relies on preserving name.
        if obj.name is None:
            dataset = obj._to_temp_dataset()
        else:
            dataset = obj.to_dataset()
        input_is_array = True
    else:
        dataset = obj
        input_is_array = False

    input_chunks = dataset.chunks

    template: Union[DataArray, Dataset] = infer_template(func, obj, *args, **kwargs)
    if isinstance(template, DataArray):
        result_is_array = True
        template_name = template.name
        template = template._to_temp_dataset()
    elif isinstance(template, Dataset):
        result_is_array = False
    else:
        raise TypeError(
            f"func output must be DataArray or Dataset; got {type(template)}"
        )

    template_indexes = set(template.indexes)
    dataset_indexes = set(dataset.indexes)
    preserved_indexes = template_indexes & dataset_indexes
    new_indexes = template_indexes - dataset_indexes
    indexes = {dim: dataset.indexes[dim] for dim in preserved_indexes}
    indexes.update({k: template.indexes[k] for k in new_indexes})

    # We're building a new HighLevelGraph hlg. We'll have one new layer
    # for each variable in the dataset, which is the result of the
    # func applied to the values.

    graph: Dict[Any, Any] = {}
    new_layers: DefaultDict[str, Dict[Any, Any]] = collections.defaultdict(dict)
    gname = "{}-{}".format(
        dask.utils.funcname(func), dask.base.tokenize(dataset, args, kwargs)
    )

    # map dims to list of chunk indexes
    ichunk = {dim: range(len(chunks_v)) for dim, chunks_v in input_chunks.items()}
    # mapping from chunk index to slice bounds
    chunk_index_bounds = {
        dim: np.cumsum((0,) + chunks_v) for dim, chunks_v in input_chunks.items()
    }

    # iterate over all possible chunk combinations
    for v in itertools.product(*ichunk.values()):
        chunk_index_dict = dict(zip(dataset.dims, v))

        # this will become [[name1, variable1],
        #                   [name2, variable2],
        #                   ...]
        # which is passed to dict and then to Dataset
        data_vars = []
        coords = []

        for name, variable in dataset.variables.items():
            # make a task that creates tuple of (dims, chunk)
            if dask.is_dask_collection(variable.data):
                # recursively index into dask_keys nested list to get chunk
                chunk = variable.__dask_keys__()
                for dim in variable.dims:
                    chunk = chunk[chunk_index_dict[dim]]

                chunk_variable_task = (f"{gname}-{chunk[0]}",) + v
                graph[chunk_variable_task] = (
                    tuple,
                    [variable.dims, chunk, variable.attrs],
                )
            else:
                # non-dask array with possibly chunked dimensions
                # index into variable appropriately
                subsetter = {}
                for dim in variable.dims:
                    if dim in chunk_index_dict:
                        which_chunk = chunk_index_dict[dim]
                        subsetter[dim] = slice(
                            chunk_index_bounds[dim][which_chunk],
                            chunk_index_bounds[dim][which_chunk + 1],
                        )

                subset = variable.isel(subsetter)
                chunk_variable_task = (
                    "{}-{}".format(gname, dask.base.tokenize(subset)),
                ) + v
                graph[chunk_variable_task] = (
                    tuple,
                    [subset.dims, subset, subset.attrs],
                )

            # this task creates dict mapping variable name to above tuple
            if name in dataset._coord_names:
                coords.append([name, chunk_variable_task])
            else:
                data_vars.append([name, chunk_variable_task])

        from_wrapper = (gname,) + v
        graph[from_wrapper] = (
            _wrapper,
            func,
            (Dataset, (dict, data_vars), (dict, coords), dataset.attrs),
            input_is_array,
            args,
            kwargs,
        )

        # mapping from variable name to dask graph key
        var_key_map: Dict[Hashable, str] = {}
        for name, variable in template.variables.items():
            if name in indexes:
                continue
            gname_l = f"{gname}-{name}"
            var_key_map[name] = gname_l

            key: Tuple[Any, ...] = (gname_l,)
            for dim in variable.dims:
                if dim in chunk_index_dict:
                    key += (chunk_index_dict[dim],)
                else:
                    # unchunked dimensions in the input have one chunk in the result
                    key += (0,)

            # We're adding multiple new layers to the graph:
            # The first new layer is the result of the computation on
            # the array.
            # Then we add one layer per variable, which extracts the
            # result for that variable, and depends on just the first new
            # layer.
            new_layers[gname_l][key] = (operator.getitem, from_wrapper, name)

    hlg = HighLevelGraph.from_collections(gname, graph, dependencies=[dataset])

    for gname_l, layer in new_layers.items():
        # This adds in the getitems for each variable in the dataset.
        hlg.dependencies[gname_l] = {gname}
        hlg.layers[gname_l] = layer

    result = Dataset(coords=indexes, attrs=template.attrs)
    for name, gname_l in var_key_map.items():
        dims = template[name].dims
        var_chunks = []
        for dim in dims:
            if dim in input_chunks:
                var_chunks.append(input_chunks[dim])
            elif dim in indexes:
                var_chunks.append((len(indexes[dim]),))
            elif dim in template.dims:
                # new unindexed dimension
                var_chunks.append((template.sizes[dim],))

        data = dask.array.Array(
            hlg, name=gname_l, chunks=var_chunks, dtype=template[name].dtype
        )
        result[name] = (dims, data, template[name].attrs)

    result = result.set_coords(template._coord_names)

    if result_is_array:
        da = dataset_to_dataarray(result)
        da.name = template_name
        return da  # type: ignore
    return result  # type: ignore
Esempio n. 22
0
 def fit(self, X, y=None):
     if not dask.is_dask_collection(X):
         raise TypeError(_TYPE_MSG.format(type(X)))
     self._fit(X)
     self.n_features_in_ = X.shape[1]
     return self
Esempio n. 23
0
    def transform(self, X):
        """Transform a sequence of documents to a document-term matrix.

        Transformation is done in parallel, and correctly handles dask
        collections.

        Parameters
        ----------
        X : dask.Bag of raw text documents, length = n_samples
            Samples. Each sample must be a text document (either bytes or
            unicode strings, file name or file object depending on the
            constructor argument) which will be tokenized and hashed.

        Returns
        -------
        X : dask.array.Array, shape = (n_samples, self.n_features)
            Document-term matrix. Each block of the array is a scipy sparse
            matrix.

        Notes
        -----
        The returned dask Array is composed scipy sparse matricies. If you need
        to compute on the result immediately, you may need to convert the individual
        blocks to ndarrays or pydata/sparse matricies.

        >>> import sparse
        >>> X.map_blocks(sparse.COO.from_scipy_sparse, dtype=X.dtype)  # doctest: +SKIP

        See the :doc:`examples/text-vectorization` for more.
        """
        transformer = super(HashingVectorizer, self).transform

        msg = "'X' should be a 1-dimensional array with length 'num_samples'."

        if not dask.is_dask_collection(X):
            return transformer(X)

        if isinstance(X, db.Bag):
            bag2 = X.map_partitions(transformer)
            objs = bag2.to_delayed()
            arrs = [
                da.from_delayed(obj, (np.nan, self.n_features), self.dtype)
                for obj in objs
            ]
            result = da.concatenate(arrs, axis=0)
        elif isinstance(X, dd.Series):
            result = X.map_partitions(transformer)
        elif isinstance(X, da.Array):
            # dask.Array
            chunks = ((np.nan, ) * X.numblocks[0], (self.n_features, ))
            if X.ndim == 1:
                result = X.map_blocks(transformer,
                                      dtype="f8",
                                      chunks=chunks,
                                      new_axis=1)
            else:
                raise ValueError(msg)
        else:
            raise ValueError(msg)

        return result
Esempio n. 24
0
    def dispatch(self, hyper_model, X, y, X_val, y_val, max_trails, dataset_id,
                 trail_store, **fit_kwargs):
        assert not any(dask.is_dask_collection(i) for i in (X, y, X_val, y_val)), \
            f'{self.__class__.__name__} does not support to run trail with dask collection.'

        experiment = time.strftime('%Y%m%d%H%M%S')
        experiment_model_root = f'{_model_root}/{experiment}'
        os.makedirs(experiment_model_root, exist_ok=True)

        queue_size = int(config('search_queue', '1'))
        worker_count = int(config('search_executors', '3'))
        retry_limit = int(config('search_retry', '1000'))

        failed_counter = Counter()
        success_counter = Counter()

        def on_trail_start(trail_item):
            trail_item.start_at = time.time()
            if logger.is_info_enabled():
                msg = f'Start trail {trail_item.trail_no}, space_id={trail_item.space_id}' \
                      + f',model_file={trail_item.model_file}'
                logger.info(msg)
            for callback in hyper_model.callbacks:
                # callback.on_build_estimator(hyper_model, space_sample, estimator, trail_no) #fixme
                callback.on_trail_begin(hyper_model, trail_item.space_sample,
                                        trail_item.trail_no)

        def on_trail_done(trail_item):
            trail_item.done_at = time.time()

            if trail_item.reward != 0 and not math.isnan(
                    trail_item.reward):  # success
                improved = hyper_model.history.append(trail_item)
                for callback in hyper_model.callbacks:
                    callback.on_trail_end(hyper_model, trail_item.space_sample,
                                          trail_item.trail_no,
                                          trail_item.reward, improved,
                                          trail_item.elapsed)
                success_counter()
            else:
                for callback in hyper_model.callbacks:
                    callback.on_trail_error(hyper_model,
                                            trail_item.space_sample,
                                            trail_item.trail_no)
                failed_counter()

            if logger.is_info_enabled():
                elapsed = '%.3f' % (trail_item.done_at - trail_item.start_at)
                msg = f'Trail {trail_item.trail_no} done with reward={trail_item.reward}, ' \
                      f'elapsed {elapsed} seconds\n' \
                      f'----------------------------------------------------------------\n' \
                      f'space signatures: \n{hyper_model.history.get_space_signatures()}\n' \
                      f'----------------------------------------------------------------'
                logger.info(msg)
            if trail_store is not None:
                trail_store.put(dataset_id, trail_item)

        pool = DaskExecutorPool(worker_count, queue_size, on_trail_start,
                                on_trail_done, hyper_model._run_trial, X, y,
                                X_val, y_val, fit_kwargs)
        pool.start()

        trail_no = 1
        retry_counter = 0

        while trail_no <= max_trails and pool.running:
            if pool.qsize >= queue_size:
                time.sleep(0.1)
                continue

            space_sample = hyper_model.searcher.sample()
            if hyper_model.history.is_existed(space_sample):
                if retry_counter >= retry_limit:
                    logger.info(
                        f'Unable to take valid sample and exceed the retry limit 1000.'
                    )
                    break
                trail = hyper_model.history.get_trail(space_sample)
                for callback in hyper_model.callbacks:
                    callback.on_skip_trail(hyper_model, space_sample, trail_no,
                                           'trail_existed', trail.reward,
                                           False, trail.elapsed)
                retry_counter += 1
                continue

            try:
                if trail_store is not None:
                    trail = trail_store.get(dataset_id, space_sample)
                    if trail is not None:
                        reward = trail.reward
                        elapsed = trail.elapsed
                        trail = Trail(space_sample, trail_no, reward, elapsed)
                        improved = hyper_model.history.append(trail)
                        hyper_model.searcher.update_result(
                            space_sample, reward)
                        for callback in hyper_model.callbacks:
                            callback.on_skip_trail(hyper_model, space_sample,
                                                   trail_no, 'hit_trail_store',
                                                   reward, improved, elapsed)
                        trail_no += 1
                        continue

                model_file = '%s/%05d_%s.pkl' % (
                    experiment_model_root, trail_no, space_sample.space_id)

                item = DaskTrailItem(space_sample,
                                     trail_no,
                                     model_file=model_file)
                pool.push(item)

                if logger.is_info_enabled():
                    logger.info(
                        f'Found trail {trail_no}, queue size: {pool.qsize}')
            except EarlyStoppingError:
                pool.stop()
                break
            except KeyboardInterrupt:
                pool.stop()
                pool.interrupted = True
                print('KeyboardInterrupt')
                break
            except Exception as e:
                import traceback
                msg = f'{">" * 20} Search trail {trail_no} failed! {"<" * 20}\n' \
                      + f'{e.__class__.__name__}: {e}\n' \
                      + traceback.format_exc() \
                      + '*' * 50
                logger.error(msg)
            finally:
                trail_no += 1
                retry_counter = 0

        # wait trails
        if pool.running:
            logger.info('Search done, wait trail tasks.')
        pool.push(None)  # mark end
        pool.join()

        if logger.is_info_enabled():
            logger.info(
                f'Search and all trails done, {success_counter.value} success, '
                f'{failed_counter.value} failed.')

        return trail_no
Esempio n. 25
0
def test_gap_fill():
    a = np.zeros((5,), dtype="uint8")
    b = np.empty_like(a)
    b[:] = 33

    a[0] = 11
    ab = _gap_fill_np(a, b, 0)
    assert ab.dtype == a.dtype
    assert ab.tolist() == [11, 33, 33, 33, 33]

    xa = xr.DataArray(
        a,
        name="test_a",
        dims=("t",),
        attrs={"p1": 1, "nodata": 0},
        coords=dict(t=np.arange(a.shape[0])),
    )
    xb = xa + 0
    xb.data[:] = b
    xab = gap_fill(xa, xb)
    assert xab.name == xa.name
    assert xab.attrs == xa.attrs
    assert xab.data.tolist() == [11, 33, 33, 33, 33]

    xa.attrs["nodata"] = 11
    assert gap_fill(xa, xb).data.tolist() == [33, 0, 0, 0, 0]

    a = np.zeros((5,), dtype="float32")
    a[1:] = np.nan
    b = np.empty_like(a)
    b[:] = 33
    ab = _gap_fill_np(a, b, np.nan)

    assert ab.dtype == a.dtype
    assert ab.tolist() == [0, 33, 33, 33, 33]

    xa = xr.DataArray(
        a,
        name="test_a",
        dims=("t",),
        attrs={"p1": 1},
        coords=dict(t=np.arange(a.shape[0])),
    )
    xb = xa + 0
    xb.data[:] = b
    xab = gap_fill(xa, xb)
    assert xab.name == xa.name
    assert xab.attrs == xa.attrs
    assert xab.data.tolist() == [0, 33, 33, 33, 33]

    xa = xr.DataArray(
        da.from_array(a),
        name="test_a",
        dims=("t",),
        attrs={"p1": 1},
        coords=dict(t=np.arange(a.shape[0])),
    )

    xb = xr.DataArray(
        da.from_array(b),
        name="test_a",
        dims=("t",),
        attrs={"p1": 1},
        coords=dict(t=np.arange(b.shape[0])),
    )

    assert dask.is_dask_collection(xa)
    assert dask.is_dask_collection(xb)
    xab = gap_fill(xa, xb)

    assert dask.is_dask_collection(xab)
    assert xab.name == xa.name
    assert xab.attrs == xa.attrs
    assert xab.compute().values.tolist() == [0, 33, 33, 33, 33]
Esempio n. 26
0
def int_geomedian(ds, scale=1, offset=0, wk_rows=-1, as_array=False, **kw):
    """ds -- xr.Dataset (possibly dask) with dims: (time, y, x) for each band

        on output time dimension is removed

    :param ds: Dataset with int data variables
    :param scale: Normalize data for running computation (output is scaled back to original values)
    :param offset: ``(x*scale + offset)``
    :param wk_rows: reduce memory requirements by processing that many rows of a chunk at a time
    :param as_array: If set to True return DataArray with band dimension instead of Dataset
    :param kw: Passed on to hdstats (eps=1e-4, num_threads=1, maxiters=10_000, nocheck=True)

    """
    band_names = [dv.name for dv in ds.data_vars.values()]
    xx, *_ = ds.data_vars.values()
    nodata = getattr(xx, "nodata", None)

    is_dask = dask.is_dask_collection(xx)
    if is_dask:
        if xx.data.chunksize[0] != xx.shape[0]:
            ds = ds.chunk(chunks={xx.dims[0]: -1})
            xx, *_ = ds.data_vars.values()

    nt, ny, nx = xx.shape
    bands = [dv.data for dv in ds.data_vars.values()]
    band = bands[0]
    nb = len(bands)
    dtype = band.dtype

    kw.setdefault("nocheck", True)
    kw.setdefault("num_threads", 1)
    kw.setdefault("eps", 1e-4)
    kw.setdefault("maxiters", 10_000)

    if is_dask:
        chunks = ((nb, ), *xx.chunks[1:])

        data = da.map_blocks(
            int_geomedian_np,
            *bands,
            nodata=nodata,
            scale=scale,
            offset=offset,
            wk_rows=wk_rows,
            **kw,
            name=randomize("geomedian"),
            dtype=dtype,
            chunks=chunks,
            drop_axis=[0],  # time is dropped
            new_axis=[0],
        )  # band is added on the left
    else:
        data = int_geomedian_np(*bands,
                                nodata=nodata,
                                scale=scale,
                                offset=offset,
                                wk_rows=wk_rows,
                                **kw)

    dims = ("band", *xx.dims[1:])
    cc = {k: xx.coords[k] for k in dims[1:]}
    cc["band"] = band_names

    da_out = xr.DataArray(data, dims=dims, coords=cc)

    if as_array:
        if nodata is not None:
            da_out.attrs["nodata"] = nodata
        return da_out

    ds_out = da_out.to_dataset(dim="band")
    ds_out.attrs.update(ds.attrs)
    for b in ds.data_vars.keys():
        src, dst = ds[b], ds_out[b]
        dst.attrs.update(src.attrs)

    return ds_out
Esempio n. 27
0
def assert_chunk(actual, chunk_bool):
    """check that actual is chunked when chunk_bool==True."""
    if chunk_bool:
        assert is_dask_collection(actual)
    else:
        assert not is_dask_collection(actual)
Esempio n. 28
0
def xr_geomedian(ds, axis="time", where=None, **kw):
    """

    :param ds: xr.Dataset|xr.DataArray|numpy array

    Other parameters:
    **kwargs -- passed on to pcm.gnmpcm
       maxiters   : int         1000
       eps        : float       0.0001
       num_threads: int| None   None
    """
    from hdstats import nangeomedian_pcm

    def norm_input(ds, axis):
        if isinstance(ds, xr.DataArray):
            xx = ds
            if len(xx.dims) != 4:
                raise ValueError("Expect 4 dimensions on input: y,x,band,time")
            if axis is not None and xx.dims[3] != axis:
                raise ValueError(
                    f"Can only reduce last dimension, expect: y,x,band,{axis}")
            return None, xx, xx.data
        elif isinstance(ds, xr.Dataset):
            xx = reshape_for_geomedian(ds, axis)
            return ds, xx, xx.data
        else:  # assume numpy or similar
            xx_data = ds
            if xx_data.ndim != 4:
                raise ValueError("Expect 4 dimensions on input: y,x,band,time")
            return None, None, xx_data

    kw.setdefault("nocheck", True)
    kw.setdefault("num_threads", 1)
    kw.setdefault("eps", 1e-6)

    ds, xx, xx_data = norm_input(ds, axis)
    is_dask = dask.is_dask_collection(xx_data)

    if where is not None:
        if is_dask:
            raise NotImplementedError(
                "Dask version doesn't support output masking currently")

        if where.shape != xx_data.shape[:2]:
            raise ValueError("Shape for `where` parameter doesn't match")
        set_nan = ~where
    else:
        set_nan = None

    if is_dask:
        if xx_data.shape[-2:] != xx_data.chunksize[-2:]:
            xx_data = xx_data.rechunk(xx_data.chunksize[:2] + (-1, -1))

        data = da.map_blocks(
            lambda x: nangeomedian_pcm(x, **kw),
            xx_data,
            name=randomize("geomedian"),
            dtype=xx_data.dtype,
            drop_axis=3,
        )
    else:
        data = nangeomedian_pcm(xx_data, **kw)

    if set_nan is not None:
        data[set_nan, :] = np.nan

    if xx is None:
        return data

    dims = xx.dims[:-1]
    cc = {k: xx.coords[k] for k in dims}
    xx_out = xr.DataArray(data, dims=dims, coords=cc)

    if ds is None:
        xx_out.attrs.update(xx.attrs)
        return xx_out

    ds_out = xx_out.to_dataset(dim="band")
    for b in ds.data_vars.keys():
        src, dst = ds[b], ds_out[b]
        dst.attrs.update(src.attrs)

    return ds_out
Esempio n. 29
0
def reshape_yxbt(
    xx: xr.Dataset,
    name: str = "reshape_yxbt",
    yx_chunks: Union[int, Tuple[int, int]] = -1,
) -> xr.DataArray:
    """
    Reshape Dask-backed ``xr.Dataset[Time,Y,X]`` into
    ``xr.DataArray[Y,X,Band,Time]``. On the output DataArray there is
    exactly one chunk along both Time and Band dimensions.

    :param xx: Dataset with 3 dimensional bands, dimension order (time, y, x)

    :param name: Dask name of the output operation

    :param yx_chunks: If supplied subdivide YX chunks of input into smaller
                      sections, note that this can only make yx chunks smaller
                      not bigger. Every output chunk depends on one input chunk
                      only, so output chunks might not be regular, for example
                      if input chunk sizes are 10, and yx_chunks=3, you'll get
                      chunks sized 3,3,3,1,3,3,3,1... (example only, never use chunks
                      that small)

    .. note:

       Chunks along first dimension ought to be of size 1 exactly (default for
       time dimension when using dc.load).
    """
    if isinstance(yx_chunks, int):
        yx_chunks = (yx_chunks, yx_chunks)

    if not is_dask_collection(xx):
        raise ValueError("Currently this code works only on Dask inputs")

    if not all(dv.data.numblocks[0] == dv.data.shape[0]
               for dv in xx.data_vars.values()):
        raise ValueError(
            "All input bands should have chunk=1 for the first dimension")

    name0 = name
    name = randomize(name)

    blocks, _ = _get_chunks_for_all_bands(xx)
    b0, *_ = xx.data_vars.values()

    attrs = dict(b0.attrs)
    nb = len(xx.data_vars.values())
    nt, ny, nx = b0.shape

    deps = [dv.data for dv in xx.data_vars.values()]
    shape = (ny, nx, nb, nt)
    dtype = b0.dtype
    dims = b0.dims[1:] + ("band", b0.dims[0])

    maxy, maxx = yx_chunks
    ychunks, xchunks = b0.data.chunks[1:3]
    _yy = list(_split_chunks(ychunks, maxy))
    _xx = list(_split_chunks(xchunks, maxx))
    ychunks = tuple(roi.stop - roi.start for _, _, roi in _yy)
    xchunks = tuple(roi.stop - roi.start for _, _, roi in _xx)

    chunks = [ychunks, xchunks, (nb, ), (nt, )]

    dsk = {}
    for iy, iy_src, y_roi in _yy:
        for ix, ix_src, x_roi in _xx:
            crop_yx = (y_roi, x_roi)
            _blocks = blocks[:, :, iy_src, ix_src].tolist()
            dsk[(name, iy, ix, 0, 0)] = (
                functools.partial(_reshape_yxbt_impl, crop_yx=crop_yx),
                _blocks,
            )

    dsk = HighLevelGraph.from_collections(name, dsk, dependencies=deps)
    data = da.Array(dsk, name, chunks=chunks, dtype=dtype, shape=shape)

    coords: Dict[Hashable, Any] = {k: c for k, c in xx.coords.items()}
    coords["band"] = list(xx.data_vars)

    return xr.DataArray(data=data,
                        dims=dims,
                        coords=coords,
                        name=name0,
                        attrs=attrs)
Esempio n. 30
0
def map_blocks(
    func: Callable[..., T_DSorDA],
    obj: Union[DataArray, Dataset],
    args: Sequence[Any] = (),
    kwargs: Mapping[str, Any] = None,
    template: Union[DataArray, Dataset] = None,
) -> T_DSorDA:
    """Apply a function to each block of a DataArray or Dataset.

    .. warning::
        This function is experimental and its signature may change.

    Parameters
    ----------
    func : callable
        User-provided function that accepts a DataArray or Dataset as its first
        parameter ``obj``. The function will receive a subset or 'block' of ``obj`` (see below),
        corresponding to one chunk along each chunked dimension. ``func`` will be
        executed as ``func(subset_obj, *subset_args, **kwargs)``.

        This function must return either a single DataArray or a single Dataset.

        This function cannot add a new chunked dimension.
    obj : DataArray, Dataset
        Passed to the function as its first argument, one block at a time.
    args : sequence
        Passed to func after unpacking and subsetting any xarray objects by blocks.
        xarray objects in args must be aligned with obj, otherwise an error is raised.
    kwargs : mapping
        Passed verbatim to func after unpacking. xarray objects, if any, will not be
        subset to blocks. Passing dask collections in kwargs is not allowed.
    template : DataArray or Dataset, optional
        xarray object representing the final result after compute is called. If not provided,
        the function will be first run on mocked-up data, that looks like ``obj`` but
        has sizes 0, to determine properties of the returned object such as dtype,
        variable names, attributes, new dimensions and new indexes (if any).
        ``template`` must be provided if the function changes the size of existing dimensions.
        When provided, ``attrs`` on variables in `template` are copied over to the result. Any
        ``attrs`` set by ``func`` will be ignored.

    Returns
    -------
    A single DataArray or Dataset with dask backend, reassembled from the outputs of the
    function.

    Notes
    -----
    This function is designed for when ``func`` needs to manipulate a whole xarray object
    subset to each block. In the more common case where ``func`` can work on numpy arrays, it is
    recommended to use ``apply_ufunc``.

    If none of the variables in ``obj`` is backed by dask arrays, calling this function is
    equivalent to calling ``func(obj, *args, **kwargs)``.

    See Also
    --------
    dask.array.map_blocks, xarray.apply_ufunc, xarray.Dataset.map_blocks
    xarray.DataArray.map_blocks

    Examples
    --------
    Calculate an anomaly from climatology using ``.groupby()``. Using
    ``xr.map_blocks()`` allows for parallel operations with knowledge of ``xarray``,
    its indices, and its methods like ``.groupby()``.

    >>> def calculate_anomaly(da, groupby_type="time.month"):
    ...     gb = da.groupby(groupby_type)
    ...     clim = gb.mean(dim="time")
    ...     return gb - clim
    ...
    >>> time = xr.cftime_range("1990-01", "1992-01", freq="M")
    >>> month = xr.DataArray(time.month, coords={"time": time}, dims=["time"])
    >>> np.random.seed(123)
    >>> array = xr.DataArray(
    ...     np.random.rand(len(time)),
    ...     dims=["time"],
    ...     coords={"time": time, "month": month},
    ... ).chunk()
    >>> array.map_blocks(calculate_anomaly, template=array).compute()
    <xarray.DataArray (time: 24)>
    array([ 0.12894847,  0.11323072, -0.0855964 , -0.09334032,  0.26848862,
            0.12382735,  0.22460641,  0.07650108, -0.07673453, -0.22865714,
           -0.19063865,  0.0590131 , -0.12894847, -0.11323072,  0.0855964 ,
            0.09334032, -0.26848862, -0.12382735, -0.22460641, -0.07650108,
            0.07673453,  0.22865714,  0.19063865, -0.0590131 ])
    Coordinates:
      * time     (time) object 1990-01-31 00:00:00 ... 1991-12-31 00:00:00
        month    (time) int64 1 2 3 4 5 6 7 8 9 10 11 12 1 2 3 4 5 6 7 8 9 10 11 12

    Note that one must explicitly use ``args=[]`` and ``kwargs={}`` to pass arguments
    to the function being applied in ``xr.map_blocks()``:

    >>> array.map_blocks(
    ...     calculate_anomaly,
    ...     kwargs={"groupby_type": "time.year"},
    ...     template=array,
    ... )  # doctest: +ELLIPSIS
    <xarray.DataArray (time: 24)>
    dask.array<calculate_anomaly-...-<this, shape=(24,), dtype=float64, chunksize=(24,), chunktype=numpy.ndarray>
    Coordinates:
      * time     (time) object 1990-01-31 00:00:00 ... 1991-12-31 00:00:00
        month    (time) int64 dask.array<chunksize=(24,), meta=np.ndarray>
    """
    def _wrapper(
        func: Callable,
        args: List,
        kwargs: dict,
        arg_is_array: Iterable[bool],
        expected: dict,
    ):
        """
        Wrapper function that receives datasets in args; converts to dataarrays when necessary;
        passes these to the user function `func` and checks returned objects for expected shapes/sizes/etc.
        """

        converted_args = [
            dataset_to_dataarray(arg) if is_array else arg
            for is_array, arg in zip(arg_is_array, args)
        ]

        result = func(*converted_args, **kwargs)

        # check all dims are present
        missing_dimensions = set(expected["shapes"]) - set(result.sizes)
        if missing_dimensions:
            raise ValueError(
                f"Dimensions {missing_dimensions} missing on returned object.")

        # check that index lengths and values are as expected
        for name, index in result.indexes.items():
            if name in expected["shapes"]:
                if len(index) != expected["shapes"][name]:
                    raise ValueError(
                        f"Received dimension {name!r} of length {len(index)}. Expected length {expected['shapes'][name]}."
                    )
            if name in expected["indexes"]:
                expected_index = expected["indexes"][name]
                if not index.equals(expected_index):
                    raise ValueError(
                        f"Expected index {name!r} to be {expected_index!r}. Received {index!r} instead."
                    )

        # check that all expected variables were returned
        check_result_variables(result, expected, "coords")
        if isinstance(result, Dataset):
            check_result_variables(result, expected, "data_vars")

        return make_dict(result)

    if template is not None and not isinstance(template, (DataArray, Dataset)):
        raise TypeError(
            f"template must be a DataArray or Dataset. Received {type(template).__name__} instead."
        )
    if not isinstance(args, Sequence):
        raise TypeError(
            "args must be a sequence (for example, a list or tuple).")
    if kwargs is None:
        kwargs = {}
    elif not isinstance(kwargs, Mapping):
        raise TypeError("kwargs must be a mapping (for example, a dict)")

    for value in kwargs.values():
        if dask.is_dask_collection(value):
            raise TypeError(
                "Cannot pass dask collections in kwargs yet. Please compute or "
                "load values before passing to map_blocks.")

    if not dask.is_dask_collection(obj):
        return func(obj, *args, **kwargs)

    all_args = [obj] + list(args)
    is_xarray = [isinstance(arg, (Dataset, DataArray)) for arg in all_args]
    is_array = [isinstance(arg, DataArray) for arg in all_args]

    # there should be a better way to group this. partition?
    xarray_indices, xarray_objs = unzip(
        (index, arg) for index, arg in enumerate(all_args) if is_xarray[index])
    others = [(index, arg) for index, arg in enumerate(all_args)
              if not is_xarray[index]]

    # all xarray objects must be aligned. This is consistent with apply_ufunc.
    aligned = align(*xarray_objs, join="exact")
    xarray_objs = tuple(
        dataarray_to_dataset(arg) if is_da else arg
        for is_da, arg in zip(is_array, aligned))

    _, npargs = unzip(
        sorted(list(zip(xarray_indices, xarray_objs)) + others,
               key=lambda x: x[0]))

    # check that chunk sizes are compatible
    input_chunks = dict(npargs[0].chunks)
    input_indexes = dict(npargs[0].indexes)
    for arg in xarray_objs[1:]:
        assert_chunks_compatible(npargs[0], arg)
        input_chunks.update(arg.chunks)
        input_indexes.update(arg.indexes)

    if template is None:
        # infer template by providing zero-shaped arrays
        template = infer_template(func, aligned[0], *args, **kwargs)
        template_indexes = set(template.indexes)
        preserved_indexes = template_indexes & set(input_indexes)
        new_indexes = template_indexes - set(input_indexes)
        indexes = {dim: input_indexes[dim] for dim in preserved_indexes}
        indexes.update({k: template.indexes[k] for k in new_indexes})
        output_chunks = {
            dim: input_chunks[dim]
            for dim in template.dims if dim in input_chunks
        }

    else:
        # template xarray object has been provided with proper sizes and chunk shapes
        indexes = dict(template.indexes)
        if isinstance(template, DataArray):
            output_chunks = dict(zip(template.dims,
                                     template.chunks))  # type: ignore
        else:
            output_chunks = dict(template.chunks)

    for dim in output_chunks:
        if dim in input_chunks and len(input_chunks[dim]) != len(
                output_chunks[dim]):
            raise ValueError(
                "map_blocks requires that one block of the input maps to one block of output. "
                f"Expected number of output chunks along dimension {dim!r} to be {len(input_chunks[dim])}. "
                f"Received {len(output_chunks[dim])} instead. Please provide template if not provided, or "
                "fix the provided template.")

    if isinstance(template, DataArray):
        result_is_array = True
        template_name = template.name
        template = template._to_temp_dataset()
    elif isinstance(template, Dataset):
        result_is_array = False
    else:
        raise TypeError(
            f"func output must be DataArray or Dataset; got {type(template)}")

    # We're building a new HighLevelGraph hlg. We'll have one new layer
    # for each variable in the dataset, which is the result of the
    # func applied to the values.

    graph: Dict[Any, Any] = {}
    new_layers: DefaultDict[str, Dict[Any,
                                      Any]] = collections.defaultdict(dict)
    gname = "{}-{}".format(dask.utils.funcname(func),
                           dask.base.tokenize(npargs[0], args, kwargs))

    # map dims to list of chunk indexes
    ichunk = {
        dim: range(len(chunks_v))
        for dim, chunks_v in input_chunks.items()
    }
    # mapping from chunk index to slice bounds
    input_chunk_bounds = {
        dim: np.cumsum((0, ) + chunks_v)
        for dim, chunks_v in input_chunks.items()
    }
    output_chunk_bounds = {
        dim: np.cumsum((0, ) + chunks_v)
        for dim, chunks_v in output_chunks.items()
    }

    def subset_dataset_to_block(graph: dict, gname: str, dataset: Dataset,
                                input_chunk_bounds, chunk_index):
        """
        Creates a task that subsets an xarray dataset to a block determined by chunk_index.
        Block extents are determined by input_chunk_bounds.
        Also subtasks that subset the constituent variables of a dataset.
        """

        # this will become [[name1, variable1],
        #                   [name2, variable2],
        #                   ...]
        # which is passed to dict and then to Dataset
        data_vars = []
        coords = []

        chunk_tuple = tuple(chunk_index.values())
        for name, variable in dataset.variables.items():
            # make a task that creates tuple of (dims, chunk)
            if dask.is_dask_collection(variable.data):
                # recursively index into dask_keys nested list to get chunk
                chunk = variable.__dask_keys__()
                for dim in variable.dims:
                    chunk = chunk[chunk_index[dim]]

                chunk_variable_task = (
                    f"{gname}-{name}-{chunk[0]}", ) + chunk_tuple
                graph[chunk_variable_task] = (
                    tuple,
                    [variable.dims, chunk, variable.attrs],
                )
            else:
                # non-dask array possibly with dimensions chunked on other variables
                # index into variable appropriately
                subsetter = {
                    dim: _get_chunk_slicer(dim, chunk_index,
                                           input_chunk_bounds)
                    for dim in variable.dims
                }
                subset = variable.isel(subsetter)
                chunk_variable_task = ("{}-{}".format(
                    gname, dask.base.tokenize(subset)), ) + chunk_tuple
                graph[chunk_variable_task] = (
                    tuple,
                    [subset.dims, subset, subset.attrs],
                )

            # this task creates dict mapping variable name to above tuple
            if name in dataset._coord_names:
                coords.append([name, chunk_variable_task])
            else:
                data_vars.append([name, chunk_variable_task])

        return (Dataset, (dict, data_vars), (dict, coords), dataset.attrs)

    # iterate over all possible chunk combinations
    for chunk_tuple in itertools.product(*ichunk.values()):
        # mapping from dimension name to chunk index
        chunk_index = dict(zip(ichunk.keys(), chunk_tuple))

        blocked_args = [
            subset_dataset_to_block(graph, gname, arg, input_chunk_bounds,
                                    chunk_index) if isxr else arg
            for isxr, arg in zip(is_xarray, npargs)
        ]

        # expected["shapes", "coords", "data_vars", "indexes"] are used to
        # raise nice error messages in _wrapper
        expected = {}
        # input chunk 0 along a dimension maps to output chunk 0 along the same dimension
        # even if length of dimension is changed by the applied function
        expected["shapes"] = {
            k: output_chunks[k][v]
            for k, v in chunk_index.items() if k in output_chunks
        }
        expected["data_vars"] = set(template.data_vars.keys())  # type: ignore
        expected["coords"] = set(template.coords.keys())  # type: ignore
        expected["indexes"] = {
            dim: indexes[dim][_get_chunk_slicer(dim, chunk_index,
                                                output_chunk_bounds)]
            for dim in indexes
        }

        from_wrapper = (gname, ) + chunk_tuple
        graph[from_wrapper] = (_wrapper, func, blocked_args, kwargs, is_array,
                               expected)

        # mapping from variable name to dask graph key
        var_key_map: Dict[Hashable, str] = {}
        for name, variable in template.variables.items():
            if name in indexes:
                continue
            gname_l = f"{gname}-{name}"
            var_key_map[name] = gname_l

            key: Tuple[Any, ...] = (gname_l, )
            for dim in variable.dims:
                if dim in chunk_index:
                    key += (chunk_index[dim], )
                else:
                    # unchunked dimensions in the input have one chunk in the result
                    # output can have new dimensions with exactly one chunk
                    key += (0, )

            # We're adding multiple new layers to the graph:
            # The first new layer is the result of the computation on
            # the array.
            # Then we add one layer per variable, which extracts the
            # result for that variable, and depends on just the first new
            # layer.
            new_layers[gname_l][key] = (operator.getitem, from_wrapper, name)

    hlg = HighLevelGraph.from_collections(
        gname,
        graph,
        dependencies=[arg for arg in npargs if dask.is_dask_collection(arg)],
    )

    for gname_l, layer in new_layers.items():
        # This adds in the getitems for each variable in the dataset.
        hlg.dependencies[gname_l] = {gname}
        hlg.layers[gname_l] = layer

    result = Dataset(coords=indexes, attrs=template.attrs)
    for index in result.indexes:
        result[index].attrs = template[index].attrs
        result[index].encoding = template[index].encoding

    for name, gname_l in var_key_map.items():
        dims = template[name].dims
        var_chunks = []
        for dim in dims:
            if dim in output_chunks:
                var_chunks.append(output_chunks[dim])
            elif dim in indexes:
                var_chunks.append((len(indexes[dim]), ))
            elif dim in template.dims:
                # new unindexed dimension
                var_chunks.append((template.sizes[dim], ))

        data = dask.array.Array(hlg,
                                name=gname_l,
                                chunks=var_chunks,
                                dtype=template[name].dtype)
        result[name] = (dims, data, template[name].attrs)
        result[name].encoding = template[name].encoding

    result = result.set_coords(template._coord_names)

    if result_is_array:
        da = dataset_to_dataarray(result)
        da.name = template_name
        return da  # type: ignore
    return result  # type: ignore
Esempio n. 31
0
 def test_transform_dtypes(self, array):
     result = dpp.LabelEncoder().fit_transform(array)
     assert result.dtype == np.intp
     if dask.is_dask_collection(array):
         assert result.dtype == result.compute().dtype