Example #1
0
def _prepare_dask(result, riods, filename, chunks):
    """
    Prepare the data for dask computations
    """
    from dask.base import tokenize

    # augment the token with the file modification time
    try:
        mtime = os.path.getmtime(filename)
    except OSError:
        # the filename is probably an s3 bucket rather than a regular file
        mtime = None

    if chunks in (True, "auto"):
        import dask
        from dask.array.core import normalize_chunks

        if LooseVersion(dask.__version__) < LooseVersion("0.18.0"):
            msg = ("Automatic chunking requires dask.__version__ >= 0.18.0 . "
                   "You currently have version %s" % dask.__version__)
            raise NotImplementedError(msg)
        block_shape = (1, ) + riods.block_shapes[0]
        chunks = normalize_chunks(
            chunks=(1, "auto", "auto"),
            shape=(riods.count, riods.height, riods.width),
            dtype=riods.dtypes[0],
            previous_chunks=tuple((c, ) for c in block_shape),
        )
    token = tokenize(filename, mtime, chunks)
    name_prefix = "open_rasterio-%s" % token
    return result.chunk(chunks, name_prefix=name_prefix, token=token)
Example #2
0
def precomputed_to_dask(store_path: str,
                        key: str,
                        chunks: Union[Sequence[int], str],
                        channel: int = 0):
    tsa = access_precomputed(store_path, key,
                             mode='r')[ts.d["channel"][channel]]
    shape = tuple(tsa.shape)
    dtype = tsa.dtype.numpy_dtype
    if chunks == "auto":
        chunks = tsa.spec().to_json()["scale_metadata"]["chunk_size"]
    _chunks = normalize_chunks(chunks, shape)

    def chunk_loader(store_path, key, block_info=None):
        idx = tuple(
            slice(*idcs) for idcs in block_info[None]["array-location"])
        tsa = access_precomputed(store_path, key,
                                 mode='r')[ts.d["channel"][channel]]
        result = tsa[idx].read().result()
        return result

    arr = map_blocks(chunk_loader,
                     store_path,
                     key,
                     chunks=_chunks,
                     dtype=dtype)
    return arr
Example #3
0
def create_array(ds_group, column, schema, coordinate=False):
    codec = numcodecs.Pickle() if schema.dtype == np.object else None

    zchunks = zarr_chunks(column, schema.dims, schema.chunks)

    array = ds_group.require_dataset(column,
                                     schema.shape,
                                     chunks=zchunks,
                                     dtype=schema.dtype,
                                     object_codec=codec,
                                     exact=True)

    if zchunks is not None:
        # Expand zarr chunks to full dask resolution
        # For comparison purposes
        zchunks = normalize_chunks(array.chunks, schema.shape)

        if zchunks != schema.chunks:
            raise ValueError(
                f"zarr chunks {zchunks} "
                f"don't match dask chunks {schema.chunks}. "
                f"This can cause data corruption as described in "
                f"https://zarr.readthedocs.io/en/stable/tutorial.html"
                f"#parallel-computing-and-synchronization")

    array.attrs[DASKMS_ATTR_KEY] = {
        "dims": schema.dims,
        "coordinate": coordinate,
        "array_type": encode_type(schema.type),
    }
Example #4
0
def tri(N, M=None, k=0, dtype=float, chunks="auto", *, like=None):
    if not _numpy_120 and like is not None:
        raise RuntimeError("The use of ``like`` required NumPy >= 1.20")

    _min_int = np.lib.twodim_base._min_int

    if M is None:
        M = N

    chunks = normalize_chunks(chunks, shape=(N, M), dtype=dtype)

    m = greater_equal(
        arange(N, chunks=chunks[0][0], dtype=_min_int(0, N),
               like=like).reshape(1, N).T,
        arange(-k,
               M - k,
               chunks=chunks[1][0],
               dtype=_min_int(-k, M - k),
               like=like),
    )

    # Avoid making a copy if the requested type is already bool
    m = m.astype(dtype, copy=False)

    return m
Example #5
0
    def _new_chunks(self, in_arr, rows_per_scan):
        """Determine a good scan-based chunk size."""
        if len(in_arr.shape) != 2:
            raise ValueError("Can only rechunk 2D arrays for EWA resampling.")
        if xr is not None and isinstance(in_arr, xr.DataArray):
            # get the dask or numpy array underneath
            in_arr = in_arr.data

        # assume (y, x)
        num_cols = in_arr.shape[1]
        prev_chunks = getattr(in_arr, 'chunks',
                              tuple((x, ) for x in in_arr.shape))
        num_row_chunks = prev_chunks[0][0]
        if num_row_chunks % rows_per_scan == 0:
            row_chunks = num_row_chunks
        else:
            row_chunks = 'auto'
        # what do dask's settings give us for full width chunks
        auto_chunks = normalize_chunks({
            0: row_chunks,
            1: num_cols
        },
                                       shape=in_arr.shape,
                                       dtype=in_arr.dtype,
                                       previous_chunks=prev_chunks)
        # let's make them scan-aligned
        chunk_rows = max(math.floor(auto_chunks[0][0] / rows_per_scan),
                         1) * rows_per_scan
        return {0: chunk_rows, 1: num_cols}
Example #6
0
def read(filename, shape, chunks):
    from dask.highlevelgraph import HighLevelGraph
    from dask.array.core import normalize_chunks, Array
    from itertools import product
    from ...tunable import delayed
    from numpy import prod, dtype
    import xmltodict

    records = scan_file(filename)
    records = {r["lime_type"]: r for r in records}

    data_record = records["ildg-binary-data"]
    data_offset = data_record["pos"]

    info = xmltodict.parse(records["ildg-format"]["data"])["ildgFormat"]
    dtype = dtype("complex%d" % (int(info["precision"]) * 2))

    assert data_record["data_length"] == prod(shape) * dtype.itemsize

    normal_chunks = normalize_chunks(chunks, shape=shape)
    chunks_id = list(product(*[range(len(bd)) for bd in normal_chunks]))

    reads = [
        delayed(read_chunk)(filename, shape, dtype, data_offset, chunks,
                            chunk_id) for chunk_id in chunks_id
    ]

    keys = [(filename, *chunk_id) for chunk_id in chunks_id]
    vals = [read.key for read in reads]
    dsk = dict(zip(keys, vals))

    graph = HighLevelGraph.from_collections(filename, dsk, dependencies=reads)

    return Array(graph, filename, normal_chunks, dtype=dtype)
Example #7
0
File: wrap.py Project: m-rossi/dask
def _parse_wrap_args(func, args, kwargs, shape):
    if isinstance(shape, np.ndarray):
        shape = shape.tolist()

    if not isinstance(shape, (tuple, list)):
        shape = (shape, )

    name = kwargs.pop("name", None)
    chunks = kwargs.pop("chunks", "auto")

    dtype = kwargs.pop("dtype", None)
    if dtype is None:
        dtype = func(shape, *args, **kwargs).dtype
    dtype = np.dtype(dtype)

    chunks = normalize_chunks(chunks, shape, dtype=dtype)

    name = name or funcname(func) + "-" + tokenize(func, shape, chunks, dtype,
                                                   args, kwargs)

    return {
        "shape": shape,
        "dtype": dtype,
        "kwargs": kwargs,
        "chunks": chunks,
        "name": name,
    }
Example #8
0
def partition_chunking(partition, fragment_rows, chunks):
    partition_rows = sum(fragment_rows)

    if chunks is None:
        # Default to natural chunking determined from individual
        # parquet files in the dataset
        row_chunks = tuple(fragment_rows)
    else:
        try:
            partition_chunks = chunks[partition]
        except IndexError:
            partition_chunks = chunks[-1]

        # We only handle row chunking at present,
        # warn the user
        unhandled_dims = set(partition_chunks.keys()) - {"row"}

        if len(unhandled_dims) != 0:
            warnings.warn(
                f"{unhandled_dims} chunking dimensions are "
                f"currently ignored for arrow", UserWarning)

        # Get any user specified row chunking, defaulting to
        row_chunks = partition_chunks.get("row", fragment_rows)

        if isinstance(row_chunks, list):
            row_chunks = tuple(row_chunks)

        row_chunks = normalize_chunks(row_chunks, (partition_rows, ))[0]

    intervals = np.cumsum([0] + fragment_rows)
    chunk_intervals = np.cumsum((0, ) + row_chunks)
    ranges = defaultdict(list)
    it = zip(chunk_intervals, chunk_intervals[1:])

    for c, (lower, upper) in enumerate(it):

        si = np.searchsorted(intervals, lower, side='right') - 1
        ei = np.searchsorted(intervals, upper, side='left')

        if si == ei:
            raise ValueError("si == ei, arrays may have zero chunks")

        for s in range(si, ei):
            e = s + 1

            if lower <= intervals[s]:
                start = 0
            else:
                start = lower - intervals[s]

            if upper >= intervals[e]:
                end = intervals[e] - intervals[s]
            else:
                end = upper - intervals[s]

            ranges[c].append((s, (start, end)))

    return ranges
Example #9
0
def mrc_to_dask(urlpath: Pathlike, chunks: Union[str, Sequence[int]],
                **kwargs):
    """
    Generate a dask array backed by a memory-mapped .mrc file.
    """
    with access_mrc(urlpath, mode="r") as mem:
        shape, dtype = mrc_shape_dtype_inference(mem)

    if chunks == "auto":
        _chunks = normalize_chunks((1, *(-1, ) * (len(shape) - 1)),
                                   shape,
                                   dtype=dtype)
    else:
        _chunks = normalize_chunks(chunks, shape, dtype=dtype)

    arr = da.map_blocks(mrc_chunk_loader, urlpath, chunks=_chunks, dtype=dtype)
    return arr
Example #10
0
    def compute(self,
                data,
                cache_id=None,
                rows_per_scan=None,
                chunks=None,
                fill_value=None,
                weight_count=10000,
                weight_min=0.01,
                weight_distance_max=1.0,
                weight_delta_max=1.0,
                weight_sum_min=-1.0,
                maximum_weight_mode=None,
                **kwargs):
        """Resample the data according to the precomputed X/Y coordinates."""
        # not used in this step
        kwargs.pop("persist", None)
        data_in, xr_obj = self._get_input_tuples(data)
        rows_per_scan = self._get_rows_per_scan(rows_per_scan)
        data_in = tuple(self._convert_to_dask(data_in, rows_per_scan))
        out_chunks = normalize_chunks(chunks or 'auto',
                                      shape=self.target_geo_def.shape,
                                      dtype=data.dtype)
        fornav_kwargs = kwargs.copy()
        maximum_weight_mode = self._handle_mwm(data, maximum_weight_mode)
        fornav_kwargs.update(
            dict(
                weight_count=weight_count,
                weight_min=weight_min,
                weight_distance_max=weight_distance_max,
                weight_delta_max=weight_delta_max,
                weight_sum_min=weight_sum_min,
                maximum_weight_mode=maximum_weight_mode,
                rows_per_scan=rows_per_scan,
            ))

        # determine a fill value if they didn't tell us what they have as a
        # fill value in the numpy arrays
        if fill_value is None:
            fill_value = self._get_default_fill(data_in[0])

        data_out = []
        for data_subarr in data_in:
            res = self._run_fornav_single(data_subarr, out_chunks,
                                          self.target_geo_def, fill_value,
                                          **fornav_kwargs)
            data_out.append(res)
        if data.ndim == 2:
            out = data_out[0]
        else:
            out = da.concatenate([arr[None, ...] for arr in data_out], axis=0)

        if xr_obj is not None:
            dims = [d for d in xr_obj.dims if d not in ('y', 'x')] + ['y', 'x']
            out = xr.DataArray(out, attrs=xr_obj.attrs.copy(), dims=dims)
            out = update_resampled_coords(xr_obj, out, self.target_geo_def)
        if isinstance(data, np.ndarray):
            return out.compute()
        return out
Example #11
0
def test_rfftfreq(n, d, c):
    c = [ci for ci in c(n) if ci != 0]

    r1 = np.fft.rfftfreq(n, d)
    r2 = da.fft.rfftfreq(n, d, chunks=c)

    assert normalize_chunks(c, r2.shape) == r2.chunks

    assert_eq(r1, r2)
Example #12
0
def test_fftfreq(n, d, c):
    c = c(n)

    r1 = np.fft.fftfreq(n, d)
    r2 = da.fft.fftfreq(n, d, chunks=c)

    assert normalize_chunks(c, r2.shape) == r2.chunks

    assert_eq(r1, r2)
Example #13
0
def test_rfftfreq(n, d, c):
    c = [ci for ci in c(n) if ci != 0]

    r1 = np.fft.rfftfreq(n, d)
    r2 = da.fft.rfftfreq(n, d, chunks=c)

    assert normalize_chunks(c, r2.shape) == r2.chunks

    assert_eq(r1, r2)
Example #14
0
def test_fftfreq(n, d, c):
    c = c(n)

    r1 = np.fft.fftfreq(n, d)
    r2 = da.fft.fftfreq(n, d, chunks=c)

    assert normalize_chunks(c, r2.shape) == r2.chunks

    assert_eq(r1, r2)
Example #15
0
def test_dataset(ms, select_cols, group_cols, index_cols, shapes, chunks):
    """ Test dataset creation """
    datasets = read_datasets(ms,
                             select_cols,
                             group_cols,
                             index_cols,
                             chunks=chunks)
    # (1) Read-only TableProxy
    # (2) Read-only TAQL TableProxy
    assert_liveness(2, 1)

    chans = shapes['chan']
    corrs = shapes['corr']

    # Expected output chunks
    echunks = {
        'chan': normalize_chunks(chunks.get('chan', chans),
                                 shape=(chans, ))[0],
        'corr': normalize_chunks(chunks.get('corr', corrs), shape=(corrs, ))[0]
    }

    for ds in datasets:
        compute_dict = {}

        for k, v in ds.data_vars.items():
            compute_dict[k] = v.data
            assert v.dtype == v.data.dtype

        res = dask.compute(compute_dict)[0]

        assert res['DATA'].shape[1:] == (chans, corrs)
        assert 'STATE_ID' in res
        assert 'TIME' in res

        chunks = ds.chunks
        assert chunks["chan"] == echunks['chan']
        assert chunks["corr"] == echunks['corr']

        dims = ds.dims
        dims.pop('row')  # row changes
        assert dims == {"chan": shapes['chan'], "corr": shapes['corr']}

    del ds, datasets, compute_dict, v
    assert_liveness(0, 0)
Example #16
0
def get_group_chunks(group):

    group_chunks = {}

    for array in group.values():
        array_chunks = normalize_chunks(array.chunks, array.shape)
        array_dims = decode_attr(array.attrs[DASKMS_ATTR_KEY])["dims"]
        group_chunks.update(dict(zip(array_dims, array_chunks)))

    return group_chunks
Example #17
0
def indices(dimensions, dtype=int, chunks="auto"):
    """
    Implements NumPy's ``indices`` for Dask Arrays.

    Generates a grid of indices covering the dimensions provided.

    The final array has the shape ``(len(dimensions), *dimensions)``. The
    chunks are used to specify the chunking for axis 1 up to
    ``len(dimensions)``. The 0th axis always has chunks of length 1.

    Parameters
    ----------
    dimensions : sequence of ints
        The shape of the index grid.
    dtype : dtype, optional
        Type to use for the array. Default is ``int``.
    chunks : sequence of ints, str
        The size of each block.  Must be one of the following forms:

        - A blocksize like (500, 1000)
        - A size in bytes, like "100 MiB" which will choose a uniform
          block-like shape
        - The word "auto" which acts like the above, but uses a configuration
          value ``array.chunk-size`` for the chunk size

        Note that the last block will have fewer samples if ``len(array) % chunks != 0``.

    Returns
    -------
    grid : dask array
    """
    dimensions = tuple(dimensions)
    dtype = np.dtype(dtype)
    chunks = normalize_chunks(chunks, shape=dimensions, dtype=dtype)

    if len(dimensions) != len(chunks):
        raise ValueError("Need same number of chunks as dimensions.")

    xi = []
    for i in range(len(dimensions)):
        xi.append(arange(dimensions[i], dtype=dtype, chunks=(chunks[i], )))

    grid = []
    if all(dimensions):
        grid = meshgrid(*xi, indexing="ij")

    if grid:
        grid = stack(grid)
    else:
        grid = empty((len(dimensions), ) + dimensions,
                     dtype=dtype,
                     chunks=(1, ) + chunks)

    return grid
Example #18
0
def test_indices_dimensions_chunks():
    chunks = ((1, 4, 2, 3), (5, 5))
    darr = da.indices((10, 10), chunks=chunks)
    assert darr.chunks == ((1, 1), ) + chunks

    with dask.config.set({"array.chunk-size": "50 MiB"}):
        shape = (10000, 10000)
        expected = normalize_chunks("auto", shape=shape, dtype=int)
        result = da.indices(shape, chunks="auto")
        # indices prepends a dimension
        actual = result.chunks[1:]
        assert expected == actual
Example #19
0
def fromfunction(func, chunks="auto", shape=None, dtype=None, **kwargs):
    dtype = dtype or float
    chunks = normalize_chunks(chunks, shape, dtype=dtype)

    inds = tuple(range(len(shape)))

    arrs = [arange(s, dtype=dtype, chunks=c) for s, c in zip(shape, chunks)]
    arrs = meshgrid(*arrs, indexing="ij")

    args = sum(zip(arrs, itertools.repeat(inds)), ())

    res = blockwise(func, inds, *args, token="fromfunction", **kwargs)

    return res
Example #20
0
def linspace(start, stop, num=50, chunks=None, dtype=None, endpoint=True):
    """
    Return `num` evenly spaced values over the closed interval [`start`,
    `stop`].

    TODO: implement the `endpoint`, `restep`, and `dtype` keyword args

    Parameters
    ----------
    start : scalar
        The starting value of the sequence.
    stop : scalar
        The last value of the sequence.
    num : int, optional
        Number of samples to include in the returned dask array, including the
        endpoints.
    chunks :  int
        The number of samples on each block. Note that the last block will have
        fewer samples if `num % blocksize != 0`

    Returns
    -------
    samples : dask array

    """
    num = int(num)
    if endpoint == False:
        num = num + 1
    if chunks is None:
        raise ValueError("Must supply a chunks= keyword argument")

    chunks = normalize_chunks(chunks, (num, ))

    range_ = stop - start

    space = float(range_) / (num - 1)

    name = 'linspace-' + tokenize((start, stop, num, chunks, dtype, endpoint))

    dsk = {}
    blockstart = start

    for i, bs in enumerate(chunks[0]):
        blockstop = blockstart + ((bs - 1) * space)
        task = (partial(np.linspace, dtype=dtype), blockstart, blockstop, bs)
        blockstart = blockstart + (space * bs)
        dsk[(name, i)] = task

    return Array(dsk, name, chunks, dtype=dtype)
Example #21
0
def mrc_to_dask(fname: Pathlike, chunks: tuple):
    """
    Generate a dask array backed by a memory-mapped .mrc file
    """
    with access_mrc(fname, mode="r") as mem:
        shape, dtype = mrc_shape_dtype_inference(mem)

    chunks_ = normalize_chunks(chunks, shape)

    def chunk_loader(fname, block_info=None):
        idx = tuple(slice(*idcs) for idcs in block_info[None]["array-location"])
        result = np.array(access_mrc(fname, mode="r").data[idx]).astype(dtype)
        return result

    arr = da.map_blocks(chunk_loader, fname, chunks=chunks_, dtype=dtype)

    return arr
Example #22
0
def row_ordering(taql_proxy, index_cols, chunks):
    nrows = taql_proxy.nrows().result()
    chunks = normalize_chunks(chunks['row'], shape=(nrows, ))
    token = dask.base.tokenize(taql_proxy, index_cols, chunks, nrows)
    name = 'rows-' + token
    layers = {}
    start = 0

    for i, c in enumerate(chunks[0]):
        layers[(name, i)] = (_sorted_rows, taql_proxy, start, c)
        start += c

    graph = HighLevelGraph.from_collections(name, layers, [])
    rows = da.Array(graph, name, chunks=chunks, dtype=np.object)
    row_runs = rows.map_blocks(row_run_factory,
                               sort_dir="read",
                               dtype=np.object)

    return rows, row_runs
Example #23
0
def _group_ordering_arrays(taql_proxy, index_cols, group, group_nrows,
                           group_row_chunks):
    """
    Returns
    -------
    sorted_rows : :class:`dask.array.Array`
        Sorted table rows chunked on ``group_row_chunks``.
    row_runs : :class:`dask.array.Array`.
        Array containing (row_run, resort) tuples.
        Should not be directly computed.
        Chunked on ``group_row_chunks``.
    """
    token = dask.base.tokenize(taql_proxy, group, group_nrows)
    name = 'group-rows-' + token
    chunks = ((group_nrows, ), )
    layers = {(name, 0): (_sorted_group_rows, taql_proxy, group, index_cols)}

    graph = HighLevelGraph.from_collections(name, layers, [])
    group_rows = da.Array(graph, name, chunks, dtype=np.int32)
    group_rows = cached_array(group_rows)

    try:
        shape = (group_nrows, )
        group_row_chunks = normalize_chunks(group_row_chunks, shape=shape)
    except ValueError as e:
        raise GroupChunkingError("%s\n"
                                 "Unable to match chunks '%s' "
                                 "with shape '%s' for group '%d'. "
                                 "This can occur if too few chunk "
                                 "dictionaries have been supplied for "
                                 "the number of groups "
                                 "and an earlier group's chunking strategy "
                                 "is applied to a later one." %
                                 (str(e), group_row_chunks, shape, group))

    group_rows = group_rows.rechunk(group_row_chunks)
    row_runs = group_rows.map_blocks(row_run_factory,
                                     sort_dir="read",
                                     dtype=np.object)

    row_runs = cached_array(row_runs)

    return group_rows, row_runs
Example #24
0
def sobol(size: Union[int, Tuple[int, int]],
          d0: int = 0,
          chunks: Chunks2D = None) -> Union[np.ndarray, da.Array]:
    """Sobol points generator based on Gray code order

    :param size:
        number of samples (cannot be greater than :math:`2^{32}`) to extract
        from a single dimension, or tuple (samples, dimensions).
        To guarantee uniform distribution, the number of samples should
        always be :math:`2^{n} - 1`.
    :param int d0:
        first dimension. This can be used as a functional equivalent of a
        a random seed. dimensions + d0 can't be greater than
        :func:`max_dimensions()` - 1.
    :param chunks:
        If None, return a numpy array.

        If set, return a dask array with the given chunk size.
        It can be anything accepted by dask (a positive integer, a
        tuple of two ints, or a tuple of two tuples of ints) for the output
        shape (see result below). e.g. either ``(16384, 50)`` or
        ``((16384, 16383),  (50, 50, 50))`` could be used together with
        ``size=(32767, 150)``.

        .. note::
           The algorithm is not efficient if there are multiple chunks on axis
           0. However, if you do need them, it is typically better to require
           them here than re-chunking afterwards, particularly if (most of) the
           subsequent algorithm is embarassingly parallel.
    :returns:
        If size is an int, a 1-dimensional array of samples.
        If size is a tuple, a 2-dimensional array POINTS, where
        ``POINTS[i, j]`` is the ith sample of the jth dimension.
        Each dimension is a uniform (0, 1) distribution.
    :rtype:
        If chunks is not None, :class:`dask.array.Array`; else
        :class:`numpy.ndarray`
    """
    if isinstance(size, int):
        samples = size
        dimensions = 1
    else:
        samples, dimensions = size

    if not 0 < samples < 2**32:
        raise ValueError("samples must be between 1 and 2^32")
    if not 0 < dimensions + d0 <= max_dimensions():
        raise ValueError("(dimensions + d0) must be between 1 and %d" %
                         max_dimensions())

    if chunks is None:
        res = _sobol_kernel(samples, dimensions, 0, d0)
        if isinstance(size, int):
            res = res[:, 0]
        return res

    # dask-specific code
    chunks = cast(NormalizedChunks2D,
                  normalize_chunks(chunks, shape=(samples, dimensions)))
    name = "sobol-%d-%d-%d" % (samples, dimensions, d0)
    dsk = {}

    offset_i = 0
    for i, size_i in enumerate(chunks[0]):
        offset_j = 0
        for j, size_j in enumerate(chunks[1]):
            dsk[name, i,
                j] = (_sobol_kernel, size_i, size_j, offset_i, d0 + offset_j)
            offset_j += size_j
        offset_i += size_i

    res = da.Array(dsk, name=name, dtype=float, chunks=chunks)
    if isinstance(size, int):
        res = res[:, 0]
    return res
Example #25
0
def open_rasterio(
    filename,
    parse_coordinates=None,
    chunks=None,
    cache=None,
    lock=None,
    masked=False,
    **open_kwargs
):
    """Open a file with rasterio (experimental).

    This should work with any file that rasterio can open (most often:
    geoTIFF). The x and y coordinates are generated automatically from the
    file's geoinformation, shifted to the center of each pixel (see
    `"PixelIsArea" Raster Space
    <http://web.archive.org/web/20160326194152/http://remotesensing.org/geotiff/spec/geotiff2.5.html#2.5.2>`_
    for more information).

    You can generate 2D coordinates from the file's attributes with::

        from affine import Affine
        da = xr.open_rasterio('path_to_file.tif')
        transform = Affine.from_gdal(*da.attrs['transform'])
        nx, ny = da.sizes['x'], da.sizes['y']
        x, y = np.meshgrid(np.arange(nx)+0.5, np.arange(ny)+0.5) * transform


    Parameters
    ----------
    filename : str, rasterio.DatasetReader, or rasterio.WarpedVRT
        Path to the file to open. Or already open rasterio dataset.
    parse_coordinates : bool, optional
        Whether to parse the x and y coordinates out of the file's
        ``transform`` attribute or not. The default is to automatically
        parse the coordinates only if they are rectilinear (1D).
        It can be useful to set ``parse_coordinates=False``
        if your files are very large or if you don't need the coordinates.
    chunks : int, tuple or dict, optional
        Chunk sizes along each dimension, e.g., ``5``, ``(5, 5)`` or
        ``{'x': 5, 'y': 5}``. If chunks is provided, it used to load the new
        DataArray into a dask array. Chunks can also be set to
        ``True`` or ``"auto"`` to choose sensible chunk sizes according to
        ``dask.config.get("array.chunk-size").
    cache : bool, optional
        If True, cache data loaded from the underlying datastore in memory as
        NumPy arrays when accessed to avoid reading from the underlying data-
        store multiple times. Defaults to True unless you specify the `chunks`
        argument to use dask, in which case it defaults to False.
    lock : False, True or threading.Lock, optional
        If chunks is provided, this argument is passed on to
        :py:func:`dask.array.from_array`. By default, a global lock is
        used to avoid issues with concurrent access to the same file when using
        dask's multithreaded backend.
    masked : bool, optional
        If True, read the mask and to set values to NaN. Defaults to False.
    **open_kwargs: kwargs, optional
        Optional keyword arguments to pass into rasterio.open().

    Returns
    -------
    data : DataArray
        The newly created DataArray.
    """
    parse_coordinates = True if parse_coordinates is None else parse_coordinates

    import rasterio
    from rasterio.vrt import WarpedVRT

    vrt_params = None
    if isinstance(filename, rasterio.io.DatasetReader):
        filename = filename.name
    elif isinstance(filename, rasterio.vrt.WarpedVRT):
        vrt = filename
        filename = vrt.src_dataset.name
        vrt_params = dict(
            crs=vrt.crs.to_string(),
            resampling=vrt.resampling,
            src_nodata=vrt.src_nodata,
            dst_nodata=vrt.dst_nodata,
            tolerance=vrt.tolerance,
            transform=vrt.transform,
            width=vrt.width,
            height=vrt.height,
            warp_extras=vrt.warp_extras,
        )

    if lock is None:
        lock = RASTERIO_LOCK

    # ensure default for sharing is False
    # ref https://github.com/mapbox/rasterio/issues/1504
    open_kwargs["sharing"] = open_kwargs.get("sharing", False)
    manager = CachingFileManager(
        rasterio.open, filename, lock=lock, mode="r", kwargs=open_kwargs
    )
    riods = manager.acquire()

    # open the subdatasets if they exist
    if riods.subdatasets:
        data_arrays = {}
        for iii, subdataset in enumerate(riods.subdatasets):
            rioda = open_rasterio(
                subdataset,
                parse_coordinates=iii == 0 and parse_coordinates,
                chunks=chunks,
                cache=cache,
                lock=lock,
                masked=masked,
            )
            data_arrays[rioda.name] = rioda
        return Dataset(data_arrays)

    if vrt_params is not None:
        riods = WarpedVRT(riods, **vrt_params)

    if cache is None:
        cache = chunks is None

    coords = OrderedDict()

    # Get bands
    if riods.count < 1:
        raise ValueError("Unknown dims")
    coords["band"] = np.asarray(riods.indexes)

    # Get coordinates
    if LooseVersion(rasterio.__version__) < LooseVersion("1.0"):
        transform = riods.affine
    else:
        transform = riods.transform

    if transform.is_rectilinear and parse_coordinates:
        # 1d coordinates
        coords.update(affine_to_coords(riods.transform, riods.width, riods.height))
    elif parse_coordinates:
        # 2d coordinates
        warnings.warn(
            "The file coordinates' transformation isn't "
            "rectilinear: xarray won't parse the coordinates "
            "in this case. Set `parse_coordinates=False` to "
            "suppress this warning.",
            RuntimeWarning,
            stacklevel=3,
        )

    # Attributes
    attrs = _parse_tags(riods.tags(1))
    encoding = dict()
    # Affine transformation matrix (always available)
    # This describes coefficients mapping pixel coordinates to CRS
    # For serialization store as tuple of 6 floats, the last row being
    # always (0, 0, 1) per definition (see
    # https://github.com/sgillies/affine)
    attrs["transform"] = tuple(transform)[:6]
    if hasattr(riods, "nodata") and riods.nodata is not None:
        # The nodata values for the raster bands
        if masked:
            encoding["_FillValue"] = riods.nodata
        else:
            attrs["_FillValue"] = riods.nodata
    if hasattr(riods, "scales"):
        # The scale values for the raster bands
        attrs["scales"] = riods.scales
    if hasattr(riods, "offsets"):
        # The offset values for the raster bands
        attrs["offsets"] = riods.offsets
    if hasattr(riods, "descriptions") and any(riods.descriptions):
        # Descriptions for each dataset band
        attrs["descriptions"] = riods.descriptions
    if hasattr(riods, "units") and any(riods.units):
        # A list of units string for each dataset band
        attrs["units"] = riods.units

    # Parse extra metadata from tags, if supported
    parsers = {"ENVI": _parse_envi}

    driver = riods.driver
    if driver in parsers:
        meta = parsers[driver](riods.tags(ns=driver))

        for k, v in meta.items():
            # Add values as coordinates if they match the band count,
            # as attributes otherwise
            if isinstance(v, (list, np.ndarray)) and len(v) == riods.count:
                coords[k] = ("band", np.asarray(v))
            else:
                attrs[k] = v

    data = indexing.LazilyOuterIndexedArray(
        RasterioArrayWrapper(manager, lock, vrt_params, masked=masked)
    )

    # this lets you write arrays loaded with rasterio
    data = indexing.CopyOnWriteArray(data)
    if cache and chunks is None:
        data = indexing.MemoryCachedArray(data)

    da_name = attrs.pop("NETCDF_VARNAME", None)
    result = DataArray(
        data=data, dims=("band", "y", "x"), coords=coords, attrs=attrs, name=da_name
    )
    result.encoding = encoding

    if hasattr(riods, "crs") and riods.crs:
        result.rio.write_crs(riods.crs, inplace=True)

    if chunks is not None:
        from dask.base import tokenize

        # augment the token with the file modification time
        try:
            mtime = os.path.getmtime(filename)
        except OSError:
            # the filename is probably an s3 bucket rather than a regular file
            mtime = None

        if chunks in (True, "auto"):
            from dask.array.core import normalize_chunks
            import dask

            if LooseVersion(dask.__version__) < LooseVersion("0.18.0"):
                msg = (
                    "Automatic chunking requires dask.__version__ >= 0.18.0 . "
                    "You currently have version %s" % dask.__version__
                )
                raise NotImplementedError(msg)
            block_shape = (1,) + riods.block_shapes[0]
            chunks = normalize_chunks(
                chunks=(1, "auto", "auto"),
                shape=(riods.count, riods.height, riods.width),
                dtype=riods.dtypes[0],
                previous_chunks=tuple((c,) for c in block_shape),
            )
        token = tokenize(filename, mtime, chunks)
        name_prefix = "open_rasterio-%s" % token
        result = result.chunk(chunks, name_prefix=name_prefix, token=token)

    # Make the file closeable
    result._file_obj = manager

    return result
Example #26
0
        def choice(self, a, size=None, replace=True, p=None, chunks="auto"):
            dependencies = []
            # Normalize and validate `a`
            if isinstance(a, Integral):
                # On windows the output dtype differs if p is provided or
                # absent, see https://github.com/numpy/numpy/issues/9867
                dummy_p = np.array([1]) if p is not None else p
                dtype = np.random.choice(1, size=(), p=dummy_p).dtype
                len_a = a
                if a < 0:
                    raise ValueError("a must be greater than 0")
            else:
                a = asarray(a)
                a = a.rechunk(a.shape)
                dtype = a.dtype
                if a.ndim != 1:
                    raise ValueError("a must be one dimensional")
                len_a = len(a)
                dependencies.append(a)
                a = a.__dask_keys__()[0]

            # Normalize and validate `p`
            if p is not None:
                if not isinstance(p, Array):
                    # If p is not a dask array, first check the sum is close
                    # to 1 before converting.
                    p = np.asarray(p)
                    if not np.isclose(p.sum(), 1, rtol=1e-7, atol=0):
                        raise ValueError("probabilities do not sum to 1")
                    p = asarray(p)
                else:
                    p = p.rechunk(p.shape)

                if p.ndim != 1:
                    raise ValueError("p must be one dimensional")
                if len(p) != len_a:
                    raise ValueError("a and p must have the same size")

                dependencies.append(p)
                p = p.__dask_keys__()[0]

            if size is None:
                size = ()
            elif not isinstance(size, (tuple, list)):
                size = (size, )

            chunks = normalize_chunks(chunks, size, dtype=np.float64)
            if not replace and len(chunks[0]) > 1:
                err_msg = ("replace=False is not currently supported for "
                           "dask.array.choice with multi-chunk output "
                           "arrays")
                raise NotImplementedError(err_msg)
            sizes = list(product(*chunks))
            state_data = random_state_data(len(sizes), self._numpy_state)

            name = "da.random.choice-%s" % tokenize(state_data, size, chunks,
                                                    a, replace, p)
            keys = product([name], *(range(len(bd)) for bd in chunks))
            dsk = {
                k: (_choice, state, a, size, replace, p)
                for k, state, size in zip(keys, state_data, sizes)
            }

            graph = HighLevelGraph.from_collections(name,
                                                    dsk,
                                                    dependencies=dependencies)
            return Array(graph, name, chunks, dtype=dtype)
def groupby_agg(
    array: dask.array.Array,
    by: dask.array.Array,
    agg: Aggregation,
    expected_groups: Optional[Union[Sequence, np.ndarray]],
    axis: Sequence = None,
    split_out: int = 1,
    fill_value: Any = None,
) -> Tuple[dask.array.Array, Union[np.ndarray, dask.array.Array]]:

    # I think _tree_reduce expects this
    assert isinstance(axis, Sequence)
    assert all(ax >= 0 for ax in axis)

    inds = tuple(range(array.ndim))
    name = f"groupby_{agg.name}"
    token = dask.base.tokenize(array, by, agg, expected_groups, axis,
                               split_out)

    # This is necessary for argreductions.
    # We need to rechunk before zipping up with the index
    # let's always do it anyway
    _, (array, by) = dask.array.unify_chunks(array, inds, by, inds[-by.ndim:])

    # preprocess the array
    if agg.preprocess:
        array = agg.preprocess(array, axis=axis)

    # apply reduction on chunk
    applied = dask.array.blockwise(
        partial(
            _get_chunk_reduction(agg.reduction_type),
            func=agg.chunk,  # type: ignore
            axis=axis,
            # with the current implementation we want reindexing at the blockwise step
            # only reindex to groups present at combine stage
            expected_groups=expected_groups if split_out > 1 else None,
            fill_value=agg.fill_value,
        ),
        inds,
        array,
        inds,
        by,
        inds[-by.ndim:],
        concatenate=False,
        dtype=array.dtype,
        meta=array._meta,
        align_arrays=False,
        token=f"{name}-chunk-{token}",
    )

    if split_out > 1:
        if expected_groups is None:
            # This could be implemented using the "hash_split" strategy
            # from dask.dataframe
            raise NotImplementedError
        chunk_tuples = tuple(
            itertools.product(*tuple(range(n) for n in applied.numblocks)))
        ngroups = len(expected_groups)
        group_chunks = normalize_chunks(np.ceil(ngroups / split_out),
                                        (ngroups, ))[0]
        idx = tuple(np.cumsum((0, ) + group_chunks))

        # split each block into `split_out` chunks
        dsk = {}
        split_name = f"{name}-split-{token}"
        for i in chunk_tuples:
            for j in range(split_out):
                dsk[(split_name, *i, j)] = (
                    _split_groups,
                    (applied.name, *i),
                    j,
                    slice(idx[j], idx[j + 1]),
                )

        # now construct an array that can be passed to _tree_reduce
        intergraph = HighLevelGraph.from_collections(split_name,
                                                     dsk,
                                                     dependencies=(applied, ))
        intermediate = dask.array.Array(
            intergraph,
            name=split_name,
            chunks=applied.chunks + ((1, ) * split_out, ),
            meta=array._meta,
        )
        expected_agg = None

    else:
        intermediate = applied
        group_chunks = (len(expected_groups),
                        ) if expected_groups is not None else (np.nan, )
        expected_agg = expected_groups

    # reduced is really a dict mapping reduction name to array
    # and "groups" to an array of group labels
    # Note: it does not make sense to interpret axis relative to
    # shape of intermediate results after the blockwise call
    reduced = dask.array.reductions._tree_reduce(
        intermediate,
        aggregate=partial(
            _npg_aggregate,
            agg=agg,
            expected_groups=expected_agg,
            group_ndim=by.ndim,
            fill_value=fill_value,
        ),
        combine=partial(_npg_combine, agg=agg, group_ndim=by.ndim),
        name=f"{name}-reduce",
        dtype=array.dtype,
        axis=axis,
        keepdims=True,
        concatenate=False,
    )

    output_chunks = reduced.chunks[:-(len(axis) + int(split_out > 1))] + (
        group_chunks, )

    def _getitem(d, key1, key2):
        return d[key1][key2]

    # extract results from the dict
    result: Dict = {}
    layer: Dict[Tuple, Tuple] = {}
    ochunks = tuple(range(len(chunks_v)) for chunks_v in output_chunks)
    if expected_groups is None:
        groups_name = f"groups-{name}-{token}"
        # we've used keepdims=True, so _tree_reduce preserves some dummy dimensions
        first_block = len(ochunks) * (0, )
        layer[(groups_name, *first_block)] = (
            operator.getitem,
            (reduced.name, *first_block),
            "groups",
        )
        groups = (dask.array.Array(
            HighLevelGraph.from_collections(groups_name,
                                            layer,
                                            dependencies=[reduced]),
            groups_name,
            chunks=(group_chunks, ),
            dtype=by.dtype,
        ), )
    else:
        groups = (expected_groups, )

    layer: Dict[Tuple, Tuple] = {}  # type: ignore
    agg_name = f"{name}-{token}"
    for ochunk in itertools.product(*ochunks):
        inchunk = ochunk[:-1] + (0, ) * (len(axis)) + (ochunk[-1], ) * int(
            split_out > 1)
        layer[(agg_name, *ochunk)] = (
            operator.getitem,
            (reduced.name, *inchunk),
            agg.name,
        )
    result = dask.array.Array(
        HighLevelGraph.from_collections(agg_name,
                                        layer,
                                        dependencies=[reduced]),
        agg_name,
        chunks=output_chunks,
        dtype=agg.dtype if agg.dtype else array.dtype,
    )

    return (result, *groups)
Example #28
0
    def _wrap(self, func, *args, **kwargs):
        """ Wrap numpy random function to produce dask.array random function

        extra_chunks should be a chunks tuple to append to the end of chunks
        """
        size = kwargs.pop('size', None)
        chunks = kwargs.pop('chunks')
        extra_chunks = kwargs.pop('extra_chunks', ())

        if size is not None and not isinstance(size, (tuple, list)):
            size = (size, )

        args_shapes = {
            ar.shape
            for ar in args if isinstance(ar, (Array, np.ndarray))
        }
        args_shapes.union({
            ar.shape
            for ar in kwargs.values() if isinstance(ar, (Array, np.ndarray))
        })

        shapes = list(args_shapes)
        if size is not None:
            shapes += [size]
        # broadcast to the final size(shape)
        size = broadcast_shapes(*shapes)
        chunks = normalize_chunks(chunks, size)
        slices = slices_from_chunks(chunks)

        def _broadcast_any(ar, shape, chunks):
            if isinstance(ar, Array):
                return broadcast_to(ar, shape).rechunk(chunks)
            if isinstance(ar, np.ndarray):
                return np.ascontiguousarray(np.broadcast_to(ar, shape))

        # Broadcast all arguments, get tiny versions as well
        # Start adding the relevant bits to the graph
        dsk = {}
        dsks = []
        lookup = {}
        small_args = []
        for i, ar in enumerate(args):
            if isinstance(ar, (np.ndarray, Array)):
                res = _broadcast_any(ar, size, chunks)
                if isinstance(res, Array):
                    dsks.append(res.dask)
                    lookup[i] = res.name
                elif isinstance(res, np.ndarray):
                    name = 'array-{}'.format(tokenize(res))
                    lookup[i] = name
                    dsk[name] = res
                small_args.append(ar[tuple(0 for _ in ar.shape)])
            else:
                small_args.append(ar)

        small_kwargs = {}
        for key, ar in kwargs.items():
            if isinstance(ar, (np.ndarray, Array)):
                res = _broadcast_any(ar, size, chunks)
                if isinstance(res, Array):
                    dsks.append(res.dask)
                    lookup[key] = res.name
                elif isinstance(res, np.ndarray):
                    name = 'array-{}'.format(tokenize(res))
                    lookup[key] = name
                    dsk[name] = res
                small_kwargs[key] = ar[tuple(0 for _ in ar.shape)]
            else:
                small_kwargs[key] = ar

        # Get dtype
        small_kwargs['size'] = (0, )
        dtype = func(xoroshiro128plus.RandomState(), *small_args,
                     **small_kwargs).dtype

        sizes = list(product(*chunks))
        state_data = random_state_data(len(sizes), self._numpy_state)
        token = tokenize(state_data, size, chunks, args, kwargs)
        name = 'da.random.{0}-{1}'.format(func.__name__, token)

        keys = product([name],
                       *([range(len(bd))
                          for bd in chunks] + [[0]] * len(extra_chunks)))
        blocks = product(*[range(len(bd)) for bd in chunks])
        vals = []
        for state, size, slc, block in zip(state_data, sizes, slices, blocks):
            arg = []
            for i, ar in enumerate(args):
                if i not in lookup:
                    arg.append(ar)
                else:
                    if isinstance(ar, Array):
                        arg.append((lookup[i], ) + block)
                    else:  # np.ndarray
                        arg.append((getitem, lookup[i], slc))
            kwrg = {}
            for k, ar in kwargs.items():
                if k not in lookup:
                    kwrg[k] = ar
                else:
                    if isinstance(ar, Array):
                        kwrg[k] = (lookup[k], ) + block
                    else:  # np.ndarray
                        kwrg[k] = (getitem, lookup[k], slc)
            vals.append((_apply_random, func.__name__, state, size, arg, kwrg))
        dsk.update(dict(zip(keys, vals)))
        dsk = sharedict.merge((name, dsk), *dsks)
        return Array(dsk, name, chunks + extra_chunks, dtype=dtype)
Example #29
0
        def choice(self, a, size=None, replace=True, p=None, chunks=None):
            dsks = []
            # Normalize and validate `a`
            if isinstance(a, Integral):
                # On windows the output dtype differs if p is provided or
                # absent, see https://github.com/numpy/numpy/issues/9867
                dummy_p = np.array([1]) if p is not None else p
                dtype = np.random.choice(1, size=(), p=dummy_p).dtype
                len_a = a
                if a < 0:
                    raise ValueError("a must be greater than 0")
            else:
                a = asarray(a).rechunk(a.shape)
                dtype = a.dtype
                if a.ndim != 1:
                    raise ValueError("a must be one dimensional")
                len_a = len(a)
                dsks.append(a.dask)
                a = a.__dask_keys__()[0]

            # Normalize and validate `p`
            if p is not None:
                if not isinstance(p, Array):
                    # If p is not a dask array, first check the sum is close
                    # to 1 before converting.
                    p = np.asarray(p)
                    if not np.isclose(p.sum(), 1, rtol=1e-7, atol=0):
                        raise ValueError("probabilities do not sum to 1")
                    p = asarray(p)
                else:
                    p = p.rechunk(p.shape)

                if p.ndim != 1:
                    raise ValueError("p must be one dimensional")
                if len(p) != len_a:
                    raise ValueError("a and p must have the same size")

                dsks.append(p.dask)
                p = p.__dask_keys__()[0]

            if size is None:
                size = ()
            elif not isinstance(size, (tuple, list)):
                size = (size, )

            chunks = normalize_chunks(chunks, size)
            sizes = list(product(*chunks))
            state_data = random_state_data(len(sizes), self._numpy_state)

            name = 'da.random.choice-%s' % tokenize(state_data, size, chunks,
                                                    a, replace, p)
            keys = product([name], *(range(len(bd)) for bd in chunks))
            dsk = {
                k: (_choice, state, a, size, replace, p)
                for k, state, size in zip(keys, state_data, sizes)
            }

            return Array(sharedict.merge((name, dsk), *dsks),
                         name,
                         chunks,
                         dtype=dtype)
Example #30
0
    def _wrap(self,
              funcname,
              *args,
              size=None,
              chunks="auto",
              extra_chunks=(),
              **kwargs):
        """Wrap numpy random function to produce dask.array random function

        extra_chunks should be a chunks tuple to append to the end of chunks
        """
        if size is not None and not isinstance(size, (tuple, list)):
            size = (size, )

        shapes = list({
            ar.shape
            for ar in chain(args, kwargs.values())
            if isinstance(ar, (Array, np.ndarray))
        })
        if size is not None:
            shapes.append(size)
        # broadcast to the final size(shape)
        size = broadcast_shapes(*shapes)
        chunks = normalize_chunks(
            chunks,
            size,  # ideally would use dtype here
            dtype=kwargs.get("dtype", np.float64),
        )
        slices = slices_from_chunks(chunks)

        def _broadcast_any(ar, shape, chunks):
            if isinstance(ar, Array):
                return broadcast_to(ar, shape).rechunk(chunks)
            if isinstance(ar, np.ndarray):
                return np.ascontiguousarray(np.broadcast_to(ar, shape))

        # Broadcast all arguments, get tiny versions as well
        # Start adding the relevant bits to the graph
        dsk = {}
        lookup = {}
        small_args = []
        dependencies = []
        for i, ar in enumerate(args):
            if isinstance(ar, (np.ndarray, Array)):
                res = _broadcast_any(ar, size, chunks)
                if isinstance(res, Array):
                    dependencies.append(res)
                    lookup[i] = res.name
                elif isinstance(res, np.ndarray):
                    name = f"array-{tokenize(res)}"
                    lookup[i] = name
                    dsk[name] = res
                small_args.append(ar[tuple(0 for _ in ar.shape)])
            else:
                small_args.append(ar)

        small_kwargs = {}
        for key, ar in kwargs.items():
            if isinstance(ar, (np.ndarray, Array)):
                res = _broadcast_any(ar, size, chunks)
                if isinstance(res, Array):
                    dependencies.append(res)
                    lookup[key] = res.name
                elif isinstance(res, np.ndarray):
                    name = f"array-{tokenize(res)}"
                    lookup[key] = name
                    dsk[name] = res
                small_kwargs[key] = ar[tuple(0 for _ in ar.shape)]
            else:
                small_kwargs[key] = ar

        sizes = list(product(*chunks))
        seeds = random_state_data(len(sizes), self._numpy_state)
        token = tokenize(seeds, size, chunks, args, kwargs)
        name = f"{funcname}-{token}"

        keys = product([name],
                       *([range(len(bd))
                          for bd in chunks] + [[0]] * len(extra_chunks)))
        blocks = product(*[range(len(bd)) for bd in chunks])

        vals = []
        for seed, size, slc, block in zip(seeds, sizes, slices, blocks):
            arg = []
            for i, ar in enumerate(args):
                if i not in lookup:
                    arg.append(ar)
                else:
                    if isinstance(ar, Array):
                        arg.append((lookup[i], ) + block)
                    else:  # np.ndarray
                        arg.append((getitem, lookup[i], slc))
            kwrg = {}
            for k, ar in kwargs.items():
                if k not in lookup:
                    kwrg[k] = ar
                else:
                    if isinstance(ar, Array):
                        kwrg[k] = (lookup[k], ) + block
                    else:  # np.ndarray
                        kwrg[k] = (getitem, lookup[k], slc)
            vals.append((_apply_random, self._RandomState, funcname, seed,
                         size, arg, kwrg))

        meta = _apply_random(
            self._RandomState,
            funcname,
            seed,
            (0, ) * len(size),
            small_args,
            small_kwargs,
        )

        dsk.update(dict(zip(keys, vals)))

        graph = HighLevelGraph.from_collections(name,
                                                dsk,
                                                dependencies=dependencies)
        return Array(graph, name, chunks + extra_chunks, meta=meta)
Example #31
0
 def chunks(self):
     return normalize_chunks(CHUNKS, shape=self.shape)
Example #32
0
def _copula_impl(
    cov: Union[List[List[float]], np.ndarray],
    df: Union[None, int, List[int], np.ndarray],
    samples: int,
    seed: int,
    chunks: Chunks2D,
    rng: str,
) -> Union[np.ndarray, da.Array]:
    """Implementation of gaussian_copula and t_copula
    """
    samples = int(samples)
    if samples <= 0:
        raise ValueError("Number of samples must be positive")
    cov = np.asarray(cov)
    if cov.ndim != 2 or cov.shape[0] != cov.shape[1]:
        raise ValueError("cov must be a square matrix")
    dimensions = cov.shape[0]

    L = numpy.linalg.cholesky(cov)
    if chunks is not None:
        chunks = cast(
            NormalizedChunks2D, normalize_chunks(chunks, shape=(samples, dimensions))
        )
        L = da.from_array(L, chunks=(chunks[1], chunks[1]))

    rng = rng.lower()
    if rng == "mersenne twister":
        # When pulling samples from the Mersenne Twister generator, we have
        # the samples on the rows. This guarantees that if we draw more
        # samples, the original samples won't change.
        rnd_state_y = duck.RandomState(seed)
        y = rnd_state_y.standard_normal(size=(samples, dimensions), chunks=chunks)
    elif rng == "sobol":
        y = sobol(size=(samples, dimensions), d0=seed, chunks=chunks)
        y = duck.norm_ppf(y)
    else:
        raise ValueError(f"Unknown rng: {rng}")

    p = (L @ y.T).T  # Gaussian Copula
    if df is None:
        return p

    # Pre-process df into a 1D numpy/dask array
    df = np.asarray(df)
    if (df <= 0).any():
        raise ValueError("df must always be greater than zero")
    if df.shape not in ((), (dimensions,)):
        raise ValueError(
            "df must be either a scalar or a 1D vector with as "
            "many points as the width of the correlation matrix"
        )
    if df.ndim == 1 and chunks is not None:
        df = da.from_array(df, chunks=(chunks[1],))

    # Define chunks for the S chi-square matrix
    chunks_r = (chunks[0], (1,)) if chunks else None

    if rng == "mersenne twister":
        # Use two separate random states for the normal and the chi2
        # distributions. This is NOT the same as just extracting two series
        # from the same RandomState, as we must guarantee that, if you extract
        # a different number of samples from the generator, the initial
        # samples must remain the same.
        # Don't just do seed + 1 as that would have unwanted repercussions
        # when one tries to extract different series from different seeds.
        seed_r = (seed + 190823761298456) % 2 ** 32
        rnd_state_r = duck.RandomState(seed_r)
        r = rnd_state_r.uniform(size=(samples, 1), chunks=chunks_r)
    elif rng == "sobol":
        seed_r = seed + dimensions
        r = sobol(size=(samples, 1), d0=seed_r, chunks=chunks_r)
    else:
        assert False  # pragma: nocover

    s = duck.chi2_ppf(r, df)
    z = duck.sqrt(df / s) * p
    # Convert t distribution to normal (0, 1)
    u = duck.t_cdf(z, df)
    t = duck.norm_ppf(u)
    return t