Exemple #1
0
def diag(v, k=0):
    if not isinstance(v, np.ndarray) and not isinstance(v, Array):
        raise TypeError(
            f"v must be a dask array or numpy array, got {type(v)}")

    name = "diag-" + tokenize(v, k)

    meta = meta_from_array(v, 2 if v.ndim == 1 else 1)

    if isinstance(v, np.ndarray) or (hasattr(v, "__array_function__")
                                     and not isinstance(v, Array)):
        if v.ndim == 1:
            m = abs(k)
            chunks = ((v.shape[0] + m, ), (v.shape[0] + m, ))
            dsk = {(name, 0, 0): (np.diag, v, k)}
        elif v.ndim == 2:
            kdiag_row_start = max(0, -k)
            kdiag_row_stop = min(v.shape[0], v.shape[1] - k)
            len_kdiag = kdiag_row_stop - kdiag_row_start
            chunks = ((0, ), ) if len_kdiag <= 0 else ((len_kdiag, ), )
            dsk = {(name, 0): (np.diag, v, k)}
        else:
            raise ValueError("Array must be 1d or 2d only")
        return Array(dsk, name, chunks, meta=meta)

    if v.ndim != 1:
        if v.ndim != 2:
            raise ValueError("Array must be 1d or 2d only")
        if k == 0 and v.chunks[0] == v.chunks[1]:
            dsk = {(name, i): (np.diag, row[i])
                   for i, row in enumerate(v.__dask_keys__())}
            graph = HighLevelGraph.from_collections(name,
                                                    dsk,
                                                    dependencies=[v])
            return Array(graph, name, (v.chunks[0], ), meta=meta)
        else:
            return diagonal(v, k)

    if k == 0:
        chunks_1d = v.chunks[0]
        blocks = v.__dask_keys__()
        dsk = {}
        for i, m in enumerate(chunks_1d):
            for j, n in enumerate(chunks_1d):
                key = (name, i, j)
                if i == j:
                    dsk[key] = (np.diag, blocks[i])
                else:
                    dsk[key] = (np.zeros, (m, n))
                    dsk[key] = (partial(np.zeros_like, shape=(m, n)), meta)

        graph = HighLevelGraph.from_collections(name, dsk, dependencies=[v])
        return Array(graph, name, (chunks_1d, chunks_1d), meta=meta)

    elif k > 0:
        return pad(diag(v), [[0, k], [k, 0]], mode="constant")
    elif k < 0:
        return pad(diag(v), [[-k, 0], [0, -k]], mode="constant")
Exemple #2
0
def wrap_func_like(func, *args, **kwargs):
    """
    Transform np creation function into blocked version
    """
    x = args[0]
    meta = meta_from_array(x)
    shape = kwargs.get("shape", x.shape)

    parsed = _parse_wrap_args(func, args, kwargs, shape)
    shape = parsed["shape"]
    dtype = parsed["dtype"]
    chunks = parsed["chunks"]
    name = parsed["name"]
    kwargs = parsed["kwargs"]

    keys = product([name], *[range(len(bd)) for bd in chunks])
    shapes = product(*chunks)
    shapes = list(shapes)
    kw = [kwargs for _ in shapes]
    for i, s in enumerate(list(shapes)):
        kw[i]["shape"] = s
    vals = ((partial(func, dtype=dtype, **k), ) + args
            for (k, s) in zip(kw, shapes))

    dsk = dict(zip(keys, vals))

    return Array(dsk, name, chunks, meta=meta.astype(dtype))
Exemple #3
0
def _dask_unique(x, return_index=True):
    from dask.array.core import Array
    from dask import sharedict
    from numpy import cumsum, concatenate, unique
    from numpy.testing import assert_

    assert_(return_index)

    name = "unique-" + x.name

    def _unique(x):

        return unique(x, return_index=return_index)

    dsk = dict(((name, i), (_unique, key)) for i, key in enumerate(x._keys()))
    parts = Array._get(sharedict.merge((name, dsk), x.dask), list(dsk.keys()))

    arrs = [a[0] for a in parts]

    chunks = x.chunks[0]
    offset = cumsum((0, ) + chunks)[:-1]

    idxs = [parts[i][1] + offset[i] for i in range(len(parts))]

    arr = concatenate(arrs)
    idx = concatenate(idxs)

    u, i = unique(arr, return_index=True)
    return u, idx[i]
Exemple #4
0
def overlap_internal(x, axes):
    """Share boundaries between neighboring blocks

    Parameters
    ----------

    x: da.Array
        A dask array
    axes: dict
        The size of the shared boundary per axis

    The axes input informs how many cells to overlap between neighboring blocks
    {0: 2, 2: 5} means share two cells in 0 axis, 5 cells in 2 axis
    """
    token = tokenize(x, axes)
    name = "overlap-" + token

    graph = ArrayOverlapLayer(
        name=x.name,
        axes=axes,
        chunks=x.chunks,
        numblocks=x.numblocks,
        token=token,
    )
    graph = HighLevelGraph.from_collections(name, graph, dependencies=[x])
    chunks = _overlap_internal_chunks(x.chunks, axes)

    return Array(graph, name, chunks, meta=x)
Exemple #5
0
def read(filename, shape, chunks):
    from dask.highlevelgraph import HighLevelGraph
    from dask.array.core import normalize_chunks, Array
    from itertools import product
    from ...tunable import delayed
    from numpy import prod, dtype
    import xmltodict

    records = scan_file(filename)
    records = {r["lime_type"]: r for r in records}

    data_record = records["ildg-binary-data"]
    data_offset = data_record["pos"]

    info = xmltodict.parse(records["ildg-format"]["data"])["ildgFormat"]
    dtype = dtype("complex%d" % (int(info["precision"]) * 2))

    assert data_record["data_length"] == prod(shape) * dtype.itemsize

    normal_chunks = normalize_chunks(chunks, shape=shape)
    chunks_id = list(product(*[range(len(bd)) for bd in normal_chunks]))

    reads = [
        delayed(read_chunk)(filename, shape, dtype, data_offset, chunks,
                            chunk_id) for chunk_id in chunks_id
    ]

    keys = [(filename, *chunk_id) for chunk_id in chunks_id]
    vals = [read.key for read in reads]
    dsk = dict(zip(keys, vals))

    graph = HighLevelGraph.from_collections(filename, dsk, dependencies=reads)

    return Array(graph, filename, normal_chunks, dtype=dtype)
Exemple #6
0
def _compute_rechunk(x, chunks):
    """Compute the rechunk of *x* to the given *chunks*."""
    if x.size == 0:
        # Special case for empty array, as the algorithm below does not behave correctly
        return empty(x.shape, chunks=chunks, dtype=x.dtype)

    ndim = x.ndim
    crossed = intersect_chunks(x.chunks, chunks)
    x2 = dict()
    intermediates = dict()
    token = tokenize(x, chunks)
    merge_name = "rechunk-merge-" + token
    split_name = "rechunk-split-" + token
    split_name_suffixes = count()

    # Pre-allocate old block references, to allow re-use and reduce the
    # graph's memory footprint a bit.
    old_blocks = np.empty([len(c) for c in x.chunks], dtype="O")
    for index in np.ndindex(old_blocks.shape):
        old_blocks[index] = (x.name, ) + index

    # Iterate over all new blocks
    new_index = product(*(range(len(c)) for c in chunks))

    for new_idx, cross1 in zip(new_index, crossed):
        key = (merge_name, ) + new_idx
        old_block_indices = [[cr[i][0] for cr in cross1] for i in range(ndim)]
        subdims1 = [len(set(old_block_indices[i])) for i in range(ndim)]

        rec_cat_arg = np.empty(subdims1, dtype="O")
        rec_cat_arg_flat = rec_cat_arg.flat

        # Iterate over the old blocks required to build the new block
        for rec_cat_index, ind_slices in enumerate(cross1):
            old_block_index, slices = zip(*ind_slices)
            name = (split_name, next(split_name_suffixes))
            old_index = old_blocks[old_block_index][1:]
            if all(slc.start == 0 and slc.stop == x.chunks[i][ind]
                   for i, (slc, ind) in enumerate(zip(slices, old_index))):
                rec_cat_arg_flat[rec_cat_index] = old_blocks[old_block_index]
            else:
                intermediates[name] = (getitem, old_blocks[old_block_index],
                                       slices)
                rec_cat_arg_flat[rec_cat_index] = name

        assert rec_cat_index == rec_cat_arg.size - 1

        # New block is formed by concatenation of sliced old blocks
        if all(d == 1 for d in rec_cat_arg.shape):
            x2[key] = rec_cat_arg.flat[0]
        else:
            x2[key] = (concatenate3, rec_cat_arg.tolist())

    del old_blocks, new_index

    layer = toolz.merge(x2, intermediates)
    graph = HighLevelGraph.from_collections(merge_name,
                                            layer,
                                            dependencies=[x])
    return Array(graph, merge_name, chunks, meta=x)
Exemple #7
0
def modf(x):
    # Not actually object dtype, just need to specify something
    tmp = elemwise(np.modf, x, dtype=object)
    left = "modf1-" + tmp.name
    right = "modf2-" + tmp.name
    ldsk = {(left, ) + key[1:]: (getitem, key, 0)
            for key in core.flatten(tmp.__dask_keys__())}
    rdsk = {(right, ) + key[1:]: (getitem, key, 1)
            for key in core.flatten(tmp.__dask_keys__())}

    a = np.empty_like(getattr(x, "_meta", x),
                      shape=(1, ) * x.ndim,
                      dtype=x.dtype)
    l, r = np.modf(a)

    graph = HighLevelGraph.from_collections(left, ldsk, dependencies=[tmp])
    L = Array(graph, left, chunks=tmp.chunks, meta=l)
    graph = HighLevelGraph.from_collections(right, rdsk, dependencies=[tmp])
    R = Array(graph, right, chunks=tmp.chunks, meta=r)
    return L, R
Exemple #8
0
def imread(filename, imread=None, preprocess=None):
    """Read a stack of images into a dask array

    Parameters
    ----------

    filename: string
        A globstring like 'myfile.*.png'
    imread: function (optional)
        Optionally provide custom imread function.
        Function should expect a filename and produce a numpy array.
        Defaults to ``skimage.io.imread``.
    preprocess: function (optional)
        Optionally provide custom function to preprocess the image.
        Function should expect a numpy array for a single image.

    Examples
    --------

    >>> from dask.array.image import imread
    >>> im = imread('2015-*-*.png')  # doctest: +SKIP
    >>> im.shape  # doctest: +SKIP
    (365, 1000, 1000, 3)

    Returns
    -------

    Dask array of all images stacked along the first dimension.
    Each separate image file will be treated as an individual chunk.
    """
    imread = imread or sk_imread
    filenames = sorted(glob(filename))
    if not filenames:
        raise ValueError("No files found under name %s" % filename)

    name = "imread-%s" % tokenize(filenames, map(os.path.getmtime, filenames))

    sample = imread(filenames[0])
    if preprocess:
        sample = preprocess(sample)

    keys = [(name, i) + (0, ) * len(sample.shape)
            for i in range(len(filenames))]
    if preprocess:
        values = [(add_leading_dimension, (preprocess, (imread, fn)))
                  for fn in filenames]
    else:
        values = [(add_leading_dimension, (imread, fn)) for fn in filenames]
    dsk = dict(zip(keys, values))

    chunks = ((1, ) * len(filenames), ) + tuple((d, ) for d in sample.shape)

    return Array(dsk, name, chunks, sample.dtype)
Exemple #9
0
def linspace(start, stop, num=50, chunks=None, dtype=None, endpoint=True):
    """
    Return `num` evenly spaced values over the closed interval [`start`,
    `stop`].

    TODO: implement the `endpoint`, `restep`, and `dtype` keyword args

    Parameters
    ----------
    start : scalar
        The starting value of the sequence.
    stop : scalar
        The last value of the sequence.
    num : int, optional
        Number of samples to include in the returned dask array, including the
        endpoints.
    chunks :  int
        The number of samples on each block. Note that the last block will have
        fewer samples if `num % blocksize != 0`

    Returns
    -------
    samples : dask array

    """
    num = int(num)
    if endpoint == False:
        num = num + 1
    if chunks is None:
        raise ValueError("Must supply a chunks= keyword argument")

    chunks = normalize_chunks(chunks, (num, ))

    range_ = stop - start

    space = float(range_) / (num - 1)

    name = 'linspace-' + tokenize((start, stop, num, chunks, dtype, endpoint))

    dsk = {}
    blockstart = start

    for i, bs in enumerate(chunks[0]):
        blockstop = blockstart + ((bs - 1) * space)
        task = (partial(np.linspace, dtype=dtype), blockstart, blockstop, bs)
        blockstart = blockstart + (space * bs)
        dsk[(name, i)] = task

    return Array(dsk, name, chunks, dtype=dtype)
Exemple #10
0
def wrap_func_shape_as_first_arg(func, *args, **kwargs):
    """
    Transform np creation function into blocked version
    """
    if "shape" not in kwargs:
        shape, args = args[0], args[1:]
    else:
        shape = kwargs.pop("shape")

    if isinstance(shape, Array):
        raise TypeError("Dask array input not supported. "
                        "Please use tuple, list, or a 1D numpy array instead.")

    parsed = _parse_wrap_args(func, args, kwargs, shape)
    shape = parsed["shape"]
    dtype = parsed["dtype"]
    chunks = parsed["chunks"]
    name = parsed["name"]
    kwargs = parsed["kwargs"]
    func = partial(func, dtype=dtype, **kwargs)

    out_ind = dep_ind = tuple(range(len(shape)))
    graph = core_blockwise(
        func,
        name,
        out_ind,
        ArrayChunkShapeDep(chunks),
        dep_ind,
        numblocks={},
    )

    return Array(graph,
                 name,
                 chunks,
                 dtype=dtype,
                 meta=kwargs.get("meta", None))
Exemple #11
0
def reblock(x, blockdims=None, blockshape=None):
    """
    Convert blocks in dask array x for new blockdims.

    reblock(x, blockdims=None, blockshape=None )

    >>> import dask.array as da
    >>> a = np.random.uniform(0, 1, 7**4).reshape((7,) * 4)
    >>> x = da.from_array(a, blockdims=((2, 3, 2),)*4)
    >>> x.blockdims
    ((2, 3, 2), (2, 3, 2), (2, 3, 2), (2, 3, 2))

    >>> y = reblock(x, blockdims=((2, 4, 1), (4, 2, 1), (4, 3), (7,)))
    >>> y.blockdims
    ((2, 4, 1), (4, 2, 1), (4, 3), (7,))

    blockdims/blockshape also accept dict arguments mapping axis to blockshape

    >>> y = reblock(x, blockshape={1: 2})  # reblock axis 1 with blockshape 2

    Parameters
    ----------

    x:   dask array
    blockdims:  the new block dimensions to create
    blockshape: the new blockshape to create

    Provide one of blockdims or blockshape.
    """
    if isinstance(blockdims, dict):
        blockdims = blockdims_dict_to_tuple(x.blockdims, blockdims)
    elif isinstance(blockshape, dict):
        blockdims = blockshape_dict_to_tuple(x.blockdims, blockshape)
    elif not blockdims:
        blockdims = blockdims_from_blockshape(x.shape, blockshape)

    crossed = intersect_blockdims(x.blockdims, blockdims)
    x2 = dict()
    temp_name = next(reblock_names)
    new_index = tuple(product(*(tuple(range(len(n))) for n in blockdims)))
    for flat_idx, cross1 in enumerate(crossed):
        new_idx = new_index[flat_idx]
        key = (temp_name, ) + new_idx
        cr2 = iter(cross1)
        old_blocks = tuple(tuple(ind for ind, _ in cr) for cr in cross1)
        subdims = tuple(
            len(set(ss[i] for ss in old_blocks)) for i in range(x.ndim))
        rec_cat_arg = np.empty(subdims).tolist()
        inds_in_block = product(*(range(s) for s in subdims))
        for old_block in old_blocks:
            ind_slics = next(cr2)
            old_inds = tuple(
                tuple(s[0] for s in ind_slics) for i in range(x.ndim))
            # list of nd slices
            slic = tuple(tuple(s[1] for s in ind_slics) for i in range(x.ndim))
            ind_in_blk = next(inds_in_block)
            temp = rec_cat_arg
            for i in range(x.ndim - 1):
                temp = getitem(temp, ind_in_blk[i])
            for ind, slc in zip(old_inds, slic):
                temp[ind_in_blk[-1]] = (getitem, (x.name, ) + ind, slc)
        x2[key] = (rec_concatenate, rec_cat_arg)
    x2 = merge(x.dask, x2)
    return Array(x2, temp_name, blockdims=blockdims, dtype=x.dtype)
Exemple #12
0
    def _wrap(self,
              funcname,
              *args,
              size=None,
              chunks="auto",
              extra_chunks=(),
              **kwargs):
        """Wrap numpy random function to produce dask.array random function

        extra_chunks should be a chunks tuple to append to the end of chunks
        """
        if size is not None and not isinstance(size, (tuple, list)):
            size = (size, )

        shapes = list({
            ar.shape
            for ar in chain(args, kwargs.values())
            if isinstance(ar, (Array, np.ndarray))
        })
        if size is not None:
            shapes.append(size)
        # broadcast to the final size(shape)
        size = broadcast_shapes(*shapes)
        chunks = normalize_chunks(
            chunks,
            size,  # ideally would use dtype here
            dtype=kwargs.get("dtype", np.float64),
        )
        slices = slices_from_chunks(chunks)

        def _broadcast_any(ar, shape, chunks):
            if isinstance(ar, Array):
                return broadcast_to(ar, shape).rechunk(chunks)
            if isinstance(ar, np.ndarray):
                return np.ascontiguousarray(np.broadcast_to(ar, shape))

        # Broadcast all arguments, get tiny versions as well
        # Start adding the relevant bits to the graph
        dsk = {}
        lookup = {}
        small_args = []
        dependencies = []
        for i, ar in enumerate(args):
            if isinstance(ar, (np.ndarray, Array)):
                res = _broadcast_any(ar, size, chunks)
                if isinstance(res, Array):
                    dependencies.append(res)
                    lookup[i] = res.name
                elif isinstance(res, np.ndarray):
                    name = f"array-{tokenize(res)}"
                    lookup[i] = name
                    dsk[name] = res
                small_args.append(ar[tuple(0 for _ in ar.shape)])
            else:
                small_args.append(ar)

        small_kwargs = {}
        for key, ar in kwargs.items():
            if isinstance(ar, (np.ndarray, Array)):
                res = _broadcast_any(ar, size, chunks)
                if isinstance(res, Array):
                    dependencies.append(res)
                    lookup[key] = res.name
                elif isinstance(res, np.ndarray):
                    name = f"array-{tokenize(res)}"
                    lookup[key] = name
                    dsk[name] = res
                small_kwargs[key] = ar[tuple(0 for _ in ar.shape)]
            else:
                small_kwargs[key] = ar

        sizes = list(product(*chunks))
        seeds = random_state_data(len(sizes), self._numpy_state)
        token = tokenize(seeds, size, chunks, args, kwargs)
        name = f"{funcname}-{token}"

        keys = product([name],
                       *([range(len(bd))
                          for bd in chunks] + [[0]] * len(extra_chunks)))
        blocks = product(*[range(len(bd)) for bd in chunks])

        vals = []
        for seed, size, slc, block in zip(seeds, sizes, slices, blocks):
            arg = []
            for i, ar in enumerate(args):
                if i not in lookup:
                    arg.append(ar)
                else:
                    if isinstance(ar, Array):
                        arg.append((lookup[i], ) + block)
                    else:  # np.ndarray
                        arg.append((getitem, lookup[i], slc))
            kwrg = {}
            for k, ar in kwargs.items():
                if k not in lookup:
                    kwrg[k] = ar
                else:
                    if isinstance(ar, Array):
                        kwrg[k] = (lookup[k], ) + block
                    else:  # np.ndarray
                        kwrg[k] = (getitem, lookup[k], slc)
            vals.append((_apply_random, self._RandomState, funcname, seed,
                         size, arg, kwrg))

        meta = _apply_random(
            self._RandomState,
            funcname,
            seed,
            (0, ) * len(size),
            small_args,
            small_kwargs,
        )

        dsk.update(dict(zip(keys, vals)))

        graph = HighLevelGraph.from_collections(name,
                                                dsk,
                                                dependencies=dependencies)
        return Array(graph, name, chunks + extra_chunks, meta=meta)
Exemple #13
0
        def choice(self, a, size=None, replace=True, p=None, chunks="auto"):
            dependencies = []
            # Normalize and validate `a`
            if isinstance(a, Integral):
                # On windows the output dtype differs if p is provided or
                # absent, see https://github.com/numpy/numpy/issues/9867
                dummy_p = np.array([1]) if p is not None else p
                dtype = np.random.choice(1, size=(), p=dummy_p).dtype
                len_a = a
                if a < 0:
                    raise ValueError("a must be greater than 0")
            else:
                a = asarray(a)
                a = a.rechunk(a.shape)
                dtype = a.dtype
                if a.ndim != 1:
                    raise ValueError("a must be one dimensional")
                len_a = len(a)
                dependencies.append(a)
                a = a.__dask_keys__()[0]

            # Normalize and validate `p`
            if p is not None:
                if not isinstance(p, Array):
                    # If p is not a dask array, first check the sum is close
                    # to 1 before converting.
                    p = np.asarray(p)
                    if not np.isclose(p.sum(), 1, rtol=1e-7, atol=0):
                        raise ValueError("probabilities do not sum to 1")
                    p = asarray(p)
                else:
                    p = p.rechunk(p.shape)

                if p.ndim != 1:
                    raise ValueError("p must be one dimensional")
                if len(p) != len_a:
                    raise ValueError("a and p must have the same size")

                dependencies.append(p)
                p = p.__dask_keys__()[0]

            if size is None:
                size = ()
            elif not isinstance(size, (tuple, list)):
                size = (size, )

            chunks = normalize_chunks(chunks, size, dtype=np.float64)
            if not replace and len(chunks[0]) > 1:
                err_msg = ("replace=False is not currently supported for "
                           "dask.array.choice with multi-chunk output "
                           "arrays")
                raise NotImplementedError(err_msg)
            sizes = list(product(*chunks))
            state_data = random_state_data(len(sizes), self._numpy_state)

            name = "da.random.choice-%s" % tokenize(state_data, size, chunks,
                                                    a, replace, p)
            keys = product([name], *(range(len(bd)) for bd in chunks))
            dsk = {
                k: (_choice, state, a, size, replace, p)
                for k, state, size in zip(keys, state_data, sizes)
            }

            graph = HighLevelGraph.from_collections(name,
                                                    dsk,
                                                    dependencies=dependencies)
            return Array(graph, name, chunks, dtype=dtype)
Exemple #14
0
def arange(*args, **kwargs):
    """
    Return evenly spaced values from `start` to `stop` with step size `step`.

    The values are half-open [start, stop), so including start and excluding
    stop. This is basically the same as python's range function but for dask
    arrays.

    Parameters
    ----------
    start : int, optional
        The starting value of the sequence. The default is 0.
    stop : int
        The end of the interval, this value is excluded from the interval.
    step : int, optional
        The spacing between the values. The default is 1 when not specified.
        The last value of the sequence.
    chunks :  int
        The number of samples on each block. Note that the last block will have
        fewer samples if ``len(array) % chunks != 0``.

    Returns
    -------
    samples : dask array

    """
    args = tuple(common.to_nptype(x) for x in args)

    if len(args) == 1:
        start = 0
        stop = args[0]
        step = np.int_(1)
    elif len(args) == 2:
        start = args[0]
        stop = args[1]
        step = np.int_(1)
    elif len(args) == 3:
        start, stop, step = args
    else:
        raise TypeError('''
        arange takes 3 positional arguments: arange([start], stop, [step])
        ''')
    if step is None:
        step = np.int_(1)
    if start is None:
        start = np.int_(0)

    if 'chunks' not in kwargs:
        raise ValueError("Must supply a chunks= keyword argument")
    chunks = kwargs['chunks']

    dtype = kwargs.get('dtype', None)
    if dtype is None:
        dtype = np.arange(0, 1, step).dtype

    range_ = stop - start
    if hasattr(step, 'dtype') and step.dtype.kind == 'm':
        num = int(abs(range_ / step))
    else:
        num = int(np.round(abs(range_ / step)))

    chunks = normalize_chunks(chunks, (num, ))

    name = 'arange-' + tokenize((start, stop, step, chunks, num))
    dsk = {}
    elem_count = 0

    # Correct arange for non-integer steps.
    def fix_arange(start, stop, step, dtype):
        x0 = int(np.round(start / step))
        x1 = int(np.round(stop / step))
        return (np.arange(x0, x1) * step).astype(dtype)

    for i, bs in enumerate(chunks[0]):
        blockstart = start + (elem_count * step)
        blockstop = start + ((elem_count + bs) * step)
        task = (fix_arange, blockstart, blockstop, step, dtype)
        dsk[(name, i)] = task
        elem_count += bs

    return Array(dsk, name, chunks, dtype=dtype)
Exemple #15
0
def reshape(x, shape, merge_chunks=True, limit=None):
    """Reshape array to new shape

    Parameters
    ----------
    shape : int or tuple of ints
        The new shape should be compatible with the original shape. If
        an integer, then the result will be a 1-D array of that length.
        One shape dimension can be -1. In this case, the value is
        inferred from the length of the array and remaining dimensions.
    merge_chunks : bool, default True
        Whether to merge chunks using the logic in :meth:`dask.array.rechunk`
        when communication is necessary given the input array chunking and
        the output shape. With ``merge_chunks==False``, the input array will
        be rechunked to a chunksize of 1, which can create very many tasks.
    limit: int (optional)
        The maximum block size to target in bytes. If no limit is provided,
        it defaults to using the ``array.chunk-size`` Dask config value.

    Notes
    -----
    This is a parallelized version of the ``np.reshape`` function with the
    following limitations:

    1.  It assumes that the array is stored in `row-major order`_
    2.  It only allows for reshapings that collapse or merge dimensions like
        ``(1, 2, 3, 4) -> (1, 6, 4)`` or ``(64,) -> (4, 4, 4)``

    .. _`row-major order`: https://en.wikipedia.org/wiki/Row-_and_column-major_order

    When communication is necessary this algorithm depends on the logic within
    rechunk.  It endeavors to keep chunk sizes roughly the same when possible.

    See :ref:`array-chunks.reshaping` for a discussion the tradeoffs of
    ``merge_chunks``.

    See Also
    --------
    dask.array.rechunk
    numpy.reshape
    """
    # Sanitize inputs, look for -1 in shape
    from dask.array.core import PerformanceWarning
    from dask.array.slicing import sanitize_index

    shape = tuple(map(sanitize_index, shape))
    known_sizes = [s for s in shape if s != -1]
    if len(known_sizes) < len(shape):
        if len(shape) - len(known_sizes) > 1:
            raise ValueError("can only specify one unknown dimension")
        # Fastpath for x.reshape(-1) on 1D arrays, allows unknown shape in x
        # for this case only.
        if len(shape) == 1 and x.ndim == 1:
            return x
        missing_size = sanitize_index(x.size / reduce(mul, known_sizes, 1))
        shape = tuple(missing_size if s == -1 else s for s in shape)

    if np.isnan(sum(x.shape)):
        raise ValueError("Array chunk size or shape is unknown. shape: %s\n\n"
                         "Possible solution with x.compute_chunk_sizes()" %
                         str(x.shape))

    if reduce(mul, shape, 1) != x.size:
        raise ValueError("total size of new array must be unchanged")

    if x.shape == shape:
        return x

    meta = meta_from_array(x, len(shape))

    name = "reshape-" + tokenize(x, shape)

    if x.npartitions == 1:
        key = next(flatten(x.__dask_keys__()))
        dsk = {(name, ) + (0, ) * len(shape): (M.reshape, key, shape)}
        chunks = tuple((d, ) for d in shape)
        graph = HighLevelGraph.from_collections(name, dsk, dependencies=[x])
        return Array(graph, name, chunks, meta=meta)

    # Logic or how to rechunk
    din = len(x.shape)
    dout = len(shape)
    if not merge_chunks and din > dout:
        x = x.rechunk({i: 1 for i in range(din - dout)})

    inchunks, outchunks = reshape_rechunk(x.shape, shape, x.chunks)
    # Check output chunks are not too large
    max_chunksize_in_bytes = reduce(
        mul, [max(i) for i in outchunks]) * x.dtype.itemsize

    if limit is None:
        limit = parse_bytes(config.get("array.chunk-size"))
        split = config.get("array.slicing.split-large-chunks", None)
    else:
        limit = parse_bytes(limit)
        split = True

    if max_chunksize_in_bytes > limit:
        if split is None:
            msg = (
                "Reshaping is producing a large chunk. To accept the large\n"
                "chunk and silence this warning, set the option\n"
                "    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):\n"
                "    ...     array.reshape(shape)\n\n"
                "To avoid creating the large chunks, set the option\n"
                "    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):\n"
                "    ...     array.reshape(shape)"
                "Explictly passing ``limit`` to ``reshape`` will also silence this warning\n"
                "    >>> array.reshape(shape, limit='128 MiB')")
            warnings.warn(msg, PerformanceWarning, stacklevel=6)
        elif split:
            # Leave chunk sizes unaltered where possible
            matching_chunks = Counter(inchunks) & Counter(outchunks)
            chunk_plan = []
            for out in outchunks:
                if matching_chunks[out] > 0:
                    chunk_plan.append(out)
                    matching_chunks[out] -= 1
                else:
                    chunk_plan.append("auto")
            outchunks = normalize_chunks(
                chunk_plan,
                shape=shape,
                limit=limit,
                dtype=x.dtype,
                previous_chunks=inchunks,
            )

    x2 = x.rechunk(inchunks)

    # Construct graph
    in_keys = list(product([x2.name], *[range(len(c)) for c in inchunks]))
    out_keys = list(product([name], *[range(len(c)) for c in outchunks]))
    shapes = list(product(*outchunks))
    dsk = {
        a: (M.reshape, b, shape)
        for a, b, shape in zip(out_keys, in_keys, shapes)
    }

    graph = HighLevelGraph.from_collections(name, dsk, dependencies=[x2])
    return Array(graph, name, outchunks, meta=meta)
Exemple #16
0
def percentile(a, q, method="linear", internal_method="default", **kwargs):
    """Approximate percentile of 1-D array

    Parameters
    ----------
    a : Array
    q : array_like of float
        Percentile or sequence of percentiles to compute, which must be between
        0 and 100 inclusive.
    method : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}, optional
        The interpolation method to use when the desired percentile lies
        between two data points ``i < j``. Only valid for ``method='dask'``.

        - 'linear': ``i + (j - i) * fraction``, where ``fraction``
          is the fractional part of the index surrounded by ``i``
          and ``j``.
        - 'lower': ``i``.
        - 'higher': ``j``.
        - 'nearest': ``i`` or ``j``, whichever is nearest.
        - 'midpoint': ``(i + j) / 2``.

        .. versionchanged:: 2022.1.0
            This argument was previously called "interpolation"

    internal_method : {'default', 'dask', 'tdigest'}, optional
        What internal method to use. By default will use dask's internal custom
        algorithm (``'dask'``).  If set to ``'tdigest'`` will use tdigest for
        floats and ints and fallback to the ``'dask'`` otherwise.

        .. versionchanged:: 2022.1.0
            This argument was previously called “method”.

    interpolation : str, optional
        Deprecated name for the method keyword argument.

        .. deprecated:: 2022.1.0

    See Also
    --------
    numpy.percentile : Numpy's equivalent Percentile function
    """
    from dask.array.dispatch import percentile_lookup as _percentile
    from dask.array.utils import array_safe, meta_from_array

    allowed_internal_methods = ["default", "dask", "tdigest"]

    if method in allowed_internal_methods:
        warnings.warn(
            "In Dask 2022.1.0, the `method=` argument was renamed to `internal_method=`",
            FutureWarning,
        )
        internal_method = method

    if "interpolation" in kwargs:
        if _numpy_122:
            warnings.warn(
                "In Dask 2022.1.0, the `interpolation=` argument to percentile was renamed to "
                "`method= ` ",
                FutureWarning,
            )
        method = kwargs.pop("interpolation")

    if kwargs:
        raise TypeError(
            f"percentile() got an unexpected keyword argument {kwargs.keys()}")

    if not a.ndim == 1:
        raise NotImplementedError(
            "Percentiles only implemented for 1-d arrays")
    if isinstance(q, Number):
        q = [q]
    q = array_safe(q, like=meta_from_array(a))
    token = tokenize(a, q, method)

    dtype = a.dtype
    if np.issubdtype(dtype, np.integer):
        dtype = (array_safe([], dtype=dtype, like=meta_from_array(a)) /
                 0.5).dtype
    meta = meta_from_array(a, dtype=dtype)

    if internal_method not in allowed_internal_methods:
        raise ValueError(
            f"`internal_method=` must be one of {allowed_internal_methods}")

    # Allow using t-digest if method is allowed and dtype is of floating or integer type
    if (internal_method == "tdigest" and method == "linear"
            and (np.issubdtype(dtype, np.floating)
                 or np.issubdtype(dtype, np.integer))):

        from dask.utils import import_required

        import_required(
            "crick",
            "crick is a required dependency for using the t-digest method.")

        name = "percentile_tdigest_chunk-" + token
        dsk = {(name, i): (_tdigest_chunk, key)
               for i, key in enumerate(a.__dask_keys__())}

        name2 = "percentile_tdigest-" + token

        dsk2 = {(name2, 0): (_percentiles_from_tdigest, q, sorted(dsk))}

    # Otherwise use the custom percentile algorithm
    else:
        # Add 0 and 100 during calculation for more robust behavior (hopefully)
        calc_q = np.pad(q, 1, mode="constant")
        calc_q[-1] = 100
        name = "percentile_chunk-" + token
        dsk = {(name, i): (_percentile, key, calc_q, method)
               for i, key in enumerate(a.__dask_keys__())}

        name2 = "percentile-" + token
        dsk2 = {
            (name2, 0): (
                merge_percentiles,
                q,
                [calc_q] * len(a.chunks[0]),
                sorted(dsk),
                method,
            )
        }

    dsk = merge(dsk, dsk2)
    graph = HighLevelGraph.from_collections(name2, dsk, dependencies=[a])
    return Array(graph, name2, chunks=((len(q), ), ), meta=meta)
Exemple #17
0
def apply_gufunc(
    func,
    signature,
    *args,
    axes=None,
    axis=None,
    keepdims=False,
    output_dtypes=None,
    output_sizes=None,
    vectorize=None,
    allow_rechunk=False,
    meta=None,
    **kwargs,
):
    """
    Apply a generalized ufunc or similar python function to arrays.

    ``signature`` determines if the function consumes or produces core
    dimensions. The remaining dimensions in given input arrays (``*args``)
    are considered loop dimensions and are required to broadcast
    naturally against each other.

    In other terms, this function is like ``np.vectorize``, but for
    the blocks of dask arrays. If the function itself shall also
    be vectorized use ``vectorize=True`` for convenience.

    Parameters
    ----------
    func : callable
        Function to call like ``func(*args, **kwargs)`` on input arrays
        (``*args``) that returns an array or tuple of arrays. If multiple
        arguments with non-matching dimensions are supplied, this function is
        expected to vectorize (broadcast) over axes of positional arguments in
        the style of NumPy universal functions [1]_ (if this is not the case,
        set ``vectorize=True``). If this function returns multiple outputs,
        ``output_core_dims`` has to be set as well.
    signature: string
        Specifies what core dimensions are consumed and produced by ``func``.
        According to the specification of numpy.gufunc signature [2]_
    *args : numeric
        Input arrays or scalars to the callable function.
    axes: List of tuples, optional, keyword only
        A list of tuples with indices of axes a generalized ufunc should operate on.
        For instance, for a signature of ``"(i,j),(j,k)->(i,k)"`` appropriate for
        matrix multiplication, the base elements are two-dimensional matrices
        and these are taken to be stored in the two last axes of each argument. The
        corresponding axes keyword would be ``[(-2, -1), (-2, -1), (-2, -1)]``.
        For simplicity, for generalized ufuncs that operate on 1-dimensional arrays
        (vectors), a single integer is accepted instead of a single-element tuple,
        and for generalized ufuncs for which all outputs are scalars, the output
        tuples can be omitted.
    axis: int, optional, keyword only
        A single axis over which a generalized ufunc should operate. This is a short-cut
        for ufuncs that operate over a single, shared core dimension, equivalent to passing
        in axes with entries of (axis,) for each single-core-dimension argument and ``()`` for
        all others. For instance, for a signature ``"(i),(i)->()"``, it is equivalent to passing
        in ``axes=[(axis,), (axis,), ()]``.
    keepdims: bool, optional, keyword only
        If this is set to True, axes which are reduced over will be left in the result as
        a dimension with size one, so that the result will broadcast correctly against the
        inputs. This option can only be used for generalized ufuncs that operate on inputs
        that all have the same number of core dimensions and with outputs that have no core
        dimensions , i.e., with signatures like ``"(i),(i)->()"`` or ``"(m,m)->()"``.
        If used, the location of the dimensions in the output can be controlled with axes
        and axis.
    output_dtypes : Optional, dtype or list of dtypes, keyword only
        Valid numpy dtype specification or list thereof.
        If not given, a call of ``func`` with a small set of data
        is performed in order to try to automatically determine the
        output dtypes.
    output_sizes : dict, optional, keyword only
        Optional mapping from dimension names to sizes for outputs. Only used if
        new core dimensions (not found on inputs) appear on outputs.
    vectorize: bool, keyword only
        If set to ``True``, ``np.vectorize`` is applied to ``func`` for
        convenience. Defaults to ``False``.
    allow_rechunk: Optional, bool, keyword only
        Allows rechunking, otherwise chunk sizes need to match and core
        dimensions are to consist only of one chunk.
        Warning: enabling this can increase memory usage significantly.
        Defaults to ``False``.
    meta: Optional, tuple, keyword only
        tuple of empty ndarrays describing the shape and dtype of the output of the gufunc.
        Defaults to ``None``.
    **kwargs : dict
        Extra keyword arguments to pass to `func`

    Returns
    -------
    Single dask.array.Array or tuple of dask.array.Array

    Examples
    --------
    >>> import dask.array as da
    >>> import numpy as np
    >>> def stats(x):
    ...     return np.mean(x, axis=-1), np.std(x, axis=-1)
    >>> a = da.random.normal(size=(10,20,30), chunks=(5, 10, 30))
    >>> mean, std = da.apply_gufunc(stats, "(i)->(),()", a)
    >>> mean.compute().shape
    (10, 20)


    >>> def outer_product(x, y):
    ...     return np.einsum("i,j->ij", x, y)
    >>> a = da.random.normal(size=(   20,30), chunks=(10, 30))
    >>> b = da.random.normal(size=(10, 1,40), chunks=(5, 1, 40))
    >>> c = da.apply_gufunc(outer_product, "(i),(j)->(i,j)", a, b, vectorize=True)
    >>> c.compute().shape
    (10, 20, 30, 40)

    References
    ----------
    .. [1] https://docs.scipy.org/doc/numpy/reference/ufuncs.html
    .. [2] https://docs.scipy.org/doc/numpy/reference/c-api/generalized-ufuncs.html
    """
    # Input processing:
    ## Signature
    if not isinstance(signature, str):
        raise TypeError("`signature` has to be of type string")
    # NumPy versions before https://github.com/numpy/numpy/pull/19627
    # would not ignore whitespace characters in `signature` like they
    # are supposed to. We remove the whitespace here as a workaround.
    signature = re.sub(r"\s+", "", signature)
    input_coredimss, output_coredimss = _parse_gufunc_signature(signature)

    ## Determine nout: nout = None for functions of one direct return; nout = int for return tuples
    nout = None if not isinstance(output_coredimss,
                                  list) else len(output_coredimss)

    ## Consolidate onto `meta`
    if meta is not None and output_dtypes is not None:
        raise ValueError(
            "Only one of `meta` and `output_dtypes` should be given (`meta` is preferred)."
        )
    if meta is None:
        if output_dtypes is None:
            ## Infer `output_dtypes`
            if vectorize:
                tempfunc = np.vectorize(func, signature=signature)
            else:
                tempfunc = func
            output_dtypes = apply_infer_dtype(tempfunc, args, kwargs,
                                              "apply_gufunc", "output_dtypes",
                                              nout)

        ## Turn `output_dtypes` into `meta`
        if (nout is None and isinstance(output_dtypes, (tuple, list))
                and len(output_dtypes) == 1):
            output_dtypes = output_dtypes[0]
        sample = args[0] if args else None
        if nout is None:
            meta = meta_from_array(sample, dtype=output_dtypes)
        else:
            meta = tuple(
                meta_from_array(sample, dtype=odt) for odt in output_dtypes)

    ## Normalize `meta` format
    meta = meta_from_array(meta)
    if isinstance(meta, list):
        meta = tuple(meta)

    ## Validate `meta`
    if nout is None:
        if isinstance(meta, tuple):
            if len(meta) == 1:
                meta = meta[0]
            else:
                raise ValueError(
                    "For a function with one output, must give a single item for `output_dtypes`/`meta`, "
                    "not a tuple or list.")
    else:
        if not isinstance(meta, tuple):
            raise ValueError(
                f"For a function with {nout} outputs, must give a tuple or list for `output_dtypes`/`meta`, "
                "not a single item.")
        if len(meta) != nout:
            raise ValueError(
                f"For a function with {nout} outputs, must give a tuple or list of {nout} items for "
                f"`output_dtypes`/`meta`, not {len(meta)}.")

    ## Vectorize function, if required
    if vectorize:
        otypes = [x.dtype
                  for x in meta] if isinstance(meta, tuple) else [meta.dtype]
        func = np.vectorize(func, signature=signature, otypes=otypes)

    ## Miscellaneous
    if output_sizes is None:
        output_sizes = {}

    ## Axes
    input_axes, output_axes = _validate_normalize_axes(axes, axis, keepdims,
                                                       input_coredimss,
                                                       output_coredimss)

    # Main code:
    ## Cast all input arrays to dask
    args = [asarray(a) for a in args]

    if len(input_coredimss) != len(args):
        raise ValueError(
            "According to `signature`, `func` requires %d arguments, but %s given"
            % (len(input_coredimss), len(args)))

    ## Axes: transpose input arguments
    transposed_args = []
    for arg, iax, input_coredims in zip(args, input_axes, input_coredimss):
        shape = arg.shape
        iax = tuple(a if a < 0 else a - len(shape) for a in iax)
        tidc = tuple(i
                     for i in range(-len(shape) + 0, 0) if i not in iax) + iax
        transposed_arg = arg.transpose(tidc)
        transposed_args.append(transposed_arg)
    args = transposed_args

    ## Assess input args for loop dims
    input_shapes = [a.shape for a in args]
    input_chunkss = [a.chunks for a in args]
    num_loopdims = [
        len(s) - len(cd) for s, cd in zip(input_shapes, input_coredimss)
    ]
    max_loopdims = max(num_loopdims) if num_loopdims else None
    core_input_shapes = [
        dict(zip(icd, s[n:]))
        for s, n, icd in zip(input_shapes, num_loopdims, input_coredimss)
    ]
    core_shapes = merge(*core_input_shapes)
    core_shapes.update(output_sizes)

    loop_input_dimss = [
        tuple("__loopdim%d__" % d
              for d in range(max_loopdims - n, max_loopdims))
        for n in num_loopdims
    ]
    input_dimss = [l + c for l, c in zip(loop_input_dimss, input_coredimss)]

    loop_output_dims = max(loop_input_dimss,
                           key=len) if loop_input_dimss else tuple()

    ## Assess input args for same size and chunk sizes
    ### Collect sizes and chunksizes of all dims in all arrays
    dimsizess = {}
    chunksizess = {}
    for dims, shape, chunksizes in zip(input_dimss, input_shapes,
                                       input_chunkss):
        for dim, size, chunksize in zip(dims, shape, chunksizes):
            dimsizes = dimsizess.get(dim, [])
            dimsizes.append(size)
            dimsizess[dim] = dimsizes
            chunksizes_ = chunksizess.get(dim, [])
            chunksizes_.append(chunksize)
            chunksizess[dim] = chunksizes_
    ### Assert correct partitioning, for case:
    for dim, sizes in dimsizess.items():
        #### Check that the arrays have same length for same dimensions or dimension `1`
        if set(sizes) | {1} != {1, max(sizes)}:
            raise ValueError(
                f"Dimension `'{dim}'` with different lengths in arrays")
        if not allow_rechunk:
            chunksizes = chunksizess[dim]
            #### Check if core dimensions consist of only one chunk
            if (dim in core_shapes) and (chunksizes[0][0] < core_shapes[dim]):
                raise ValueError(
                    "Core dimension `'{}'` consists of multiple chunks. To fix, rechunk into a single \
chunk along this dimension or set `allow_rechunk=True`, but beware that this may increase memory usage \
significantly.".format(dim))
            #### Check if loop dimensions consist of same chunksizes, when they have sizes > 1
            relevant_chunksizes = list(
                unique(c for s, c in zip(sizes, chunksizes) if s > 1))
            if len(relevant_chunksizes) > 1:
                raise ValueError(
                    f"Dimension `'{dim}'` with different chunksize present")

    ## Apply function - use blockwise here
    arginds = list(concat(zip(args, input_dimss)))

    ### Use existing `blockwise` but only with loopdims to enforce
    ### concatenation for coredims that appear also at the output
    ### Modifying `blockwise` could improve things here.
    tmp = blockwise(func,
                    loop_output_dims,
                    *arginds,
                    concatenate=True,
                    meta=meta,
                    **kwargs)

    # NOTE: we likely could just use `meta` instead of `tmp._meta`,
    # but we use it and validate it anyway just to be sure nothing odd has happened.
    metas = tmp._meta
    if nout is None:
        assert not isinstance(
            metas, (list, tuple)
        ), f"meta changed from single output to multiple output during blockwise: {meta} -> {metas}"
        metas = (metas, )
    else:
        assert isinstance(
            metas, (list, tuple)
        ), f"meta changed from multiple output to single output during blockwise: {meta} -> {metas}"
        assert (
            len(metas) == nout
        ), f"Number of outputs changed from {nout} to {len(metas)} during blockwise"

    ## Prepare output shapes
    loop_output_shape = tmp.shape
    loop_output_chunks = tmp.chunks
    keys = list(flatten(tmp.__dask_keys__()))
    name, token = keys[0][0].split("-")

    ### *) Treat direct output
    if nout is None:
        output_coredimss = [output_coredimss]

    ## Split output
    leaf_arrs = []
    for i, (ocd, oax,
            meta) in enumerate(zip(output_coredimss, output_axes, metas)):
        core_output_shape = tuple(core_shapes[d] for d in ocd)
        core_chunkinds = len(ocd) * (0, )
        output_shape = loop_output_shape + core_output_shape
        output_chunks = loop_output_chunks + core_output_shape
        leaf_name = "%s_%d-%s" % (name, i, token)
        leaf_dsk = {(leaf_name, ) + key[1:] + core_chunkinds:
                    ((getitem, key, i) if nout else key)
                    for key in keys}
        graph = HighLevelGraph.from_collections(leaf_name,
                                                leaf_dsk,
                                                dependencies=[tmp])
        meta = meta_from_array(meta, len(output_shape))
        leaf_arr = Array(graph,
                         leaf_name,
                         chunks=output_chunks,
                         shape=output_shape,
                         meta=meta)

        ### Axes:
        if keepdims:
            slices = len(
                leaf_arr.shape) * (slice(None), ) + len(oax) * (np.newaxis, )
            leaf_arr = leaf_arr[slices]

        tidcs = [None] * len(leaf_arr.shape)
        for ii, oa in zip(range(-len(oax), 0), oax):
            tidcs[oa] = ii
        j = 0
        for ii in range(len(tidcs)):
            if tidcs[ii] is None:
                tidcs[ii] = j
                j += 1
        leaf_arr = leaf_arr.transpose(tidcs)
        leaf_arrs.append(leaf_arr)

    return (*leaf_arrs, ) if nout else leaf_arrs[0]  # Undo *) from above
Exemple #18
0
def linspace(start,
             stop,
             num=50,
             endpoint=True,
             retstep=False,
             chunks="auto",
             dtype=None):
    """
    Return `num` evenly spaced values over the closed interval [`start`,
    `stop`].

    Parameters
    ----------
    start : scalar
        The starting value of the sequence.
    stop : scalar
        The last value of the sequence.
    num : int, optional
        Number of samples to include in the returned dask array, including the
        endpoints. Default is 50.
    endpoint : bool, optional
        If True, ``stop`` is the last sample. Otherwise, it is not included.
        Default is True.
    retstep : bool, optional
        If True, return (samples, step), where step is the spacing between
        samples. Default is False.
    chunks :  int
        The number of samples on each block. Note that the last block will have
        fewer samples if `num % blocksize != 0`
    dtype : dtype, optional
        The type of the output array.

    Returns
    -------
    samples : dask array
    step : float, optional
        Only returned if ``retstep`` is True. Size of spacing between samples.


    See Also
    --------
    dask.array.arange
    """
    num = int(num)

    if dtype is None:
        dtype = np.linspace(0, 1, 1).dtype

    chunks = normalize_chunks(chunks, (num, ), dtype=dtype)

    range_ = stop - start

    div = (num - 1) if endpoint else num
    if div == 0:
        div = 1

    step = float(range_) / div

    name = "linspace-" + tokenize((start, stop, num, endpoint, chunks, dtype))

    dsk = {}
    blockstart = start

    for i, bs in enumerate(chunks[0]):
        bs_space = bs - 1 if endpoint else bs
        blockstop = blockstart + (bs_space * step)
        task = (
            partial(chunk.linspace, endpoint=endpoint, dtype=dtype),
            blockstart,
            blockstop,
            bs,
        )
        blockstart = blockstart + (step * bs)
        dsk[(name, i)] = task

    if retstep:
        return Array(dsk, name, chunks, dtype=dtype), step
    else:
        return Array(dsk, name, chunks, dtype=dtype)
Exemple #19
0
        def choice(self, a, size=None, replace=True, p=None, chunks=None):
            dsks = []
            # Normalize and validate `a`
            if isinstance(a, Integral):
                # On windows the output dtype differs if p is provided or
                # absent, see https://github.com/numpy/numpy/issues/9867
                dummy_p = np.array([1]) if p is not None else p
                dtype = np.random.choice(1, size=(), p=dummy_p).dtype
                len_a = a
                if a < 0:
                    raise ValueError("a must be greater than 0")
            else:
                a = asarray(a).rechunk(a.shape)
                dtype = a.dtype
                if a.ndim != 1:
                    raise ValueError("a must be one dimensional")
                len_a = len(a)
                dsks.append(a.dask)
                a = a.__dask_keys__()[0]

            # Normalize and validate `p`
            if p is not None:
                if not isinstance(p, Array):
                    # If p is not a dask array, first check the sum is close
                    # to 1 before converting.
                    p = np.asarray(p)
                    if not np.isclose(p.sum(), 1, rtol=1e-7, atol=0):
                        raise ValueError("probabilities do not sum to 1")
                    p = asarray(p)
                else:
                    p = p.rechunk(p.shape)

                if p.ndim != 1:
                    raise ValueError("p must be one dimensional")
                if len(p) != len_a:
                    raise ValueError("a and p must have the same size")

                dsks.append(p.dask)
                p = p.__dask_keys__()[0]

            if size is None:
                size = ()
            elif not isinstance(size, (tuple, list)):
                size = (size, )

            chunks = normalize_chunks(chunks, size)
            sizes = list(product(*chunks))
            state_data = random_state_data(len(sizes), self._numpy_state)

            name = 'da.random.choice-%s' % tokenize(state_data, size, chunks,
                                                    a, replace, p)
            keys = product([name], *(range(len(bd)) for bd in chunks))
            dsk = {
                k: (_choice, state, a, size, replace, p)
                for k, state, size in zip(keys, state_data, sizes)
            }

            return Array(sharedict.merge((name, dsk), *dsks),
                         name,
                         chunks,
                         dtype=dtype)
Exemple #20
0
def eye(N, chunks="auto", M=None, k=0, dtype=float):
    """
    Return a 2-D Array with ones on the diagonal and zeros elsewhere.

    Parameters
    ----------
    N : int
      Number of rows in the output.
    chunks : int, str
        How to chunk the array. Must be one of the following forms:

        -   A blocksize like 1000.
        -   A size in bytes, like "100 MiB" which will choose a uniform
            block-like shape
        -   The word "auto" which acts like the above, but uses a configuration
            value ``array.chunk-size`` for the chunk size
    M : int, optional
      Number of columns in the output. If None, defaults to `N`.
    k : int, optional
      Index of the diagonal: 0 (the default) refers to the main diagonal,
      a positive value refers to an upper diagonal, and a negative value
      to a lower diagonal.
    dtype : data-type, optional
      Data-type of the returned array.

    Returns
    -------
    I : Array of shape (N,M)
      An array where all elements are equal to zero, except for the `k`-th
      diagonal, whose values are equal to one.
    """
    eye = {}
    if M is None:
        M = N
    if dtype is None:
        dtype = float

    if not isinstance(chunks, (int, str)):
        raise ValueError("chunks must be an int or string")

    vchunks, hchunks = normalize_chunks(chunks, shape=(N, M), dtype=dtype)
    chunks = vchunks[0]

    token = tokenize(N, chunks, M, k, dtype)
    name_eye = "eye-" + token

    for i, vchunk in enumerate(vchunks):
        for j, hchunk in enumerate(hchunks):
            if (j - i - 1) * chunks <= k <= (j - i + 1) * chunks:
                eye[name_eye, i, j] = (
                    np.eye,
                    vchunk,
                    hchunk,
                    k - (j - i) * chunks,
                    dtype,
                )
            else:
                eye[name_eye, i, j] = (np.zeros, (vchunk, hchunk), dtype)
    return Array(eye,
                 name_eye,
                 shape=(N, M),
                 chunks=(chunks, chunks),
                 dtype=dtype)
Exemple #21
0
def arange(*args, chunks="auto", like=None, dtype=None, **kwargs):
    """
    Return evenly spaced values from `start` to `stop` with step size `step`.

    The values are half-open [start, stop), so including start and excluding
    stop. This is basically the same as python's range function but for dask
    arrays.

    When using a non-integer step, such as 0.1, the results will often not be
    consistent. It is better to use linspace for these cases.

    Parameters
    ----------
    start : int, optional
        The starting value of the sequence. The default is 0.
    stop : int
        The end of the interval, this value is excluded from the interval.
    step : int, optional
        The spacing between the values. The default is 1 when not specified.
        The last value of the sequence.
    chunks :  int
        The number of samples on each block. Note that the last block will have
        fewer samples if ``len(array) % chunks != 0``.
        Defaults to "auto" which will automatically determine chunk sizes.
    dtype : numpy.dtype
        Output dtype. Omit to infer it from start, stop, step
        Defaults to ``None``.
    like : array type or ``None``
        Array to extract meta from. Defaults to ``None``.

    Returns
    -------
    samples : dask array

    See Also
    --------
    dask.array.linspace
    """
    if len(args) == 1:
        start = 0
        stop = args[0]
        step = 1
    elif len(args) == 2:
        start = args[0]
        stop = args[1]
        step = 1
    elif len(args) == 3:
        start, stop, step = args
    else:
        raise TypeError("""
        arange takes 3 positional arguments: arange([start], stop, [step])
        """)

    num = int(max(np.ceil((stop - start) / step), 0))

    meta = meta_from_array(like) if like is not None else None

    if dtype is None:
        dtype = np.arange(start, stop, step * num if num else step).dtype

    chunks = normalize_chunks(chunks, (num, ), dtype=dtype)

    if kwargs:
        raise TypeError("Unexpected keyword argument(s): %s" %
                        ",".join(kwargs.keys()))

    name = "arange-" + tokenize((start, stop, step, chunks, dtype))
    dsk = {}
    elem_count = 0

    for i, bs in enumerate(chunks[0]):
        blockstart = start + (elem_count * step)
        blockstop = start + ((elem_count + bs) * step)
        task = (
            partial(chunk.arange, like=like),
            blockstart,
            blockstop,
            step,
            bs,
            dtype,
        )
        dsk[(name, i)] = task
        elem_count += bs

    return Array(dsk, name, chunks, dtype=dtype, meta=meta)
Exemple #22
0
    def _wrap(self, func, *args, **kwargs):
        """ Wrap numpy random function to produce dask.array random function

        extra_chunks should be a chunks tuple to append to the end of chunks
        """
        size = kwargs.pop('size', None)
        chunks = kwargs.pop('chunks')
        extra_chunks = kwargs.pop('extra_chunks', ())

        if size is not None and not isinstance(size, (tuple, list)):
            size = (size, )

        args_shapes = {
            ar.shape
            for ar in args if isinstance(ar, (Array, np.ndarray))
        }
        args_shapes.union({
            ar.shape
            for ar in kwargs.values() if isinstance(ar, (Array, np.ndarray))
        })

        shapes = list(args_shapes)
        if size is not None:
            shapes += [size]
        # broadcast to the final size(shape)
        size = broadcast_shapes(*shapes)
        chunks = normalize_chunks(chunks, size)
        slices = slices_from_chunks(chunks)

        def _broadcast_any(ar, shape, chunks):
            if isinstance(ar, Array):
                return broadcast_to(ar, shape).rechunk(chunks)
            if isinstance(ar, np.ndarray):
                return np.ascontiguousarray(np.broadcast_to(ar, shape))

        # Broadcast all arguments, get tiny versions as well
        # Start adding the relevant bits to the graph
        dsk = {}
        dsks = []
        lookup = {}
        small_args = []
        for i, ar in enumerate(args):
            if isinstance(ar, (np.ndarray, Array)):
                res = _broadcast_any(ar, size, chunks)
                if isinstance(res, Array):
                    dsks.append(res.dask)
                    lookup[i] = res.name
                elif isinstance(res, np.ndarray):
                    name = 'array-{}'.format(tokenize(res))
                    lookup[i] = name
                    dsk[name] = res
                small_args.append(ar[tuple(0 for _ in ar.shape)])
            else:
                small_args.append(ar)

        small_kwargs = {}
        for key, ar in kwargs.items():
            if isinstance(ar, (np.ndarray, Array)):
                res = _broadcast_any(ar, size, chunks)
                if isinstance(res, Array):
                    dsks.append(res.dask)
                    lookup[key] = res.name
                elif isinstance(res, np.ndarray):
                    name = 'array-{}'.format(tokenize(res))
                    lookup[key] = name
                    dsk[name] = res
                small_kwargs[key] = ar[tuple(0 for _ in ar.shape)]
            else:
                small_kwargs[key] = ar

        # Get dtype
        small_kwargs['size'] = (0, )
        dtype = func(xoroshiro128plus.RandomState(), *small_args,
                     **small_kwargs).dtype

        sizes = list(product(*chunks))
        state_data = random_state_data(len(sizes), self._numpy_state)
        token = tokenize(state_data, size, chunks, args, kwargs)
        name = 'da.random.{0}-{1}'.format(func.__name__, token)

        keys = product([name],
                       *([range(len(bd))
                          for bd in chunks] + [[0]] * len(extra_chunks)))
        blocks = product(*[range(len(bd)) for bd in chunks])
        vals = []
        for state, size, slc, block in zip(state_data, sizes, slices, blocks):
            arg = []
            for i, ar in enumerate(args):
                if i not in lookup:
                    arg.append(ar)
                else:
                    if isinstance(ar, Array):
                        arg.append((lookup[i], ) + block)
                    else:  # np.ndarray
                        arg.append((getitem, lookup[i], slc))
            kwrg = {}
            for k, ar in kwargs.items():
                if k not in lookup:
                    kwrg[k] = ar
                else:
                    if isinstance(ar, Array):
                        kwrg[k] = (lookup[k], ) + block
                    else:  # np.ndarray
                        kwrg[k] = (getitem, lookup[k], slc)
            vals.append((_apply_random, func.__name__, state, size, arg, kwrg))
        dsk.update(dict(zip(keys, vals)))
        dsk = sharedict.merge((name, dsk), *dsks)
        return Array(dsk, name, chunks + extra_chunks, dtype=dtype)
Exemple #23
0
def diagonal(a, offset=0, axis1=0, axis2=1):
    name = "diagonal-" + tokenize(a, offset, axis1, axis2)

    if a.ndim < 2:
        # NumPy uses `diag` as we do here.
        raise ValueError("diag requires an array of at least two dimensions")

    def _axis_fmt(axis, name, ndim):
        if axis < 0:
            t = ndim + axis
            if t < 0:
                msg = "{}: axis {} is out of bounds for array of dimension {}"
                raise np.AxisError(msg.format(name, axis, ndim))
            axis = t
        return axis

    def pop_axes(chunks, axis1, axis2):
        chunks = list(chunks)
        chunks.pop(axis2)
        chunks.pop(axis1)
        return tuple(chunks)

    axis1 = _axis_fmt(axis1, "axis1", a.ndim)
    axis2 = _axis_fmt(axis2, "axis2", a.ndim)

    if axis1 == axis2:
        raise ValueError("axis1 and axis2 cannot be the same")

    a = asarray(a)
    k = offset
    if axis1 > axis2:
        axis1, axis2 = axis2, axis1
        k = -offset

    free_axes = set(range(a.ndim)) - {axis1, axis2}
    free_indices = list(product(*(range(a.numblocks[i]) for i in free_axes)))
    ndims_free = len(free_axes)

    # equation of diagonal: i = j - k
    kdiag_row_start = max(0, -k)
    kdiag_col_start = max(0, k)
    kdiag_row_stop = min(a.shape[axis1], a.shape[axis2] - k)
    len_kdiag = kdiag_row_stop - kdiag_row_start

    if len_kdiag <= 0:
        xp = np

        if is_cupy_type(a._meta):
            import cupy

            xp = cupy

        out_chunks = pop_axes(a.chunks, axis1, axis2) + ((0, ), )
        dsk = dict()
        for free_idx in free_indices:
            shape = tuple(out_chunks[axis][free_idx[axis]]
                          for axis in range(ndims_free))
            dsk[(name, ) + free_idx + (0, )] = (
                partial(xp.empty, dtype=a.dtype),
                shape + (0, ),
            )

        meta = meta_from_array(a, ndims_free + 1)
        return Array(dsk, name, out_chunks, meta=meta)

    # compute row index ranges for chunks along axis1:
    row_stops_ = np.cumsum(a.chunks[axis1])
    row_starts = np.roll(row_stops_, 1)
    row_starts[0] = 0

    # compute column index ranges for chunks along axis2:
    col_stops_ = np.cumsum(a.chunks[axis2])
    col_starts = np.roll(col_stops_, 1)
    col_starts[0] = 0

    # locate first chunk containing diagonal:
    row_blockid = np.arange(a.numblocks[axis1])
    col_blockid = np.arange(a.numblocks[axis2])

    row_filter = (row_starts <= kdiag_row_start) & (kdiag_row_start <
                                                    row_stops_)
    col_filter = (col_starts <= kdiag_col_start) & (kdiag_col_start <
                                                    col_stops_)
    (I, ) = row_blockid[row_filter]
    (J, ) = col_blockid[col_filter]

    # follow k-diagonal through chunks while constructing dask graph:
    dsk = dict()
    i = 0
    kdiag_chunks = ()
    while kdiag_row_start < a.shape[axis1] and kdiag_col_start < a.shape[axis2]:
        # localize block info:
        nrows, ncols = a.chunks[axis1][I], a.chunks[axis2][J]
        kdiag_row_start -= row_starts[I]
        kdiag_col_start -= col_starts[J]
        k = -kdiag_row_start if kdiag_row_start > 0 else kdiag_col_start
        kdiag_row_end = min(nrows, ncols - k)
        kdiag_len = kdiag_row_end - kdiag_row_start

        # increment dask graph:
        for free_idx in free_indices:
            input_idx = (free_idx[:axis1] + (I, ) + free_idx[axis1:axis2 - 1] +
                         (J, ) + free_idx[axis2 - 1:])
            output_idx = free_idx + (i, )
            dsk[(name, ) + output_idx] = (
                np.diagonal,
                (a.name, ) + input_idx,
                k,
                axis1,
                axis2,
            )

        kdiag_chunks += (kdiag_len, )
        # prepare for next iteration:
        i += 1
        kdiag_row_start = kdiag_row_end + row_starts[I]
        kdiag_col_start = min(ncols, nrows + k) + col_starts[J]
        I = I + 1 if kdiag_row_start == row_stops_[I] else I
        J = J + 1 if kdiag_col_start == col_stops_[J] else J

    out_chunks = pop_axes(a.chunks, axis1, axis2) + (kdiag_chunks, )
    graph = HighLevelGraph.from_collections(name, dsk, dependencies=[a])
    meta = meta_from_array(a, ndims_free + 1)
    return Array(graph, name, out_chunks, meta=meta)