Beispiel #1
def assert_dask_dtypes(ddf, res, numeric_equal=True):
    """Check that the dask metadata matches the result.

    If `numeric_equal`, integer and floating dtypes compare equal. This is
    useful due to the implicit conversion of integer to floating upon
    encountering missingness, which is hard to infer statically."""

    eq_type_sets = [{"O", "S", "U", "a"}]  # treat object and strings alike
    if numeric_equal:
        eq_type_sets.append({"i", "f", "u"})

    def eq_dtypes(a, b):
        return any(a.kind in eq_types and b.kind in eq_types
                   for eq_types in eq_type_sets) or (a == b)

    if not is_dask_collection(res) and is_dataframe_like(res):
        for col, a, b in pd.concat([ddf._meta.dtypes, res.dtypes],
            assert eq_dtypes(a, b)
    elif not is_dask_collection(res) and (is_index_like(res)
                                          or is_series_like(res)):
        a = ddf._meta.dtype
        b = res.dtype
        assert eq_dtypes(a, b)
        if hasattr(ddf._meta, "dtype"):
            a = ddf._meta.dtype
            if not hasattr(res, "dtype"):
                assert np.isscalar(res)
                b = np.dtype(type(res))
                b = res.dtype
            assert eq_dtypes(a, b)
            assert type(ddf._meta) == type(res)
Beispiel #2
def test_is_dask_collection():
    class DummyCollection(object):
        def __init__(self, dsk=None):
            self.dask = dsk

        def __dask_graph__(self):
            return self.dask

    x = delayed(1) + 2
    assert is_dask_collection(x)
    assert not is_dask_collection(2)
    assert is_dask_collection(DummyCollection({}))
    assert not is_dask_collection(DummyCollection())
Beispiel #3
def test_is_dask_collection():
    class DummyCollection(object):
        def __init__(self, dsk=None):
            self.dask = dsk

        def __dask_graph__(self):
            return self.dask

    x = delayed(1) + 2
    assert is_dask_collection(x)
    assert not is_dask_collection(2)
    assert is_dask_collection(DummyCollection({}))
    assert not is_dask_collection(DummyCollection())
    assert not is_dask_collection(DummyCollection)
Beispiel #4
    def __check_dly_processing_prereq(self, inputs: dict):
        '''At least one input must be a dask DataFrame type. Output types must
        be specified as cudf.DataFrame or dask_cudf.DataFrame. (Functionality
        could also be extended to support dask dataframe of pandas, but
        currently only cudf/dask_cudf dataframes are supported.)
        # check if dask future or delayed
        ivals = inputs.values()
        if not any((is_dask_collection(iv) for iv in ivals)) and \
                not any((isinstance(iv, Future) for iv in ivals)):
            # None of the inputs are Delayed or Futures so no intention of
            # using delayed processing. Return False and avoid printing
            # non-applicable warning.
            return False

        use_delayed = False
        for ival in ivals:
            if isinstance(ival, DaskDataFrame):
                use_delayed = True

        # NOTE: Currently only support delayed processing when one of the
        #     inputs is a dask_cudf.DataFrame. In the future might generalize
        #     to support dask processing of other delayed/future type inputs.
        if not use_delayed:
            warn_msg = \
                'None of the Node "{}" inputs '\
                'is a dask_cudf.DataFrame. Ignoring '\
                '"delayed_process" setting.'.format(self.uid)

        return use_delayed
Beispiel #5
def is_dask_collection(x):
    if dsk.available:
        from dask.base import is_dask_collection

        return is_dask_collection(x)
        return False
Beispiel #6
    def outer(self, A, B, **kwargs):
        if self.nin != 2:
            raise ValueError(
                "outer product only supported for binary functions")
        if "out" in kwargs:
            raise ValueError("`out` kwarg not supported")

        A_is_dask = is_dask_collection(A)
        B_is_dask = is_dask_collection(B)
        if not A_is_dask and not B_is_dask:
            return self._ufunc.outer(A, B, **kwargs)
        elif (A_is_dask and not isinstance(A, Array)
              or B_is_dask and not isinstance(B, Array)):
            raise NotImplementedError(
                "Dask objects besides `dask.array.Array` "
                "are not supported at this time.")

        A = asarray(A)
        B = asarray(B)
        ndim = A.ndim + B.ndim
        out_inds = tuple(range(ndim))
        A_inds = out_inds[:A.ndim]
        B_inds = out_inds[A.ndim:]

        dtype = apply_infer_dtype(self._ufunc.outer, [A, B],

        if "dtype" in kwargs:
            func = partial(self._ufunc.outer, dtype=kwargs.pop("dtype"))
            func = self._ufunc.outer

        return blockwise(
            token=self.__name__ + ".outer",
Beispiel #7
 def __subclasshook__(cls, C):
     if cls is DaskImage:
             if (is_dask_collection(C) and any("__daskmeta__" in B.__dict__
                                               for B in C.__mro__)):
                 return True
         except AttributeError:
     return NotImplemented
Beispiel #8
def finalize(collection):
    assert is_dask_collection(collection)

    name = "finalize-" + tokenize(collection)
    keys = collection.__dask_keys__()
    finalize, args = collection.__dask_postcompute__()
    layer = {name: (finalize, keys) + args}
    graph = HighLevelGraph.from_collections(name,
    return Delayed(name, graph)
Beispiel #9
    def from_collections(cls, name, layer, dependencies=()):
        """Construct a HighLevelGraph from a new layer and a set of collections

        This constructs a HighLevelGraph in the common case where we have a single
        new layer and a set of old collections on which we want to depend.

        This pulls out the ``__dask_layers__()`` method of the collections if
        they exist, and adds them to the dependencies for this new layer.  It
        also merges all of the layers from all of the dependent collections
        together into the new layers for this graph.

        name : str
            The name of the new layer
        layer : Mapping
            The graph layer itself
        dependencies : List of Dask collections
            A list of other dask collections (like arrays or dataframes) that
            have graphs themselves


        In typical usage we make a new task layer, and then pass that layer
        along with all dependent collections to this method.

        >>> def add(self, other):
        ...     name = 'add-' + tokenize(self, other)
        ...     layer = {(name, i): (add, input_key, other)
        ...              for i, input_key in enumerate(self.__dask_keys__())}
        ...     graph = HighLevelGraph.from_collections(name, layer, dependencies=[self])
        ...     return new_collection(name, graph)
        if len(dependencies) == 1:
            return cls._from_collection(name, layer, dependencies[0])
        layers = {name: layer}
        deps = {name: set()}
        for collection in toolz.unique(dependencies, key=id):
            if is_dask_collection(collection):
                graph = collection.__dask_graph__()
                if isinstance(graph, HighLevelGraph):
                    deps[name] |= set(collection.__dask_layers__())
                    key = _get_some_layer_name(collection)
                    layers[key] = graph
                    deps[key] = set()
                raise TypeError(type(collection))

        return cls(layers, deps)
Beispiel #10
    def concrete_fill(self,
                      *args: Any,
                      weight: Any | None = None,
                      threads=None) -> Histogram:
        """Fill the histogram with concrete data (not a Dask collection).

        Calls the super class fill function

        *args : array_like
            Provide one value or array per dimension
        weight : array_like, optional
            Provide weights (only if the storage supports them)
        sample : array_like
            Provide samples (only if the storage supports them)
        threads : int, optional
            Fill with threads. Defaults to None, which does not
            activate threaded filling. Using 0 will automatically pick
            the number of available threads (usually two per core).

            Class instance now filled with concrete data.

        if any(is_dask_collection(a)
               for a in args) or is_dask_collection(weight):
            raise TypeError(
                "concrete_fill does not support Dask collections, only materialized "
                "data; use the Histogram.fill method.")
        return super().fill(*args,
Beispiel #11
 def __call__(self, a: AggHistogram, b: AggHistogram) -> AggHistogram:
     name = f"{self.__name__}-hist-{tokenize(a, b)}"
     deps = []
     if is_dask_collection(a):
         k1 =
         k1 = a  # type: ignore
     if is_dask_collection(b):
         k2 =
         k2 = b  # type: ignore
     k1 = a.__dask_tokenize__() if is_dask_collection(
         a) else a  # type: ignore
     k2 = b.__dask_tokenize__() if is_dask_collection(
         b) else b  # type: ignore
     llg = {name: (self.func, k1, k2)}
     g = HighLevelGraph.from_collections(name, llg, dependencies=deps)
         ref = a.histref
     except AttributeError:
         ref = b.histref
     return AggHistogram(g, name, histref=ref)
Beispiel #12
def to_keys(dsk, *args):
    for x in args:
        if x is None:
            yield None
        elif isinstance(x, da.Array):
            x = delayed(x)
            yield x.key
        elif isinstance(x, Delayed):
            yield x.key
            assert not is_dask_collection(x)
            key = "array-" + tokenize(x)
            dsk[key] = x
            yield key
Beispiel #13
def to_keys(dsk, *args):
    for x in args:
        if x is None:
            yield None
        elif isinstance(x, (da.Array, dd.DataFrame)):
            x = delayed(x)
            yield x.key
        elif isinstance(x, Delayed):
            yield x.key
            assert not is_dask_collection(x)
            key = type(x).__name__ + "-" + tokenize(x)
            dsk[key] = x
            yield key
Beispiel #14
def to_keys(dsk, *args):
    for x in args:
        if x is None:
            yield None
        elif isinstance(x, da.Array):
            x = delayed(x)
            yield x.key
        elif isinstance(x, Delayed):
            yield x.key
            assert not is_dask_collection(x)
            key = 'array-' + tokenize(x)
            dsk[key] = x
            yield key
Beispiel #15
    def _from_collection(cls, name, layer, collection):
        """`from_collections` optimized for a single collection"""
        if not is_dask_collection(collection):
            raise TypeError(type(collection))

        graph = collection.__dask_graph__()
        if isinstance(graph, HighLevelGraph):
            layers = ensure_dict(graph.layers, copy=True)
            layers[name] = layer
            deps = ensure_dict(graph.dependencies, copy=True)
            deps[name] = set(collection.__dask_layers__())
            key = _get_some_layer_name(collection)
            layers = {name: layer, key: graph}
            deps = {name: {key}, key: set()}

        return cls(layers, deps)
Beispiel #16
def to_indexable(*args, **kwargs):
    """Ensure that all args are an indexable type.

    Conversion runs lazily for dask objects, immediately otherwise.

    args : array_like or scalar
    allow_scalars : bool, optional
        Whether to allow scalars in args. Default is False.
    if kwargs.get("allow_scalars", False):
        indexable = _maybe_indexable
        indexable = _indexable
    for x in args:
        if x is None or isinstance(x, da.Array):
            yield x
        elif is_dask_collection(x):
            yield delayed(indexable, pure=True)(x)
            yield indexable(x)
Beispiel #17
def to_indexable(*args, **kwargs):
    """Ensure that all args are an indexable type.

    Conversion runs lazily for dask objects, immediately otherwise.

    args : array_like or scalar
    allow_scalars : bool, optional
        Whether to allow scalars in args. Default is False.
    if kwargs.get('allow_scalars', False):
        indexable = _maybe_indexable
        indexable = _indexable
    for x in args:
        if x is None or isinstance(x, da.Array):
            yield x
        elif is_dask_collection(x):
            yield delayed(indexable, pure=True)(x)
            yield indexable(x)
Beispiel #18
def test_custom_collection():
    dsk = {'a': 1, 'b': 2}
    dsk2 = {'c': (add, 'a', 'b'),
            'd': (add, 'c', 1)}
    dsk3 = {'e': (add, 'a', 4),
            'f': (inc, 'e')}

    x = Tuple(dsk, ['a', 'b'])
    y = Tuple(dsk2, ['c', 'd'])
    z = Tuple(dsk3, ['e', 'f'])

    # __slots__ defined on base mixin class propogates
    with pytest.raises(AttributeError): = 1

    # is_dask_collection
    assert is_dask_collection(x)

    # tokenize
    assert tokenize(x) == tokenize(x)
    assert tokenize(x) != tokenize(y)

    # compute
    assert x.compute() == (1, 2)
    assert dask.compute(x, [y, z]) == ((1, 2), [(3, 4), (5, 6)])
    t = x + y + z
    assert t.compute() == (1, 2, 3, 4, 5, 6)

    # persist
    t2 = t.persist()
    assert isinstance(t2, Tuple)
    assert t2._dask == dict(zip('abcdef', range(1, 7)))
    assert t2.compute() == (1, 2, 3, 4, 5, 6)
    x2, y2, z2 = dask.persist(x, y, z)
    t3 = x2 + y2 + z2
    assert t2._dask == t3._dask
Beispiel #19
def test_custom_collection():
    dsk = {'a': 1, 'b': 2}
    dsk2 = {'c': (add, 'a', 'b'),
            'd': (add, 'c', 1)}
    dsk3 = {'e': (add, 'a', 4),
            'f': (inc, 'e')}

    x = Tuple(dsk, ['a', 'b'])
    y = Tuple(dsk2, ['c', 'd'])
    z = Tuple(dsk3, ['e', 'f'])

    # __slots__ defined on base mixin class propogates
    with pytest.raises(AttributeError): = 1

    # is_dask_collection
    assert is_dask_collection(x)

    # tokenize
    assert tokenize(x) == tokenize(x)
    assert tokenize(x) != tokenize(y)

    # compute
    assert x.compute() == (1, 2)
    assert dask.compute(x, [y, z]) == ((1, 2), [(3, 4), (5, 6)])
    t = x + y + z
    assert t.compute() == (1, 2, 3, 4, 5, 6)

    # persist
    t2 = t.persist()
    assert isinstance(t2, Tuple)
    assert t2._dask == dict(zip('abcdef', range(1, 7)))
    assert t2.compute() == (1, 2, 3, 4, 5, 6)
    x2, y2, z2 = dask.persist(x, y, z)
    t3 = x2 + y2 + z2
    assert t2._dask == t3._dask
Beispiel #20
def test_custom_collection():
    dsk = {"a": 1, "b": 2}
    dsk2 = {"c": (add, "a", "b"), "d": (add, "c", 1)}
    dsk3 = {"e": (add, "a", 4), "f": (inc, "e")}

    x = Tuple(dsk, ["a", "b"])
    y = Tuple(dsk2, ["c", "d"])
    z = Tuple(dsk3, ["e", "f"])

    # __slots__ defined on base mixin class propagates
    with pytest.raises(AttributeError): = 1

    # is_dask_collection
    assert is_dask_collection(x)

    # tokenize
    assert tokenize(x) == tokenize(x)
    assert tokenize(x) != tokenize(y)

    # compute
    assert x.compute() == (1, 2)
    assert dask.compute(x, [y, z]) == ((1, 2), [(3, 4), (5, 6)])
    t = x + y + z
    assert t.compute() == (1, 2, 3, 4, 5, 6)

    # persist
    t2 = t.persist()
    assert isinstance(t2, Tuple)
    assert t2._dask == dict(zip("abcdef", range(1, 7)))
    assert t2.compute() == (1, 2, 3, 4, 5, 6)
    x2, y2, z2 = dask.persist(x, y, z)
    t3 = x2 + y2 + z2
    assert t2._dask == t3._dask
Beispiel #21
def unpack_collections(expr):
    """Normalize a python object and merge all sub-graphs.

    - Replace ``Delayed`` with their keys
    - Convert literals to things the schedulers can handle
    - Extract dask graphs from all enclosed values

    expr : object
        The object to be normalized. This function knows how to handle
        dask collections, as well as most builtin python types.

    task : normalized task to be run
    collections : a tuple of collections

    >>> import dask
    >>> a = delayed(1, 'a')
    >>> b = delayed(2, 'b')
    >>> task, collections = unpack_collections([a, b, 3])
    >>> task
    ['a', 'b', 3]
    >>> collections
    (Delayed('a'), Delayed('b'))

    >>> task, collections = unpack_collections({a: 1, b: 2})
    >>> task
    (<class 'dict'>, [['a', 1], ['b', 2]])
    >>> collections
    (Delayed('a'), Delayed('b'))
    if isinstance(expr, Delayed):
        return expr._key, (expr, )

    if is_dask_collection(expr):
        finalized = finalize(expr)
        return finalized._key, (finalized, )

    if isinstance(expr, Iterator):
        expr = tuple(expr)

    typ = type(expr)

    if typ in (list, tuple, set):
        args, collections = unzip((unpack_collections(e) for e in expr), 2)
        args = list(args)
        collections = tuple(unique(concat(collections), key=id))
        # Ensure output type matches input type
        if typ is not list:
            args = (typ, args)
        return args, collections

    if typ is dict:
        args, collections = unpack_collections([[k, v]
                                                for k, v in expr.items()])
        return (dict, args), collections

    if typ is slice:
        args, collections = unpack_collections(
            [expr.start, expr.stop, expr.step])
        return (slice, ) + tuple(args), collections

    if is_dataclass(expr):
        args, collections = unpack_collections([
            [, getattr(expr,] for f in fields(expr)
            if hasattr(expr,  # if init=False, field might not exist

        return (apply, typ, (), (dict, args)), collections

    return expr, ()
Beispiel #22
def histogramdd(
    a: DaskCollection | tuple[DaskCollection, ...],
    bins: BinArg = 10,
    range: RangeArg = None,
    normed: bool | None = None,
    weights: DaskCollection | None = None,
    density: bool = False,
    histogram: Any | None = None,
    storage: storage.Storage = storage.Double(),
    threads: int | None = None,
) -> Histogram | tuple[da.Array, ...] | tuple[da.Array, list[da.Array]]:
    """Histogram Dask data in multiple dimensions.

    a : dask collection or tuple of dask collections
        Data to histogram. Acceptable input data can be of the form:

        * A dask.array.Array of shape (N, D) where each row is a
          sample and each column is a specific coordinate for the
        * A sequence of dask collections where each collection (e.g.
          array or series) contains all values for one coordinate of
          all data.
    bins : sequence of arrays, int, or sequence of ints
        The bin specification.

        The possible binning configurations are:

        * A sequence of arrays describing the monotonically increasing
          bin edges along each dimension.
        * A single int describing the total number of bins that will
          be used in each dimension (this requires the `range`
          argument to be defined).
        * A sequence of ints describing the total number of bins to be
          used in each dimension (this requires the `range` argument
          to be defined).

        When bins are described by arrays, the rightmost edge is
        included. Bins described by arrays also allows for non-uniform
        bin widths.
    range : tuple(tuple(float, float), ...) optional
        A sequence of length D, each a (min, max) tuple giving the
        outer bin edges to be used if the edges are not given
        explicitly in `bins`. If defined, this argument is required to
        have an entry for each dimension. Unlike
        :func:`numpy.histogramdd`, if `bins` does not define bin
        edges, this argument is required (this function will not
        automatically use the min and max of of the value in a given
        dimension because the input data may be lazy in dask).
    normed : bool, optional
        An unsupported argument that has been deprecated in the NumPy
        API (preserved to maintain calls dependent on argument order).
    weights : dask.array.Array or dask.dataframe.Series, optional
        An array of values weighing each sample in the input data. The
        chunks of the weights must be identical to the chunking along
        the 0th (row) axis of the data sample.
    density : bool
        If ``False`` (default), the returned array represents the
        number of samples in each bin. If ``True``, the returned array
        represents the probability density function at each bin.
    histogram : dask_histogram.Histogram, optional
        If `dh.Histogram`, object based output is enabled.
    storage :
        Define the storage used by the :py:class:`Histogram` object.
    threads : int, optional
        Ignored argument kept for compatibility with boost-histogram.
        We let Dask have complete control over threads.

    tuple(dask.array.Array, tuple(dask.array.Array)) or Histogram
        The default return is the style of
        :func:`dask.array.histogramdd`: An array of bin contents and a
        tuple of edges arrays (one for each dimension). If the
        `histogram` argument is used then the return is a
        :obj:`dask_histogram.Histogram` object.

    See Also

    Creating a three dimensional histogram with variable width bins in
    each dimension. First, using three 1D arrays for each coordinate:

    >>> import dask.array as da
    >>> import dask_histogram.boost as dhb
    >>> x = da.random.standard_normal(size=(10000,), chunks=(2000,))
    >>> y = da.random.standard_normal(size=(10000,), chunks=(2000,))
    >>> z = da.random.standard_normal(size=(10000,), chunks=(2000,))
    >>> bins = [
    ...    [-3, -2, 0, 1, 3],
    ...    [-3, -1, 1, 2, 3],
    ...    [-3, -2, 0, 2, 3],
    ... ]
    >>> h, edges = dhb.histogramdd((x, y, z), bins=bins)
    >>> type(h)
    <class 'dask.array.core.Array'>
    >>> h.shape
    (4, 4, 4)
    >>> len(edges)

    Now the same histogram but instead of a
    :py:func:`dask.array.histogramdd` style return (which mirrors the
    return style of :py:func:`numpy.histogramdd`), we use the
    `histogram` argument to trigger the return of a
    :obj:`dask_histogram.Histogram` object:

    >>> import dask.array as da
    >>> import dask_histogram.boost as dhb
    >>> x = da.random.standard_normal(size=(10000,), chunks=(2000,))
    >>> y = da.random.standard_normal(size=(10000,), chunks=(2000,))
    >>> z = da.random.standard_normal(size=(10000,), chunks=(2000,))
    >>> bins = [
    ...    [-3, -2, 0, 1, 3],
    ...    [-3, -1, 1, 2, 3],
    ...    [-3, -2, 0, 2, 3],
    ... ]
    >>> h = dhb.histogramdd((x, y, z), bins=bins, histogram=dhb.Histogram)
    >>> h
      Variable([-3, -2, 0, 1, 3]),
      Variable([-3, -1, 1, 2, 3]),
      Variable([-3, -2, 0, 2, 3]),
      storage=Double()) # (has staged fills)
    >>> h.staged_fills()
    >>> h = h.compute()
    >>> h  # doctest: +SKIP
      Variable([-3, -2, 0, 1, 3]),
      Variable([-3, -1, 1, 2, 3]),
      Variable([-3, -2, 0, 2, 3]),
      storage=Double()) # Sum: 9919.0 (10000.0 with flow)

    Another 3D histogram example but with an alternative dataset form
    (a single array with three columns), fixed bin widths, sample
    weights, and usage of the boost-histogram ``Weight()`` storage:

    >>> import dask.array as da
    >>> import dask_histogram.boost as dhb
    >>> a = da.random.standard_normal(size=(10000, 3), chunks=(2000, 3))
    >>> w = da.random.uniform(0.5, 0.7, size=(10000,), chunks=2000)
    >>> bins = (7, 5, 6)
    >>> range = ((-3, 3), (-2.9, 2.9), (-3.1, 3.1))
    >>> h = dhb.histogramdd(
    ...     a,
    ...     bins=bins,
    ...     range=range,
    ...     weights=w,
    ...     histogram=dhb.Histogram,
    ... )
    >>> h
      Regular(7, -3, 3),
      Regular(5, -2.9, 2.9),
      Regular(6, -3.1, 3.1),
      storage=Weight()) # Sum: WeightedSum(value=0, variance=0) (has staged fills)
    >>> h.staged_fills()
    >>> h = h.compute()
    >>> h.staged_fills()


    # Check for invalid argument combinations.
    if normed is not None:
        raise KeyError(
            "normed=True is deprecated in NumPy and not supported by dask-histogram."
    if density and histogram is not None:
        raise KeyError(
            "dask-histogram does not support the density keyword when returning a "
            "dask-histogram object.")

    # If input is a multidimensional array or dataframe, we wrap it in
    # a tuple that will be passed to fill and unrolled in the backend.
    if (is_arraylike(a)
            and a.ndim > 1) or is_dataframe_like(a):  # type: ignore
        ndim = a.shape[1]  # type: ignore
        a = (a, )  # type: ignore
        ndim = len(a)
        for entry in a:
            if not is_dask_collection(entry):
                raise ValueError(
                    "non-dask collection was passed; this function only supports dask "
                    "collections as input")

    bins, range = normalize_bins_range(ndim, bins, range)

    # Create the axes based on the bins and range values.
    axes = []
    for _, (b, r) in enumerate(zip(bins, range)):  # type: ignore
        if r is None:
            axes.append(axis.Variable(b))  # type: ignore
            axes.append(axis.Regular(bins=b, start=r[0],
                                     stop=r[1]))  # type: ignore

    # Finally create and fill the histogram object.
    hist = Histogram(*axes, storage=storage).fill(*a, weight=weights)

    if histogram != Histogram:
        return hist.to_dask_array(flow=False, dd=True)
    return hist
Beispiel #23
def is_awkward_like(x: Any) -> bool:
    return is_dask_collection(x) and hasattr(x, "_typetracer")
Beispiel #24
def check_meta(x, meta, funcname=None, numeric_equal=True):
    """Check that the dask metadata matches the result.

    If metadata matches, ``x`` is passed through unchanged. A nice error is
    raised if metadata doesn't match.

    x : DataFrame, Series, or Index
    meta : DataFrame, Series, or Index
        The expected metadata that ``x`` should match
    funcname : str, optional
        The name of the function in which the metadata was specified. If
        provided, the function name will be included in the error message to be
        more helpful to users.
    numeric_equal : bool, optionl
        If True, integer and floating dtypes compare equal. This is useful due
        to panda's implicit conversion of integer to floating upon encountering
        missingness, which is hard to infer statically.
    eq_types = {"i", "f", "u"} if numeric_equal else set()

    def equal_dtypes(a, b):
        if is_categorical_dtype(a) != is_categorical_dtype(b):
            return False
        if isinstance(a, str) and a == "-" or isinstance(b, str) and b == "-":
            return False
        if is_categorical_dtype(a) and is_categorical_dtype(b):
            if UNKNOWN_CATEGORIES in a.categories or UNKNOWN_CATEGORIES in b.categories:
                return True
            return a == b
        return (a.kind in eq_types and b.kind in eq_types) or is_dtype_equal(
            a, b)

    if not (is_dataframe_like(meta) or is_series_like(meta)
            or is_index_like(meta)) or is_dask_collection(meta):
        raise TypeError("Expected partition to be DataFrame, Series, or "
                        "Index, got `%s`" % typename(type(meta)))

    # Notice, we use .__class__ as opposed to type() in order to support
    # object proxies see <>
    if x.__class__ != meta.__class__:
        errmsg = "Expected partition of type `{}` but got `{}`".format(
    elif is_dataframe_like(meta):
        dtypes = pd.concat([x.dtypes, meta.dtypes], axis=1, sort=True)
        bad_dtypes = [(repr(col), a, b)
                      for col, a, b in dtypes.fillna("-").itertuples()
                      if not equal_dtypes(a, b)]
        if bad_dtypes:
            errmsg = "Partition type: `{}`\n{}".format(
                asciitable(["Column", "Found", "Expected"], bad_dtypes),
            check_matching_columns(meta, x)
            return x
        if equal_dtypes(x.dtype, meta.dtype):
            return x
        errmsg = "Partition type: `{}`\n{}".format(
            asciitable(["", "dtype"], [("Found", x.dtype),
                                       ("Expected", meta.dtype)]),

    raise ValueError("Metadata mismatch found%s.\n\n"
                     "%s" %
                     ((" in `%s`" % funcname if funcname else ""), errmsg))
Beispiel #25
    def fill(  # type: ignore
        *args: DaskCollection,
        weight: DaskCollection | None = None,
        sample: Any | None = None,
        threads: Any | None = None,
    ) -> Histogram:
        """Stage a fill call using a Dask collection as input.

        If materialized NumPy ararys are passed to this function, all
        arguments are forwarded :func:`concrete_fill`.

        *args : one or more Dask collections
            Provide one dask collection per dimension, or a single
            columnar Dask collection (DataFrame or 2D Array) where the
            total number of columns equals to the total number of
            histogram dimensions.

            * A single one dimensional collection
              (:obj:`dask.array.Array` or
            * Multiple one dimensional collections, each representing
              one an array of one coordinate of the dataset to be
            * A single two dimensional collection
              (:obj:`dask.array.Array` or
              :obj:`dask.dataframe.DataFrame`), each column
              representing one coordinate of the dataset to be

            If multiple one dimensional arguments are passed (i.e. an
            `x` array and a `y` array for a two dimensional
            histogram), the collections must have equal

            If a single two dimensional array is passed (i.e. an array
            of shape ``(2000, 3)`` for a three dimensional histogram),
            chunking can only exist along the 0th (row) axis.
            (coordinates cannot be separated by a chunk boundry, only
            whole individual samples can be separated).

        weight : dask.array.Array or dask.dataframe.Series, optional
            Weights associated with each sample. The weights must be
            chunked/partitioned in a way compatible with the dataset.
        sample : dask.array.Array or dask.dataframe.Series, optional
            Provide samples if the histogram storage allows it. The
            partitioning/chunking of the samples must be compatible
            with the input data.
        threads : int, optional
            Ignored argument kept for compatibility with boost-histogram.
            We let Dask have complete control over threads.

            Class instance with a staged (delayed) fill added.

        # Pass to concrete fill if non-dask-collection
        if all(not is_dask_collection(a) for a in args):
            return self.concrete_fill(

        if len(args) == 1 and args[0].ndim == 1:
        elif len(args) == 1 and args[0].ndim == 2:
        elif len(args) > 1:
            raise ValueError(f"Cannot interpret input data: {args}")

        new_fill = factory(*args, histref=self, weights=weight, sample=sample)
        if self._staged is not None:
            self._staged += new_fill
            self._staged = new_fill  # type: ignore

        return self
Beispiel #26
 def is_duck_dask_array(x):
     return is_duck_array(x) and is_dask_collection(x)
Beispiel #27
def delayed(obj, name=None, pure=None, nout=None, traverse=True):
    """Wraps a function or object to produce a ``Delayed``.

    ``Delayed`` objects act as proxies for the object they wrap, but all
    operations on them are done lazily by building up a dask graph internally.

    obj : object
        The function or object to wrap
    name : string or hashable, optional
        The key to use in the underlying graph for the wrapped object. Defaults
        to hashing content. Note that this only affects the name of the object
        wrapped by this call to delayed, and *not* the output of delayed
        function calls - for that use ``dask_key_name=`` as described below.

        .. note::

           Because this ``name`` is used as the key in task graphs, you should
           ensure that it uniquely identifies ``obj``. If you'd like to provide
           a descriptive name that is still unique, combine the descriptive name
           with :func:`dask.base.tokenize` of the ``array_like``. See
           :ref:`graphs` for more.

    pure : bool, optional
        Indicates whether calling the resulting ``Delayed`` object is a pure
        operation. If True, arguments to the call are hashed to produce
        deterministic keys. If not provided, the default is to check the global
        ``delayed_pure`` setting, and fallback to ``False`` if unset.
    nout : int, optional
        The number of outputs returned from calling the resulting ``Delayed``
        object. If provided, the ``Delayed`` output of the call can be iterated
        into ``nout`` objects, allowing for unpacking of results. By default
        iteration over ``Delayed`` objects will error. Note, that ``nout=1``
        expects ``obj`` to return a tuple of length 1, and consequently for
        ``nout=0``, ``obj`` should return an empty tuple.
    traverse : bool, optional
        By default dask traverses builtin python collections looking for dask
        objects passed to ``delayed``. For large collections this can be
        expensive. If ``obj`` doesn't contain any dask objects, set
        ``traverse=False`` to avoid doing this traversal.

    Apply to functions to delay execution:

    >>> from dask import delayed
    >>> def inc(x):
    ...     return x + 1

    >>> inc(10)

    >>> x = delayed(inc, pure=True)(10)
    >>> type(x) == Delayed
    >>> x.compute()

    Can be used as a decorator:

    >>> @delayed(pure=True)
    ... def add(a, b):
    ...     return a + b
    >>> add(1, 2).compute()

    ``delayed`` also accepts an optional keyword ``pure``. If False, then
    subsequent calls will always produce a different ``Delayed``. This is
    useful for non-pure functions (such as ``time`` or ``random``).

    >>> from random import random
    >>> out1 = delayed(random, pure=False)()
    >>> out2 = delayed(random, pure=False)()
    >>> out1.key == out2.key

    If you know a function is pure (output only depends on the input, with no
    global state), then you can set ``pure=True``. This will attempt to apply a
    consistent name to the output, but will fallback on the same behavior of
    ``pure=False`` if this fails.

    >>> @delayed(pure=True)
    ... def add(a, b):
    ...     return a + b
    >>> out1 = add(1, 2)
    >>> out2 = add(1, 2)
    >>> out1.key == out2.key

    Instead of setting ``pure`` as a property of the callable, you can also set
    it contextually using the ``delayed_pure`` setting. Note that this
    influences the *call* and not the *creation* of the callable:

    >>> @delayed
    ... def mul(a, b):
    ...     return a * b
    >>> import dask
    >>> with dask.config.set(delayed_pure=True):
    ...     print(mul(1, 2).key == mul(1, 2).key)
    >>> with dask.config.set(delayed_pure=False):
    ...     print(mul(1, 2).key == mul(1, 2).key)

    The key name of the result of calling a delayed object is determined by
    hashing the arguments by default. To explicitly set the name, you can use
    the ``dask_key_name`` keyword when calling the function:

    >>> add(1, 2)   # doctest: +SKIP
    >>> add(1, 2, dask_key_name='three')

    Note that objects with the same key name are assumed to have the same
    result. If you set the names explicitly you should make sure your key names
    are different for different results.

    >>> add(1, 2, dask_key_name='three')
    >>> add(2, 1, dask_key_name='three')
    >>> add(2, 2, dask_key_name='four')

    ``delayed`` can also be applied to objects to make operations on them lazy:

    >>> a = delayed([1, 2, 3])
    >>> isinstance(a, Delayed)
    >>> a.compute()
    [1, 2, 3]

    The key name of a delayed object is hashed by default if ``pure=True`` or
    is generated randomly if ``pure=False`` (default).  To explicitly set the
    name, you can use the ``name`` keyword. To ensure that the key is unique
    you should include the tokenized value as well, or otherwise ensure that
    it's unique:

    >>> from dask.base import tokenize
    >>> data = [1, 2, 3]
    >>> a = delayed(data, name='mylist-' + tokenize(data))
    >>> a  # doctest: +SKIP

    Delayed results act as a proxy to the underlying object. Many operators
    are supported:

    >>> (a + [1, 2]).compute()
    [1, 2, 3, 1, 2]
    >>> a[1].compute()

    Method and attribute access also works:

    >>> a.count(2).compute()

    Note that if a method doesn't exist, no error will be thrown until runtime:

    >>> res = a.not_a_real_method() # doctest: +SKIP
    >>> res.compute()  # doctest: +SKIP
    AttributeError("'list' object has no attribute 'not_a_real_method'")

    "Magic" methods (e.g. operators and attribute access) are assumed to be
    pure, meaning that subsequent calls must return the same results. This
    behavior is not overrideable through the ``delayed`` call, but can be
    modified using other ways as described below.

    To invoke an impure attribute or operator, you'd need to use it in a
    delayed function with ``pure=False``:

    >>> class Incrementer:
    ...     def __init__(self):
    ...         self._n = 0
    ...     @property
    ...     def n(self):
    ...         self._n += 1
    ...         return self._n
    >>> x = delayed(Incrementer())
    >>> x.n.key == x.n.key
    >>> get_n = delayed(lambda x: x.n, pure=False)
    >>> get_n(x).key == get_n(x).key

    In contrast, methods are assumed to be impure by default, meaning that
    subsequent calls may return different results. To assume purity, set
    ``pure=True``. This allows sharing of any intermediate values.

    >>> a.count(2, pure=True).key == a.count(2, pure=True).key

    As with function calls, method calls also respect the global
    ``delayed_pure`` setting and support the ``dask_key_name`` keyword:

    >>> a.count(2, dask_key_name="count_2")
    >>> import dask
    >>> with dask.config.set(delayed_pure=True):
    ...     print(a.count(2).key == a.count(2).key)
    if isinstance(obj, Delayed):
        return obj

    if is_dask_collection(obj) or traverse:
        task, collections = unpack_collections(obj)
        task = quote(obj)
        collections = set()

    if not (nout is None or (type(nout) is int and nout >= 0)):
        raise ValueError(
            "nout must be None or a non-negative integer, got %s" % nout)
    if task is obj:
        if not name:
                prefix = obj.__name__
            except AttributeError:
                prefix = type(obj).__name__
            token = tokenize(obj, nout, pure=pure)
            name = f"{prefix}-{token}"
        return DelayedLeaf(obj, name, pure=pure, nout=nout)
        if not name:
            name = f"{type(obj).__name__}-{tokenize(task, pure=pure)}"
        layer = {name: task}
        graph = HighLevelGraph.from_collections(name,
        return Delayed(name, graph, nout)
Beispiel #28
def test_custom_collection():
    # Arbitrary hashables
    h1 = object()
    h2 = object()

    # Collections with 2+ keys must have all keys in the format of tuples where the
    # first element is the same string, referred to as collection name, and the rest are
    # arbitrary hashables
    dsk = {("x", h1): 1, ("x", h2): 2}
    dsk2 = {("y", h1): (add, ("x", h1), ("x", h2)), ("y", h2): (add, ("y", h1), 1)}

    # If and only if there is only one top-level key, it can be just a string
    dsk3 = {"z": (add, ("y", h1), ("y", h2))}

    w = Tuple({}, [])  # A collection can have no keys at all
    x = Tuple(dsk, [("x", h1), ("x", h2)])
    y = Tuple(dsk2, [("y", h1), ("y", h2)])
    z = Tuple(dsk3, ["z"])

    # __slots__ defined on base mixin class propagates
    with pytest.raises(AttributeError): = 1

    # is_dask_collection
    assert is_dask_collection(w)
    assert is_dask_collection(x)
    assert is_dask_collection(y)
    assert is_dask_collection(z)

    # tokenize
    assert tokenize(w) == tokenize(w)
    assert tokenize(x) == tokenize(x)
    assert tokenize(y) == tokenize(y)
    assert tokenize(z) == tokenize(z)
    assert len({tokenize(coll) for coll in (w, x, y, z)}) == 4

    # get_collection_name
    with pytest.raises(KeyError):
    assert get_collection_name(x) == "x"
    assert get_collection_name(y) == "y"
    assert get_collection_name(z) == "z"

    # compute
    assert w.compute() == ()
    assert x.compute() == (1, 2)
    assert y.compute() == (3, 4)
    assert z.compute() == (7,)
    assert dask.compute(w, [{"x": x}, y, z]) == ((), [{"x": (1, 2)}, (3, 4), (7,)])
    t = w + x + y + z
    assert t.compute() == (1, 2, 3, 4, 7)

    # persist
    t2 = t.persist()
    assert isinstance(t2, Tuple)
    assert t2._keys == t._keys
    assert sorted(t2._dask.values()) == [1, 2, 3, 4, 7]
    assert t2.compute() == (1, 2, 3, 4, 7)

    w2, x2, y2, z2 = dask.persist(w, x, y, z)
    assert y2._keys == y._keys
    assert y2._dask == {("y", h1): 3, ("y", h2): 4}
    assert y2.compute() == (3, 4)

    t3 = x2 + y2 + z2
    assert t3.compute() == (1, 2, 3, 4, 7)

    # __dask_postpersist__ with name change
    rebuild, args = w.__dask_postpersist__()
    w3 = rebuild({}, *args, name="w3")
    assert w3.compute() == ()

    rebuild, args = x.__dask_postpersist__()
    x3 = rebuild({("x3", h1): 10, ("x3", h2): 20}, *args, name="x3")
    assert x3.compute() == (10, 20)

    rebuild, args = z.__dask_postpersist__()
    z3 = rebuild({"z3": 70}, *args, name="z3")
    assert z3.compute() == (70,)
Beispiel #29
def shuffle(
    """Group DataFrame by index

    Hash grouping of elements. After this operation all elements that have
    the same index will be in the same partition. Note that this requires
    full dataset read, serialization and shuffle. This is expensive. If
    possible you should avoid shuffles.

    This does not preserve a meaningful index/partitioning scheme. This is not
    deterministic if done in parallel.

    See Also
    list_like = pd.api.types.is_list_like(
        index) and not is_dask_collection(index)
    if shuffle == "tasks" and (isinstance(index, str) or list_like):
        # Avoid creating the "_partitions" column if possible.
        # We currently do this if the user is passing in
        # specific column names (and shuffle == "tasks").
        if isinstance(index, str):
            index = [index]
            index = list(index)
        nset = set(index)
        if nset & set(df.columns) == nset:
            return rearrange_by_column(

    if not isinstance(index, _Frame):
        if list_like:
            # Make sure we don't try to select with pd.Series/pd.Index
            index = list(index)
        index = df._select_columns_or_index(index)
    elif hasattr(index, "to_frame"):
        # If this is an index, we should still convert to a
        # DataFrame. Otherwise, the hashed values of a column
        # selection will not match (important when merging).
        index = index.to_frame()

    partitions = index.map_partitions(
        npartitions=npartitions or df.npartitions,
    df2 = df.assign(_partitions=partitions) =
    df3 = rearrange_by_column(
    del df3["_partitions"]
    return df3
Beispiel #30
def test_custom_collection():
    # Arbitrary hashables
    h1 = object()
    h2 = object()

    dsk = {("x", h1): 1, ("x", h2): 2}
    dsk2 = {
        ("y", h1): (add, ("x", h1), ("x", h2)),
        ("y", h2): (add, ("y", h1), 1)
    dsk3 = {"z": (add, ("y", h1), ("y", h2))}

    w = Tuple({}, [])  # A collection can have no keys at all
    x = Tuple(dsk, [("x", h1), ("x", h2)])
    y = Tuple(dsk2, [("y", h1), ("y", h2)])
    z = Tuple(dsk3, ["z"])
    # Collection with multiple names
    t = w + x + y + z

    # __slots__ defined on base mixin class propagates
    with pytest.raises(AttributeError): = 1

    # is_dask_collection
    assert is_dask_collection(w)
    assert is_dask_collection(x)
    assert is_dask_collection(y)
    assert is_dask_collection(z)
    assert is_dask_collection(t)

    # tokenize
    assert tokenize(w) == tokenize(w)
    assert tokenize(x) == tokenize(x)
    assert tokenize(y) == tokenize(y)
    assert tokenize(z) == tokenize(z)
    assert tokenize(t) == tokenize(t)
    # All tokens are unique
    assert len({tokenize(coll) for coll in (w, x, y, z, t)}) == 5

    # get_collection_names
    assert get_collection_names(w) == set()
    assert get_collection_names(x) == {"x"}
    assert get_collection_names(y) == {"y"}
    assert get_collection_names(z) == {"z"}
    assert get_collection_names(t) == {"x", "y", "z"}

    # compute
    assert w.compute() == ()
    assert x.compute() == (1, 2)
    assert y.compute() == (3, 4)
    assert z.compute() == (7, )
    assert dask.compute(w, [{
        "x": x
    }, y, z]) == ((), [{
        "x": (1, 2)
    }, (3, 4), (7, )])
    assert t.compute() == (1, 2, 3, 4, 7)

    # persist
    t2 = t.persist()
    assert isinstance(t2, Tuple)
    assert t2._keys == t._keys
    assert sorted(t2._dask.values()) == [1, 2, 3, 4, 7]
    assert t2.compute() == (1, 2, 3, 4, 7)

    w2, x2, y2, z2 = dask.persist(w, x, y, z)
    assert y2._keys == y._keys
    assert y2._dask == {("y", h1): 3, ("y", h2): 4}
    assert y2.compute() == (3, 4)

    t3 = x2 + y2 + z2
    assert t3.compute() == (1, 2, 3, 4, 7)

    # __dask_postpersist__ with name change
    rebuild, args = w.__dask_postpersist__()
    w3 = rebuild({}, *args, rename={"w": "w3"})
    assert w3.compute() == ()

    rebuild, args = x.__dask_postpersist__()
    x3 = rebuild({("x3", h1): 10, ("x3", h2): 20}, *args, rename={"x": "x3"})
    assert x3.compute() == (10, 20)

    rebuild, args = z.__dask_postpersist__()
    z3 = rebuild({"z3": 70}, *args, rename={"z": "z3"})
    assert z3.compute() == (70, )
Beispiel #31
def to_task_dask(expr):
    """Normalize a python object and merge all sub-graphs.

    - Replace ``Delayed`` with their keys
    - Convert literals to things the schedulers can handle
    - Extract dask graphs from all enclosed values

    expr : object
        The object to be normalized. This function knows how to handle
        ``Delayed``s, as well as most builtin python types.

    task : normalized task to be run
    dask : a merged dask graph that forms the dag for this task

    >>> import dask
    >>> a = delayed(1, 'a')
    >>> b = delayed(2, 'b')
    >>> task, dask = to_task_dask([a, b, 3])  # doctest: +SKIP
    >>> task  # doctest: +SKIP
    ['a', 'b', 3]
    >>> dict(dask)  # doctest: +SKIP
    {'a': 1, 'b': 2}

    >>> task, dasks = to_task_dask({a: 1, b: 2})  # doctest: +SKIP
    >>> task  # doctest: +SKIP
    (dict, [['a', 1], ['b', 2]])
    >>> dict(dask)  # doctest: +SKIP
    {'a': 1, 'b': 2}
        "The dask.delayed.to_dask_dask function has been "
        "Deprecated in favor of unpack_collections",

    if isinstance(expr, Delayed):
        return expr.key, expr.dask

    if is_dask_collection(expr):
        name = "finalize-" + tokenize(expr, pure=True)
        keys = expr.__dask_keys__()
        opt = getattr(expr, "__dask_optimize__", dont_optimize)
        finalize, args = expr.__dask_postcompute__()
        dsk = {name: (finalize, keys) + args}
        dsk.update(opt(expr.__dask_graph__(), keys))
        return name, dsk

    if isinstance(expr, Iterator):
        expr = list(expr)
    typ = type(expr)

    if typ in (list, tuple, set):
        args, dasks = unzip((to_task_dask(e) for e in expr), 2)
        args = list(args)
        dsk = merge(dasks)
        # Ensure output type matches input type
        return (args, dsk) if typ is list else ((typ, args), dsk)

    if typ is dict:
        args, dsk = to_task_dask([[k, v] for k, v in expr.items()])
        return (dict, args), dsk

    if is_dataclass(expr):
        args, dsk = to_task_dask([
            [, getattr(expr,] for f in fields(expr)
            if hasattr(expr,  # if init=False, field might not exist

        return (apply, typ, (), (dict, args)), dsk

    if typ is slice:
        args, dsk = to_task_dask([expr.start, expr.stop, expr.step])
        return (slice, ) + tuple(args), dsk

    return expr, {}