Beispiel #1
0
class HLGCollection(DaskMethodsMixin):
    def __init__(self, based_on: HLGDaskCollection) -> None:
        self.based_on = based_on

    def __dask_graph__(self) -> Mapping:
        return self.based_on.__dask_graph__()

    def __dask_layers__(self) -> Sequence[str]:
        return self.based_on.__dask_layers__()

    def __dask_keys__(self) -> list[Hashable]:
        return self.based_on.__dask_keys__()

    def __dask_postcompute__(self) -> tuple[PostComputeCallable, tuple]:
        return finalize, ()

    def __dask_postpersist__(self) -> tuple[PostPersistCallable, tuple]:
        return self.based_on.__dask_postpersist__()

    def __dask_tokenize__(self) -> Hashable:
        return tokenize(self.based_on)

    __dask_scheduler__ = staticmethod(get1)

    __dask_optimize__ = globalmethod(
        dont_optimize,
        key="hlgcollection_optim",
        falsey=dont_optimize,
    )
Beispiel #2
0
class Inheriting(DaskCollection):
    def __init__(self, based_on: DaskCollection) -> None:
        self.based_on = based_on

    def __dask_graph__(self) -> Mapping:
        return self.based_on.__dask_graph__()

    def __dask_keys__(self) -> list[Hashable]:
        return self.based_on.__dask_keys__()

    def __dask_postcompute__(self) -> tuple[PostComputeCallable, tuple]:
        return finalize, ()

    def __dask_postpersist__(self) -> tuple[PostPersistCallable, tuple]:
        return self.based_on.__dask_postpersist__()

    def __dask_tokenize__(self) -> Hashable:
        return tokenize(self.based_on)

    __dask_scheduler__ = staticmethod(dask.threaded.get)

    __dask_optimize__ = globalmethod(
        dont_optimize,
        key="hlgcollection_optim",
        falsey=dont_optimize,
    )

    def compute(self, **kwargs) -> Any:
        return dask.compute(self, **kwargs)

    def persist(self, **kwargs) -> Inheriting:
        return Inheriting(self.based_on.persist(**kwargs))

    def visualize(
        self,
        filename: str = "mydask",
        format: str | None = None,
        optimize_graph: bool = False,
        **kwargs: Any,
    ) -> DisplayObject | None:
        return dask.visualize(
            self,
            filename=filename,
            format=format,
            optimize_graph=optimize_graph,
            **kwargs,
        )
class DatasetCollection(DaskMethodsMixin):
    def __init__(self, dask, name, n_chunk, slice_dim):
        if not isinstance(dask, ShareDict):
            s = ShareDict()
            s.update_with_key(dask, key=name)
            dask = s
        self.dask = dask
        self.name = name
        self.n_chunk = n_chunk
        self.slice_dim = slice_dim

    def __dask_graph__(self):
        return self.dask

    def __dask_keys__(self):
        return [(self.name, ) + (i, ) for i in range(self.n_chunk)]

    __dask_optimize__ = globalmethod(optimize,
                                     key='array_optimize',
                                     falsey=dont_optimize)
    __dask_scheduler__ = staticmethod(dask.threaded.get)

    def __dask_postcompute__(self):
        return finalize, (self.slice_dim, )

    def __dask_postpersist__(self):
        return DatasetCollection, (self.name, self.n_chunk, self.slice_dim)

    def __dask_tokenize__(self):
        return self.name

    @property
    def numblocks(self):
        return (self.n_chunk, )

    def __add__(self, other):
        return elemwise(operator.add, self, other)

    def slice(self, slice_dim, index):
        assert slice_dim != self.slice_dim  # TODO implement
        return elemwise(do_slice, self, slice_dim, index)

    def concatenate(self, dim, other):
        assert dim != self.slice_dim  # TODO implement
        return elemwise(concatenate, dim, self, other)
Beispiel #4
0
class Foo(object):
    @globalmethod(key='f')
    def f():
        return 1

    g = globalmethod(foo, key='g', falsey=bar)
Beispiel #5
0
class Foo:
    @globalmethod(key="f")
    def f():
        return 1

    g = globalmethod(foo, key="g", falsey=bar)
Beispiel #6
0
class PartitionedHistogram(DaskMethodsMixin):
    """Partitioned Histogram collection.

    The class constructor is typically used internally;
    :py:func:`dask_histogram.factory` is recommended for users (along
    with the `dask_histogram.routines` module).

    See Also
    --------
    dask_histogram.factory
    dask_histogram.AggHistogram

    """
    def __init__(self, dsk: HighLevelGraph, name: str, npartitions: int,
                 histref: bh.Histogram) -> None:
        self._dask: HighLevelGraph = dsk
        self._name: str = name
        self._npartitions: int = npartitions
        self._histref: bh.Histogram = histref

    @property
    def name(self) -> str:
        return self._name

    @property
    def dask(self) -> HighLevelGraph:
        return self._dask

    @property
    def npartitions(self) -> int:
        return self._npartitions

    def __dask_graph__(self) -> HighLevelGraph:
        return self.dask

    def __dask_keys__(self) -> list[tuple[str, int]]:
        return [(self.name, i) for i in range(self.npartitions)]

    def __dask_layers__(self) -> tuple[str]:
        return (self.name, )

    def __dask_tokenize__(self) -> str:
        return self.name

    def __dask_postcompute__(self) -> Any:
        return _finalize_partitioned_histogram, ()

    def _rebuild(self, dsk: Any, *, rename: Any = None) -> Any:
        name = self.name
        if rename:
            name = rename.get(name, name)
        return type(self)(dsk, name, self.npartitions, self.histref)

    __dask_optimize__ = globalmethod(optimize,
                                     key="histogram_optimize",
                                     falsey=dont_optimize)

    __dask_scheduler__ = staticmethod(tget)

    def __str__(self) -> str:
        return "dask_histogram.PartitionedHistogram,<%s, npartitions=%d>" % (
            key_split(self.name),
            self.npartitions,
        )

    __repr__ = __str__

    def __reduce__(self):
        return (
            PartitionedHistogram,
            (
                self._dask,
                self._name,
                self._npartitions,
                self._histref,
            ),
        )

    @property
    def histref(self) -> bh.Histogram:
        """boost_histogram.Histogram: reference histogram."""
        return self._histref

    def collapse(self, split_every: int | None = None) -> AggHistogram:
        """Translate into a reduced aggregated histogram."""
        return _reduction(self, split_every=split_every)

    def to_delayed(self, optimize_graph: bool = True) -> list[Delayed]:
        keys = self.__dask_keys__()
        graph = self.__dask_graph__()
        layer = self.__dask_layers__()[0]
        if optimize_graph:
            graph = self.__dask_optimize__(graph, keys)
            layer = f"delayed-{self.name}"
            graph = HighLevelGraph.from_collections(layer,
                                                    graph,
                                                    dependencies=())
        return [Delayed(k, graph, layer=layer) for k in keys]
Beispiel #7
0
class AggHistogram(DaskMethodsMixin):
    """Aggregated Histogram collection.

    The class constructor is typically used internally;
    :py:func:`dask_histogram.factory` is recommended for users (along
    with the `dask_histogram.routines` module).

    See Also
    --------
    dask_histogram.factory

    """
    def __init__(
        self,
        dsk: HighLevelGraph,
        name: str,
        histref: bh.Histogram,
        layer: Any | None = None,
    ) -> None:
        self._dask: HighLevelGraph = dsk
        self._name: str = name
        self._histref: bh.Histogram = histref

        # NOTE: Layer only used by `Item.from_delayed`, to handle
        # Delayed objects created by other collections. e.g.:
        # Item.from_delayed(da.ones(1).to_delayed()[0]) See
        # Delayed.__init__
        self._layer = layer or name
        if isinstance(dsk, HighLevelGraph) and self._layer not in dsk.layers:
            raise ValueError(
                f"Layer {self._layer} not in the HighLevelGraph's layers: {list(dsk.layers)}"
            )

    def __dask_graph__(self) -> HighLevelGraph:
        return self._dask

    def __dask_keys__(self) -> list[str]:
        return [self.name]

    def __dask_layers__(self) -> tuple[str, ...]:
        if isinstance(self._dask, HighLevelGraph) and len(
                self._dask.layers) == 1:
            return tuple(self._dask.layers)
        return (self.name, )

    def __dask_tokenize__(self) -> str:
        return self.name

    def __dask_postcompute__(self) -> Any:
        return _finalize_agg_histogram, ()

    def __dask_postpersist__(self) -> Any:
        return self._rebuild, ()

    __dask_optimize__ = globalmethod(optimize,
                                     key="histogram_optimize",
                                     falsey=dont_optimize)

    __dask_scheduler__ = staticmethod(tget)

    def _rebuild(
        self,
        dsk: HighLevelGraph,
        *,
        rename: Mapping[str, str] | None = None,
    ) -> Any:
        name = self._name
        if rename:
            name = rename.get(name, name)
        return type(self)(dsk, name, self.histref)

    @property
    def name(self) -> str:
        return self._name

    @property
    def dask(self) -> HighLevelGraph:
        return self._dask

    @property
    def histref(self) -> bh.Histogram:
        """Empty reference boost-histogram object."""
        return self._histref

    @property
    def _storage_type(self) -> type[bh.storage.Storage]:
        """Storage type of the histogram."""
        return self.histref._storage_type

    @property
    def ndim(self) -> int:
        """Total number of dimensions."""
        return self.histref.ndim

    @property
    def shape(self) -> tuple[int, ...]:
        """Shape of the histogram as an array."""
        return self.histref.shape

    @property
    def size(self) -> int:
        """Size of the histogram."""
        return self.histref.size

    def __str__(self) -> str:
        return ("dask_histogram.AggHistogram<"
                f"{key_split(self.name)}, "
                f"ndim={self.ndim}, "
                f"storage={self._storage_type()}"
                ">")

    __repr__ = __str__

    def __reduce__(self):
        return (AggHistogram, (self._dask, self._name, self._histref))

    def to_dask_array(
        self,
        flow: bool = False,
        dd: bool = False
    ) -> tuple[DaskArray, ...] | tuple[DaskArray, list[DaskArray]]:
        """Convert histogram object to dask.array form.

        Parameters
        ----------
        flow : bool
            Include the flow bins.
        dd : bool
            Use the histogramdd return syntax, where the edges are in a tuple.
            Otherwise, this is the histogram/histogram2d return style.

        Returns
        -------
        contents : dask.array.Array
            The bin contents
        *edges : dask.array.Array
            The edges for each dimension

        """
        return to_dask_array(self, flow=flow, dd=dd)

    def to_boost(self) -> bh.Histogram:
        """Convert to a boost_histogram.Histogram via computation.

        This is an alias of `.compute()`.

        """
        return self.compute()

    def to_delayed(self, optimize_graph: bool = True) -> Delayed:
        keys = self.__dask_keys__()
        graph = self.__dask_graph__()
        layer = self.__dask_layers__()[0]
        if optimize_graph:
            graph = self.__dask_optimize__(graph, keys)
            layer = f"delayed-{self.name}"
            graph = HighLevelGraph.from_collections(layer,
                                                    graph,
                                                    dependencies=())
        return Delayed(keys[0], graph, layer=layer)

    def values(self, flow: bool = False) -> NDArray[Any]:
        return self.to_boost().values(flow=flow)

    def variances(self, flow: bool = False) -> NDArray[Any] | None:
        return self.to_boost().variances(flow=flow)

    def counts(self, flow: bool = False) -> NDArray[Any]:
        return self.to_boost().counts(flow=flow)

    def __array__(self) -> NDArray[Any]:
        return self.compute().__array__()

    def __iadd__(self, other) -> AggHistogram:
        return _iadd(self, other)

    def __add__(self, other: Any) -> AggHistogram:
        return self.__iadd__(other)

    def __radd__(self, other: Any) -> AggHistogram:
        return self.__iadd__(other)

    def __isub__(self, other: Any) -> AggHistogram:
        return _isub(self, other)

    def __sub__(self, other: Any) -> AggHistogram:
        return self.__isub__(other)

    def __itruediv__(self, other: Any) -> AggHistogram:
        return _itruediv(self, other)

    def __truediv__(self, other: Any) -> AggHistogram:
        return self.__itruediv__(other)

    def __idiv__(self, other: Any) -> AggHistogram:
        return self.__itruediv__(other)

    def __div__(self, other: Any) -> AggHistogram:
        return self.__idiv__(other)

    def __imul__(self, other: Any) -> AggHistogram:
        return _imul(self, other)

    def __mul__(self, other: Any) -> AggHistogram:
        return self.__imul__(other)

    def __rmul__(self, other: Any) -> AggHistogram:
        return self.__mul__(other)
Beispiel #8
0
class Delayed(DaskMethodsMixin, OperatorMethodMixin):
    """Represents a value to be computed by dask.

    Equivalent to the output from a single key in a dask graph.
    """

    __slots__ = ("_key", "_dask", "_length", "_layer")

    def __init__(self, key, dsk, length=None, layer=None):
        self._key = key
        self._dask = dsk
        self._length = length

        # NOTE: Layer is used by `to_delayed` in other collections, but not in normal Delayed use
        self._layer = layer or key
        if isinstance(dsk, HighLevelGraph) and self._layer not in dsk.layers:
            raise ValueError(
                f"Layer {self._layer} not in the HighLevelGraph's layers: {list(dsk.layers)}"
            )

    @property
    def key(self):
        return self._key

    @property
    def dask(self):
        return self._dask

    def __dask_graph__(self):
        return self.dask

    def __dask_keys__(self):
        return [self.key]

    def __dask_layers__(self):
        return (self._layer, )

    def __dask_tokenize__(self):
        return self.key

    __dask_scheduler__ = staticmethod(threaded.get)
    __dask_optimize__ = globalmethod(optimize, key="delayed_optimize")

    def __dask_postcompute__(self):
        return single_key, ()

    def __dask_postpersist__(self):
        return self._rebuild, ()

    def _rebuild(self, dsk, *, rename=None):
        key = replace_name_in_key(self.key, rename) if rename else self.key
        if isinstance(dsk, HighLevelGraph) and len(dsk.layers) == 1:
            # FIXME Delayed is currently the only collection type that supports both high- and low-level graphs.
            # The HLG output of `optimize` will have a layer name that doesn't match `key`.
            # Remove this when Delayed is HLG-only (because `optimize` will only be passed HLGs, so it won't have
            # to generate random layer names).
            layer = next(iter(dsk.layers))
        else:
            layer = None
        return Delayed(key, dsk, self._length, layer=layer)

    def __repr__(self):
        return f"Delayed({repr(self.key)})"

    def __hash__(self):
        return hash(self.key)

    def __dir__(self):
        return dir(type(self))

    def __getattr__(self, attr):
        if attr.startswith("_"):
            raise AttributeError(f"Attribute {attr} not found")

        if attr == "visualise":
            # added to warn users in case of spelling error
            # for more details: https://github.com/dask/dask/issues/5721
            warnings.warn("dask.delayed objects have no `visualise` method. "
                          "Perhaps you meant `visualize`?")

        return DelayedAttr(self, attr)

    def __setattr__(self, attr, val):
        try:
            object.__setattr__(self, attr, val)
        except AttributeError:
            # attr is neither in type(self).__slots__ nor in the __slots__ of any of its
            # parent classes, and all the parent classes define __slots__ too.
            # This last bit needs to be unit tested: if any of the parent classes omit
            # the __slots__ declaration, self will gain a __dict__ and this branch will
            # become unreachable.
            raise TypeError("Delayed objects are immutable")

    def __setitem__(self, index, val):
        raise TypeError("Delayed objects are immutable")

    def __iter__(self):
        if self._length is None:
            raise TypeError(
                "Delayed objects of unspecified length are not iterable")
        for i in range(self._length):
            yield self[i]

    def __len__(self):
        if self._length is None:
            raise TypeError(
                "Delayed objects of unspecified length have no len()")
        return self._length

    def __call__(self, *args, pure=None, dask_key_name=None, **kwargs):
        func = delayed(apply, pure=pure)
        if dask_key_name is not None:
            return func(self, args, kwargs, dask_key_name=dask_key_name)
        return func(self, args, kwargs)

    def __bool__(self):
        raise TypeError("Truth of Delayed objects is not supported")

    __nonzero__ = __bool__

    def __get__(self, instance, cls):
        if instance is None:
            return self
        return types.MethodType(self, instance)

    @classmethod
    def _get_binary_operator(cls, op, inv=False):
        method = delayed(right(op) if inv else op, pure=True)
        return lambda *args, **kwargs: method(*args, **kwargs)

    _get_unary_operator = _get_binary_operator