class HLGCollection(DaskMethodsMixin): def __init__(self, based_on: HLGDaskCollection) -> None: self.based_on = based_on def __dask_graph__(self) -> Mapping: return self.based_on.__dask_graph__() def __dask_layers__(self) -> Sequence[str]: return self.based_on.__dask_layers__() def __dask_keys__(self) -> list[Hashable]: return self.based_on.__dask_keys__() def __dask_postcompute__(self) -> tuple[PostComputeCallable, tuple]: return finalize, () def __dask_postpersist__(self) -> tuple[PostPersistCallable, tuple]: return self.based_on.__dask_postpersist__() def __dask_tokenize__(self) -> Hashable: return tokenize(self.based_on) __dask_scheduler__ = staticmethod(get1) __dask_optimize__ = globalmethod( dont_optimize, key="hlgcollection_optim", falsey=dont_optimize, )
class Inheriting(DaskCollection): def __init__(self, based_on: DaskCollection) -> None: self.based_on = based_on def __dask_graph__(self) -> Mapping: return self.based_on.__dask_graph__() def __dask_keys__(self) -> list[Hashable]: return self.based_on.__dask_keys__() def __dask_postcompute__(self) -> tuple[PostComputeCallable, tuple]: return finalize, () def __dask_postpersist__(self) -> tuple[PostPersistCallable, tuple]: return self.based_on.__dask_postpersist__() def __dask_tokenize__(self) -> Hashable: return tokenize(self.based_on) __dask_scheduler__ = staticmethod(dask.threaded.get) __dask_optimize__ = globalmethod( dont_optimize, key="hlgcollection_optim", falsey=dont_optimize, ) def compute(self, **kwargs) -> Any: return dask.compute(self, **kwargs) def persist(self, **kwargs) -> Inheriting: return Inheriting(self.based_on.persist(**kwargs)) def visualize( self, filename: str = "mydask", format: str | None = None, optimize_graph: bool = False, **kwargs: Any, ) -> DisplayObject | None: return dask.visualize( self, filename=filename, format=format, optimize_graph=optimize_graph, **kwargs, )
class DatasetCollection(DaskMethodsMixin): def __init__(self, dask, name, n_chunk, slice_dim): if not isinstance(dask, ShareDict): s = ShareDict() s.update_with_key(dask, key=name) dask = s self.dask = dask self.name = name self.n_chunk = n_chunk self.slice_dim = slice_dim def __dask_graph__(self): return self.dask def __dask_keys__(self): return [(self.name, ) + (i, ) for i in range(self.n_chunk)] __dask_optimize__ = globalmethod(optimize, key='array_optimize', falsey=dont_optimize) __dask_scheduler__ = staticmethod(dask.threaded.get) def __dask_postcompute__(self): return finalize, (self.slice_dim, ) def __dask_postpersist__(self): return DatasetCollection, (self.name, self.n_chunk, self.slice_dim) def __dask_tokenize__(self): return self.name @property def numblocks(self): return (self.n_chunk, ) def __add__(self, other): return elemwise(operator.add, self, other) def slice(self, slice_dim, index): assert slice_dim != self.slice_dim # TODO implement return elemwise(do_slice, self, slice_dim, index) def concatenate(self, dim, other): assert dim != self.slice_dim # TODO implement return elemwise(concatenate, dim, self, other)
class Foo(object): @globalmethod(key='f') def f(): return 1 g = globalmethod(foo, key='g', falsey=bar)
class Foo: @globalmethod(key="f") def f(): return 1 g = globalmethod(foo, key="g", falsey=bar)
class PartitionedHistogram(DaskMethodsMixin): """Partitioned Histogram collection. The class constructor is typically used internally; :py:func:`dask_histogram.factory` is recommended for users (along with the `dask_histogram.routines` module). See Also -------- dask_histogram.factory dask_histogram.AggHistogram """ def __init__(self, dsk: HighLevelGraph, name: str, npartitions: int, histref: bh.Histogram) -> None: self._dask: HighLevelGraph = dsk self._name: str = name self._npartitions: int = npartitions self._histref: bh.Histogram = histref @property def name(self) -> str: return self._name @property def dask(self) -> HighLevelGraph: return self._dask @property def npartitions(self) -> int: return self._npartitions def __dask_graph__(self) -> HighLevelGraph: return self.dask def __dask_keys__(self) -> list[tuple[str, int]]: return [(self.name, i) for i in range(self.npartitions)] def __dask_layers__(self) -> tuple[str]: return (self.name, ) def __dask_tokenize__(self) -> str: return self.name def __dask_postcompute__(self) -> Any: return _finalize_partitioned_histogram, () def _rebuild(self, dsk: Any, *, rename: Any = None) -> Any: name = self.name if rename: name = rename.get(name, name) return type(self)(dsk, name, self.npartitions, self.histref) __dask_optimize__ = globalmethod(optimize, key="histogram_optimize", falsey=dont_optimize) __dask_scheduler__ = staticmethod(tget) def __str__(self) -> str: return "dask_histogram.PartitionedHistogram,<%s, npartitions=%d>" % ( key_split(self.name), self.npartitions, ) __repr__ = __str__ def __reduce__(self): return ( PartitionedHistogram, ( self._dask, self._name, self._npartitions, self._histref, ), ) @property def histref(self) -> bh.Histogram: """boost_histogram.Histogram: reference histogram.""" return self._histref def collapse(self, split_every: int | None = None) -> AggHistogram: """Translate into a reduced aggregated histogram.""" return _reduction(self, split_every=split_every) def to_delayed(self, optimize_graph: bool = True) -> list[Delayed]: keys = self.__dask_keys__() graph = self.__dask_graph__() layer = self.__dask_layers__()[0] if optimize_graph: graph = self.__dask_optimize__(graph, keys) layer = f"delayed-{self.name}" graph = HighLevelGraph.from_collections(layer, graph, dependencies=()) return [Delayed(k, graph, layer=layer) for k in keys]
class AggHistogram(DaskMethodsMixin): """Aggregated Histogram collection. The class constructor is typically used internally; :py:func:`dask_histogram.factory` is recommended for users (along with the `dask_histogram.routines` module). See Also -------- dask_histogram.factory """ def __init__( self, dsk: HighLevelGraph, name: str, histref: bh.Histogram, layer: Any | None = None, ) -> None: self._dask: HighLevelGraph = dsk self._name: str = name self._histref: bh.Histogram = histref # NOTE: Layer only used by `Item.from_delayed`, to handle # Delayed objects created by other collections. e.g.: # Item.from_delayed(da.ones(1).to_delayed()[0]) See # Delayed.__init__ self._layer = layer or name if isinstance(dsk, HighLevelGraph) and self._layer not in dsk.layers: raise ValueError( f"Layer {self._layer} not in the HighLevelGraph's layers: {list(dsk.layers)}" ) def __dask_graph__(self) -> HighLevelGraph: return self._dask def __dask_keys__(self) -> list[str]: return [self.name] def __dask_layers__(self) -> tuple[str, ...]: if isinstance(self._dask, HighLevelGraph) and len( self._dask.layers) == 1: return tuple(self._dask.layers) return (self.name, ) def __dask_tokenize__(self) -> str: return self.name def __dask_postcompute__(self) -> Any: return _finalize_agg_histogram, () def __dask_postpersist__(self) -> Any: return self._rebuild, () __dask_optimize__ = globalmethod(optimize, key="histogram_optimize", falsey=dont_optimize) __dask_scheduler__ = staticmethod(tget) def _rebuild( self, dsk: HighLevelGraph, *, rename: Mapping[str, str] | None = None, ) -> Any: name = self._name if rename: name = rename.get(name, name) return type(self)(dsk, name, self.histref) @property def name(self) -> str: return self._name @property def dask(self) -> HighLevelGraph: return self._dask @property def histref(self) -> bh.Histogram: """Empty reference boost-histogram object.""" return self._histref @property def _storage_type(self) -> type[bh.storage.Storage]: """Storage type of the histogram.""" return self.histref._storage_type @property def ndim(self) -> int: """Total number of dimensions.""" return self.histref.ndim @property def shape(self) -> tuple[int, ...]: """Shape of the histogram as an array.""" return self.histref.shape @property def size(self) -> int: """Size of the histogram.""" return self.histref.size def __str__(self) -> str: return ("dask_histogram.AggHistogram<" f"{key_split(self.name)}, " f"ndim={self.ndim}, " f"storage={self._storage_type()}" ">") __repr__ = __str__ def __reduce__(self): return (AggHistogram, (self._dask, self._name, self._histref)) def to_dask_array( self, flow: bool = False, dd: bool = False ) -> tuple[DaskArray, ...] | tuple[DaskArray, list[DaskArray]]: """Convert histogram object to dask.array form. Parameters ---------- flow : bool Include the flow bins. dd : bool Use the histogramdd return syntax, where the edges are in a tuple. Otherwise, this is the histogram/histogram2d return style. Returns ------- contents : dask.array.Array The bin contents *edges : dask.array.Array The edges for each dimension """ return to_dask_array(self, flow=flow, dd=dd) def to_boost(self) -> bh.Histogram: """Convert to a boost_histogram.Histogram via computation. This is an alias of `.compute()`. """ return self.compute() def to_delayed(self, optimize_graph: bool = True) -> Delayed: keys = self.__dask_keys__() graph = self.__dask_graph__() layer = self.__dask_layers__()[0] if optimize_graph: graph = self.__dask_optimize__(graph, keys) layer = f"delayed-{self.name}" graph = HighLevelGraph.from_collections(layer, graph, dependencies=()) return Delayed(keys[0], graph, layer=layer) def values(self, flow: bool = False) -> NDArray[Any]: return self.to_boost().values(flow=flow) def variances(self, flow: bool = False) -> NDArray[Any] | None: return self.to_boost().variances(flow=flow) def counts(self, flow: bool = False) -> NDArray[Any]: return self.to_boost().counts(flow=flow) def __array__(self) -> NDArray[Any]: return self.compute().__array__() def __iadd__(self, other) -> AggHistogram: return _iadd(self, other) def __add__(self, other: Any) -> AggHistogram: return self.__iadd__(other) def __radd__(self, other: Any) -> AggHistogram: return self.__iadd__(other) def __isub__(self, other: Any) -> AggHistogram: return _isub(self, other) def __sub__(self, other: Any) -> AggHistogram: return self.__isub__(other) def __itruediv__(self, other: Any) -> AggHistogram: return _itruediv(self, other) def __truediv__(self, other: Any) -> AggHistogram: return self.__itruediv__(other) def __idiv__(self, other: Any) -> AggHistogram: return self.__itruediv__(other) def __div__(self, other: Any) -> AggHistogram: return self.__idiv__(other) def __imul__(self, other: Any) -> AggHistogram: return _imul(self, other) def __mul__(self, other: Any) -> AggHistogram: return self.__imul__(other) def __rmul__(self, other: Any) -> AggHistogram: return self.__mul__(other)
class Delayed(DaskMethodsMixin, OperatorMethodMixin): """Represents a value to be computed by dask. Equivalent to the output from a single key in a dask graph. """ __slots__ = ("_key", "_dask", "_length", "_layer") def __init__(self, key, dsk, length=None, layer=None): self._key = key self._dask = dsk self._length = length # NOTE: Layer is used by `to_delayed` in other collections, but not in normal Delayed use self._layer = layer or key if isinstance(dsk, HighLevelGraph) and self._layer not in dsk.layers: raise ValueError( f"Layer {self._layer} not in the HighLevelGraph's layers: {list(dsk.layers)}" ) @property def key(self): return self._key @property def dask(self): return self._dask def __dask_graph__(self): return self.dask def __dask_keys__(self): return [self.key] def __dask_layers__(self): return (self._layer, ) def __dask_tokenize__(self): return self.key __dask_scheduler__ = staticmethod(threaded.get) __dask_optimize__ = globalmethod(optimize, key="delayed_optimize") def __dask_postcompute__(self): return single_key, () def __dask_postpersist__(self): return self._rebuild, () def _rebuild(self, dsk, *, rename=None): key = replace_name_in_key(self.key, rename) if rename else self.key if isinstance(dsk, HighLevelGraph) and len(dsk.layers) == 1: # FIXME Delayed is currently the only collection type that supports both high- and low-level graphs. # The HLG output of `optimize` will have a layer name that doesn't match `key`. # Remove this when Delayed is HLG-only (because `optimize` will only be passed HLGs, so it won't have # to generate random layer names). layer = next(iter(dsk.layers)) else: layer = None return Delayed(key, dsk, self._length, layer=layer) def __repr__(self): return f"Delayed({repr(self.key)})" def __hash__(self): return hash(self.key) def __dir__(self): return dir(type(self)) def __getattr__(self, attr): if attr.startswith("_"): raise AttributeError(f"Attribute {attr} not found") if attr == "visualise": # added to warn users in case of spelling error # for more details: https://github.com/dask/dask/issues/5721 warnings.warn("dask.delayed objects have no `visualise` method. " "Perhaps you meant `visualize`?") return DelayedAttr(self, attr) def __setattr__(self, attr, val): try: object.__setattr__(self, attr, val) except AttributeError: # attr is neither in type(self).__slots__ nor in the __slots__ of any of its # parent classes, and all the parent classes define __slots__ too. # This last bit needs to be unit tested: if any of the parent classes omit # the __slots__ declaration, self will gain a __dict__ and this branch will # become unreachable. raise TypeError("Delayed objects are immutable") def __setitem__(self, index, val): raise TypeError("Delayed objects are immutable") def __iter__(self): if self._length is None: raise TypeError( "Delayed objects of unspecified length are not iterable") for i in range(self._length): yield self[i] def __len__(self): if self._length is None: raise TypeError( "Delayed objects of unspecified length have no len()") return self._length def __call__(self, *args, pure=None, dask_key_name=None, **kwargs): func = delayed(apply, pure=pure) if dask_key_name is not None: return func(self, args, kwargs, dask_key_name=dask_key_name) return func(self, args, kwargs) def __bool__(self): raise TypeError("Truth of Delayed objects is not supported") __nonzero__ = __bool__ def __get__(self, instance, cls): if instance is None: return self return types.MethodType(self, instance) @classmethod def _get_binary_operator(cls, op, inv=False): method = delayed(right(op) if inv else op, pure=True) return lambda *args, **kwargs: method(*args, **kwargs) _get_unary_operator = _get_binary_operator