Ejemplo n.º 1
0
def concat(objs: List):
    """
    Concat the results of partitioned dask task executions. This function guess the
        types of resulting list, then calls the corresponding native dask concat functions.

    Parameters
    ----------
    objs: List
        List of the partitioned dask task execution results, which will be concat.

    Returns
    -------
    obj:
        The concat result

    """
    if is_arraylike(objs[0]):
        res = array_concat(objs,
                           axes=[0])  # TODO: Add concat with args support
    elif any((is_dataframe_like(objs[0]), is_series_like(objs[0]),
              is_index_like(objs[0]))):
        res = df_concat(objs)
    else:
        res = objs
    return res.compute() if is_dask_collection(res) else res
Ejemplo n.º 2
0
def assert_dask_dtypes(ddf, res, numeric_equal=True):
    """Check that the dask metadata matches the result.

    If `numeric_equal`, integer and floating dtypes compare equal. This is
    useful due to the implicit conversion of integer to floating upon
    encountering missingness, which is hard to infer statically."""

    eq_type_sets = [{"O", "S", "U", "a"}]  # treat object and strings alike
    if numeric_equal:
        eq_type_sets.append({"i", "f", "u"})

    def eq_dtypes(a, b):
        return any(a.kind in eq_types and b.kind in eq_types
                   for eq_types in eq_type_sets) or (a == b)

    if not is_dask_collection(res) and is_dataframe_like(res):
        for col, a, b in pd.concat([ddf._meta.dtypes, res.dtypes],
                                   axis=1).itertuples():
            assert eq_dtypes(a, b)
    elif not is_dask_collection(res) and (is_index_like(res)
                                          or is_series_like(res)):
        a = ddf._meta.dtype
        b = res.dtype
        assert eq_dtypes(a, b)
    else:
        if hasattr(ddf._meta, "dtype"):
            a = ddf._meta.dtype
            if not hasattr(res, "dtype"):
                assert np.isscalar(res)
                b = np.dtype(type(res))
            else:
                b = res.dtype
            assert eq_dtypes(a, b)
        else:
            assert type(ddf._meta) == type(res)
Ejemplo n.º 3
0
def _maybe_sort(a, check_index: bool):
    # sort by value, then index
    try:
        if is_dataframe_like(a):
            if set(a.index.names) & set(a.columns):
                a.index.names = [
                    "-overlapped-index-name-%d" % i
                    for i in range(len(a.index.names))
                ]
            a = a.sort_values(by=methods.tolist(a.columns))
        else:
            a = a.sort_values()
    except (TypeError, IndexError, ValueError):
        pass
    return a.sort_index() if check_index else a
Ejemplo n.º 4
0
    def _input_to_dask_cupy_array(self, X):
        if (is_dataframe_like(X) or is_series_like(X)) and hasattr(X, "dask"):

            if not isinstance(X._meta, (cudf.Series, cudf.DataFrame)):
                raise TypeError("Please convert your Dask DataFrame"
                                " to a Dask-cuDF DataFrame using dask_cudf.")
            X = X.values
            X._meta = cp.asarray(X._meta)

        elif is_arraylike(X) and hasattr(X, "dask"):
            if not isinstance(X._meta, cp.ndarray):
                raise TypeError("Please convert your CPU Dask Array"
                                " to a GPU Dask Array using"
                                " arr.map_blocks(cp.asarray).")
        else:
            raise TypeError("Please pass a GPU backed Dask DataFrame"
                            " or Dask Array.")

        X.compute_chunk_sizes()
        return X
Ejemplo n.º 5
0
def histogramdd(
    a: DaskCollection | tuple[DaskCollection, ...],
    bins: BinArg = 10,
    range: RangeArg = None,
    normed: bool | None = None,
    weights: DaskCollection | None = None,
    density: bool = False,
    *,
    histogram: Any | None = None,
    storage: storage.Storage = storage.Double(),
    threads: int | None = None,
) -> Histogram | tuple[da.Array, ...] | tuple[da.Array, list[da.Array]]:
    """Histogram Dask data in multiple dimensions.

    Parameters
    ----------
    a : dask collection or tuple of dask collections
        Data to histogram. Acceptable input data can be of the form:

        * A dask.array.Array of shape (N, D) where each row is a
          sample and each column is a specific coordinate for the
          data.
        * A sequence of dask collections where each collection (e.g.
          array or series) contains all values for one coordinate of
          all data.
    bins : sequence of arrays, int, or sequence of ints
        The bin specification.

        The possible binning configurations are:

        * A sequence of arrays describing the monotonically increasing
          bin edges along each dimension.
        * A single int describing the total number of bins that will
          be used in each dimension (this requires the `range`
          argument to be defined).
        * A sequence of ints describing the total number of bins to be
          used in each dimension (this requires the `range` argument
          to be defined).

        When bins are described by arrays, the rightmost edge is
        included. Bins described by arrays also allows for non-uniform
        bin widths.
    range : tuple(tuple(float, float), ...) optional
        A sequence of length D, each a (min, max) tuple giving the
        outer bin edges to be used if the edges are not given
        explicitly in `bins`. If defined, this argument is required to
        have an entry for each dimension. Unlike
        :func:`numpy.histogramdd`, if `bins` does not define bin
        edges, this argument is required (this function will not
        automatically use the min and max of of the value in a given
        dimension because the input data may be lazy in dask).
    normed : bool, optional
        An unsupported argument that has been deprecated in the NumPy
        API (preserved to maintain calls dependent on argument order).
    weights : dask.array.Array or dask.dataframe.Series, optional
        An array of values weighing each sample in the input data. The
        chunks of the weights must be identical to the chunking along
        the 0th (row) axis of the data sample.
    density : bool
        If ``False`` (default), the returned array represents the
        number of samples in each bin. If ``True``, the returned array
        represents the probability density function at each bin.
    histogram : dask_histogram.Histogram, optional
        If `dh.Histogram`, object based output is enabled.
    storage : boost_histogram.storage.Storage
        Define the storage used by the :py:class:`Histogram` object.
    threads : int, optional
        Ignored argument kept for compatibility with boost-histogram.
        We let Dask have complete control over threads.

    Returns
    -------
    tuple(dask.array.Array, tuple(dask.array.Array)) or Histogram
        The default return is the style of
        :func:`dask.array.histogramdd`: An array of bin contents and a
        tuple of edges arrays (one for each dimension). If the
        `histogram` argument is used then the return is a
        :obj:`dask_histogram.Histogram` object.

    See Also
    --------
    histogram
    histogram2d

    Examples
    --------
    Creating a three dimensional histogram with variable width bins in
    each dimension. First, using three 1D arrays for each coordinate:

    >>> import dask.array as da
    >>> import dask_histogram.boost as dhb
    >>> x = da.random.standard_normal(size=(10000,), chunks=(2000,))
    >>> y = da.random.standard_normal(size=(10000,), chunks=(2000,))
    >>> z = da.random.standard_normal(size=(10000,), chunks=(2000,))
    >>> bins = [
    ...    [-3, -2, 0, 1, 3],
    ...    [-3, -1, 1, 2, 3],
    ...    [-3, -2, 0, 2, 3],
    ... ]
    >>> h, edges = dhb.histogramdd((x, y, z), bins=bins)
    >>> type(h)
    <class 'dask.array.core.Array'>
    >>> h.shape
    (4, 4, 4)
    >>> len(edges)
    3

    Now the same histogram but instead of a
    :py:func:`dask.array.histogramdd` style return (which mirrors the
    return style of :py:func:`numpy.histogramdd`), we use the
    `histogram` argument to trigger the return of a
    :obj:`dask_histogram.Histogram` object:

    >>> import dask.array as da
    >>> import dask_histogram.boost as dhb
    >>> x = da.random.standard_normal(size=(10000,), chunks=(2000,))
    >>> y = da.random.standard_normal(size=(10000,), chunks=(2000,))
    >>> z = da.random.standard_normal(size=(10000,), chunks=(2000,))
    >>> bins = [
    ...    [-3, -2, 0, 1, 3],
    ...    [-3, -1, 1, 2, 3],
    ...    [-3, -2, 0, 2, 3],
    ... ]
    >>> h = dhb.histogramdd((x, y, z), bins=bins, histogram=dhb.Histogram)
    >>> h
    Histogram(
      Variable([-3, -2, 0, 1, 3]),
      Variable([-3, -1, 1, 2, 3]),
      Variable([-3, -2, 0, 2, 3]),
      storage=Double()) # (has staged fills)
    >>> h.staged_fills()
    True
    >>> h = h.compute()
    >>> h  # doctest: +SKIP
    Histogram(
      Variable([-3, -2, 0, 1, 3]),
      Variable([-3, -1, 1, 2, 3]),
      Variable([-3, -2, 0, 2, 3]),
      storage=Double()) # Sum: 9919.0 (10000.0 with flow)

    Another 3D histogram example but with an alternative dataset form
    (a single array with three columns), fixed bin widths, sample
    weights, and usage of the boost-histogram ``Weight()`` storage:

    >>> import dask.array as da
    >>> import dask_histogram.boost as dhb
    >>> a = da.random.standard_normal(size=(10000, 3), chunks=(2000, 3))
    >>> w = da.random.uniform(0.5, 0.7, size=(10000,), chunks=2000)
    >>> bins = (7, 5, 6)
    >>> range = ((-3, 3), (-2.9, 2.9), (-3.1, 3.1))
    >>> h = dhb.histogramdd(
    ...     a,
    ...     bins=bins,
    ...     range=range,
    ...     weights=w,
    ...     histogram=dhb.Histogram,
    ...     storage=dhb.storage.Weight()
    ... )
    >>> h
    Histogram(
      Regular(7, -3, 3),
      Regular(5, -2.9, 2.9),
      Regular(6, -3.1, 3.1),
      storage=Weight()) # Sum: WeightedSum(value=0, variance=0) (has staged fills)
    >>> h.staged_fills()
    True
    >>> h = h.compute()
    >>> h.staged_fills()
    False

    """

    # Check for invalid argument combinations.
    if normed is not None:
        raise KeyError(
            "normed=True is deprecated in NumPy and not supported by dask-histogram."
        )
    if density and histogram is not None:
        raise KeyError(
            "dask-histogram does not support the density keyword when returning a "
            "dask-histogram object.")

    # If input is a multidimensional array or dataframe, we wrap it in
    # a tuple that will be passed to fill and unrolled in the backend.
    if (is_arraylike(a)
            and a.ndim > 1) or is_dataframe_like(a):  # type: ignore
        ndim = a.shape[1]  # type: ignore
        a = (a, )  # type: ignore
    else:
        ndim = len(a)
        for entry in a:
            if not is_dask_collection(entry):
                raise ValueError(
                    "non-dask collection was passed; this function only supports dask "
                    "collections as input")

    bins, range = normalize_bins_range(ndim, bins, range)

    # Create the axes based on the bins and range values.
    axes = []
    for _, (b, r) in enumerate(zip(bins, range)):  # type: ignore
        if r is None:
            axes.append(axis.Variable(b))  # type: ignore
        else:
            axes.append(axis.Regular(bins=b, start=r[0],
                                     stop=r[1]))  # type: ignore

    # Finally create and fill the histogram object.
    hist = Histogram(*axes, storage=storage).fill(*a, weight=weights)

    if histogram != Histogram:
        return hist.to_dask_array(flow=False, dd=True)
    return hist
Ejemplo n.º 6
0
def _partitioned_histogram(
    *data: DaskCollection,
    histref: bh.Histogram,
    weights: DaskCollection | None = None,
    sample: DaskCollection | None = None,
    split_every: int | None = None,
) -> PartitionedHistogram:
    name = f"hist-on-block-{tokenize(data, histref, weights, sample, split_every)}"
    data_is_df = is_dataframe_like(data[0])
    data_is_dak = is_awkward_like(data[0])
    _weight_sample_check(*data, weights=weights)

    # Single awkward array object.
    if len(data) == 1 and data_is_dak:
        from dask_awkward.core import partitionwise_layer as dak_pwl

        x = data[0]
        if weights is not None and sample is not None:
            raise NotImplementedError()
        elif weights is not None and sample is None:
            raise NotImplementedError()
        elif weights is None and sample is not None:
            raise NotImplementedError()
        else:
            g = dak_pwl(_blocked_dak, name, x, histref=histref)

    # Single object, not a dataframe
    elif len(data) == 1 and not data_is_df:
        x = data[0]
        if weights is not None and sample is not None:
            g = partitionwise(_blocked_sa_w_s,
                              name,
                              x,
                              weights,
                              sample,
                              histref=histref)
        elif weights is not None and sample is None:
            g = partitionwise(_blocked_sa_w, name, x, weights, histref=histref)
        elif weights is None and sample is not None:
            g = partitionwise(_blocked_sa_s, name, x, sample, histref=histref)
        else:
            g = partitionwise(_blocked_sa, name, x, histref=histref)

    # Single object, is a dataframe
    elif len(data) == 1 and data_is_df:
        x = data[0]
        if weights is not None and sample is not None:
            g = partitionwise(_blocked_df_w_s,
                              name,
                              x,
                              weights,
                              sample,
                              histref=histref)
        elif weights is not None and sample is None:
            g = partitionwise(_blocked_df_w, name, x, weights, histref=histref)
        elif weights is None and sample is not None:
            g = partitionwise(_blocked_df_s, name, x, sample, histref=histref)
        else:
            g = partitionwise(_blocked_df, name, x, histref=histref)

    # Multiple objects
    else:

        # Awkward array collection detected as first argument
        if data_is_dak:
            from dask_awkward.core import partitionwise_layer as dak_pwl

            if weights is None and sample is None:
                g = dak_pwl(_blocked_dak_ma, name, *data, histref=histref)
            else:
                raise NotImplementedError()
        # Not an awkward array collection
        elif weights is not None and sample is not None:
            g = partitionwise(_blocked_ma_w_s,
                              name,
                              *data,
                              weights,
                              sample,
                              histref=histref)
        elif weights is not None and sample is None:
            g = partitionwise(_blocked_ma_w,
                              name,
                              *data,
                              weights,
                              histref=histref)
        elif weights is None and sample is not None:
            g = partitionwise(_blocked_ma_s,
                              name,
                              *data,
                              sample,
                              histref=histref)
        else:
            g = partitionwise(_blocked_ma, name, *data, histref=histref)

    dependencies = _dependencies(*data, weights=weights, sample=sample)
    hlg = HighLevelGraph.from_collections(name, g, dependencies=dependencies)
    return PartitionedHistogram(hlg,
                                name,
                                data[0].npartitions,
                                histref=histref)
Ejemplo n.º 7
0
def check_meta(x, meta, funcname=None, numeric_equal=True):
    """Check that the dask metadata matches the result.

    If metadata matches, ``x`` is passed through unchanged. A nice error is
    raised if metadata doesn't match.

    Parameters
    ----------
    x : DataFrame, Series, or Index
    meta : DataFrame, Series, or Index
        The expected metadata that ``x`` should match
    funcname : str, optional
        The name of the function in which the metadata was specified. If
        provided, the function name will be included in the error message to be
        more helpful to users.
    numeric_equal : bool, optionl
        If True, integer and floating dtypes compare equal. This is useful due
        to panda's implicit conversion of integer to floating upon encountering
        missingness, which is hard to infer statically.
    """
    eq_types = {"i", "f", "u"} if numeric_equal else set()

    def equal_dtypes(a, b):
        if is_categorical_dtype(a) != is_categorical_dtype(b):
            return False
        if isinstance(a, str) and a == "-" or isinstance(b, str) and b == "-":
            return False
        if is_categorical_dtype(a) and is_categorical_dtype(b):
            if UNKNOWN_CATEGORIES in a.categories or UNKNOWN_CATEGORIES in b.categories:
                return True
            return a == b
        return (a.kind in eq_types and b.kind in eq_types) or is_dtype_equal(
            a, b)

    if not (is_dataframe_like(meta) or is_series_like(meta)
            or is_index_like(meta)) or is_dask_collection(meta):
        raise TypeError("Expected partition to be DataFrame, Series, or "
                        "Index, got `%s`" % typename(type(meta)))

    # Notice, we use .__class__ as opposed to type() in order to support
    # object proxies see <https://github.com/dask/dask/pull/6981>
    if x.__class__ != meta.__class__:
        errmsg = "Expected partition of type `{}` but got `{}`".format(
            typename(type(meta)),
            typename(type(x)),
        )
    elif is_dataframe_like(meta):
        dtypes = pd.concat([x.dtypes, meta.dtypes], axis=1, sort=True)
        bad_dtypes = [(repr(col), a, b)
                      for col, a, b in dtypes.fillna("-").itertuples()
                      if not equal_dtypes(a, b)]
        if bad_dtypes:
            errmsg = "Partition type: `{}`\n{}".format(
                typename(type(meta)),
                asciitable(["Column", "Found", "Expected"], bad_dtypes),
            )
        else:
            check_matching_columns(meta, x)
            return x
    else:
        if equal_dtypes(x.dtype, meta.dtype):
            return x
        errmsg = "Partition type: `{}`\n{}".format(
            typename(type(meta)),
            asciitable(["", "dtype"], [("Found", x.dtype),
                                       ("Expected", meta.dtype)]),
        )

    raise ValueError("Metadata mismatch found%s.\n\n"
                     "%s" %
                     ((" in `%s`" % funcname if funcname else ""), errmsg))