コード例 #1
0
def test_is_arraylike():
    np = pytest.importorskip("numpy")

    assert is_arraylike(0) is False
    assert is_arraylike(()) is False
    assert is_arraylike(0) is False
    assert is_arraylike([]) is False
    assert is_arraylike([0]) is False

    assert is_arraylike(np.empty(())) is True
    assert is_arraylike(np.empty((0, ))) is True
    assert is_arraylike(np.empty((0, 0))) is True
コード例 #2
0
def _blockwise_comparison_dnf(op, indices: list, dsk: RegenerableGraph):
    # Return DNF expression pattern for a simple comparison
    left = _get_blockwise_input(0, indices, dsk)
    right = _get_blockwise_input(1, indices, dsk)

    def _inv(symbol: str):
        return {
            ">": "<",
            "<": ">",
            ">=": "<=",
            "<=": ">=",
        }.get(symbol, symbol)

    if is_arraylike(left) and hasattr(left, "item") and left.size == 1:
        left = left.item()
        # Need inverse comparison in read_parquet
        return (right, _inv(_comparison_symbols[op]), left)
    if is_arraylike(right) and hasattr(right, "item") and right.size == 1:
        right = right.item()
    return to_dnf((left, _comparison_symbols[op], right))
コード例 #3
0
ファイル: ndarray.py プロジェクト: zillow/intake
 def _data_to_source(arr,
                     path,
                     component=None,
                     storage_options=None,
                     **kwargs):
     from dask.utils import is_arraylike
     from dask.array import to_zarr, from_array
     from ..source.zarr import ZarrArraySource
     if not is_arraylike(arr):
         raise NotImplementedError
     if not hasattr(arr, 'npartitions'):
         arr = from_array(arr, chunks='auto')
     to_zarr(arr,
             path,
             component=None,
             storage_options=storage_options,
             **kwargs)
     source = ZarrArraySource(path, storage_options, component)
     return source
コード例 #4
0
ファイル: logistic_regression.py プロジェクト: rapidsai/cuml
    def _input_to_dask_cupy_array(self, X):
        if (is_dataframe_like(X) or is_series_like(X)) and hasattr(X, "dask"):

            if not isinstance(X._meta, (cudf.Series, cudf.DataFrame)):
                raise TypeError("Please convert your Dask DataFrame"
                                " to a Dask-cuDF DataFrame using dask_cudf.")
            X = X.values
            X._meta = cp.asarray(X._meta)

        elif is_arraylike(X) and hasattr(X, "dask"):
            if not isinstance(X._meta, cp.ndarray):
                raise TypeError("Please convert your CPU Dask Array"
                                " to a GPU Dask Array using"
                                " arr.map_blocks(cp.asarray).")
        else:
            raise TypeError("Please pass a GPU backed Dask DataFrame"
                            " or Dask Array.")

        X.compute_chunk_sizes()
        return X
コード例 #5
0
ファイル: backends.py プロジェクト: jakirkham/dask
def make_meta_object(x, index=None):
    """Create an empty pandas object containing the desired metadata.

    Parameters
    ----------
    x : dict, tuple, list, pd.Series, pd.DataFrame, pd.Index, dtype, scalar
        To create a DataFrame, provide a `dict` mapping of `{name: dtype}`, or
        an iterable of `(name, dtype)` tuples. To create a `Series`, provide a
        tuple of `(name, dtype)`. If a pandas object, names, dtypes, and index
        should match the desired output. If a dtype or scalar, a scalar of the
        same dtype is returned.
    index :  pd.Index, optional
        Any pandas index to use in the metadata. If none provided, a
        `RangeIndex` will be used.

    Examples
    --------

    >>> make_meta_object([('a', 'i8'), ('b', 'O')])
    Empty DataFrame
    Columns: [a, b]
    Index: []
    >>> make_meta_object(('a', 'f8'))
    Series([], Name: a, dtype: float64)
    >>> make_meta_object('i8')
    1
    """

    if is_arraylike(x) and x.shape:
        return x[:0]

    if index is not None:
        index = make_meta_dispatch(index)

    if isinstance(x, dict):
        return pd.DataFrame(
            {c: _empty_series(c, d, index=index)
             for (c, d) in x.items()},
            index=index)
    if isinstance(x, tuple) and len(x) == 2:
        return _empty_series(x[0], x[1], index=index)
    elif isinstance(x, Iterable) and not isinstance(x, str):
        if not all(isinstance(i, tuple) and len(i) == 2 for i in x):
            raise ValueError(
                f"Expected iterable of tuples of (name, dtype), got {x}")
        return pd.DataFrame(
            {c: _empty_series(c, d, index=index)
             for (c, d) in x},
            columns=[c for c, d in x],
            index=index,
        )
    elif not hasattr(x, "dtype") and x is not None:
        # could be a string, a dtype object, or a python type. Skip `None`,
        # because it is implictly converted to `dtype('f8')`, which we don't
        # want here.
        try:
            dtype = np.dtype(x)
            return _scalar_from_dtype(dtype)
        except Exception:
            # Continue on to next check
            pass

    if is_scalar(x):
        return _nonempty_scalar(x)

    raise TypeError(f"Don't know how to create metadata from {x}")
コード例 #6
0
ファイル: boost.py プロジェクト: dask-contrib/dask-histogram
def histogramdd(
    a: DaskCollection | tuple[DaskCollection, ...],
    bins: BinArg = 10,
    range: RangeArg = None,
    normed: bool | None = None,
    weights: DaskCollection | None = None,
    density: bool = False,
    *,
    histogram: Any | None = None,
    storage: storage.Storage = storage.Double(),
    threads: int | None = None,
) -> Histogram | tuple[da.Array, ...] | tuple[da.Array, list[da.Array]]:
    """Histogram Dask data in multiple dimensions.

    Parameters
    ----------
    a : dask collection or tuple of dask collections
        Data to histogram. Acceptable input data can be of the form:

        * A dask.array.Array of shape (N, D) where each row is a
          sample and each column is a specific coordinate for the
          data.
        * A sequence of dask collections where each collection (e.g.
          array or series) contains all values for one coordinate of
          all data.
    bins : sequence of arrays, int, or sequence of ints
        The bin specification.

        The possible binning configurations are:

        * A sequence of arrays describing the monotonically increasing
          bin edges along each dimension.
        * A single int describing the total number of bins that will
          be used in each dimension (this requires the `range`
          argument to be defined).
        * A sequence of ints describing the total number of bins to be
          used in each dimension (this requires the `range` argument
          to be defined).

        When bins are described by arrays, the rightmost edge is
        included. Bins described by arrays also allows for non-uniform
        bin widths.
    range : tuple(tuple(float, float), ...) optional
        A sequence of length D, each a (min, max) tuple giving the
        outer bin edges to be used if the edges are not given
        explicitly in `bins`. If defined, this argument is required to
        have an entry for each dimension. Unlike
        :func:`numpy.histogramdd`, if `bins` does not define bin
        edges, this argument is required (this function will not
        automatically use the min and max of of the value in a given
        dimension because the input data may be lazy in dask).
    normed : bool, optional
        An unsupported argument that has been deprecated in the NumPy
        API (preserved to maintain calls dependent on argument order).
    weights : dask.array.Array or dask.dataframe.Series, optional
        An array of values weighing each sample in the input data. The
        chunks of the weights must be identical to the chunking along
        the 0th (row) axis of the data sample.
    density : bool
        If ``False`` (default), the returned array represents the
        number of samples in each bin. If ``True``, the returned array
        represents the probability density function at each bin.
    histogram : dask_histogram.Histogram, optional
        If `dh.Histogram`, object based output is enabled.
    storage : boost_histogram.storage.Storage
        Define the storage used by the :py:class:`Histogram` object.
    threads : int, optional
        Ignored argument kept for compatibility with boost-histogram.
        We let Dask have complete control over threads.

    Returns
    -------
    tuple(dask.array.Array, tuple(dask.array.Array)) or Histogram
        The default return is the style of
        :func:`dask.array.histogramdd`: An array of bin contents and a
        tuple of edges arrays (one for each dimension). If the
        `histogram` argument is used then the return is a
        :obj:`dask_histogram.Histogram` object.

    See Also
    --------
    histogram
    histogram2d

    Examples
    --------
    Creating a three dimensional histogram with variable width bins in
    each dimension. First, using three 1D arrays for each coordinate:

    >>> import dask.array as da
    >>> import dask_histogram.boost as dhb
    >>> x = da.random.standard_normal(size=(10000,), chunks=(2000,))
    >>> y = da.random.standard_normal(size=(10000,), chunks=(2000,))
    >>> z = da.random.standard_normal(size=(10000,), chunks=(2000,))
    >>> bins = [
    ...    [-3, -2, 0, 1, 3],
    ...    [-3, -1, 1, 2, 3],
    ...    [-3, -2, 0, 2, 3],
    ... ]
    >>> h, edges = dhb.histogramdd((x, y, z), bins=bins)
    >>> type(h)
    <class 'dask.array.core.Array'>
    >>> h.shape
    (4, 4, 4)
    >>> len(edges)
    3

    Now the same histogram but instead of a
    :py:func:`dask.array.histogramdd` style return (which mirrors the
    return style of :py:func:`numpy.histogramdd`), we use the
    `histogram` argument to trigger the return of a
    :obj:`dask_histogram.Histogram` object:

    >>> import dask.array as da
    >>> import dask_histogram.boost as dhb
    >>> x = da.random.standard_normal(size=(10000,), chunks=(2000,))
    >>> y = da.random.standard_normal(size=(10000,), chunks=(2000,))
    >>> z = da.random.standard_normal(size=(10000,), chunks=(2000,))
    >>> bins = [
    ...    [-3, -2, 0, 1, 3],
    ...    [-3, -1, 1, 2, 3],
    ...    [-3, -2, 0, 2, 3],
    ... ]
    >>> h = dhb.histogramdd((x, y, z), bins=bins, histogram=dhb.Histogram)
    >>> h
    Histogram(
      Variable([-3, -2, 0, 1, 3]),
      Variable([-3, -1, 1, 2, 3]),
      Variable([-3, -2, 0, 2, 3]),
      storage=Double()) # (has staged fills)
    >>> h.staged_fills()
    True
    >>> h = h.compute()
    >>> h  # doctest: +SKIP
    Histogram(
      Variable([-3, -2, 0, 1, 3]),
      Variable([-3, -1, 1, 2, 3]),
      Variable([-3, -2, 0, 2, 3]),
      storage=Double()) # Sum: 9919.0 (10000.0 with flow)

    Another 3D histogram example but with an alternative dataset form
    (a single array with three columns), fixed bin widths, sample
    weights, and usage of the boost-histogram ``Weight()`` storage:

    >>> import dask.array as da
    >>> import dask_histogram.boost as dhb
    >>> a = da.random.standard_normal(size=(10000, 3), chunks=(2000, 3))
    >>> w = da.random.uniform(0.5, 0.7, size=(10000,), chunks=2000)
    >>> bins = (7, 5, 6)
    >>> range = ((-3, 3), (-2.9, 2.9), (-3.1, 3.1))
    >>> h = dhb.histogramdd(
    ...     a,
    ...     bins=bins,
    ...     range=range,
    ...     weights=w,
    ...     histogram=dhb.Histogram,
    ...     storage=dhb.storage.Weight()
    ... )
    >>> h
    Histogram(
      Regular(7, -3, 3),
      Regular(5, -2.9, 2.9),
      Regular(6, -3.1, 3.1),
      storage=Weight()) # Sum: WeightedSum(value=0, variance=0) (has staged fills)
    >>> h.staged_fills()
    True
    >>> h = h.compute()
    >>> h.staged_fills()
    False

    """

    # Check for invalid argument combinations.
    if normed is not None:
        raise KeyError(
            "normed=True is deprecated in NumPy and not supported by dask-histogram."
        )
    if density and histogram is not None:
        raise KeyError(
            "dask-histogram does not support the density keyword when returning a "
            "dask-histogram object.")

    # If input is a multidimensional array or dataframe, we wrap it in
    # a tuple that will be passed to fill and unrolled in the backend.
    if (is_arraylike(a)
            and a.ndim > 1) or is_dataframe_like(a):  # type: ignore
        ndim = a.shape[1]  # type: ignore
        a = (a, )  # type: ignore
    else:
        ndim = len(a)
        for entry in a:
            if not is_dask_collection(entry):
                raise ValueError(
                    "non-dask collection was passed; this function only supports dask "
                    "collections as input")

    bins, range = normalize_bins_range(ndim, bins, range)

    # Create the axes based on the bins and range values.
    axes = []
    for _, (b, r) in enumerate(zip(bins, range)):  # type: ignore
        if r is None:
            axes.append(axis.Variable(b))  # type: ignore
        else:
            axes.append(axis.Regular(bins=b, start=r[0],
                                     stop=r[1]))  # type: ignore

    # Finally create and fill the histogram object.
    hist = Histogram(*axes, storage=storage).fill(*a, weight=weights)

    if histogram != Histogram:
        return hist.to_dask_array(flow=False, dd=True)
    return hist
コード例 #7
0
def from_map(
    func,
    *iterables,
    args=None,
    meta=None,
    divisions=None,
    label=None,
    token=None,
    enforce_metadata=True,
    **kwargs,
):
    """Create a DataFrame collection from a custom function map

    WARNING: The ``from_map`` API is experimental, and stability is not
    yet guaranteed. Use at your own risk!

    Parameters
    ----------
    func : callable
        Function used to create each partition. If ``func`` satisfies the
        ``DataFrameIOFunction`` protocol, column projection will be enabled.
    *iterables : Iterable objects
        Iterable objects to map to each output partition. All iterables must
        be the same length. This length determines the number of partitions
        in the output collection (only one element of each iterable will
        be passed to ``func`` for each partition).
    args : list or tuple, optional
        Positional arguments to broadcast to each output partition. Note
        that these arguments will always be passed to ``func`` after the
        ``iterables`` positional arguments.
    $META
    divisions : tuple, str, optional
        Partition boundaries along the index.
        For tuple, see https://docs.dask.org/en/latest/dataframe-design.html#partitions
        For string 'sorted' will compute the delayed values to find index
        values.  Assumes that the indexes are mutually sorted.
        If None, then won't use index information
    label : str, optional
        String to use as the function-name label in the output
        collection-key names.
    token : str, optional
        String to use as the "token" in the output collection-key names.
    enforce_metadata : bool, default True
        Whether to enforce at runtime that the structure of the DataFrame
        produced by ``func`` actually matches the structure of ``meta``.
        This will rename and reorder columns for each partition,
        and will raise an error if this doesn't work or types don't match.
    **kwargs:
        Key-word arguments to broadcast to each output partition. These
        same arguments will be passed to ``func`` for every output partition.

    Examples
    --------
    >>> import pandas as pd
    >>> import dask.dataframe as dd
    >>> func = lambda x, size=0: pd.Series([x] * size)
    >>> inputs = ["A", "B"]
    >>> dd.from_map(func, inputs, size=2).compute()
    0    A
    1    A
    0    B
    1    B
    dtype: object

    This API can also be used as an alternative to other file-based
    IO functions, like ``read_parquet`` (which are already just
    ``from_map`` wrapper functions):

    >>> import pandas as pd
    >>> import dask.dataframe as dd
    >>> paths = ["0.parquet", "1.parquet", "2.parquet"]
    >>> dd.from_map(pd.read_parquet, paths).head()  # doctest: +SKIP
                        name
    timestamp
    2000-01-01 00:00:00   Laura
    2000-01-01 00:00:01  Oliver
    2000-01-01 00:00:02   Alice
    2000-01-01 00:00:03  Victor
    2000-01-01 00:00:04     Bob

    Since ``from_map`` allows you to map an arbitrary function
    to any number of iterable objects, it can be a very convenient
    means of implementing functionality that may be missing from
    from other DataFrame-creation methods. For example, if you
    happen to have apriori knowledge about the number of rows
    in each of the files in a dataset, you can generate a
    DataFrame collection with a global RangeIndex:

    >>> import pandas as pd
    >>> import numpy as np
    >>> import dask.dataframe as dd
    >>> paths = ["0.parquet", "1.parquet", "2.parquet"]
    >>> file_sizes = [86400, 86400, 86400]
    >>> def func(path, row_offset):
    ...     # Read parquet file and set RangeIndex offset
    ...     df = pd.read_parquet(path)
    ...     return df.set_index(
    ...         pd.RangeIndex(row_offset, row_offset+len(df))
    ...     )
    >>> def get_ddf(paths, file_sizes):
    ...     offsets = [0] + list(np.cumsum(file_sizes))
    ...     return dd.from_map(
    ...         func, paths, offsets[:-1], divisions=offsets
    ...     )
    >>> ddf = get_ddf(paths, file_sizes)  # doctest: +SKIP
    >>> ddf.index  # doctest: +SKIP
    Dask Index Structure:
    npartitions=3
    0         int64
    86400       ...
    172800      ...
    259200      ...
    dtype: int64
    Dask Name: myfunc, 6 tasks

    See Also
    --------
    dask.dataframe.from_delayed
    dask.layers.DataFrameIOLayer
    """

    # Input validation
    if not callable(func):
        raise ValueError("`func` argument must be `callable`")
    lengths = set()
    iterables = list(iterables)
    for i, iterable in enumerate(iterables):
        if not isinstance(iterable, Iterable):
            raise ValueError(
                f"All elements of `iterables` must be Iterable, got {type(iterable)}"
            )
        try:
            lengths.add(len(iterable))
        except (AttributeError, TypeError):
            iterables[i] = list(iterable)
            lengths.add(len(iterables[i]))
    if len(lengths) == 0:
        raise ValueError("`from_map` requires at least one Iterable input")
    elif len(lengths) > 1:
        raise ValueError("All `iterables` must have the same length")
    if lengths == {0}:
        raise ValueError("All `iterables` must have a non-zero length")

    # Check for `produces_tasks` and `creation_info`.
    # These options are included in the function signature,
    # because they are not intended for "public" use.
    produces_tasks = kwargs.pop("produces_tasks", False)
    creation_info = kwargs.pop("creation_info", None)

    if produces_tasks or len(iterables) == 1:
        if len(iterables) > 1:
            # Tasks are not detected correctly when they are "packed"
            # within an outer list/tuple
            raise ValueError(
                "Multiple iterables not supported when produces_tasks=True")
        inputs = iterables[0]
        packed = False
    else:
        inputs = list(zip(*iterables))
        packed = True

    # Define collection name
    label = label or funcname(func)
    token = token or tokenize(func, meta, inputs, args, divisions,
                              enforce_metadata, **kwargs)
    name = f"{label}-{token}"

    # Get "projectable" column selection.
    # Note that this relies on the IO function
    # ducktyping with DataFrameIOFunction
    column_projection = func.columns if isinstance(
        func, DataFrameIOFunction) else None

    # NOTE: Most of the metadata-handling logic used here
    # is copied directly from `map_partitions`
    if meta is None:
        meta = _emulate(
            func,
            *(inputs[0] if packed else inputs[:1]),
            *(args or []),
            udf=True,
            **kwargs,
        )
        meta_is_emulated = True
    else:
        meta = make_meta(meta)
        meta_is_emulated = False

    if not (has_parallel_type(meta) or is_arraylike(meta) and meta.shape):
        if not meta_is_emulated:
            raise TypeError(
                "Meta is not valid, `from_map` expects output to be a pandas object. "
                "Try passing a pandas object as meta or a dict or tuple representing the "
                "(name, dtype) of the columns.")
        # If `meta` is not a pandas object, the concatenated results will be a
        # different type
        meta = make_meta(_concat([meta]))

    # Ensure meta is empty DataFrame
    meta = make_meta(meta)

    # Define io_func
    if packed or args or kwargs or enforce_metadata:
        io_func = _PackedArgCallable(
            func,
            args=args,
            kwargs=kwargs,
            meta=meta if enforce_metadata else None,
            enforce_metadata=enforce_metadata,
            packed=packed,
        )
    else:
        io_func = func

    # Construct DataFrameIOLayer
    layer = DataFrameIOLayer(
        name,
        column_projection,
        inputs,
        io_func,
        label=label,
        produces_tasks=produces_tasks,
        creation_info=creation_info,
    )

    # Return new DataFrame-collection object
    divisions = divisions or [None] * (len(inputs) + 1)
    graph = HighLevelGraph.from_collections(name, layer, dependencies=[])
    return new_dd_object(graph, name, meta, divisions)