Example #1
0
def test_make_meta_backends(index):

    dtypes = ["int8", "int32", "int64", "float64"]
    df = cudf.DataFrame(
        {dt: np.arange(start=0, stop=3, dtype=dt)
         for dt in dtypes})
    df["strings"] = ["cat", "dog", "fish"]
    df["cats"] = df["strings"].astype("category")
    df["time_s"] = np.array(["2018-10-07", "2018-10-08", "2018-10-09"],
                            dtype="datetime64[s]")
    df["time_ms"] = df["time_s"].astype("datetime64[ms]")
    df["time_ns"] = df["time_s"].astype("datetime64[ns]")
    df = df.set_index(index)

    # Check "empty" metadata types
    chk_meta = dask_make_meta(df)
    dd.assert_eq(chk_meta.dtypes, df.dtypes)

    # Check "non-empty" metadata types
    chk_meta_nonempty = meta_nonempty(df)
    dd.assert_eq(chk_meta.dtypes, chk_meta_nonempty.dtypes)

    # Check dask code path if not MultiIndex
    if not isinstance(df.index, cudf.MultiIndex):

        ddf = dgd.from_cudf(df, npartitions=1)

        # Check "empty" metadata types
        dd.assert_eq(ddf._meta.dtypes, df.dtypes)

        # Check "non-empty" metadata types
        dd.assert_eq(ddf._meta.dtypes, ddf._meta_nonempty.dtypes)
Example #2
0
File: core.py Project: rongou/cudf
    def _assign_column(self, k, v):
        def assigner(df, k, v):
            out = df.copy()
            out[k] = v
            return out

        meta = assigner(self._meta, k, dask_make_meta(v))
        return self.map_partitions(assigner, k, v, meta=meta)
Example #3
0
File: core.py Project: rongou/cudf
 def __init__(self, dsk, name, meta, divisions):
     if not isinstance(dsk, HighLevelGraph):
         dsk = HighLevelGraph.from_collections(name, dsk, dependencies=[])
     self.dask = dsk
     self._name = name
     meta = dask_make_meta(meta)
     if not isinstance(meta, self._partition_type):
         raise TypeError(f"Expected meta to specify type "
                         f"{self._partition_type.__name__}, got type "
                         f"{type(meta).__name__}")
     self._meta = meta
     self.divisions = tuple(divisions)
Example #4
0
File: core.py Project: rongou/cudf
def reduction(
    args,
    chunk=None,
    aggregate=None,
    combine=None,
    meta=None,
    token=None,
    chunk_kwargs=None,
    aggregate_kwargs=None,
    combine_kwargs=None,
    split_every=None,
    **kwargs,
):
    """Generic tree reduction operation.

    Parameters
    ----------
    args :
        Positional arguments for the `chunk` function. All `dask.dataframe`
        objects should be partitioned and indexed equivalently.
    chunk : function [block-per-arg] -> block
        Function to operate on each block of data
    aggregate : function list-of-blocks -> block
        Function to operate on the list of results of chunk
    combine : function list-of-blocks -> block, optional
        Function to operate on intermediate lists of results of chunk
        in a tree-reduction. If not provided, defaults to aggregate.
    $META
    token : str, optional
        The name to use for the output keys.
    chunk_kwargs : dict, optional
        Keywords for the chunk function only.
    aggregate_kwargs : dict, optional
        Keywords for the aggregate function only.
    combine_kwargs : dict, optional
        Keywords for the combine function only.
    split_every : int, optional
        Group partitions into groups of this size while performing a
        tree-reduction. If set to False, no tree-reduction will be used,
        and all intermediates will be concatenated and passed to ``aggregate``.
        Default is 8.
    kwargs :
        All remaining keywords will be passed to ``chunk``, ``aggregate``, and
        ``combine``.
    """
    if chunk_kwargs is None:
        chunk_kwargs = dict()
    if aggregate_kwargs is None:
        aggregate_kwargs = dict()
    chunk_kwargs.update(kwargs)
    aggregate_kwargs.update(kwargs)

    if combine is None:
        if combine_kwargs:
            raise ValueError("`combine_kwargs` provided with no `combine`")
        combine = aggregate
        combine_kwargs = aggregate_kwargs
    else:
        if combine_kwargs is None:
            combine_kwargs = dict()
        combine_kwargs.update(kwargs)

    if not isinstance(args, (tuple, list)):
        args = [args]

    npartitions = {arg.npartitions for arg in args if isinstance(arg, _Frame)}
    if len(npartitions) > 1:
        raise ValueError("All arguments must have same number of partitions")
    npartitions = npartitions.pop()

    if split_every is None:
        split_every = 8
    elif split_every is False:
        split_every = npartitions
    elif split_every < 2 or not isinstance(split_every, int):
        raise ValueError("split_every must be an integer >= 2")

    token_key = tokenize(
        token or (chunk, aggregate),
        meta,
        args,
        chunk_kwargs,
        aggregate_kwargs,
        combine_kwargs,
        split_every,
    )

    # Chunk
    a = f"{token or funcname(chunk)}-chunk-{token_key}"
    if len(args) == 1 and isinstance(args[0], _Frame) and not chunk_kwargs:
        dsk = {(a, 0, i): (chunk, key)
               for i, key in enumerate(args[0].__dask_keys__())}
    else:
        dsk = {(a, 0, i): (
            apply,
            chunk,
            [(x._name, i) if isinstance(x, _Frame) else x for x in args],
            chunk_kwargs,
        )
               for i in range(args[0].npartitions)}

    # Combine
    b = f"{token or funcname(combine)}-combine-{token_key}"
    k = npartitions
    depth = 0
    while k > split_every:
        for part_i, inds in enumerate(partition_all(split_every, range(k))):
            conc = (list, [(a, depth, i) for i in inds])
            dsk[(b, depth + 1,
                 part_i)] = ((apply, combine, [conc],
                              combine_kwargs) if combine_kwargs else
                             (combine, conc))
        k = part_i + 1
        a = b
        depth += 1

    # Aggregate
    b = f"{token or funcname(aggregate)}-agg-{token_key}"
    conc = (list, [(a, depth, i) for i in range(k)])
    if aggregate_kwargs:
        dsk[(b, 0)] = (apply, aggregate, [conc], aggregate_kwargs)
    else:
        dsk[(b, 0)] = (aggregate, conc)

    if meta is None:
        meta_chunk = _emulate(apply, chunk, args, chunk_kwargs)
        meta = _emulate(apply, aggregate, [[meta_chunk]], aggregate_kwargs)
    meta = dask_make_meta(meta)

    graph = HighLevelGraph.from_collections(b, dsk, dependencies=args)
    return dd.core.new_dd_object(graph, b, meta, (None, None))