def test_make_meta_backends(index): dtypes = ["int8", "int32", "int64", "float64"] df = cudf.DataFrame( {dt: np.arange(start=0, stop=3, dtype=dt) for dt in dtypes}) df["strings"] = ["cat", "dog", "fish"] df["cats"] = df["strings"].astype("category") df["time_s"] = np.array(["2018-10-07", "2018-10-08", "2018-10-09"], dtype="datetime64[s]") df["time_ms"] = df["time_s"].astype("datetime64[ms]") df["time_ns"] = df["time_s"].astype("datetime64[ns]") df = df.set_index(index) # Check "empty" metadata types chk_meta = dask_make_meta(df) dd.assert_eq(chk_meta.dtypes, df.dtypes) # Check "non-empty" metadata types chk_meta_nonempty = meta_nonempty(df) dd.assert_eq(chk_meta.dtypes, chk_meta_nonempty.dtypes) # Check dask code path if not MultiIndex if not isinstance(df.index, cudf.MultiIndex): ddf = dgd.from_cudf(df, npartitions=1) # Check "empty" metadata types dd.assert_eq(ddf._meta.dtypes, df.dtypes) # Check "non-empty" metadata types dd.assert_eq(ddf._meta.dtypes, ddf._meta_nonempty.dtypes)
def _assign_column(self, k, v): def assigner(df, k, v): out = df.copy() out[k] = v return out meta = assigner(self._meta, k, dask_make_meta(v)) return self.map_partitions(assigner, k, v, meta=meta)
def __init__(self, dsk, name, meta, divisions): if not isinstance(dsk, HighLevelGraph): dsk = HighLevelGraph.from_collections(name, dsk, dependencies=[]) self.dask = dsk self._name = name meta = dask_make_meta(meta) if not isinstance(meta, self._partition_type): raise TypeError(f"Expected meta to specify type " f"{self._partition_type.__name__}, got type " f"{type(meta).__name__}") self._meta = meta self.divisions = tuple(divisions)
def reduction( args, chunk=None, aggregate=None, combine=None, meta=None, token=None, chunk_kwargs=None, aggregate_kwargs=None, combine_kwargs=None, split_every=None, **kwargs, ): """Generic tree reduction operation. Parameters ---------- args : Positional arguments for the `chunk` function. All `dask.dataframe` objects should be partitioned and indexed equivalently. chunk : function [block-per-arg] -> block Function to operate on each block of data aggregate : function list-of-blocks -> block Function to operate on the list of results of chunk combine : function list-of-blocks -> block, optional Function to operate on intermediate lists of results of chunk in a tree-reduction. If not provided, defaults to aggregate. $META token : str, optional The name to use for the output keys. chunk_kwargs : dict, optional Keywords for the chunk function only. aggregate_kwargs : dict, optional Keywords for the aggregate function only. combine_kwargs : dict, optional Keywords for the combine function only. split_every : int, optional Group partitions into groups of this size while performing a tree-reduction. If set to False, no tree-reduction will be used, and all intermediates will be concatenated and passed to ``aggregate``. Default is 8. kwargs : All remaining keywords will be passed to ``chunk``, ``aggregate``, and ``combine``. """ if chunk_kwargs is None: chunk_kwargs = dict() if aggregate_kwargs is None: aggregate_kwargs = dict() chunk_kwargs.update(kwargs) aggregate_kwargs.update(kwargs) if combine is None: if combine_kwargs: raise ValueError("`combine_kwargs` provided with no `combine`") combine = aggregate combine_kwargs = aggregate_kwargs else: if combine_kwargs is None: combine_kwargs = dict() combine_kwargs.update(kwargs) if not isinstance(args, (tuple, list)): args = [args] npartitions = {arg.npartitions for arg in args if isinstance(arg, _Frame)} if len(npartitions) > 1: raise ValueError("All arguments must have same number of partitions") npartitions = npartitions.pop() if split_every is None: split_every = 8 elif split_every is False: split_every = npartitions elif split_every < 2 or not isinstance(split_every, int): raise ValueError("split_every must be an integer >= 2") token_key = tokenize( token or (chunk, aggregate), meta, args, chunk_kwargs, aggregate_kwargs, combine_kwargs, split_every, ) # Chunk a = f"{token or funcname(chunk)}-chunk-{token_key}" if len(args) == 1 and isinstance(args[0], _Frame) and not chunk_kwargs: dsk = {(a, 0, i): (chunk, key) for i, key in enumerate(args[0].__dask_keys__())} else: dsk = {(a, 0, i): ( apply, chunk, [(x._name, i) if isinstance(x, _Frame) else x for x in args], chunk_kwargs, ) for i in range(args[0].npartitions)} # Combine b = f"{token or funcname(combine)}-combine-{token_key}" k = npartitions depth = 0 while k > split_every: for part_i, inds in enumerate(partition_all(split_every, range(k))): conc = (list, [(a, depth, i) for i in inds]) dsk[(b, depth + 1, part_i)] = ((apply, combine, [conc], combine_kwargs) if combine_kwargs else (combine, conc)) k = part_i + 1 a = b depth += 1 # Aggregate b = f"{token or funcname(aggregate)}-agg-{token_key}" conc = (list, [(a, depth, i) for i in range(k)]) if aggregate_kwargs: dsk[(b, 0)] = (apply, aggregate, [conc], aggregate_kwargs) else: dsk[(b, 0)] = (aggregate, conc) if meta is None: meta_chunk = _emulate(apply, chunk, args, chunk_kwargs) meta = _emulate(apply, aggregate, [[meta_chunk]], aggregate_kwargs) meta = dask_make_meta(meta) graph = HighLevelGraph.from_collections(b, dsk, dependencies=args) return dd.core.new_dd_object(graph, b, meta, (None, None))