def _assign_column(self, k, v): def assigner(df, k, v): out = df.copy() out[k] = v return out meta = assigner(self._meta, k, make_meta(v)) return self.map_partitions(assigner, k, v, meta=meta)
def map_partitions(func, *args, **kwargs): """ Apply Python function on each DataFrame partition. Parameters ---------- func : function Function applied to each partition. args, kwargs : Arguments and keywords to pass to the function. At least one of the args should be a dask_cudf object. """ meta = kwargs.pop('meta', None) if meta is not None: meta = make_meta(meta) if 'token' in kwargs: name = kwargs.pop('token') token = tokenize(meta, *args, **kwargs) else: name = funcname(func) token = tokenize(func, meta, *args, **kwargs) name = '{0}-{1}'.format(name, token) args = align_partitions(args) if meta is None: meta = _emulate(func, *args, **kwargs) meta = make_meta(meta) if all(isinstance(arg, Scalar) for arg in args): dask = { (name, 0): (apply, func, (tuple, [(x._name, 0) for x in args]), kwargs) } return Scalar(merge(dask, *[x.dask for x in args]), name, meta) dfs = [df for df in args if isinstance(df, _Frame)] dsk = {} for i in range(dfs[0].npartitions): values = [(x._name, i if isinstance(x, _Frame) else 0) if isinstance( x, (_Frame, Scalar)) else x for x in args] dsk[(name, i)] = (apply, func, values, kwargs) dasks = [arg.dask for arg in args if isinstance(arg, (_Frame, Scalar))] return new_dd_object(merge(dsk, *dasks), name, meta, args[0].divisions)
def __init__(self, dsk, name, meta, divisions): self.dask = dsk self._name = name meta = make_meta(meta) if not isinstance(meta, self._partition_type): raise TypeError("Expected meta to specify type {0}, got type " "{1}".format(self._partition_type.__name__, type(meta).__name__)) self._meta = meta self.divisions = tuple(divisions)
def from_delayed(dfs, meta=None, prefix='from_delayed'): """ Create dask_cudf DataFrame from many Dask Delayed objects Parameters ---------- dfs : list of Delayed An iterable of ``dask.delayed.Delayed`` objects, such as come from ``dask.delayed`` These comprise the individual partitions of the resulting dataframe. meta : cudf.DataFrame, cudf.Series, or cudf.Index An empty cudf object with names, dtypes, and indices matching the expected output. prefix : str, optional Prefix to prepend to the keys. """ from dask.delayed import Delayed, delayed if isinstance(dfs, Delayed): dfs = [dfs] dfs = [ delayed(df) if not isinstance(df, Delayed) and hasattr(df, 'key') else df for df in dfs ] for df in dfs: if not isinstance(df, Delayed): raise TypeError("Expected Delayed object, got {}".format( type(df).__name__)) if meta is None: meta = dfs[0].compute() meta = make_meta(meta) name = prefix + '-' + tokenize(*dfs) dsk = merge(df.dask for df in dfs) dsk.update({(name, i): (check_meta, df.key, meta, 'from_delayed') for (i, df) in enumerate(dfs)}) divs = [None] * (len(dfs) + 1) df = new_dd_object(dsk, name, meta, divs) return df
def reduction(args, chunk=None, aggregate=None, combine=None, meta=None, token=None, chunk_kwargs=None, aggregate_kwargs=None, combine_kwargs=None, split_every=None, **kwargs): """Generic tree reduction operation. Parameters ---------- args : Positional arguments for the `chunk` function. All `dask.dataframe` objects should be partitioned and indexed equivalently. chunk : function [block-per-arg] -> block Function to operate on each block of data aggregate : function list-of-blocks -> block Function to operate on the list of results of chunk combine : function list-of-blocks -> block, optional Function to operate on intermediate lists of results of chunk in a tree-reduction. If not provided, defaults to aggregate. $META token : str, optional The name to use for the output keys. chunk_kwargs : dict, optional Keywords for the chunk function only. aggregate_kwargs : dict, optional Keywords for the aggregate function only. combine_kwargs : dict, optional Keywords for the combine function only. split_every : int, optional Group partitions into groups of this size while performing a tree-reduction. If set to False, no tree-reduction will be used, and all intermediates will be concatenated and passed to ``aggregate``. Default is 8. kwargs : All remaining keywords will be passed to ``chunk``, ``aggregate``, and ``combine``. """ if chunk_kwargs is None: chunk_kwargs = dict() if aggregate_kwargs is None: aggregate_kwargs = dict() chunk_kwargs.update(kwargs) aggregate_kwargs.update(kwargs) if combine is None: if combine_kwargs: raise ValueError("`combine_kwargs` provided with no `combine`") combine = aggregate combine_kwargs = aggregate_kwargs else: if combine_kwargs is None: combine_kwargs = dict() combine_kwargs.update(kwargs) if not isinstance(args, (tuple, list)): args = [args] npartitions = set(arg.npartitions for arg in args if isinstance(arg, _Frame)) if len(npartitions) > 1: raise ValueError("All arguments must have same number of partitions") npartitions = npartitions.pop() if split_every is None: split_every = 8 elif split_every is False: split_every = npartitions elif split_every < 2 or not isinstance(split_every, int): raise ValueError("split_every must be an integer >= 2") token_key = tokenize(token or (chunk, aggregate), meta, args, chunk_kwargs, aggregate_kwargs, combine_kwargs, split_every) # Chunk a = '{0}-chunk-{1}'.format(token or funcname(chunk), token_key) if len(args) == 1 and isinstance(args[0], _Frame) and not chunk_kwargs: dsk = {(a, 0, i): (chunk, key) for i, key in enumerate(args[0].__dask_keys__())} else: dsk = {(a, 0, i): (apply, chunk, [(x._name, i) if isinstance(x, _Frame) else x for x in args], chunk_kwargs) for i in range(args[0].npartitions)} # Combine b = '{0}-combine-{1}'.format(token or funcname(combine), token_key) k = npartitions depth = 0 while k > split_every: for part_i, inds in enumerate(partition_all(split_every, range(k))): conc = (list, [(a, depth, i) for i in inds]) dsk[(b, depth + 1, part_i)] = ( (apply, combine, [conc], combine_kwargs) if combine_kwargs else (combine, conc)) k = part_i + 1 a = b depth += 1 # Aggregate b = '{0}-agg-{1}'.format(token or funcname(aggregate), token_key) conc = (list, [(a, depth, i) for i in range(k)]) if aggregate_kwargs: dsk[(b, 0)] = (apply, aggregate, [conc], aggregate_kwargs) else: dsk[(b, 0)] = (aggregate, conc) if meta is None: meta_chunk = _emulate(apply, chunk, args, chunk_kwargs) meta = _emulate(apply, aggregate, [[meta_chunk]], aggregate_kwargs) meta = make_meta(meta) for arg in args: if isinstance(arg, _Frame): dsk.update(arg.dask) return new_dd_object(dsk, b, meta, (None, None))