Beispiel #1
0
    def _assign_column(self, k, v):
        def assigner(df, k, v):
            out = df.copy()
            out[k] = v
            return out

        meta = assigner(self._meta, k, make_meta(v))
        return self.map_partitions(assigner, k, v, meta=meta)
Beispiel #2
0
def map_partitions(func, *args, **kwargs):
    """ Apply Python function on each DataFrame partition.

    Parameters
    ----------
    func : function
        Function applied to each partition.
    args, kwargs :
        Arguments and keywords to pass to the function. At least one of the
        args should be a dask_cudf object.
    """
    meta = kwargs.pop('meta', None)
    if meta is not None:
        meta = make_meta(meta)

    if 'token' in kwargs:
        name = kwargs.pop('token')
        token = tokenize(meta, *args, **kwargs)
    else:
        name = funcname(func)
        token = tokenize(func, meta, *args, **kwargs)
    name = '{0}-{1}'.format(name, token)

    args = align_partitions(args)

    if meta is None:
        meta = _emulate(func, *args, **kwargs)
    meta = make_meta(meta)

    if all(isinstance(arg, Scalar) for arg in args):
        dask = {
            (name, 0):
            (apply, func, (tuple, [(x._name, 0) for x in args]), kwargs)
        }
        return Scalar(merge(dask, *[x.dask for x in args]), name, meta)

    dfs = [df for df in args if isinstance(df, _Frame)]
    dsk = {}
    for i in range(dfs[0].npartitions):
        values = [(x._name, i if isinstance(x, _Frame) else 0) if isinstance(
            x, (_Frame, Scalar)) else x for x in args]
        dsk[(name, i)] = (apply, func, values, kwargs)

    dasks = [arg.dask for arg in args if isinstance(arg, (_Frame, Scalar))]
    return new_dd_object(merge(dsk, *dasks), name, meta, args[0].divisions)
Beispiel #3
0
 def __init__(self, dsk, name, meta, divisions):
     self.dask = dsk
     self._name = name
     meta = make_meta(meta)
     if not isinstance(meta, self._partition_type):
         raise TypeError("Expected meta to specify type {0}, got type "
                         "{1}".format(self._partition_type.__name__,
                                      type(meta).__name__))
     self._meta = meta
     self.divisions = tuple(divisions)
Beispiel #4
0
def from_delayed(dfs, meta=None, prefix='from_delayed'):
    """ Create dask_cudf DataFrame from many Dask Delayed objects
    Parameters
    ----------
    dfs : list of Delayed
        An iterable of ``dask.delayed.Delayed`` objects, such as come from
        ``dask.delayed`` These comprise the individual partitions of the
        resulting dataframe.
    meta : cudf.DataFrame, cudf.Series, or cudf.Index
        An empty cudf object with names, dtypes, and indices matching the
        expected output.
    prefix : str, optional
        Prefix to prepend to the keys.
    """
    from dask.delayed import Delayed, delayed

    if isinstance(dfs, Delayed):
        dfs = [dfs]

    dfs = [
        delayed(df)
        if not isinstance(df, Delayed) and hasattr(df, 'key') else df
        for df in dfs
    ]

    for df in dfs:
        if not isinstance(df, Delayed):
            raise TypeError("Expected Delayed object, got {}".format(
                type(df).__name__))

    if meta is None:
        meta = dfs[0].compute()
    meta = make_meta(meta)

    name = prefix + '-' + tokenize(*dfs)

    dsk = merge(df.dask for df in dfs)
    dsk.update({(name, i): (check_meta, df.key, meta, 'from_delayed')
                for (i, df) in enumerate(dfs)})

    divs = [None] * (len(dfs) + 1)
    df = new_dd_object(dsk, name, meta, divs)

    return df
Beispiel #5
0
def reduction(args, chunk=None, aggregate=None, combine=None,
              meta=None, token=None, chunk_kwargs=None,
              aggregate_kwargs=None, combine_kwargs=None,
              split_every=None, **kwargs):
    """Generic tree reduction operation.

    Parameters
    ----------
    args :
        Positional arguments for the `chunk` function. All `dask.dataframe`
        objects should be partitioned and indexed equivalently.
    chunk : function [block-per-arg] -> block
        Function to operate on each block of data
    aggregate : function list-of-blocks -> block
        Function to operate on the list of results of chunk
    combine : function list-of-blocks -> block, optional
        Function to operate on intermediate lists of results of chunk
        in a tree-reduction. If not provided, defaults to aggregate.
    $META
    token : str, optional
        The name to use for the output keys.
    chunk_kwargs : dict, optional
        Keywords for the chunk function only.
    aggregate_kwargs : dict, optional
        Keywords for the aggregate function only.
    combine_kwargs : dict, optional
        Keywords for the combine function only.
    split_every : int, optional
        Group partitions into groups of this size while performing a
        tree-reduction. If set to False, no tree-reduction will be used,
        and all intermediates will be concatenated and passed to ``aggregate``.
        Default is 8.
    kwargs :
        All remaining keywords will be passed to ``chunk``, ``aggregate``, and
        ``combine``.
    """
    if chunk_kwargs is None:
        chunk_kwargs = dict()
    if aggregate_kwargs is None:
        aggregate_kwargs = dict()
    chunk_kwargs.update(kwargs)
    aggregate_kwargs.update(kwargs)

    if combine is None:
        if combine_kwargs:
            raise ValueError("`combine_kwargs` provided with no `combine`")
        combine = aggregate
        combine_kwargs = aggregate_kwargs
    else:
        if combine_kwargs is None:
            combine_kwargs = dict()
        combine_kwargs.update(kwargs)

    if not isinstance(args, (tuple, list)):
        args = [args]

    npartitions = set(arg.npartitions for arg in args
                      if isinstance(arg, _Frame))
    if len(npartitions) > 1:
        raise ValueError("All arguments must have same number of partitions")
    npartitions = npartitions.pop()

    if split_every is None:
        split_every = 8
    elif split_every is False:
        split_every = npartitions
    elif split_every < 2 or not isinstance(split_every, int):
        raise ValueError("split_every must be an integer >= 2")

    token_key = tokenize(token or (chunk, aggregate), meta, args,
                         chunk_kwargs, aggregate_kwargs, combine_kwargs,
                         split_every)

    # Chunk
    a = '{0}-chunk-{1}'.format(token or funcname(chunk), token_key)
    if len(args) == 1 and isinstance(args[0], _Frame) and not chunk_kwargs:
        dsk = {(a, 0, i): (chunk, key)
               for i, key in enumerate(args[0].__dask_keys__())}
    else:
        dsk = {(a, 0, i): (apply, chunk,
                           [(x._name, i) if isinstance(x, _Frame)
                            else x for x in args], chunk_kwargs)
               for i in range(args[0].npartitions)}

    # Combine
    b = '{0}-combine-{1}'.format(token or funcname(combine), token_key)
    k = npartitions
    depth = 0
    while k > split_every:
        for part_i, inds in enumerate(partition_all(split_every, range(k))):
            conc = (list, [(a, depth, i) for i in inds])
            dsk[(b, depth + 1, part_i)] = (
                (apply, combine, [conc], combine_kwargs)
                if combine_kwargs else (combine, conc))
        k = part_i + 1
        a = b
        depth += 1

    # Aggregate
    b = '{0}-agg-{1}'.format(token or funcname(aggregate), token_key)
    conc = (list, [(a, depth, i) for i in range(k)])
    if aggregate_kwargs:
        dsk[(b, 0)] = (apply, aggregate, [conc], aggregate_kwargs)
    else:
        dsk[(b, 0)] = (aggregate, conc)

    if meta is None:
        meta_chunk = _emulate(apply, chunk, args, chunk_kwargs)
        meta = _emulate(apply, aggregate, [[meta_chunk]],
                        aggregate_kwargs)
    meta = make_meta(meta)

    for arg in args:
        if isinstance(arg, _Frame):
            dsk.update(arg.dask)

    return new_dd_object(dsk, b, meta, (None, None))