Exemple #1
0
def task_label(task):
    """Label for a task on a dot graph.

    Examples
    --------
    >>> from operator import add
    >>> task_label((add, 1, 2))
    'add'
    >>> task_label((add, (add, 1, 2), 3))
    'add(...)'
    """
    func = task[0]
    if func is apply:
        func = task[1]
    if hasattr(func, "funcs"):
        if len(func.funcs) > 1:
            return f"{funcname(func.funcs[0])}(...)"
        else:
            head = funcname(func.funcs[0])
    else:
        head = funcname(func)
    if any(has_sub_tasks(i) for i in task[1:]):
        return f"{head}(...)"
    else:
        return head
Exemple #2
0
def test_funcname_toolz():
    @curry
    def foo(a, b, c):
        pass

    assert funcname(foo) == "foo"
    assert funcname(foo(1)) == "foo"
Exemple #3
0
def test_funcname_toolz():
    toolz = pytest.importorskip('toolz')

    @toolz.curry
    def foo(a, b, c):
        pass

    assert funcname(foo) == 'foo'
    assert funcname(foo(1)) == 'foo'
Exemple #4
0
def test_funcname_toolz():
    toolz = pytest.importorskip("toolz")

    @toolz.curry
    def foo(a, b, c):
        pass

    assert funcname(foo) == "foo"
    assert funcname(foo(1)) == "foo"
Exemple #5
0
def test_funcname_toolz():
    toolz = pytest.importorskip('toolz')

    @toolz.curry
    def foo(a, b, c):
        pass

    assert funcname(foo) == 'foo'
    assert funcname(foo(1)) == 'foo'
Exemple #6
0
def test_funcname_multipledispatch():
    md = pytest.importorskip('multipledispatch')

    @md.dispatch(int, int, int)
    def foo(a, b, c):
        pass

    assert funcname(foo) == 'foo'
    assert funcname(functools.partial(foo, a=1)) == 'foo'
Exemple #7
0
def test_funcname_multipledispatch():
    md = pytest.importorskip("multipledispatch")

    @md.dispatch(int, int, int)
    def foo(a, b, c):
        pass

    assert funcname(foo) == "foo"
    assert funcname(functools.partial(foo, a=1)) == "foo"
Exemple #8
0
def test_funcname_numpy_vectorize():
    np = pytest.importorskip("numpy")

    vfunc = np.vectorize(int)
    assert funcname(vfunc) == "vectorize_int"

    # Regression test for https://github.com/pydata/xarray/issues/3303
    # Partial functions don't have a __name__ attribute
    func = functools.partial(np.add, out=None)
    vfunc = np.vectorize(func)
    assert funcname(vfunc) == "vectorize_add"
Exemple #9
0
def _funcname(x):
    try:
        if isinstance(x, BatchedCalls):
            x = x.items[0][0]
    except Exception:
        pass
    return funcname(x)
Exemple #10
0
def test_dont_concatenate_single_chunks(shape, chunks):
    x = da.ones(shape, chunks=shape)
    y = x.rechunk(chunks)
    dsk = dict(y.dask)
    assert not any(funcname(task[0]).startswith('concat')
                   for task in dsk.values()
                   if dask.istask(task))
Exemple #11
0
def _get_worker_plugin_name(plugin) -> str:
    """Returns the worker plugin name. If plugin has no name attribute
    a random name is used."""
    if hasattr(plugin, "name"):
        return plugin.name
    else:
        return funcname(type(plugin)) + "-" + str(uuid.uuid4())
Exemple #12
0
 def __init__(self, func, nin, nout):
     self._ufunc = np.frompyfunc(func, nin, nout)
     self._func = func
     self.nin = nin
     self.nout = nout
     self._name = funcname(func)
     self.__name__ = "frompyfunc-%s" % self._name
Exemple #13
0
def _parse_wrap_args(func, args, kwargs, shape):
    if isinstance(shape, np.ndarray):
        shape = shape.tolist()

    if not isinstance(shape, (tuple, list)):
        shape = (shape, )

    name = kwargs.pop("name", None)
    chunks = kwargs.pop("chunks", "auto")

    dtype = kwargs.pop("dtype", None)
    if dtype is None:
        dtype = func(shape, *args, **kwargs).dtype
    dtype = np.dtype(dtype)

    chunks = normalize_chunks(chunks, shape, dtype=dtype)

    name = name or funcname(func) + "-" + tokenize(func, shape, chunks, dtype,
                                                   args, kwargs)

    return {
        "shape": shape,
        "dtype": dtype,
        "kwargs": kwargs,
        "chunks": chunks,
        "name": name,
    }
    def __init__(
        self,
        func,
        output_indices,
        indices,
        numblocks,
        feed_index=0,
        axis=None,
    ):
        self.func = func
        self.output_indices = tuple(output_indices)
        self.indices = tuple((name, tuple(ind) if ind is not None else ind)
                             for name, ind in indices)
        self.numblocks = numblocks

        if axis is None:
            raise ValueError("axis not set")

        if axis in self.output_indices:
            raise ValueError("axis in output_indices")

        self.feed_index = feed_index
        self.axis = axis

        token = tokenize(self.func,
                         self.output_indices,
                         self.indices,
                         self.numblocks,
                         self.feed_index,
                         self.axis)

        self.func_name = funcname(self.func)
        self.name = "-".join((self.func_name, token))
Exemple #15
0
def _funcname(x):
    try:
        if isinstance(x, list):
            x = x[0][0]
    except Exception:
        pass
    return funcname(x)
def elemwise(op, *args, **kwargs):
    # See also da.core.elemwise. Note: dask seems to be able to convert Python
    # and numpy objects in this function, thus supporting operations between
    # dask objects and others. This would be useful for us as well.
    # Do not support mismatching chunking for now.
    n_chunk = None
    slice_dim = None
    for arg in args:
        if isinstance(arg, DatasetCollection):
            if n_chunk is not None:
                assert n_chunk == arg.n_chunk
                assert slice_dim == arg.slice_dim
            else:
                n_chunk = arg.n_chunk
                slice_dim = arg.slice_dim
    out = '{}-{}'.format(funcname(op), tokenize(op, *args))
    out_ind = (0, )
    # Handling only 1D chunking here, so everything is (0,).
    arginds = list(
        (a, (0, ) if isinstance(a, DatasetCollection) else None) for a in args)
    numblocks = {a.name: a.numblocks for a, ind in arginds if ind is not None}
    argindsstr = list(
        concat([(a if ind is None else a.name, ind) for a, ind in arginds]))
    dsk = top(op, out, out_ind, *argindsstr, numblocks=numblocks, **kwargs)
    dsks = [a.dask for a, ind in arginds if ind is not None]
    return DatasetCollection(sharedict.merge((out, dsk), *dsks), out, n_chunk,
                             slice_dim)
Exemple #17
0
def call_function(func, func_token, args, kwargs, pure=None, nout=None):
    dask_key_name = kwargs.pop("dask_key_name", None)
    pure = kwargs.pop("pure", pure)

    if dask_key_name is None:
        name = "{}-{}".format(
            funcname(func),
            tokenize(func_token, *args, pure=pure, **kwargs),
        )
    else:
        name = dask_key_name

    args2, collections = unzip(map(unpack_collections, args), 2)
    collections = list(concat(collections))

    if kwargs:
        dask_kwargs, collections2 = unpack_collections(kwargs)
        collections.extend(collections2)
        task = (apply, func, list(args2), dask_kwargs)
    else:
        task = (func, ) + args2

    graph = HighLevelGraph.from_collections(name, {name: task},
                                            dependencies=collections)
    nout = nout if nout is not None else None
    return Delayed(name, graph, length=nout)
Exemple #18
0
def test_dont_concatenate_single_chunks(shape, chunks):
    x = da.ones(shape, chunks=shape)
    y = x.rechunk(chunks)
    dsk = dict(y.dask)
    assert not any(
        funcname(task[0]).startswith("concat")
        for task in dsk.values() if dask.istask(task))
Exemple #19
0
def _emulate(func, *args, **kwargs):
    """
    Apply a function using args / kwargs. If arguments contain dd.DataFrame /
    dd.Series, using internal cache (``_meta``) for calculation
    """
    with raise_on_meta_error(funcname(func)):
        return func(*_extract_meta(args), **_extract_meta(kwargs))
Exemple #20
0
def test_funcname_long():
    def a_long_function_name_11111111111111111111111111111111111111111111111():
        pass

    result = funcname(
        a_long_function_name_11111111111111111111111111111111111111111111111)
    assert "a_long_function_name" in result
    assert len(result) < 60
Exemple #21
0
    def map_partitions(self, func, *args, **kwargs):
        dtype = kwargs.pop('dtype', None)
        shape = kwargs.pop('shape', None)

        token = tokenize(self, func, args, kwargs, dtype, shape)
        new = '{0}-{1}'.format(funcname(func), token)
        old = self.name
        args = list(args)

        dsk = dict(((new, i), (apply, func, [(old, i)] + args, kwargs))
                   for i in range(self.npartitions))
        dsk.update(self.dask)
        return Matrix(dsk, new, self.npartitions, dtype, shape)
Exemple #22
0
 def map_partitions(self, func, *args, **kwargs):
     example = func(self._example, *args, **kwargs)
     name = funcname(func) + '-' + tokenize(self, func, *args, **kwargs)
     if not args and not kwargs:
         dsk = {(name, i): (func, key) for i, key in enumerate(self.__dask_keys__())}
     else:
         dsk = {(name, i): (apply, func, list((key,) + args), kwargs)
                for i, key in enumerate(self.__dask_keys__())}
     if isinstance(example, gpd.base.GeoPandasBase):
         regions = self._regions
     else:
         regions = [None] * (self.npartitions + 1)
     return typeof(example)(merge(dsk, self.dask), name,
                            example, regions)
Exemple #23
0
def map_partitions(func, *args, **kwargs):
    """ Apply Python function on each DataFrame partition.

    Parameters
    ----------
    func : function
        Function applied to each partition.
    args, kwargs :
        Arguments and keywords to pass to the function. At least one of the
        args should be a dask_gdf object.
    """
    meta = kwargs.pop('meta', None)
    if meta is not None:
        meta = make_meta(meta)

    if 'token' in kwargs:
        name = kwargs.pop('token')
        token = tokenize(meta, *args, **kwargs)
    else:
        name = funcname(func)
        token = tokenize(func, meta, *args, **kwargs)
    name = '{0}-{1}'.format(name, token)

    args = align_partitions(args)

    if meta is None:
        meta = _emulate(func, *args, **kwargs)
    meta = make_meta(meta)

    if all(isinstance(arg, Scalar) for arg in args):
        dask = {
            (name, 0):
            (apply, func, (tuple, [(x._name, 0) for x in args]), kwargs)
        }
        return Scalar(merge(dask, *[x.dask for x in args]), name, meta)

    dfs = [df for df in args if isinstance(df, _Frame)]
    dsk = {}
    for i in range(dfs[0].npartitions):
        values = [(x._name, i if isinstance(x, _Frame) else 0) if isinstance(
            x, (_Frame, Scalar)) else x for x in args]
        dsk[(name, i)] = (apply, func, values, kwargs)

    dasks = [arg.dask for arg in args if isinstance(arg, (_Frame, Scalar))]
    return new_dd_object(merge(dsk, *dasks), name, meta, args[0].divisions)
Exemple #24
0
def test_funcname():
    def foo(a, b, c):
        pass

    assert funcname(foo) == 'foo'
    assert funcname(functools.partial(foo, a=1)) == 'foo'
    assert funcname(M.sum) == 'sum'
    assert funcname(lambda: 1) == 'lambda'

    class Foo(object):
        pass

    assert funcname(Foo) == 'Foo'
    assert 'Foo' in funcname(Foo())
Exemple #25
0
def test_funcname():
    def foo(a, b, c):
        pass

    assert funcname(foo) == 'foo'
    assert funcname(functools.partial(foo, a=1)) == 'foo'
    assert funcname(M.sum) == 'sum'
    assert funcname(lambda: 1) == 'lambda'

    class Foo(object):
        pass

    assert funcname(Foo) == 'Foo'
    assert 'Foo' in funcname(Foo())
Exemple #26
0
def test_funcname():
    def foo(a, b, c):
        pass

    assert funcname(foo) == "foo"
    assert funcname(functools.partial(foo, a=1)) == "foo"
    assert funcname(M.sum) == "sum"
    assert funcname(lambda: 1) == "lambda"

    class Foo:
        pass

    assert funcname(Foo) == "Foo"
    assert "Foo" in funcname(Foo())
Exemple #27
0
def reduction(args, chunk=None, aggregate=None, combine=None,
              meta=None, token=None, chunk_kwargs=None,
              aggregate_kwargs=None, combine_kwargs=None,
              split_every=None, **kwargs):
    """Generic tree reduction operation.

    Parameters
    ----------
    args :
        Positional arguments for the `chunk` function. All `dask.dataframe`
        objects should be partitioned and indexed equivalently.
    chunk : function [block-per-arg] -> block
        Function to operate on each block of data
    aggregate : function list-of-blocks -> block
        Function to operate on the list of results of chunk
    combine : function list-of-blocks -> block, optional
        Function to operate on intermediate lists of results of chunk
        in a tree-reduction. If not provided, defaults to aggregate.
    $META
    token : str, optional
        The name to use for the output keys.
    chunk_kwargs : dict, optional
        Keywords for the chunk function only.
    aggregate_kwargs : dict, optional
        Keywords for the aggregate function only.
    combine_kwargs : dict, optional
        Keywords for the combine function only.
    split_every : int, optional
        Group partitions into groups of this size while performing a
        tree-reduction. If set to False, no tree-reduction will be used,
        and all intermediates will be concatenated and passed to ``aggregate``.
        Default is 8.
    kwargs :
        All remaining keywords will be passed to ``chunk``, ``aggregate``, and
        ``combine``.
    """
    if chunk_kwargs is None:
        chunk_kwargs = dict()
    if aggregate_kwargs is None:
        aggregate_kwargs = dict()
    chunk_kwargs.update(kwargs)
    aggregate_kwargs.update(kwargs)

    if combine is None:
        if combine_kwargs:
            raise ValueError("`combine_kwargs` provided with no `combine`")
        combine = aggregate
        combine_kwargs = aggregate_kwargs
    else:
        if combine_kwargs is None:
            combine_kwargs = dict()
        combine_kwargs.update(kwargs)

    if not isinstance(args, (tuple, list)):
        args = [args]

    npartitions = set(arg.npartitions for arg in args
                      if isinstance(arg, _Frame))
    if len(npartitions) > 1:
        raise ValueError("All arguments must have same number of partitions")
    npartitions = npartitions.pop()

    if split_every is None:
        split_every = 8
    elif split_every is False:
        split_every = npartitions
    elif split_every < 2 or not isinstance(split_every, int):
        raise ValueError("split_every must be an integer >= 2")

    token_key = tokenize(token or (chunk, aggregate), meta, args,
                         chunk_kwargs, aggregate_kwargs, combine_kwargs,
                         split_every)

    # Chunk
    a = '{0}-chunk-{1}'.format(token or funcname(chunk), token_key)
    if len(args) == 1 and isinstance(args[0], _Frame) and not chunk_kwargs:
        dsk = {(a, 0, i): (chunk, key)
               for i, key in enumerate(args[0].__dask_keys__())}
    else:
        dsk = {(a, 0, i): (apply, chunk,
                           [(x._name, i) if isinstance(x, _Frame)
                            else x for x in args], chunk_kwargs)
               for i in range(args[0].npartitions)}

    # Combine
    b = '{0}-combine-{1}'.format(token or funcname(combine), token_key)
    k = npartitions
    depth = 0
    while k > split_every:
        for part_i, inds in enumerate(partition_all(split_every, range(k))):
            conc = (list, [(a, depth, i) for i in inds])
            dsk[(b, depth + 1, part_i)] = (
                (apply, combine, [conc], combine_kwargs)
                if combine_kwargs else (combine, conc))
        k = part_i + 1
        a = b
        depth += 1

    # Aggregate
    b = '{0}-agg-{1}'.format(token or funcname(aggregate), token_key)
    conc = (list, [(a, depth, i) for i in range(k)])
    if aggregate_kwargs:
        dsk[(b, 0)] = (apply, aggregate, [conc], aggregate_kwargs)
    else:
        dsk[(b, 0)] = (aggregate, conc)

    if meta is None:
        meta_chunk = _emulate(apply, chunk, args, chunk_kwargs)
        meta = _emulate(apply, aggregate, [[meta_chunk]],
                        aggregate_kwargs)
    meta = make_meta(meta)

    for arg in args:
        if isinstance(arg, _Frame):
            dsk.update(arg.dask)

    return new_dd_object(dsk, b, meta, (None, None))
Exemple #28
0
 def __repr__(self):
     return '%s(%s)' % (funcname(
         self.func), ', '.join(map(repr, self.children)) +
                        (', ' + ', '.join('%s=%s' % (k, v)
                                          for k, v in self.kwargs.items())
                         if self.kwargs else ''))
Exemple #29
0
def test_name():
    assert funcname(sizeof) == 'sizeof'
Exemple #30
0
def map_overlap(
    func,
    df,
    before,
    after,
    *args,
    meta=no_default,
    enforce_metadata=True,
    transform_divisions=True,
    align_dataframes=True,
    **kwargs,
):
    """Apply a function to each partition, sharing rows with adjacent partitions.

    Parameters
    ----------
    func : function
        The function applied to each partition. If this function accepts
        the special ``partition_info`` keyword argument, it will recieve
        information on the partition's relative location within the
        dataframe.
    df: dd.DataFrame, dd.Series
    args, kwargs :
        Positional and keyword arguments to pass to the function.
        Positional arguments are computed on a per-partition basis, while
        keyword arguments are shared across all partitions. The partition
        itself will be the first positional argument, with all other
        arguments passed *after*. Arguments can be ``Scalar``, ``Delayed``,
        or regular Python objects. DataFrame-like args (both dask and
        pandas) will be repartitioned to align (if necessary) before
        applying the function; see ``align_dataframes`` to control this
        behavior.
    enforce_metadata : bool, default True
        Whether to enforce at runtime that the structure of the DataFrame
        produced by ``func`` actually matches the structure of ``meta``.
        This will rename and reorder columns for each partition,
        and will raise an error if this doesn't work or types don't match.
    before : int or timedelta
        The rows to prepend to partition ``i`` from the end of
        partition ``i - 1``.
    after : int or timedelta
        The rows to append to partition ``i`` from the beginning
        of partition ``i + 1``.
    transform_divisions : bool, default True
        Whether to apply the function onto the divisions and apply those
        transformed divisions to the output.
    align_dataframes : bool, default True
        Whether to repartition DataFrame- or Series-like args
        (both dask and pandas) so their divisions align before applying
        the function. This requires all inputs to have known divisions.
        Single-partition inputs will be split into multiple partitions.

        If False, all inputs must have either the same number of partitions
        or a single partition. Single-partition inputs will be broadcast to
        every partition of multi-partition inputs.
    $META

    See Also
    --------
    dd.DataFrame.map_overlap
    """
    args = (df, ) + args

    dfs = [df for df in args if isinstance(df, _Frame)]

    if isinstance(before, datetime.timedelta) or isinstance(
            after, datetime.timedelta):
        if not is_datetime64_any_dtype(
                dfs[0].index._meta_nonempty.inferred_type):
            raise TypeError(
                "Must have a `DatetimeIndex` when using string offset "
                "for `before` and `after`")
    else:
        if not (isinstance(before, Integral) and before >= 0
                and isinstance(after, Integral) and after >= 0):
            raise ValueError("before and after must be positive integers")

    name = kwargs.pop("token", None)
    parent_meta = kwargs.pop("parent_meta", None)

    assert callable(func)
    if name is not None:
        token = tokenize(meta, before, after, *args, **kwargs)
    else:
        name = "overlap-" + funcname(func)
        token = tokenize(func, meta, before, after, *args, **kwargs)
    name = f"{name}-{token}"

    if align_dataframes:
        args = _maybe_from_pandas(args)
        try:
            args = _maybe_align_partitions(args)
        except ValueError as e:
            raise ValueError(
                f"{e}. If you don't want the partitions to be aligned, and are "
                "calling `map_overlap` directly, pass `align_dataframes=False`."
            ) from e

    meta = _get_meta_map_partitions(args, dfs, func, kwargs, meta, parent_meta)

    if all(isinstance(arg, Scalar) for arg in args):
        layer = {
            (name, 0): (
                apply,
                func,
                (tuple, [(arg._name, 0) for arg in args]),
                kwargs,
            )
        }
        graph = HighLevelGraph.from_collections(name, layer, dependencies=args)
        return Scalar(graph, name, meta)

    args2 = []
    dependencies = []

    divisions = _get_divisions_map_partitions(align_dataframes,
                                              transform_divisions, dfs, func,
                                              args, kwargs)

    def _handle_frame_argument(arg):
        dsk = {}

        prevs_parts_dsk, prevs = _get_previous_partitions(arg, before)
        dsk.update(prevs_parts_dsk)

        nexts_parts_dsk, nexts = _get_nexts_partitions(arg, after)
        dsk.update(nexts_parts_dsk)

        name_a = "overlap-concat-" + tokenize(arg)
        for i, (prev, current,
                next) in enumerate(zip(prevs, arg.__dask_keys__(), nexts)):
            key = (name_a, i)
            dsk[key] = (_combined_parts, prev, current, next, before, after)

        graph = HighLevelGraph.from_collections(name_a,
                                                dsk,
                                                dependencies=[arg])
        return new_dd_object(graph, name_a, meta, divisions)

    for arg in args:
        if isinstance(arg, _Frame):
            arg = _handle_frame_argument(arg)
            args2.append(arg)
            dependencies.append(arg)
            continue
        arg = normalize_arg(arg)
        arg2, collections = unpack_collections(arg)
        if collections:
            args2.append(arg2)
            dependencies.extend(collections)
        else:
            args2.append(arg)

    kwargs3 = {}
    simple = True
    for k, v in kwargs.items():
        v = normalize_arg(v)
        v, collections = unpack_collections(v)
        dependencies.extend(collections)
        kwargs3[k] = v
        if collections:
            simple = False

    if has_keyword(func, "partition_info"):
        partition_info = {(i, ): {
            "number": i,
            "division": division
        }
                          for i, division in enumerate(divisions[:-1])}

        args2.insert(0, BlockwiseDepDict(partition_info))
        orig_func = func

        def func(partition_info, *args, **kwargs):
            return orig_func(*args, **kwargs, partition_info=partition_info)

    if enforce_metadata:
        dsk = partitionwise_graph(
            apply_and_enforce,
            name,
            func,
            before,
            after,
            *args2,
            dependencies=dependencies,
            _func=overlap_chunk,
            _meta=meta,
            **kwargs3,
        )
    else:
        kwargs4 = kwargs if simple else kwargs3
        dsk = partitionwise_graph(
            overlap_chunk,
            name,
            func,
            before,
            after,
            *args2,
            **kwargs4,
            dependencies=dependencies,
        )

    graph = HighLevelGraph.from_collections(name,
                                            dsk,
                                            dependencies=dependencies)
    return new_dd_object(graph, name, meta, divisions)
Exemple #31
0
def test_name():
    assert funcname(sizeof) == 'sizeof'
Exemple #32
0
def from_map(
    func,
    *iterables,
    args=None,
    meta=None,
    divisions=None,
    label=None,
    token=None,
    enforce_metadata=True,
    **kwargs,
):
    """Create a DataFrame collection from a custom function map

    WARNING: The ``from_map`` API is experimental, and stability is not
    yet guaranteed. Use at your own risk!

    Parameters
    ----------
    func : callable
        Function used to create each partition. If ``func`` satisfies the
        ``DataFrameIOFunction`` protocol, column projection will be enabled.
    *iterables : Iterable objects
        Iterable objects to map to each output partition. All iterables must
        be the same length. This length determines the number of partitions
        in the output collection (only one element of each iterable will
        be passed to ``func`` for each partition).
    args : list or tuple, optional
        Positional arguments to broadcast to each output partition. Note
        that these arguments will always be passed to ``func`` after the
        ``iterables`` positional arguments.
    $META
    divisions : tuple, str, optional
        Partition boundaries along the index.
        For tuple, see https://docs.dask.org/en/latest/dataframe-design.html#partitions
        For string 'sorted' will compute the delayed values to find index
        values.  Assumes that the indexes are mutually sorted.
        If None, then won't use index information
    label : str, optional
        String to use as the function-name label in the output
        collection-key names.
    token : str, optional
        String to use as the "token" in the output collection-key names.
    enforce_metadata : bool, default True
        Whether to enforce at runtime that the structure of the DataFrame
        produced by ``func`` actually matches the structure of ``meta``.
        This will rename and reorder columns for each partition,
        and will raise an error if this doesn't work or types don't match.
    **kwargs:
        Key-word arguments to broadcast to each output partition. These
        same arguments will be passed to ``func`` for every output partition.

    Examples
    --------
    >>> import pandas as pd
    >>> import dask.dataframe as dd
    >>> func = lambda x, size=0: pd.Series([x] * size)
    >>> inputs = ["A", "B"]
    >>> dd.from_map(func, inputs, size=2).compute()
    0    A
    1    A
    0    B
    1    B
    dtype: object

    This API can also be used as an alternative to other file-based
    IO functions, like ``read_parquet`` (which are already just
    ``from_map`` wrapper functions):

    >>> import pandas as pd
    >>> import dask.dataframe as dd
    >>> paths = ["0.parquet", "1.parquet", "2.parquet"]
    >>> dd.from_map(pd.read_parquet, paths).head()  # doctest: +SKIP
                        name
    timestamp
    2000-01-01 00:00:00   Laura
    2000-01-01 00:00:01  Oliver
    2000-01-01 00:00:02   Alice
    2000-01-01 00:00:03  Victor
    2000-01-01 00:00:04     Bob

    Since ``from_map`` allows you to map an arbitrary function
    to any number of iterable objects, it can be a very convenient
    means of implementing functionality that may be missing from
    from other DataFrame-creation methods. For example, if you
    happen to have apriori knowledge about the number of rows
    in each of the files in a dataset, you can generate a
    DataFrame collection with a global RangeIndex:

    >>> import pandas as pd
    >>> import numpy as np
    >>> import dask.dataframe as dd
    >>> paths = ["0.parquet", "1.parquet", "2.parquet"]
    >>> file_sizes = [86400, 86400, 86400]
    >>> def func(path, row_offset):
    ...     # Read parquet file and set RangeIndex offset
    ...     df = pd.read_parquet(path)
    ...     return df.set_index(
    ...         pd.RangeIndex(row_offset, row_offset+len(df))
    ...     )
    >>> def get_ddf(paths, file_sizes):
    ...     offsets = [0] + list(np.cumsum(file_sizes))
    ...     return dd.from_map(
    ...         func, paths, offsets[:-1], divisions=offsets
    ...     )
    >>> ddf = get_ddf(paths, file_sizes)  # doctest: +SKIP
    >>> ddf.index  # doctest: +SKIP
    Dask Index Structure:
    npartitions=3
    0         int64
    86400       ...
    172800      ...
    259200      ...
    dtype: int64
    Dask Name: myfunc, 6 tasks

    See Also
    --------
    dask.dataframe.from_delayed
    dask.layers.DataFrameIOLayer
    """

    # Input validation
    if not callable(func):
        raise ValueError("`func` argument must be `callable`")
    lengths = set()
    iterables = list(iterables)
    for i, iterable in enumerate(iterables):
        if not isinstance(iterable, Iterable):
            raise ValueError(
                f"All elements of `iterables` must be Iterable, got {type(iterable)}"
            )
        try:
            lengths.add(len(iterable))
        except (AttributeError, TypeError):
            iterables[i] = list(iterable)
            lengths.add(len(iterables[i]))
    if len(lengths) == 0:
        raise ValueError("`from_map` requires at least one Iterable input")
    elif len(lengths) > 1:
        raise ValueError("All `iterables` must have the same length")
    if lengths == {0}:
        raise ValueError("All `iterables` must have a non-zero length")

    # Check for `produces_tasks` and `creation_info`.
    # These options are included in the function signature,
    # because they are not intended for "public" use.
    produces_tasks = kwargs.pop("produces_tasks", False)
    creation_info = kwargs.pop("creation_info", None)

    if produces_tasks or len(iterables) == 1:
        if len(iterables) > 1:
            # Tasks are not detected correctly when they are "packed"
            # within an outer list/tuple
            raise ValueError(
                "Multiple iterables not supported when produces_tasks=True")
        inputs = iterables[0]
        packed = False
    else:
        inputs = list(zip(*iterables))
        packed = True

    # Define collection name
    label = label or funcname(func)
    token = token or tokenize(func, meta, inputs, args, divisions,
                              enforce_metadata, **kwargs)
    name = f"{label}-{token}"

    # Get "projectable" column selection.
    # Note that this relies on the IO function
    # ducktyping with DataFrameIOFunction
    column_projection = func.columns if isinstance(
        func, DataFrameIOFunction) else None

    # NOTE: Most of the metadata-handling logic used here
    # is copied directly from `map_partitions`
    if meta is None:
        meta = _emulate(
            func,
            *(inputs[0] if packed else inputs[:1]),
            *(args or []),
            udf=True,
            **kwargs,
        )
        meta_is_emulated = True
    else:
        meta = make_meta(meta)
        meta_is_emulated = False

    if not (has_parallel_type(meta) or is_arraylike(meta) and meta.shape):
        if not meta_is_emulated:
            raise TypeError(
                "Meta is not valid, `from_map` expects output to be a pandas object. "
                "Try passing a pandas object as meta or a dict or tuple representing the "
                "(name, dtype) of the columns.")
        # If `meta` is not a pandas object, the concatenated results will be a
        # different type
        meta = make_meta(_concat([meta]))

    # Ensure meta is empty DataFrame
    meta = make_meta(meta)

    # Define io_func
    if packed or args or kwargs or enforce_metadata:
        io_func = _PackedArgCallable(
            func,
            args=args,
            kwargs=kwargs,
            meta=meta if enforce_metadata else None,
            enforce_metadata=enforce_metadata,
            packed=packed,
        )
    else:
        io_func = func

    # Construct DataFrameIOLayer
    layer = DataFrameIOLayer(
        name,
        column_projection,
        inputs,
        io_func,
        label=label,
        produces_tasks=produces_tasks,
        creation_info=creation_info,
    )

    # Return new DataFrame-collection object
    divisions = divisions or [None] * (len(inputs) + 1)
    graph = HighLevelGraph.from_collections(name, layer, dependencies=[])
    return new_dd_object(graph, name, meta, divisions)
Exemple #33
0
def test_funcname_numpy_vectorize():
    np = pytest.importorskip("numpy")

    func = np.vectorize(int)

    assert funcname(func) == "vectorize_int"
Exemple #34
0
def blockwise(
    func,
    out_ind,
    *args,
    name=None,
    token=None,
    dtype=None,
    adjust_chunks=None,
    new_axes=None,
    align_arrays=True,
    concatenate=None,
    meta=None,
    **kwargs,
):
    """Tensor operation: Generalized inner and outer products

    A broad class of blocked algorithms and patterns can be specified with a
    concise multi-index notation.  The ``blockwise`` function applies an in-memory
    function across multiple blocks of multiple inputs in a variety of ways.
    Many dask.array operations are special cases of blockwise including
    elementwise, broadcasting, reductions, tensordot, and transpose.

    Parameters
    ----------
    func : callable
        Function to apply to individual tuples of blocks
    out_ind : iterable
        Block pattern of the output, something like 'ijk' or (1, 2, 3)
    *args : sequence of Array, index pairs
        Sequence like (x, 'ij', y, 'jk', z, 'i')
    **kwargs : dict
        Extra keyword arguments to pass to function
    dtype : np.dtype
        Datatype of resulting array.
    concatenate : bool, keyword only
        If true concatenate arrays along dummy indices, else provide lists
    adjust_chunks : dict
        Dictionary mapping index to function to be applied to chunk sizes
    new_axes : dict, keyword only
        New indexes and their dimension lengths
    align_arrays: bool
        Whether or not to align chunks along equally sized dimensions when
        multiple arrays are provided.  This allows for larger chunks in some
        arrays to be broken into smaller ones that match chunk sizes in other
        arrays such that they are compatible for block function mapping. If
        this is false, then an error will be thrown if arrays do not already
        have the same number of blocks in each dimension.

    Examples
    --------
    2D embarrassingly parallel operation from two arrays, x, and y.

    >>> import operator, numpy as np, dask.array as da
    >>> x = da.from_array([[1, 2],
    ...                    [3, 4]], chunks=(1, 2))
    >>> y = da.from_array([[10, 20],
    ...                    [0, 0]])
    >>> z = blockwise(operator.add, 'ij', x, 'ij', y, 'ij', dtype='f8')
    >>> z.compute()
    array([[11, 22],
           [ 3,  4]])

    Outer product multiplying a by b, two 1-d vectors

    >>> a = da.from_array([0, 1, 2], chunks=1)
    >>> b = da.from_array([10, 50, 100], chunks=1)
    >>> z = blockwise(np.outer, 'ij', a, 'i', b, 'j', dtype='f8')
    >>> z.compute()
    array([[  0,   0,   0],
           [ 10,  50, 100],
           [ 20, 100, 200]])

    z = x.T

    >>> z = blockwise(np.transpose, 'ji', x, 'ij', dtype=x.dtype)
    >>> z.compute()
    array([[1, 3],
           [2, 4]])

    The transpose case above is illustrative because it does transposition
    both on each in-memory block by calling ``np.transpose`` and on the order
    of the blocks themselves, by switching the order of the index ``ij -> ji``.

    We can compose these same patterns with more variables and more complex
    in-memory functions

    z = X + Y.T

    >>> z = blockwise(lambda x, y: x + y.T, 'ij', x, 'ij', y, 'ji', dtype='f8')
    >>> z.compute()
    array([[11,  2],
           [23,  4]])

    Any index, like ``i`` missing from the output index is interpreted as a
    contraction (note that this differs from Einstein convention; repeated
    indices do not imply contraction.)  In the case of a contraction the passed
    function should expect an iterable of blocks on any array that holds that
    index.  To receive arrays concatenated along contracted dimensions instead
    pass ``concatenate=True``.

    Inner product multiplying a by b, two 1-d vectors

    >>> def sequence_dot(a_blocks, b_blocks):
    ...     result = 0
    ...     for a, b in zip(a_blocks, b_blocks):
    ...         result += a.dot(b)
    ...     return result

    >>> z = blockwise(sequence_dot, '', a, 'i', b, 'i', dtype='f8')
    >>> z.compute()
    250

    Add new single-chunk dimensions with the ``new_axes=`` keyword, including
    the length of the new dimension.  New dimensions will always be in a single
    chunk.

    >>> def f(a):
    ...     return a[:, None] * np.ones((1, 5))

    >>> z = blockwise(f, 'az', a, 'a', new_axes={'z': 5}, dtype=a.dtype)

    New dimensions can also be multi-chunk by specifying a tuple of chunk
    sizes.  This has limited utility as is (because the chunks are all the
    same), but the resulting graph can be modified to achieve more useful
    results (see ``da.map_blocks``).

    >>> z = blockwise(f, 'az', a, 'a', new_axes={'z': (5, 5)}, dtype=x.dtype)
    >>> z.chunks
    ((1, 1, 1), (5, 5))

    If the applied function changes the size of each chunk you can specify this
    with a ``adjust_chunks={...}`` dictionary holding a function for each index
    that modifies the dimension size in that index.

    >>> def double(x):
    ...     return np.concatenate([x, x])

    >>> y = blockwise(double, 'ij', x, 'ij',
    ...               adjust_chunks={'i': lambda n: 2 * n}, dtype=x.dtype)
    >>> y.chunks
    ((2, 2), (2,))

    Include literals by indexing with None

    >>> z = blockwise(operator.add, 'ij', x, 'ij', 1234, None, dtype=x.dtype)
    >>> z.compute()
    array([[1235, 1236],
           [1237, 1238]])
    """
    out = name
    new_axes = new_axes or {}

    # Input Validation
    if len(set(out_ind)) != len(out_ind):
        raise ValueError(
            "Repeated elements not allowed in output index",
            [k for k, v in toolz.frequencies(out_ind).items() if v > 1],
        )
    new = (set(out_ind) -
           {a
            for arg in args[1::2] if arg is not None
            for a in arg} - set(new_axes or ()))
    if new:
        raise ValueError("Unknown dimension", new)

    from dask.array.core import normalize_arg, unify_chunks

    if align_arrays:
        chunkss, arrays = unify_chunks(*args)
    else:
        arginds = [(a, i) for (a, i) in toolz.partition(2, args)
                   if i is not None]
        chunkss = {}
        # For each dimension, use the input chunking that has the most blocks;
        # this will ensure that broadcasting works as expected, and in
        # particular the number of blocks should be correct if the inputs are
        # consistent.
        for arg, ind in arginds:
            for c, i in zip(arg.chunks, ind):
                if i not in chunkss or len(c) > len(chunkss[i]):
                    chunkss[i] = c
        arrays = args[::2]

    for k, v in new_axes.items():
        if not isinstance(v, tuple):
            v = (v, )
        chunkss[k] = v

    arginds = zip(arrays, args[1::2])
    numblocks = {}

    dependencies = []
    arrays = []

    # Normalize arguments
    argindsstr = []

    for arg, ind in arginds:
        if ind is None:
            arg = normalize_arg(arg)
            arg, collections = unpack_collections(arg)
            dependencies.extend(collections)
        else:
            if (hasattr(arg, "ndim") and hasattr(ind, "__len__")
                    and arg.ndim != len(ind)):
                raise ValueError(
                    "Index string %s does not match array dimension %d" %
                    (ind, arg.ndim))
            numblocks[arg.name] = arg.numblocks
            arrays.append(arg)
            arg = arg.name
        argindsstr.extend((arg, ind))

    # Normalize keyword arguments
    kwargs2 = {}
    for k, v in kwargs.items():
        v = normalize_arg(v)
        v, collections = unpack_collections(v)
        dependencies.extend(collections)
        kwargs2[k] = v

    # Finish up the name
    if not out:
        out = "{}-{}".format(
            token or utils.funcname(func).strip("_"),
            base.tokenize(func, out_ind, argindsstr, dtype, **kwargs),
        )

    graph = core_blockwise(
        func,
        out,
        out_ind,
        *argindsstr,
        numblocks=numblocks,
        dependencies=dependencies,
        new_axes=new_axes,
        concatenate=concatenate,
        **kwargs2,
    )
    graph = HighLevelGraph.from_collections(out,
                                            graph,
                                            dependencies=arrays + dependencies)

    chunks = [chunkss[i] for i in out_ind]
    if adjust_chunks:
        for i, ind in enumerate(out_ind):
            if ind in adjust_chunks:
                if callable(adjust_chunks[ind]):
                    chunks[i] = tuple(map(adjust_chunks[ind], chunks[i]))
                elif isinstance(adjust_chunks[ind], numbers.Integral):
                    chunks[i] = tuple(adjust_chunks[ind] for _ in chunks[i])
                elif isinstance(adjust_chunks[ind], (tuple, list)):
                    if len(adjust_chunks[ind]) != len(chunks[i]):
                        raise ValueError(
                            f"Dimension {i} has {len(chunks[i])} blocks, adjust_chunks "
                            f"specified with {len(adjust_chunks[ind])} blocks")
                    chunks[i] = tuple(adjust_chunks[ind])
                else:
                    raise NotImplementedError(
                        "adjust_chunks values must be callable, int, or tuple")
    chunks = tuple(chunks)

    if meta is None:
        from dask.array.utils import compute_meta

        meta = compute_meta(func, dtype, *args[::2], **kwargs)
    return new_da_object(graph, out, chunks, meta=meta, dtype=dtype)
Exemple #35
0
def map_overlap(func, df, before, after, *args, **kwargs):
    """Apply a function to each partition, sharing rows with adjacent partitions.

    Parameters
    ----------
    func : function
        Function applied to each partition.
    df : dd.DataFrame, dd.Series
    before : int or timedelta
        The rows to prepend to partition ``i`` from the end of
        partition ``i - 1``.
    after : int or timedelta
        The rows to append to partition ``i`` from the beginning
        of partition ``i + 1``.
    args, kwargs :
        Arguments and keywords to pass to the function. The partition will
        be the first argument, and these will be passed *after*.

    See Also
    --------
    dd.DataFrame.map_overlap
    """
    if isinstance(before, datetime.timedelta) or isinstance(after, datetime.timedelta):
        if not is_datetime64_any_dtype(df.index._meta_nonempty.inferred_type):
            raise TypeError(
                "Must have a `DatetimeIndex` when using string offset "
                "for `before` and `after`"
            )
    else:
        if not (
            isinstance(before, Integral)
            and before >= 0
            and isinstance(after, Integral)
            and after >= 0
        ):
            raise ValueError("before and after must be positive integers")

    if "token" in kwargs:
        func_name = kwargs.pop("token")
        token = tokenize(df, before, after, *args, **kwargs)
    else:
        func_name = "overlap-" + funcname(func)
        token = tokenize(func, df, before, after, *args, **kwargs)

    if "meta" in kwargs:
        meta = kwargs.pop("meta")
    else:
        meta = _emulate(func, df, *args, **kwargs)
    meta = make_meta(meta, index=df._meta.index, parent_meta=df._meta)

    name = f"{func_name}-{token}"
    name_a = "overlap-prepend-" + tokenize(df, before)
    name_b = "overlap-append-" + tokenize(df, after)
    df_name = df._name

    dsk = {}

    timedelta_partition_message = (
        "Partition size is less than specified window. "
        "Try using ``df.repartition`` to increase the partition size"
    )

    if before and isinstance(before, Integral):

        prevs = [None]
        for i in range(df.npartitions - 1):
            key = (name_a, i)
            dsk[key] = (M.tail, (df_name, i), before)
            prevs.append(key)

    elif isinstance(before, datetime.timedelta):
        # Assumes monotonic (increasing?) index
        divs = pd.Series(df.divisions)
        deltas = divs.diff().iloc[1:-1]

        # In the first case window-size is larger than at least one partition, thus it is
        # necessary to calculate how many partitions must be used for each rolling task.
        # Otherwise, these calculations can be skipped (faster)

        if (before > deltas).any():
            pt_z = divs[0]
            prevs = [None]
            for i in range(df.npartitions - 1):
                # Select all indexes of relevant partitions between the current partition and
                # the partition with the highest division outside the rolling window (before)
                pt_i = divs[i + 1]

                # lower-bound the search to the first division
                lb = max(pt_i - before, pt_z)

                first, j = divs[i], i
                while first > lb and j > 0:
                    first = first - deltas[j]
                    j = j - 1

                key = (name_a, i)
                dsk[key] = (
                    _tail_timedelta,
                    [(df_name, k) for k in range(j, i + 1)],
                    (df_name, i + 1),
                    before,
                )
                prevs.append(key)

        else:
            prevs = [None]
            for i in range(df.npartitions - 1):
                key = (name_a, i)
                dsk[key] = (
                    _tail_timedelta,
                    [(df_name, i)],
                    (df_name, i + 1),
                    before,
                )
                prevs.append(key)
    else:
        prevs = [None] * df.npartitions

    if after and isinstance(after, Integral):
        nexts = []
        for i in range(1, df.npartitions):
            key = (name_b, i)
            dsk[key] = (M.head, (df_name, i), after)
            nexts.append(key)
        nexts.append(None)
    elif isinstance(after, datetime.timedelta):
        # TODO: Do we have a use-case for this? Pandas doesn't allow negative rolling windows
        deltas = pd.Series(df.divisions).diff().iloc[1:-1]
        if (after > deltas).any():
            raise ValueError(timedelta_partition_message)

        nexts = []
        for i in range(1, df.npartitions):
            key = (name_b, i)
            dsk[key] = (_head_timedelta, (df_name, i - 0), (df_name, i), after)
            nexts.append(key)
        nexts.append(None)
    else:
        nexts = [None] * df.npartitions

    for i, (prev, current, next) in enumerate(zip(prevs, df.__dask_keys__(), nexts)):
        dsk[(name, i)] = (
            overlap_chunk,
            func,
            prev,
            current,
            next,
            before,
            after,
            args,
            kwargs,
        )

    graph = HighLevelGraph.from_collections(name, dsk, dependencies=[df])
    return df._constructor(graph, name, meta, df.divisions)