Ejemplo n.º 1
0
def test_compute_divisions():
    from dask.dataframe.shuffle import compute_and_set_divisions

    df = pd.DataFrame(
        {"x": [1, 2, 3, 4], "y": [10, 20, 20, 40], "z": [4, 3, 2, 1]},
        index=[1, 3, 10, 20],
    )
    a = dd.from_pandas(df, 2, sort=False)
    assert not a.known_divisions

    b = compute_and_set_divisions(copy(a))

    assert_eq(a, b, check_divisions=False)
    assert b.known_divisions
Ejemplo n.º 2
0
def from_delayed(
    dfs,
    meta=None,
    divisions=None,
    prefix="from-delayed",
    verify_meta=True,
):
    """Create Dask DataFrame from many Dask Delayed objects

    Parameters
    ----------
    dfs : list of Delayed or Future
        An iterable of ``dask.delayed.Delayed`` objects, such as come from
        ``dask.delayed`` or an iterable of ``distributed.Future`` objects,
        such as come from ``client.submit`` interface. These comprise the individual
        partitions of the resulting dataframe.
    $META
    divisions : tuple, str, optional
        Partition boundaries along the index.
        For tuple, see https://docs.dask.org/en/latest/dataframe-design.html#partitions
        For string 'sorted' will compute the delayed values to find index
        values.  Assumes that the indexes are mutually sorted.
        If None, then won't use index information
    prefix : str, optional
        Prefix to prepend to the keys.
    verify_meta : bool, optional
        If True check that the partitions have consistent metadata, defaults to True.
    """
    from dask.delayed import Delayed

    if isinstance(dfs, Delayed):
        dfs = [dfs]
    dfs = [
        delayed(df)
        if not isinstance(df, Delayed) and hasattr(df, "key") else df
        for df in dfs
    ]

    for df in dfs:
        if not isinstance(df, Delayed):
            raise TypeError("Expected Delayed object, got %s" %
                            type(df).__name__)

    if meta is None:
        meta = delayed(make_meta)(dfs[0]).compute()
    else:
        meta = make_meta(meta)

    if not dfs:
        dfs = [delayed(make_meta)(meta)]

    if divisions is None or divisions == "sorted":
        divs = [None] * (len(dfs) + 1)
    else:
        divs = tuple(divisions)
        if len(divs) != len(dfs) + 1:
            raise ValueError("divisions should be a tuple of len(dfs) + 1")

    name = prefix + "-" + tokenize(*dfs)
    layer = DataFrameIOLayer(
        name=name,
        columns=None,
        inputs=BlockwiseDepDict(
            {(i, ): inp.key
             for i, inp in enumerate(dfs)},
            produces_keys=True,
        ),
        io_func=partial(check_meta, meta=meta, funcname="from_delayed")
        if verify_meta else lambda x: x,
    )
    df = new_dd_object(HighLevelGraph.from_collections(name, layer, dfs), name,
                       meta, divs)

    if divisions == "sorted":
        from dask.dataframe.shuffle import compute_and_set_divisions

        df = compute_and_set_divisions(df)

    return df