def test_compute_divisions(): from dask.dataframe.shuffle import compute_and_set_divisions df = pd.DataFrame( {"x": [1, 2, 3, 4], "y": [10, 20, 20, 40], "z": [4, 3, 2, 1]}, index=[1, 3, 10, 20], ) a = dd.from_pandas(df, 2, sort=False) assert not a.known_divisions b = compute_and_set_divisions(copy(a)) assert_eq(a, b, check_divisions=False) assert b.known_divisions
def from_delayed( dfs, meta=None, divisions=None, prefix="from-delayed", verify_meta=True, ): """Create Dask DataFrame from many Dask Delayed objects Parameters ---------- dfs : list of Delayed or Future An iterable of ``dask.delayed.Delayed`` objects, such as come from ``dask.delayed`` or an iterable of ``distributed.Future`` objects, such as come from ``client.submit`` interface. These comprise the individual partitions of the resulting dataframe. $META divisions : tuple, str, optional Partition boundaries along the index. For tuple, see https://docs.dask.org/en/latest/dataframe-design.html#partitions For string 'sorted' will compute the delayed values to find index values. Assumes that the indexes are mutually sorted. If None, then won't use index information prefix : str, optional Prefix to prepend to the keys. verify_meta : bool, optional If True check that the partitions have consistent metadata, defaults to True. """ from dask.delayed import Delayed if isinstance(dfs, Delayed): dfs = [dfs] dfs = [ delayed(df) if not isinstance(df, Delayed) and hasattr(df, "key") else df for df in dfs ] for df in dfs: if not isinstance(df, Delayed): raise TypeError("Expected Delayed object, got %s" % type(df).__name__) if meta is None: meta = delayed(make_meta)(dfs[0]).compute() else: meta = make_meta(meta) if not dfs: dfs = [delayed(make_meta)(meta)] if divisions is None or divisions == "sorted": divs = [None] * (len(dfs) + 1) else: divs = tuple(divisions) if len(divs) != len(dfs) + 1: raise ValueError("divisions should be a tuple of len(dfs) + 1") name = prefix + "-" + tokenize(*dfs) layer = DataFrameIOLayer( name=name, columns=None, inputs=BlockwiseDepDict( {(i, ): inp.key for i, inp in enumerate(dfs)}, produces_keys=True, ), io_func=partial(check_meta, meta=meta, funcname="from_delayed") if verify_meta else lambda x: x, ) df = new_dd_object(HighLevelGraph.from_collections(name, layer, dfs), name, meta, divs) if divisions == "sorted": from dask.dataframe.shuffle import compute_and_set_divisions df = compute_and_set_divisions(df) return df