def test_set_index_does_not_repeat_work_due_to_optimizations(npartitions): # Atomic counter count = itertools.count() def increment(): next(count) def make_part(dummy, n): return pd.DataFrame({ "x": np.random.random(n), "y": np.random.random(n) }) nbytes = 1e6 nparts = 50 n = int(nbytes / (nparts * 8)) dsk = {("inc", i): (increment, ) for i in range(nparts)} dsk.update({("x", i): (make_part, ("inc", i), n) for i in range(nparts)}) ddf = dd.DataFrame(dsk, "x", make_part(None, 1), [None] * (nparts + 1)) ddf.set_index("x", npartitions=npartitions) ntimes = next(count) assert ntimes == nparts
def test_set_index(): dsk = { ('x', 0): pd.DataFrame({ 'a': [1, 2, 3], 'b': [4, 2, 6] }, index=[0, 1, 3]), ('x', 1): pd.DataFrame({ 'a': [4, 5, 6], 'b': [3, 5, 8] }, index=[5, 6, 8]), ('x', 2): pd.DataFrame({ 'a': [7, 8, 9], 'b': [9, 1, 8] }, index=[9, 9, 9]) } d = dd.DataFrame(dsk, 'x', ['a', 'b'], [0, 4, 9, 9]) full = d.compute() d2 = d.set_index('b', npartitions=3) assert d2.npartitions == 3 # assert eq(d2, full.set_index('b').sort()) assert str(d2.compute().sort(['a' ])) == str(full.set_index('b').sort(['a'])) d3 = d.set_index(d.b, npartitions=3) assert d3.npartitions == 3 # assert eq(d3, full.set_index(full.b).sort()) assert str(d3.compute().sort(['a'])) == str( full.set_index(full.b).sort(['a'])) d2 = d.set_index('b') assert str(d2.compute().sort(['a' ])) == str(full.set_index('b').sort(['a']))
def test_set_index(engine): if engine == "cudf": # NOTE: engine == "cudf" requires cudf/dask_cudf, # will be skipped by non-GPU CI. dask_cudf = pytest.importorskip("dask_cudf") dsk = { ("x", 0): pd.DataFrame({"a": [1, 2, 3], "b": [4, 2, 6]}, index=[0, 1, 3]), ("x", 1): pd.DataFrame({"a": [4, 5, 6], "b": [3, 5, 8]}, index=[5, 6, 8]), ("x", 2): pd.DataFrame({"a": [7, 8, 9], "b": [9, 1, 8]}, index=[9, 9, 9]), } d = dd.DataFrame(dsk, "x", meta, [0, 4, 9, 9]) if engine == "cudf": d = dask_cudf.from_dask_dataframe(d) full = d.compute() d2 = d.set_index("b", npartitions=3) assert d2.npartitions == 3 assert d2.index.name == "b" assert_eq(d2, full.set_index("b")) d3 = d.set_index(d.b, npartitions=3) assert d3.npartitions == 3 assert d3.index.name == "b" assert_eq(d3, full.set_index(full.b)) d4 = d.set_index("b") assert d4.index.name == "b" assert_eq(d4, full.set_index("b")) d5 = d.set_index(["b"]) assert d5.index.name == "b" assert_eq(d5, full.set_index(["b"]))
def _futures_to_dask_dataframe(futures, divisions=None, client=None): import dask.dataframe as dd client = default_client(client) f = yield _first_completed(futures) empty = client.submit(get_empty, f) if divisions is True: divisions = client.map(index_min, futures) divisions.append(client.submit(index_max, futures[-1])) divisions2 = yield client._gather(divisions) if sorted(divisions2) != divisions2: divisions2 = [None] * (len(futures) + 1) elif divisions in (None, False): divisions2 = [None] * (len(futures) + 1) else: raise NotImplementedError() empty = yield empty name = 'distributed-pandas-to-dask-' + tokenize(*futures) dsk = {(name, i): f for i, f in enumerate(futures)} ensure_default_get(client) raise gen.Return(dd.DataFrame(dsk, name, empty, divisions2))
def test_reductions(split_every): dsk = { ('x', 0): pd.DataFrame({ 'a': [1, 2, 3], 'b': [4, 5, 6] }, index=[0, 1, 3]), ('x', 1): pd.DataFrame({ 'a': [4, 5, 6], 'b': [3, 2, 1] }, index=[5, 6, 8]), ('x', 2): pd.DataFrame({ 'a': [7, 8, 9], 'b': [0, 0, 0] }, index=[9, 9, 9]) } meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8')) ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9]) pdf1 = ddf1.compute() nans1 = pd.Series([1] + [np.nan] * 4 + [2] + [np.nan] * 3) nands1 = dd.from_pandas(nans1, 2) nans2 = pd.Series([1] + [np.nan] * 8) nands2 = dd.from_pandas(nans2, 2) nans3 = pd.Series([np.nan] * 9) nands3 = dd.from_pandas(nans3, 2) bools = pd.Series([True, False, True, False, True], dtype=bool) boolds = dd.from_pandas(bools, 2) for dds, pds in [(ddf1.b, pdf1.b), (ddf1.a, pdf1.a), (ddf1['a'], pdf1['a']), (ddf1['b'], pdf1['b']), (nands1, nans1), (nands2, nans2), (nands3, nans3), (boolds, bools)]: assert isinstance(dds, dd.Series) assert isinstance(pds, pd.Series) assert_eq(dds.sum(split_every=split_every), pds.sum()) assert_eq(dds.prod(split_every=split_every), pds.prod()) assert_eq(dds.min(split_every=split_every), pds.min()) assert_eq(dds.max(split_every=split_every), pds.max()) assert_eq(dds.count(split_every=split_every), pds.count()) with pytest.warns(None): # runtime warnings; https://github.com/dask/dask/issues/2381 assert_eq(dds.std(split_every=split_every), pds.std()) with pytest.warns(None): # runtime warnings; https://github.com/dask/dask/issues/2381 assert_eq(dds.var(split_every=split_every), pds.var()) with pytest.warns(None): # runtime warnings; https://github.com/dask/dask/issues/2381 assert_eq(dds.sem(split_every=split_every), pds.sem()) assert_eq(dds.std(ddof=0, split_every=split_every), pds.std(ddof=0)) assert_eq(dds.var(ddof=0, split_every=split_every), pds.var(ddof=0)) assert_eq(dds.sem(ddof=0, split_every=split_every), pds.sem(ddof=0)) assert_eq(dds.mean(split_every=split_every), pds.mean()) assert_eq(dds.nunique(split_every=split_every), pds.nunique()) assert_eq(dds.sum(skipna=False, split_every=split_every), pds.sum(skipna=False)) assert_eq(dds.prod(skipna=False, split_every=split_every), pds.prod(skipna=False)) assert_eq(dds.min(skipna=False, split_every=split_every), pds.min(skipna=False)) assert_eq(dds.max(skipna=False, split_every=split_every), pds.max(skipna=False)) assert_eq(dds.std(skipna=False, split_every=split_every), pds.std(skipna=False)) assert_eq(dds.var(skipna=False, split_every=split_every), pds.var(skipna=False)) assert_eq(dds.sem(skipna=False, split_every=split_every), pds.sem(skipna=False)) assert_eq(dds.std(skipna=False, ddof=0, split_every=split_every), pds.std(skipna=False, ddof=0)) assert_eq(dds.var(skipna=False, ddof=0, split_every=split_every), pds.var(skipna=False, ddof=0)) assert_eq(dds.sem(skipna=False, ddof=0, split_every=split_every), pds.sem(skipna=False, ddof=0)) assert_eq(dds.mean(skipna=False, split_every=split_every), pds.mean(skipna=False)) assert_dask_graph(ddf1.b.sum(split_every=split_every), 'series-sum') assert_dask_graph(ddf1.b.prod(split_every=split_every), 'series-prod') assert_dask_graph(ddf1.b.min(split_every=split_every), 'series-min') assert_dask_graph(ddf1.b.max(split_every=split_every), 'series-max') assert_dask_graph(ddf1.b.count(split_every=split_every), 'series-count') assert_dask_graph(ddf1.b.std(split_every=split_every), 'series-std') assert_dask_graph(ddf1.b.var(split_every=split_every), 'series-var') assert_dask_graph(ddf1.b.sem(split_every=split_every), 'series-sem') assert_dask_graph(ddf1.b.std(ddof=0, split_every=split_every), 'series-std') assert_dask_graph(ddf1.b.var(ddof=0, split_every=split_every), 'series-var') assert_dask_graph(ddf1.b.sem(ddof=0, split_every=split_every), 'series-sem') assert_dask_graph(ddf1.b.mean(split_every=split_every), 'series-mean') # nunique is performed using drop-duplicates assert_dask_graph(ddf1.b.nunique(split_every=split_every), 'drop-duplicates') # testing index assert_eq(ddf1.index.min(split_every=split_every), pdf1.index.min()) assert_eq(ddf1.index.max(split_every=split_every), pdf1.index.max()) assert_eq(ddf1.index.count(split_every=split_every), pd.notnull(pdf1.index).sum())
def test_reductions_frame(split_every): dsk = { ('x', 0): pd.DataFrame({ 'a': [1, 2, 3], 'b': [4, 5, 6] }, index=[0, 1, 3]), ('x', 1): pd.DataFrame({ 'a': [4, 5, 6], 'b': [3, 2, 1] }, index=[5, 6, 8]), ('x', 2): pd.DataFrame({ 'a': [7, 8, 9], 'b': [0, 0, 0] }, index=[9, 9, 9]) } meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8')) ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9]) pdf1 = ddf1.compute() assert_eq(ddf1.sum(split_every=split_every), pdf1.sum()) assert_eq(ddf1.prod(split_every=split_every), pdf1.prod()) assert_eq(ddf1.min(split_every=split_every), pdf1.min()) assert_eq(ddf1.max(split_every=split_every), pdf1.max()) assert_eq(ddf1.count(split_every=split_every), pdf1.count()) assert_eq(ddf1.std(split_every=split_every), pdf1.std()) assert_eq(ddf1.var(split_every=split_every), pdf1.var()) assert_eq(ddf1.sem(split_every=split_every), pdf1.sem()) assert_eq(ddf1.std(ddof=0, split_every=split_every), pdf1.std(ddof=0)) assert_eq(ddf1.var(ddof=0, split_every=split_every), pdf1.var(ddof=0)) assert_eq(ddf1.sem(ddof=0, split_every=split_every), pdf1.sem(ddof=0)) assert_eq(ddf1.mean(split_every=split_every), pdf1.mean()) for axis in [0, 1, 'index', 'columns']: assert_eq(ddf1.sum(axis=axis, split_every=split_every), pdf1.sum(axis=axis)) assert_eq(ddf1.prod(axis=axis, split_every=split_every), pdf1.prod(axis=axis)) assert_eq(ddf1.min(axis=axis, split_every=split_every), pdf1.min(axis=axis)) assert_eq(ddf1.max(axis=axis, split_every=split_every), pdf1.max(axis=axis)) assert_eq(ddf1.count(axis=axis, split_every=split_every), pdf1.count(axis=axis)) assert_eq(ddf1.std(axis=axis, split_every=split_every), pdf1.std(axis=axis)) assert_eq(ddf1.var(axis=axis, split_every=split_every), pdf1.var(axis=axis)) assert_eq(ddf1.sem(axis=axis, split_every=split_every), pdf1.sem(axis=axis)) assert_eq(ddf1.std(axis=axis, ddof=0, split_every=split_every), pdf1.std(axis=axis, ddof=0)) assert_eq(ddf1.var(axis=axis, ddof=0, split_every=split_every), pdf1.var(axis=axis, ddof=0)) assert_eq(ddf1.sem(axis=axis, ddof=0, split_every=split_every), pdf1.sem(axis=axis, ddof=0)) assert_eq(ddf1.mean(axis=axis, split_every=split_every), pdf1.mean(axis=axis)) pytest.raises(ValueError, lambda: ddf1.sum(axis='incorrect').compute()) # axis=0 assert_dask_graph(ddf1.sum(split_every=split_every), 'dataframe-sum') assert_dask_graph(ddf1.prod(split_every=split_every), 'dataframe-prod') assert_dask_graph(ddf1.min(split_every=split_every), 'dataframe-min') assert_dask_graph(ddf1.max(split_every=split_every), 'dataframe-max') assert_dask_graph(ddf1.count(split_every=split_every), 'dataframe-count') # std, var, sem, and mean consist of sum and count operations assert_dask_graph(ddf1.std(split_every=split_every), 'dataframe-sum') assert_dask_graph(ddf1.std(split_every=split_every), 'dataframe-count') assert_dask_graph(ddf1.var(split_every=split_every), 'dataframe-sum') assert_dask_graph(ddf1.var(split_every=split_every), 'dataframe-count') assert_dask_graph(ddf1.sem(split_every=split_every), 'dataframe-sum') assert_dask_graph(ddf1.sem(split_every=split_every), 'dataframe-count') assert_dask_graph(ddf1.mean(split_every=split_every), 'dataframe-sum') assert_dask_graph(ddf1.mean(split_every=split_every), 'dataframe-count') # axis=1 assert_dask_graph(ddf1.sum(axis=1, split_every=split_every), 'dataframe-sum') assert_dask_graph(ddf1.prod(axis=1, split_every=split_every), 'dataframe-prod') assert_dask_graph(ddf1.min(axis=1, split_every=split_every), 'dataframe-min') assert_dask_graph(ddf1.max(axis=1, split_every=split_every), 'dataframe-max') assert_dask_graph(ddf1.count(axis=1, split_every=split_every), 'dataframe-count') assert_dask_graph(ddf1.std(axis=1, split_every=split_every), 'dataframe-std') assert_dask_graph(ddf1.var(axis=1, split_every=split_every), 'dataframe-var') assert_dask_graph(ddf1.sem(axis=1, split_every=split_every), 'dataframe-sem') assert_dask_graph(ddf1.mean(axis=1, split_every=split_every), 'dataframe-mean')
dsk = { ("x", 0): pd.DataFrame({ "a": [1, 2, 3], "b": [4, 5, 6] }, index=[0, 1, 3]), ("x", 1): pd.DataFrame({ "a": [4, 5, 6], "b": [3, 2, 1] }, index=[5, 6, 8]), ("x", 2): pd.DataFrame({ "a": [7, 8, 9], "b": [0, 0, 0] }, index=[9, 9, 9]), } meta = make_meta({"a": "i8", "b": "i8"}, index=pd.Index([], "i8")) d = dd.DataFrame(dsk, "x", meta, [0, 5, 9, 9]) full = d.compute() CHECK_FREQ = {} if dd._compat.PANDAS_GT_110: CHECK_FREQ["check_freq"] = False def test_loc(): assert d.loc[3:8].divisions[0] == 3 assert d.loc[3:8].divisions[-1] == 8 assert d.loc[5].divisions == (5, 5) assert_eq(d.loc[5], full.loc[5:5]) assert_eq(d.loc[3:8], full.loc[3:8]) assert_eq(d.loc[:8], full.loc[:8])
def test_get_numeric_data_unknown_part(): df = pd.DataFrame({'a': range(5), 'b': range(5), 'c': list('abcde')}) ddf = dd.from_pandas(df, 3) # Drop dtype information ddf = dd.DataFrame(ddf.dask, ddf._name, ['a', 'b', 'c'], ddf.divisions) assert eq(ddf._get_numeric_data(), df._get_numeric_data())
def to_dask(self, pages=None, persist=False, progress=True): try: import dask except ImportError: raise RuntimeError("Dask is not installed.") if progress: from dask.diagnostics import ProgressBar ProgressBar().register() if pages is None: pages = self.page_numbers columns = [(k, DASK_TYPE_MAPPING[v.get("type", 'string')]) for k, v in self.schema.items() if k in self.fields and not k.startswith("_")] column_types = dict(columns) url = self._url client_kwargs = self.session.get_client_kwargs() if client_kwargs["app"] is not None: client_kwargs["app"] = dict(client_kwargs["app"].config) def get_data(params): import httpx if client_kwargs["app"] is not None: from eve import Eve client_kwargs["app"] = Eve(settings=client_kwargs["app"]) items = [] with httpx.Client(**client_kwargs) as client: try: resp = client.get( url, params=params, ) items = resp.json().get("_items", []) except: pass data = [{ k: column_types[k](v) for k, v in item.items() if k in column_types } for item in items] return data if not self.is_tabular: import dask.bag as db return db.from_sequence([self.get_page_kwargs(i) for i in pages]).map(get_data).flatten() import dask.dataframe as dd import pandas as pd def get_df(params): data = get_data(params) return pd.DataFrame(data, columns=list(column_types)) dask_name = str( hash((self.name, ) + tuple(self.get_page_kwargs(1).values()))) dsk = {(dask_name, i - 1): (get_df, self.get_page_kwargs(i)) for i in pages} nitems = self.nitems divisions = list(range(0, nitems, self.items_per_page)) if nitems not in divisions: divisions = divisions + [nitems] df = dd.DataFrame(dsk, dask_name, columns, divisions) if persist: return df.persist() return df
def test_reductions(split_every): dsk = { ("x", 0): pd.DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": [True, True, False]}, index=[0, 1, 3] ), ("x", 1): pd.DataFrame( {"a": [4, 5, 6], "b": [3, 2, 1], "c": [False, False, False]}, index=[5, 6, 8], ), ("x", 2): pd.DataFrame( { "a": [13094304034, 3489385935, 100006774], "b": [0, 0, 0], "c": [True, True, True], }, index=[9, 9, 9], ), } meta = make_meta({"a": "i8", "b": "i8", "c": "bool"}, index=pd.Index([], "i8")) ddf1 = dd.DataFrame(dsk, "x", meta, [0, 4, 9, 9]) pdf1 = ddf1.compute() nans1 = pd.Series([1] + [np.nan] * 4 + [2] + [np.nan] * 3) nands1 = dd.from_pandas(nans1, 2) nans2 = pd.Series([1] + [np.nan] * 8) nands2 = dd.from_pandas(nans2, 2) nans3 = pd.Series([np.nan] * 9) nands3 = dd.from_pandas(nans3, 2) bools = pd.Series([True, False, True, False, True], dtype=bool) boolds = dd.from_pandas(bools, 2) for dds, pds in [ (ddf1.a, pdf1.a), (ddf1.b, pdf1.b), (ddf1.c, pdf1.c), (ddf1["a"], pdf1["a"]), (ddf1["b"], pdf1["b"]), (nands1, nans1), (nands2, nans2), (nands3, nans3), (boolds, bools), ]: assert isinstance(dds, dd.Series) assert isinstance(pds, pd.Series) assert_eq(dds.sum(split_every=split_every), pds.sum()) assert_eq(dds.prod(split_every=split_every), pds.prod()) assert_eq(dds.min(split_every=split_every), pds.min()) assert_eq(dds.max(split_every=split_every), pds.max()) assert_eq(dds.count(split_every=split_every), pds.count()) if scipy: # pandas uses unbiased skew, need to correct for that n = pds.shape[0] bias_factor = (n * (n - 1)) ** 0.5 / (n - 2) assert_eq(dds.skew(), pds.skew() / bias_factor) with pytest.warns(None): # runtime warnings; https://github.com/dask/dask/issues/2381 assert_eq(dds.std(split_every=split_every), pds.std()) with pytest.warns(None): # runtime warnings; https://github.com/dask/dask/issues/2381 assert_eq(dds.var(split_every=split_every), pds.var()) with pytest.warns(None): # runtime warnings; https://github.com/dask/dask/issues/2381 assert_eq(dds.sem(split_every=split_every), pds.sem()) with warnings.catch_warnings(): # dask.dataframe should probably filter this, to match pandas, but # it seems quite difficult. warnings.simplefilter("ignore", RuntimeWarning) assert_eq(dds.std(ddof=0, split_every=split_every), pds.std(ddof=0)) assert_eq(dds.var(ddof=0, split_every=split_every), pds.var(ddof=0)) assert_eq(dds.sem(ddof=0, split_every=split_every), pds.sem(ddof=0)) assert_eq(dds.mean(split_every=split_every), pds.mean()) assert_eq(dds.nunique(split_every=split_every), pds.nunique()) assert_eq(dds.sum(skipna=False, split_every=split_every), pds.sum(skipna=False)) assert_eq( dds.prod(skipna=False, split_every=split_every), pds.prod(skipna=False) ) assert_eq(dds.min(skipna=False, split_every=split_every), pds.min(skipna=False)) assert_eq(dds.max(skipna=False, split_every=split_every), pds.max(skipna=False)) assert_eq(dds.std(skipna=False, split_every=split_every), pds.std(skipna=False)) assert_eq(dds.var(skipna=False, split_every=split_every), pds.var(skipna=False)) assert_eq(dds.sem(skipna=False, split_every=split_every), pds.sem(skipna=False)) assert_eq( dds.std(skipna=False, ddof=0, split_every=split_every), pds.std(skipna=False, ddof=0), ) assert_eq( dds.var(skipna=False, ddof=0, split_every=split_every), pds.var(skipna=False, ddof=0), ) assert_eq( dds.sem(skipna=False, ddof=0, split_every=split_every), pds.sem(skipna=False, ddof=0), ) assert_eq( dds.mean(skipna=False, split_every=split_every), pds.mean(skipna=False) ) assert_dask_graph(ddf1.b.sum(split_every=split_every), "series-sum") assert_dask_graph(ddf1.b.prod(split_every=split_every), "series-prod") assert_dask_graph(ddf1.b.min(split_every=split_every), "series-min") assert_dask_graph(ddf1.b.max(split_every=split_every), "series-max") assert_dask_graph(ddf1.b.count(split_every=split_every), "series-count") assert_dask_graph(ddf1.b.std(split_every=split_every), "series-std") assert_dask_graph(ddf1.b.var(split_every=split_every), "series-var") assert_dask_graph(ddf1.b.sem(split_every=split_every), "series-sem") assert_dask_graph(ddf1.b.std(ddof=0, split_every=split_every), "series-std") assert_dask_graph(ddf1.b.var(ddof=0, split_every=split_every), "series-var") assert_dask_graph(ddf1.b.sem(ddof=0, split_every=split_every), "series-sem") assert_dask_graph(ddf1.b.mean(split_every=split_every), "series-mean") # nunique is performed using drop-duplicates assert_dask_graph(ddf1.b.nunique(split_every=split_every), "drop-duplicates") # testing index assert_eq(ddf1.index.min(split_every=split_every), pdf1.index.min()) assert_eq(ddf1.index.max(split_every=split_every), pdf1.index.max()) assert_eq(ddf1.index.count(split_every=split_every), pd.notnull(pdf1.index).sum())
def test_concat2(): dsk = { ('x', 0): pd.DataFrame({ 'a': [1, 2, 3], 'b': [4, 5, 6] }), ('x', 1): pd.DataFrame({ 'a': [4, 5, 6], 'b': [3, 2, 1] }), ('x', 2): pd.DataFrame({ 'a': [7, 8, 9], 'b': [0, 0, 0] }) } a = dd.DataFrame(dsk, 'x', ['a', 'b'], [None, None]) dsk = { ('y', 0): pd.DataFrame({ 'a': [10, 20, 30], 'b': [40, 50, 60] }), ('y', 1): pd.DataFrame({ 'a': [40, 50, 60], 'b': [30, 20, 10] }), ('y', 2): pd.DataFrame({ 'a': [70, 80, 90], 'b': [0, 0, 0] }) } b = dd.DataFrame(dsk, 'y', ['a', 'b'], [None, None]) dsk = { ('y', 0): pd.DataFrame({ 'b': [10, 20, 30], 'c': [40, 50, 60] }), ('y', 1): pd.DataFrame({ 'b': [40, 50, 60], 'c': [30, 20, 10] }) } c = dd.DataFrame(dsk, 'y', ['b', 'c'], [None, None]) dsk = { ('y', 0): pd.DataFrame({ 'b': [10, 20, 30], 'c': [40, 50, 60], 'd': [70, 80, 90] }), ('y', 1): pd.DataFrame({ 'b': [40, 50, 60], 'c': [30, 20, 10], 'd': [90, 80, 70] }, index=[3, 4, 5]) } d = dd.DataFrame(dsk, 'y', ['b', 'c', 'd'], [0, 3, 5]) cases = [[a, b], [a, c], [a, d]] assert dd.concat([a]) is a for case in cases: result = dd.concat(case) pdcase = [c.compute() for c in case] assert result.npartitions == case[0].npartitions + case[1].npartitions assert result.divisions == (None, ) * (result.npartitions + 1) assert eq(pd.concat(pdcase), result) assert result.dask == dd.concat(case).dask result = dd.concat(case, join='inner') assert result.npartitions == case[0].npartitions + case[1].npartitions assert result.divisions == (None, ) * (result.npartitions + 1) assert eq(pd.concat(pdcase, join='inner'), result) assert result.dask == dd.concat(case, join='inner').dask msg = ('Unable to concatenate DataFrame with unknown division ' 'specifying axis=1') with tm.assertRaisesRegexp(ValueError, msg): dd.concat(case, axis=1)
def test_append2(): dsk = { ('x', 0): pd.DataFrame({ 'a': [1, 2, 3], 'b': [4, 5, 6] }), ('x', 1): pd.DataFrame({ 'a': [4, 5, 6], 'b': [3, 2, 1] }), ('x', 2): pd.DataFrame({ 'a': [7, 8, 9], 'b': [0, 0, 0] }) } ddf1 = dd.DataFrame(dsk, 'x', ['a', 'b'], [None, None]) dsk = { ('y', 0): pd.DataFrame({ 'a': [10, 20, 30], 'b': [40, 50, 60] }), ('y', 1): pd.DataFrame({ 'a': [40, 50, 60], 'b': [30, 20, 10] }), ('y', 2): pd.DataFrame({ 'a': [70, 80, 90], 'b': [0, 0, 0] }) } ddf2 = dd.DataFrame(dsk, 'y', ['a', 'b'], [None, None]) dsk = { ('y', 0): pd.DataFrame({ 'b': [10, 20, 30], 'c': [40, 50, 60] }), ('y', 1): pd.DataFrame({ 'b': [40, 50, 60], 'c': [30, 20, 10] }) } ddf3 = dd.DataFrame(dsk, 'y', ['b', 'c'], [None, None]) assert eq(ddf1.append(ddf2), ddf1.compute().append(ddf2.compute())) assert eq(ddf2.append(ddf1), ddf2.compute().append(ddf1.compute())) # Series + DataFrame assert eq(ddf1.a.append(ddf2), ddf1.a.compute().append(ddf2.compute())) assert eq(ddf2.a.append(ddf1), ddf2.a.compute().append(ddf1.compute())) # different columns assert eq(ddf1.append(ddf3), ddf1.compute().append(ddf3.compute())) assert eq(ddf3.append(ddf1), ddf3.compute().append(ddf1.compute())) # Series + DataFrame assert eq(ddf1.a.append(ddf3), ddf1.a.compute().append(ddf3.compute())) assert eq(ddf3.b.append(ddf1), ddf3.b.compute().append(ddf1.compute())) # Dask + pandas assert eq(ddf1.append(ddf2.compute()), ddf1.compute().append(ddf2.compute())) assert eq(ddf2.append(ddf1.compute()), ddf2.compute().append(ddf1.compute())) # Series + DataFrame assert eq(ddf1.a.append(ddf2.compute()), ddf1.a.compute().append(ddf2.compute())) assert eq(ddf2.a.append(ddf1.compute()), ddf2.a.compute().append(ddf1.compute())) # different columns assert eq(ddf1.append(ddf3.compute()), ddf1.compute().append(ddf3.compute())) assert eq(ddf3.append(ddf1.compute()), ddf3.compute().append(ddf1.compute())) # Series + DataFrame assert eq(ddf1.a.append(ddf3.compute()), ddf1.a.compute().append(ddf3.compute())) assert eq(ddf3.b.append(ddf1.compute()), ddf3.b.compute().append(ddf1.compute()))
def dataframe_factory(out_ind, *arginds, columns=None): """ Creates a dask Dataframe by broadcasting *arginds against each other and then ravelling them. .. code-block:: python df = dataframe_factory(("row", "chan"), x, ("row",), y, ("chan",)) Parameters ---------- out_ind : sequence Output dimensions. e.g. :code:`(row, chan)` *arginds : Sequence of (:class:`dask.array.Array`, index) document me columns : sequence, optional Dataframe column names. Defaults to :code:`[x, y]` """ if not len(arginds) % 2 == 0: raise ValueError("Must supply an index for each argument") args = arginds[::2] inds = arginds[1::2] if columns is None: columns = ['x', 'y'] + ["c%d" % i for i in range(len(args) - 2)] else: if (not isinstance(columns, (tuple, list)) and len(columns) != len(args)): raise ValueError("Columns must be a tuple/list of columns " "matching the number of arrays") have_nan_chunks = False new_args = [] for a, (arg, ind) in enumerate(zip(args, inds)): if not all(i in out_ind for i in ind): raise ValueError("Argument %d dimensions not in out_ind" % a) if not len(ind) == arg.ndim: raise ValueError("Argument %d len(ind) != arg.ndim" % a) have_nan_chunks = (any(np.isnan(c) for dc in arg.chunks for c in dc) or have_nan_chunks) # Generate slicing tuple that will expand arg up to full resolution expand = tuple(slice(None) if i in ind else None for i in out_ind) new_args.append(arg[expand]) # Create meta data so that blockwise doesn't call # np.broadcast_arrays and fall over on the tuple # of arrays that it returns dtype = np.result_type(*args) meta = np.empty((0, ) * len(out_ind), dtype=dtype) blockargs = (v for pair in ((a, out_ind) for a in new_args) for v in pair) bcast = da.blockwise(np.broadcast_arrays, out_ind, *blockargs, subok=True, align_arrays=not have_nan_chunks, meta=meta, dtype=dtype) # Now create a dataframe from the broadcasted arrays # with lower-level dask graph API # Flattened list of broadcast array keys # We'll use this to generate a 1D (ravelled) dataframe keys = product((bcast.name, ), *(range(b) for b in bcast.numblocks)) name = "dataframe-" + tokenize(bcast) # dictionary defining the graph for this part of the operation layers = {} if have_nan_chunks: # We can't create proper indices if we don't known our chunk sizes divisions = [None] for i, key in enumerate(keys): layers[(name, i)] = (_create_dataframe, key, None, None, columns) divisions.append(None) else: # We do know all our chunk sizes, create reasonable dataframe indices start_idx = 0 divisions = [0] expr = ((e - s for s, e in start_ends(dim_chunks)) for dim_chunks in bcast.chunks) chunk_sizes = (reduce(mul, shape, 1) for shape in product(*expr)) chunk_ranges = start_ends(chunk_sizes) for i, (key, (start, end)) in enumerate(zip(keys, chunk_ranges)): layers[(name, i)] = (_create_dataframe, key, start, end, columns) start_idx += end - start divisions.append(start_idx) assert len(layers) == bcast.npartitions assert len(divisions) == bcast.npartitions + 1 # Create the HighLevelGraph graph = HighLevelGraph.from_collections(name, layers, [bcast]) # Metadata representing the broadcasted and ravelled data meta = pd.DataFrame(data={ k: np.empty((0, ), dtype=a.dtype) for k, a in zip(columns, args) }, columns=columns) # Create the actual Dataframe return dd.DataFrame(graph, name, meta=meta, divisions=divisions)
def multicol_dataframe_factory(out_ind, arrays, array_dims): """ Creates a dask Dataframe by broadcasting arrays (given by the arrays dict-like object) against each other and then ravelling them. The array_indices mapping specifies which indices the arrays have .. code-block:: python df = dataframe_factory(("row", "chan"), {'x': x, 'y': y}, {x: ("row",), y: ("chan",)}) Parameters ---------- out_ind : sequence Output dimensions. e.g. :code:`(row, chan)` """ columns = list(arrays.keys()) have_nan_chunks = None expand = {} barr = {} # build up list of arguments for blockwise call below blockwise_args = [np.broadcast_arrays, out_ind] for col, arr in arrays.items(): if col not in array_dims: raise ValueError(f"{col} dimensions not specified") arr_ind = array_dims[col] if not all(i in out_ind for i in arr_ind): raise ValueError(f"{col} dimensions not in out_ind") if not len(arr_ind) == arr.ndim: raise ValueError(f"len({col}_ind) != {col}.ndim") have_nan_chunks = have_nan_chunks or any( np.isnan(c) for dc in arr.chunks for c in dc) # Generate slicing tuples that will expand arr up to the full # resolution expand[col] = tuple( slice(None) if i in arr_ind else None for i in out_ind) # broadcast vesion of array barr[col] = arr[expand[col]] blockwise_args += [barr[col], out_ind] # Create meta data so that blockwise doesn't call # np.broadcast_arrays and fall over on the tuple # of arrays that it returns dtype = np.result_type(*arrays.values()) meta = np.empty((0, ) * len(out_ind), dtype=dtype) bcast = da.blockwise(*blockwise_args, align_arrays=not have_nan_chunks, meta=meta, dtype=dtype) # Now create a dataframe from the broadcasted arrays # with lower-level dask graph API # Flattened list of broadcast array keys # We'll use this to generate a 1D (ravelled) dataframe keys = product((bcast.name, ), *(range(b) for b in bcast.numblocks)) name = "dataframe-" + tokenize(bcast) # dictionary defining the graph for this part of the operation layers = {} if have_nan_chunks: # We can't create proper indices if we don't known our chunk sizes divisions = [None] for i, key in enumerate(keys): layers[(name, i)] = (_create_dataframe, key, None, None) divisions.append(None) else: # We do know all our chunk sizes, create reasonable dataframe indices start_idx = 0 divisions = [0] expr = ((e - s for s, e in start_ends(dim_chunks)) for dim_chunks in bcast.chunks) chunk_sizes = (reduce(mul, shape, 1) for shape in product(*expr)) chunk_ranges = start_ends(chunk_sizes) for i, (key, (start, end)) in enumerate(zip(keys, chunk_ranges)): layers[(name, i)] = (_create_dataframe, key, start, end) start_idx += end - start divisions.append(start_idx) assert len(layers) == bcast.npartitions assert len(divisions) == bcast.npartitions + 1 # Create the HighLevelGraph graph = HighLevelGraph.from_collections(name, layers, [bcast]) # Metadata representing the broadcasted and ravelled data meta = pd.DataFrame(data={ col: np.empty((0, ), dtype=arr.dtype) for col, arr in arrays.items() }, columns=columns) # Create the actual Dataframe return dd.DataFrame(graph, name, meta=meta, divisions=divisions)
def test_reductions_frame(split_every): dsk = { ("x", 0): pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=[0, 1, 3]), ("x", 1): pd.DataFrame({"a": [4, 5, 6], "b": [3, 2, 1]}, index=[5, 6, 8]), ("x", 2): pd.DataFrame({"a": [7, 8, 9], "b": [0, 0, 0]}, index=[9, 9, 9]), } meta = make_meta({"a": "i8", "b": "i8"}, index=pd.Index([], "i8")) ddf1 = dd.DataFrame(dsk, "x", meta, [0, 4, 9, 9]) pdf1 = ddf1.compute() assert_eq(ddf1.sum(split_every=split_every), pdf1.sum()) assert_eq(ddf1.prod(split_every=split_every), pdf1.prod()) assert_eq(ddf1.min(split_every=split_every), pdf1.min()) assert_eq(ddf1.max(split_every=split_every), pdf1.max()) assert_eq(ddf1.count(split_every=split_every), pdf1.count()) assert_eq(ddf1.std(split_every=split_every), pdf1.std()) assert_eq(ddf1.var(split_every=split_every), pdf1.var()) assert_eq(ddf1.sem(split_every=split_every), pdf1.sem()) assert_eq(ddf1.std(ddof=0, split_every=split_every), pdf1.std(ddof=0)) assert_eq(ddf1.var(ddof=0, split_every=split_every), pdf1.var(ddof=0)) assert_eq(ddf1.sem(ddof=0, split_every=split_every), pdf1.sem(ddof=0)) assert_eq(ddf1.mean(split_every=split_every), pdf1.mean()) for axis in [0, 1, "index", "columns"]: assert_eq(ddf1.sum(axis=axis, split_every=split_every), pdf1.sum(axis=axis)) assert_eq(ddf1.prod(axis=axis, split_every=split_every), pdf1.prod(axis=axis)) assert_eq(ddf1.min(axis=axis, split_every=split_every), pdf1.min(axis=axis)) assert_eq(ddf1.max(axis=axis, split_every=split_every), pdf1.max(axis=axis)) assert_eq(ddf1.count(axis=axis, split_every=split_every), pdf1.count(axis=axis)) assert_eq(ddf1.std(axis=axis, split_every=split_every), pdf1.std(axis=axis)) assert_eq(ddf1.var(axis=axis, split_every=split_every), pdf1.var(axis=axis)) assert_eq(ddf1.sem(axis=axis, split_every=split_every), pdf1.sem(axis=axis)) assert_eq( ddf1.std(axis=axis, ddof=0, split_every=split_every), pdf1.std(axis=axis, ddof=0), ) assert_eq( ddf1.var(axis=axis, ddof=0, split_every=split_every), pdf1.var(axis=axis, ddof=0), ) assert_eq( ddf1.sem(axis=axis, ddof=0, split_every=split_every), pdf1.sem(axis=axis, ddof=0), ) assert_eq(ddf1.mean(axis=axis, split_every=split_every), pdf1.mean(axis=axis)) pytest.raises(ValueError, lambda: ddf1.sum(axis="incorrect").compute()) # axis=0 assert_dask_graph(ddf1.sum(split_every=split_every), "dataframe-sum") assert_dask_graph(ddf1.prod(split_every=split_every), "dataframe-prod") assert_dask_graph(ddf1.min(split_every=split_every), "dataframe-min") assert_dask_graph(ddf1.max(split_every=split_every), "dataframe-max") assert_dask_graph(ddf1.count(split_every=split_every), "dataframe-count") # std, var, sem, and mean consist of moment_* operations assert_dask_graph(ddf1.std(split_every=split_every), "dataframe-var") assert_dask_graph(ddf1.std(split_every=split_every), "moment_chunk") assert_dask_graph(ddf1.std(split_every=split_every), "moment_agg") assert_dask_graph(ddf1.std(split_every=split_every), "values") assert_dask_graph(ddf1.var(split_every=split_every), "moment_chunk") assert_dask_graph(ddf1.var(split_every=split_every), "moment_agg") assert_dask_graph(ddf1.var(split_every=split_every), "values") assert_dask_graph(ddf1.sem(split_every=split_every), "dataframe-var") assert_dask_graph(ddf1.sem(split_every=split_every), "moment_chunk") assert_dask_graph(ddf1.sem(split_every=split_every), "moment_agg") assert_dask_graph(ddf1.sem(split_every=split_every), "values") assert_dask_graph(ddf1.mean(split_every=split_every), "dataframe-sum") assert_dask_graph(ddf1.mean(split_every=split_every), "dataframe-count") # axis=1 assert_dask_graph(ddf1.sum(axis=1, split_every=split_every), "dataframe-sum") assert_dask_graph(ddf1.prod(axis=1, split_every=split_every), "dataframe-prod") assert_dask_graph(ddf1.min(axis=1, split_every=split_every), "dataframe-min") assert_dask_graph(ddf1.max(axis=1, split_every=split_every), "dataframe-max") assert_dask_graph(ddf1.count(axis=1, split_every=split_every), "dataframe-count") assert_dask_graph(ddf1.std(axis=1, split_every=split_every), "dataframe-std") assert_dask_graph(ddf1.var(axis=1, split_every=split_every), "dataframe-var") assert_dask_graph(ddf1.sem(axis=1, split_every=split_every), "dataframe-sem") assert_dask_graph(ddf1.mean(axis=1, split_every=split_every), "dataframe-mean")
def test_arithmetics(): dsk = { ("x", 0): pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=[0, 1, 3]), ("x", 1): pd.DataFrame({"a": [4, 5, 6], "b": [3, 2, 1]}, index=[5, 6, 8]), ("x", 2): pd.DataFrame({"a": [7, 8, 9], "b": [0, 0, 0]}, index=[9, 9, 9]), } meta = make_meta({"a": "i8", "b": "i8"}, index=pd.Index([], "i8")) ddf1 = dd.DataFrame(dsk, "x", meta, [0, 4, 9, 9]) pdf1 = ddf1.compute() pdf2 = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7, 8], "b": [5, 6, 7, 8, 1, 2, 3, 4]}) pdf3 = pd.DataFrame({"a": [5, 6, 7, 8, 4, 3, 2, 1], "b": [2, 4, 5, 3, 4, 2, 1, 0]}) ddf2 = dd.from_pandas(pdf2, 3) ddf3 = dd.from_pandas(pdf3, 2) dsk4 = { ("y", 0): pd.DataFrame({"a": [3, 2, 1], "b": [7, 8, 9]}, index=[0, 1, 3]), ("y", 1): pd.DataFrame({"a": [5, 2, 8], "b": [4, 2, 3]}, index=[5, 6, 8]), ("y", 2): pd.DataFrame({"a": [1, 4, 10], "b": [1, 0, 5]}, index=[9, 9, 9]), } ddf4 = dd.DataFrame(dsk4, "y", meta, [0, 4, 9, 9]) pdf4 = ddf4.compute() # Arithmetics cases = [ (ddf1, ddf1, pdf1, pdf1), (ddf1, ddf1.repartition([0, 1, 3, 6, 9]), pdf1, pdf1), (ddf2, ddf3, pdf2, pdf3), (ddf2.repartition([0, 3, 6, 7]), ddf3.repartition([0, 7]), pdf2, pdf3), (ddf2.repartition([0, 7]), ddf3.repartition([0, 2, 4, 5, 7]), pdf2, pdf3), (ddf1, ddf4, pdf1, pdf4), (ddf1, ddf4.repartition([0, 9]), pdf1, pdf4), (ddf1.repartition([0, 3, 9]), ddf4.repartition([0, 5, 9]), pdf1, pdf4), # dask + pandas (ddf1, pdf4, pdf1, pdf4), (ddf2, pdf3, pdf2, pdf3), ] for (l, r, el, er) in cases: check_series_arithmetics(l.a, r.b, el.a, er.b) check_frame_arithmetics(l, r, el, er) # different index, pandas raises ValueError in comparison ops pdf5 = pd.DataFrame( {"a": [3, 2, 1, 5, 2, 8, 1, 4, 10], "b": [7, 8, 9, 4, 2, 3, 1, 0, 5]}, index=[0, 1, 3, 5, 6, 8, 9, 9, 9], ) ddf5 = dd.from_pandas(pdf5, 2) pdf6 = pd.DataFrame( {"a": [3, 2, 1, 5, 2, 8, 1, 4, 10], "b": [7, 8, 9, 5, 7, 8, 4, 2, 5]}, index=[0, 1, 2, 3, 4, 5, 6, 7, 9], ) ddf6 = dd.from_pandas(pdf6, 4) pdf7 = pd.DataFrame( {"a": [1, 2, 3, 4, 5, 6, 7, 8], "b": [5, 6, 7, 8, 1, 2, 3, 4]}, index=list("aaabcdeh"), ) pdf8 = pd.DataFrame( {"a": [5, 6, 7, 8, 4, 3, 2, 1], "b": [2, 4, 5, 3, 4, 2, 1, 0]}, index=list("abcdefgh"), ) ddf7 = dd.from_pandas(pdf7, 3) ddf8 = dd.from_pandas(pdf8, 4) pdf9 = pd.DataFrame( { "a": [1, 2, 3, 4, 5, 6, 7, 8], "b": [5, 6, 7, 8, 1, 2, 3, 4], "c": [5, 6, 7, 8, 1, 2, 3, 4], }, index=list("aaabcdeh"), ) pdf10 = pd.DataFrame( { "b": [5, 6, 7, 8, 4, 3, 2, 1], "c": [2, 4, 5, 3, 4, 2, 1, 0], "d": [2, 4, 5, 3, 4, 2, 1, 0], }, index=list("abcdefgh"), ) ddf9 = dd.from_pandas(pdf9, 3) ddf10 = dd.from_pandas(pdf10, 4) # Arithmetics with different index cases = [ (ddf5, ddf6, pdf5, pdf6), (ddf5.repartition([0, 9]), ddf6, pdf5, pdf6), (ddf5.repartition([0, 5, 9]), ddf6.repartition([0, 7, 9]), pdf5, pdf6), (ddf7, ddf8, pdf7, pdf8), (ddf7.repartition(["a", "c", "h"]), ddf8.repartition(["a", "h"]), pdf7, pdf8), ( ddf7.repartition(["a", "b", "e", "h"]), ddf8.repartition(["a", "e", "h"]), pdf7, pdf8, ), (ddf9, ddf10, pdf9, pdf10), (ddf9.repartition(["a", "c", "h"]), ddf10.repartition(["a", "h"]), pdf9, pdf10), # dask + pandas (ddf5, pdf6, pdf5, pdf6), (ddf7, pdf8, pdf7, pdf8), (ddf9, pdf10, pdf9, pdf10), ] for (l, r, el, er) in cases: check_series_arithmetics(l.a, r.b, el.a, er.b, allow_comparison_ops=False) check_frame_arithmetics(l, r, el, er, allow_comparison_ops=False)
# -*- coding:utf-8 -*- """ Author : 'longguangbin' Contact : [email protected] Date : 2018/11/23 Usage : """ import dask.dataframe as dd df = dd.DataFrame([[1, 2, 3], [2, 3, 4], [3, 4, 5], [5, 5, 6]]) df.head()
dsk = { ('x', 0): pd.DataFrame({ 'a': [1, 2, 3], 'b': [1, 4, 7] }, index=[0, 1, 3]), ('x', 1): pd.DataFrame({ 'a': [4, 5, 6], 'b': [2, 5, 8] }, index=[5, 6, 8]), ('x', 2): pd.DataFrame({ 'a': [7, 8, 9], 'b': [3, 6, 9] }, index=[9, 9, 9]) } d = dd.DataFrame(dsk, 'x', ['a', 'b'], [0, 4, 9, 9]) full = d.compute() def test_shuffle(): s = shuffle(d, d.b, npartitions=2) assert isinstance(s, dd.DataFrame) assert s.npartitions == 2 x = get_sync(s.dask, (s._name, 0)) y = get_sync(s.dask, (s._name, 1)) assert not (set(x.b) & set(y.b)) # disjoint assert shuffle(d, d.b, npartitions=2)._name == shuffle(d, d.b,
def test_fast_functions(): df = dd.DataFrame(dsk, 'x', ['a', 'b'], [None, None, None, None]) e = df.a + df.b assert len(e.dask) > 6 assert len(dd.optimize(e.dask, e._keys())) == 6
def convert(self, rel: "org.apache.calcite.rel.RelNode", context: "dask_sql.Context") -> DataContainer: # Joining is a bit more complicated, so lets do it in steps: # 1. We now have two inputs (from left and right), so we fetch them both dc_lhs, dc_rhs = self.assert_inputs(rel, 2, context) cc_lhs = dc_lhs.column_container cc_rhs = dc_rhs.column_container # 2. dask's merge will do some smart things with columns, which have the same name # on lhs an rhs (which also includes reordering). # However, that will confuse our column numbering in SQL. # So we make our life easier by converting the column names into unique names # We will convert back in the end cc_lhs_renamed = cc_lhs.make_unique("lhs") cc_rhs_renamed = cc_rhs.make_unique("rhs") dc_lhs_renamed = DataContainer(dc_lhs.df, cc_lhs_renamed) dc_rhs_renamed = DataContainer(dc_rhs.df, cc_rhs_renamed) df_lhs_renamed = dc_lhs_renamed.assign() df_rhs_renamed = dc_rhs_renamed.assign() join_type = rel.getJoinType() join_type = self.JOIN_TYPE_MAPPING[str(join_type)] # 3. The join condition can have two forms, that we can understand # (a) a = b # (b) X AND Y AND a = b AND Z ... (can also be multiple a = b) # The first case is very simple and we do not need any additional filter # In the second case we do a merge on all the a = b, # and then apply a filter using the other expressions. # In all other cases, we need to do a full table cross join and filter afterwards. # As this is probably non-sense for large tables, but there is no other # known solution so far. join_condition = rel.getCondition() lhs_on, rhs_on, filter_condition = self._split_join_condition( join_condition) logger.debug( f"Joining with type {join_type} on columns {lhs_on}, {rhs_on}.") # lhs_on and rhs_on are the indices of the columns to merge on. # The given column indices are for the full, merged table which consists # of lhs and rhs put side-by-side (in this order) # We therefore need to normalize the rhs indices relative to the rhs table. rhs_on = [index - len(df_lhs_renamed.columns) for index in rhs_on] # 4. dask can only merge on the same column names. # We therefore create new columns on purpose, which have a distinct name. assert len(lhs_on) == len(rhs_on) if lhs_on: # 5. Now we can finally merge on these columns # The resulting dataframe will contain all (renamed) columns from the lhs and rhs # plus the added columns df = self._join_on_columns( df_lhs_renamed, df_rhs_renamed, lhs_on, rhs_on, join_type, ) else: # 5. We are in the complex join case # where we have no column to merge on # This means we have no other chance than to merge # everything with everything... # TODO: we should implement a shortcut # for filter conditions that are always false def merge_single_partitions(lhs_partition, rhs_partition): # Do a cross join with the two partitions # TODO: it would be nice to apply the filter already here # problem: this would mean we need to ship the rex to the # workers (as this is executed on the workers), # which is definitely not possible (java dependency, JVM start...) lhs_partition = lhs_partition.assign(common=1) rhs_partition = rhs_partition.assign(common=1) return lhs_partition.merge(rhs_partition, on="common").drop(columns="common") # Iterate nested over all partitions from lhs and rhs and merge them name = "cross-join-" + tokenize(df_lhs_renamed, df_rhs_renamed) dsk = {(name, i * df_rhs_renamed.npartitions + j): ( merge_single_partitions, (df_lhs_renamed._name, i), (df_rhs_renamed._name, j), ) for i in range(df_lhs_renamed.npartitions) for j in range(df_rhs_renamed.npartitions)} graph = HighLevelGraph.from_collections( name, dsk, dependencies=[df_lhs_renamed, df_rhs_renamed]) meta = dd.dispatch.concat( [df_lhs_renamed._meta_nonempty, df_rhs_renamed._meta_nonempty], axis=1) # TODO: Do we know the divisions in any way here? divisions = [None] * (len(dsk) + 1) df = dd.DataFrame(graph, name, meta=meta, divisions=divisions) warnings.warn( "Need to do a cross-join, which is typically very resource heavy", ResourceWarning, ) # 6. So the next step is to make sure # we have the correct column order (and to remove the temporary join columns) correct_column_order = list(df_lhs_renamed.columns) + list( df_rhs_renamed.columns) cc = ColumnContainer(df.columns).limit_to(correct_column_order) # and to rename them like the rel specifies row_type = rel.getRowType() field_specifications = [str(f) for f in row_type.getFieldNames()] cc = cc.rename({ from_col: to_col for from_col, to_col in zip(cc.columns, field_specifications) }) cc = self.fix_column_to_row_type(cc, row_type) dc = DataContainer(df, cc) # 7. Last but not least we apply any filters by and-chaining together the filters if filter_condition: # This line is a bit of code duplication with RexCallPlugin - but I guess it is worth to keep it separate filter_condition = reduce( operator.and_, [ RexConverter.convert(rex, dc, context=context) for rex in filter_condition ], ) logger.debug(f"Additionally applying filter {filter_condition}") df = filter_or_scalar(df, filter_condition) dc = DataContainer(df, cc) dc = self.fix_dtype_to_row_type(dc, rel.getRowType()) return dc
def fit(self, X, y=None): if(isinstance(X, pd.DataFrame)): X = dd.DataFrame(X) return self
def test_categorize(): # rename y to y_ to avoid pandas future warning about ambiguous # levels meta = clear_known_categories(frames4[0]).rename(columns={"y": "y_"}) ddf = dd.DataFrame( {("unknown", i): df for (i, df) in enumerate(frames3)}, "unknown", meta, [None] * 4, ).rename(columns={"y": "y_"}) ddf = ddf.assign(w=ddf.w.cat.set_categories(["x", "y", "z"])) assert ddf.w.cat.known assert not ddf.y_.cat.known assert not ddf.index.cat.known df = ddf.compute() for index in [None, True, False]: known_index = index is not False # By default categorize object and unknown cat columns ddf2 = ddf.categorize(index=index) assert ddf2.y_.cat.known assert ddf2.v.cat.known assert ddf2.index.cat.known == known_index assert_eq(ddf2, df.astype({"v": "category"}), check_categorical=False) # Specifying split_every works ddf2 = ddf.categorize(index=index, split_every=2) assert ddf2.y_.cat.known assert ddf2.v.cat.known assert ddf2.index.cat.known == known_index assert_eq(ddf2, df.astype({"v": "category"}), check_categorical=False) # Specifying one column doesn't affect others ddf2 = ddf.categorize("v", index=index) assert not ddf2.y_.cat.known assert ddf2.v.cat.known assert ddf2.index.cat.known == known_index assert_eq(ddf2, df.astype({"v": "category"}), check_categorical=False) ddf2 = ddf.categorize("y_", index=index) assert ddf2.y_.cat.known assert ddf2.v.dtype == "object" assert ddf2.index.cat.known == known_index assert_eq(ddf2, df) ddf_known_index = ddf.categorize(columns=[], index=True) assert ddf_known_index.index.cat.known assert_eq(ddf_known_index, df) # Specifying known categorical or no columns is a no-op: assert ddf.categorize(["w"], index=False) is ddf assert ddf.categorize([], index=False) is ddf assert ddf_known_index.categorize(["w"]) is ddf_known_index assert ddf_known_index.categorize([]) is ddf_known_index # Bad split_every fails with pytest.raises(ValueError): ddf.categorize(split_every=1) with pytest.raises(ValueError): ddf.categorize(split_every="foo")
def read_pairix_block( filepath, block, names=None, dtypes=None, usecols=None, chromsizes=None, chunk_level=0, ): if chromsizes is None: f = pypairix.open(filepath) cs = f.get_chromsize() if not len(cs): raise ValueError( "No chromsize headers found in file. " "They must be provided explicitly." ) chromsizes = pd.Series(dict([(c, int(s)) for c, s in cs])) del f chrom1, chrom2 = block nrows = chromsizes[chrom1] meta = ( pd.read_csv( filepath, sep="\t", comment="#", header=None, names=names, dtype=dtypes, usecols=usecols, iterator=True, ) .read(1024) .iloc[0:0] ) # Make a unique task name token = tokenize(filepath, chromsizes, block, names, dtypes, usecols, chunk_level) task_name = "read-pairix-block-" + token # Build the task graph divisions = [] dsk = {} edges = LEVEL[chunk_level] edges = edges[: np.searchsorted(edges, nrows)] if edges[-1] != nrows: edges = np.r_[edges, nrows] spans = zip(edges[:-1], edges[1:]) for i, (lo, hi) in enumerate(spans): if i == 0: divisions.append(lo) divisions.append(hi - 1) slc = slice(lo, hi) dsk[task_name, i] = ( _fetch_region, filepath, chromsizes, slc, block, names, usecols, meta, ) # Generate ddf from dask graph return dd.DataFrame(dsk, task_name, meta, tuple(divisions))
partitioning_index, rearrange_by_column, rearrange_by_divisions, maybe_buffered_partd, remove_nans) from dask.dataframe.utils import assert_eq, make_meta dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [1, 4, 7]}, index=[0, 1, 3]), ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [2, 5, 8]}, index=[5, 6, 8]), ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [3, 6, 9]}, index=[9, 9, 9])} meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8')) d = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9]) full = d.compute() shuffle_func = shuffle # conflicts with keyword argument @pytest.mark.parametrize('shuffle', ['disk', 'tasks']) def test_shuffle(shuffle): s = shuffle_func(d, d.b, shuffle=shuffle) assert isinstance(s, dd.DataFrame) assert s.npartitions == d.npartitions x = dask.get(s.dask, (s._name, 0)) y = dask.get(s.dask, (s._name, 1))
def test_split_apply_combine_on_series(): dsk = { ('x', 0): pd.DataFrame({ 'a': [1, 2, 6], 'b': [4, 2, 7] }, index=[0, 1, 3]), ('x', 1): pd.DataFrame({ 'a': [4, 4, 6], 'b': [3, 3, 1] }, index=[5, 6, 8]), ('x', 2): pd.DataFrame({ 'a': [4, 3, 7], 'b': [1, 1, 3] }, index=[9, 9, 9]) } ddf1 = dd.DataFrame(dsk, 'x', ['a', 'b'], [0, 4, 9, 9]) pdf1 = ddf1.compute() for ddkey, pdkey in [('b', 'b'), (ddf1.b, pdf1.b), (ddf1.b + 1, pdf1.b + 1)]: assert eq(ddf1.groupby(ddkey).a.min(), pdf1.groupby(pdkey).a.min()) assert eq(ddf1.groupby(ddkey).a.max(), pdf1.groupby(pdkey).a.max()) assert eq(ddf1.groupby(ddkey).a.count(), pdf1.groupby(pdkey).a.count()) assert eq(ddf1.groupby(ddkey).a.mean(), pdf1.groupby(pdkey).a.mean()) assert eq( ddf1.groupby(ddkey).a.nunique(), pdf1.groupby(pdkey).a.nunique()) assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum()) assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min()) assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max()) assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count()) assert eq(ddf1.groupby(ddkey).mean(), pdf1.groupby(pdkey).mean()) for ddkey, pdkey in [(ddf1.b, pdf1.b), (ddf1.b + 1, pdf1.b + 1)]: assert eq(ddf1.a.groupby(ddkey).sum(), pdf1.a.groupby(pdkey).sum(), check_names=False) assert eq(ddf1.a.groupby(ddkey).max(), pdf1.a.groupby(pdkey).max(), check_names=False) assert eq(ddf1.a.groupby(ddkey).count(), pdf1.a.groupby(pdkey).count(), check_names=False) assert eq(ddf1.a.groupby(ddkey).mean(), pdf1.a.groupby(pdkey).mean(), check_names=False) assert eq(ddf1.a.groupby(ddkey).nunique(), pdf1.a.groupby(pdkey).nunique(), check_names=False) for i in range(8): assert eq( ddf1.groupby(ddf1.b > i).a.sum(), pdf1.groupby(pdf1.b > i).a.sum()) assert eq( ddf1.groupby(ddf1.b > i).a.min(), pdf1.groupby(pdf1.b > i).a.min()) assert eq( ddf1.groupby(ddf1.b > i).a.max(), pdf1.groupby(pdf1.b > i).a.max()) assert eq( ddf1.groupby(ddf1.b > i).a.count(), pdf1.groupby(pdf1.b > i).a.count()) assert eq( ddf1.groupby(ddf1.b > i).a.mean(), pdf1.groupby(pdf1.b > i).a.mean()) assert eq( ddf1.groupby(ddf1.b > i).a.nunique(), pdf1.groupby(pdf1.b > i).a.nunique()) assert eq( ddf1.groupby(ddf1.a > i).b.sum(), pdf1.groupby(pdf1.a > i).b.sum()) assert eq( ddf1.groupby(ddf1.a > i).b.min(), pdf1.groupby(pdf1.a > i).b.min()) assert eq( ddf1.groupby(ddf1.a > i).b.max(), pdf1.groupby(pdf1.a > i).b.max()) assert eq( ddf1.groupby(ddf1.a > i).b.count(), pdf1.groupby(pdf1.a > i).b.count()) assert eq( ddf1.groupby(ddf1.a > i).b.mean(), pdf1.groupby(pdf1.a > i).b.mean()) assert eq( ddf1.groupby(ddf1.a > i).b.nunique(), pdf1.groupby(pdf1.a > i).b.nunique()) assert eq( ddf1.groupby(ddf1.b > i).sum(), pdf1.groupby(pdf1.b > i).sum()) assert eq( ddf1.groupby(ddf1.b > i).min(), pdf1.groupby(pdf1.b > i).min()) assert eq( ddf1.groupby(ddf1.b > i).max(), pdf1.groupby(pdf1.b > i).max()) assert eq( ddf1.groupby(ddf1.b > i).count(), pdf1.groupby(pdf1.b > i).count()) assert eq( ddf1.groupby(ddf1.b > i).mean(), pdf1.groupby(pdf1.b > i).mean()) assert eq( ddf1.groupby(ddf1.a > i).sum(), pdf1.groupby(pdf1.a > i).sum()) assert eq( ddf1.groupby(ddf1.a > i).min(), pdf1.groupby(pdf1.a > i).min()) assert eq( ddf1.groupby(ddf1.a > i).max(), pdf1.groupby(pdf1.a > i).max()) assert eq( ddf1.groupby(ddf1.a > i).count(), pdf1.groupby(pdf1.a > i).count()) assert eq( ddf1.groupby(ddf1.a > i).mean(), pdf1.groupby(pdf1.a > i).mean()) for ddkey, pdkey in [('a', 'a'), (ddf1.a, pdf1.a), (ddf1.a + 1, pdf1.a + 1), (ddf1.a > 3, pdf1.a > 3)]: assert eq(ddf1.groupby(ddkey).b.sum(), pdf1.groupby(pdkey).b.sum()) assert eq(ddf1.groupby(ddkey).b.min(), pdf1.groupby(pdkey).b.min()) assert eq(ddf1.groupby(ddkey).b.max(), pdf1.groupby(pdkey).b.max()) assert eq(ddf1.groupby(ddkey).b.count(), pdf1.groupby(pdkey).b.count()) assert eq(ddf1.groupby(ddkey).b.mean(), pdf1.groupby(pdkey).b.mean()) assert eq( ddf1.groupby(ddkey).b.nunique(), pdf1.groupby(pdkey).b.nunique()) assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum()) assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min()) assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max()) assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count()) assert eq( ddf1.groupby(ddkey).mean(), pdf1.groupby(pdkey).mean().astype(float)) assert sorted(ddf1.groupby('b').a.sum().dask) == \ sorted(ddf1.groupby('b').a.sum().dask) assert sorted(ddf1.groupby(ddf1.a > 3).b.mean().dask) == \ sorted(ddf1.groupby(ddf1.a > 3).b.mean().dask) # test raises with incorrect key assert raises(KeyError, lambda: ddf1.groupby('x')) assert raises(KeyError, lambda: ddf1.groupby(['a', 'x'])) assert raises(KeyError, lambda: ddf1.groupby('a')['x']) assert raises(KeyError, lambda: ddf1.groupby('a')['b', 'x']) assert raises(KeyError, lambda: ddf1.groupby('a')[['b', 'x']]) # test graph node labels assert_dask_graph(ddf1.groupby('b').a.sum(), 'series-groupby-sum') assert_dask_graph(ddf1.groupby('b').a.min(), 'series-groupby-min') assert_dask_graph(ddf1.groupby('b').a.max(), 'series-groupby-max') assert_dask_graph(ddf1.groupby('b').a.count(), 'series-groupby-count') # mean consists from sum and count operations assert_dask_graph(ddf1.groupby('b').a.mean(), 'series-groupby-sum') assert_dask_graph(ddf1.groupby('b').a.mean(), 'series-groupby-count') assert_dask_graph(ddf1.groupby('b').a.nunique(), 'series-groupby-nunique') assert_dask_graph(ddf1.groupby('b').sum(), 'dataframe-groupby-sum') assert_dask_graph(ddf1.groupby('b').min(), 'dataframe-groupby-min') assert_dask_graph(ddf1.groupby('b').max(), 'dataframe-groupby-max') assert_dask_graph(ddf1.groupby('b').count(), 'dataframe-groupby-count') # mean consists from sum and count operations assert_dask_graph(ddf1.groupby('b').mean(), 'dataframe-groupby-sum') assert_dask_graph(ddf1.groupby('b').mean(), 'dataframe-groupby-count')
def test_arithmetics(): dsk = { ('x', 0): pd.DataFrame({ 'a': [1, 2, 3], 'b': [4, 5, 6] }, index=[0, 1, 3]), ('x', 1): pd.DataFrame({ 'a': [4, 5, 6], 'b': [3, 2, 1] }, index=[5, 6, 8]), ('x', 2): pd.DataFrame({ 'a': [7, 8, 9], 'b': [0, 0, 0] }, index=[9, 9, 9]) } meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8')) ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9]) pdf1 = ddf1.compute() pdf2 = pd.DataFrame({ 'a': [1, 2, 3, 4, 5, 6, 7, 8], 'b': [5, 6, 7, 8, 1, 2, 3, 4] }) pdf3 = pd.DataFrame({ 'a': [5, 6, 7, 8, 4, 3, 2, 1], 'b': [2, 4, 5, 3, 4, 2, 1, 0] }) ddf2 = dd.from_pandas(pdf2, 3) ddf3 = dd.from_pandas(pdf3, 2) dsk4 = { ('y', 0): pd.DataFrame({ 'a': [3, 2, 1], 'b': [7, 8, 9] }, index=[0, 1, 3]), ('y', 1): pd.DataFrame({ 'a': [5, 2, 8], 'b': [4, 2, 3] }, index=[5, 6, 8]), ('y', 2): pd.DataFrame({ 'a': [1, 4, 10], 'b': [1, 0, 5] }, index=[9, 9, 9]) } ddf4 = dd.DataFrame(dsk4, 'y', meta, [0, 4, 9, 9]) pdf4 = ddf4.compute() # Arithmetics cases = [ (ddf1, ddf1, pdf1, pdf1), (ddf1, ddf1.repartition([0, 1, 3, 6, 9]), pdf1, pdf1), (ddf2, ddf3, pdf2, pdf3), (ddf2.repartition([0, 3, 6, 7]), ddf3.repartition([0, 7]), pdf2, pdf3), (ddf2.repartition([0, 7]), ddf3.repartition([0, 2, 4, 5, 7]), pdf2, pdf3), (ddf1, ddf4, pdf1, pdf4), (ddf1, ddf4.repartition([0, 9]), pdf1, pdf4), (ddf1.repartition([0, 3, 9]), ddf4.repartition([0, 5, 9]), pdf1, pdf4), # dask + pandas (ddf1, pdf4, pdf1, pdf4), (ddf2, pdf3, pdf2, pdf3) ] for (l, r, el, er) in cases: check_series_arithmetics(l.a, r.b, el.a, er.b) check_frame_arithmetics(l, r, el, er) # different index, pandas raises ValueError in comparison ops pdf5 = pd.DataFrame( { 'a': [3, 2, 1, 5, 2, 8, 1, 4, 10], 'b': [7, 8, 9, 4, 2, 3, 1, 0, 5] }, index=[0, 1, 3, 5, 6, 8, 9, 9, 9]) ddf5 = dd.from_pandas(pdf5, 2) pdf6 = pd.DataFrame( { 'a': [3, 2, 1, 5, 2, 8, 1, 4, 10], 'b': [7, 8, 9, 5, 7, 8, 4, 2, 5] }, index=[0, 1, 2, 3, 4, 5, 6, 7, 9]) ddf6 = dd.from_pandas(pdf6, 4) pdf7 = pd.DataFrame( { 'a': [1, 2, 3, 4, 5, 6, 7, 8], 'b': [5, 6, 7, 8, 1, 2, 3, 4] }, index=list('aaabcdeh')) pdf8 = pd.DataFrame( { 'a': [5, 6, 7, 8, 4, 3, 2, 1], 'b': [2, 4, 5, 3, 4, 2, 1, 0] }, index=list('abcdefgh')) ddf7 = dd.from_pandas(pdf7, 3) ddf8 = dd.from_pandas(pdf8, 4) pdf9 = pd.DataFrame( { 'a': [1, 2, 3, 4, 5, 6, 7, 8], 'b': [5, 6, 7, 8, 1, 2, 3, 4], 'c': [5, 6, 7, 8, 1, 2, 3, 4] }, index=list('aaabcdeh')) pdf10 = pd.DataFrame( { 'b': [5, 6, 7, 8, 4, 3, 2, 1], 'c': [2, 4, 5, 3, 4, 2, 1, 0], 'd': [2, 4, 5, 3, 4, 2, 1, 0] }, index=list('abcdefgh')) ddf9 = dd.from_pandas(pdf9, 3) ddf10 = dd.from_pandas(pdf10, 4) # Arithmetics with different index cases = [ (ddf5, ddf6, pdf5, pdf6), (ddf5.repartition([0, 9]), ddf6, pdf5, pdf6), (ddf5.repartition([0, 5, 9]), ddf6.repartition([0, 7, 9]), pdf5, pdf6), (ddf7, ddf8, pdf7, pdf8), (ddf7.repartition(['a', 'c', 'h']), ddf8.repartition(['a', 'h']), pdf7, pdf8), (ddf7.repartition(['a', 'b', 'e', 'h']), ddf8.repartition(['a', 'e', 'h']), pdf7, pdf8), (ddf9, ddf10, pdf9, pdf10), (ddf9.repartition(['a', 'c', 'h']), ddf10.repartition(['a', 'h']), pdf9, pdf10), # dask + pandas (ddf5, pdf6, pdf5, pdf6), (ddf7, pdf8, pdf7, pdf8), (ddf9, pdf10, pdf9, pdf10) ] for (l, r, el, er) in cases: check_series_arithmetics(l.a, r.b, el.a, er.b, allow_comparison_ops=False) check_frame_arithmetics(l, r, el, er, allow_comparison_ops=False)
def test_reductions_frame(): dsk = { ('x', 0): pd.DataFrame({ 'a': [1, 2, 3], 'b': [4, 5, 6] }, index=[0, 1, 3]), ('x', 1): pd.DataFrame({ 'a': [4, 5, 6], 'b': [3, 2, 1] }, index=[5, 6, 8]), ('x', 2): pd.DataFrame({ 'a': [7, 8, 9], 'b': [0, 0, 0] }, index=[9, 9, 9]) } ddf1 = dd.DataFrame(dsk, 'x', ['a', 'b'], [0, 4, 9, 9]) pdf1 = ddf1.compute() assert eq(ddf1.sum(), pdf1.sum()) assert eq(ddf1.min(), pdf1.min()) assert eq(ddf1.max(), pdf1.max()) assert eq(ddf1.count(), pdf1.count()) assert eq(ddf1.std(), pdf1.std()) assert eq(ddf1.var(), pdf1.var()) assert eq(ddf1.std(ddof=0), pdf1.std(ddof=0)) assert eq(ddf1.var(ddof=0), pdf1.var(ddof=0)) assert eq(ddf1.mean(), pdf1.mean()) for axis in [0, 1, 'index', 'columns']: assert eq(ddf1.sum(axis=axis), pdf1.sum(axis=axis)) assert eq(ddf1.min(axis=axis), pdf1.min(axis=axis)) assert eq(ddf1.max(axis=axis), pdf1.max(axis=axis)) assert eq(ddf1.count(axis=axis), pdf1.count(axis=axis)) assert eq(ddf1.std(axis=axis), pdf1.std(axis=axis)) assert eq(ddf1.var(axis=axis), pdf1.var(axis=axis)) assert eq(ddf1.std(axis=axis, ddof=0), pdf1.std(axis=axis, ddof=0)) assert eq(ddf1.var(axis=axis, ddof=0), pdf1.var(axis=axis, ddof=0)) assert eq(ddf1.mean(axis=axis), pdf1.mean(axis=axis)) assert raises(ValueError, lambda: ddf1.sum(axis='incorrect').compute()) # axis=0 assert_dask_graph(ddf1.sum(), 'dataframe-sum') assert_dask_graph(ddf1.min(), 'dataframe-min') assert_dask_graph(ddf1.max(), 'dataframe-max') assert_dask_graph(ddf1.count(), 'dataframe-count') # std, var, mean consists from sum and count operations assert_dask_graph(ddf1.std(), 'dataframe-sum') assert_dask_graph(ddf1.std(), 'dataframe-count') assert_dask_graph(ddf1.var(), 'dataframe-sum') assert_dask_graph(ddf1.var(), 'dataframe-count') assert_dask_graph(ddf1.mean(), 'dataframe-sum') assert_dask_graph(ddf1.mean(), 'dataframe-count') # axis=1 assert_dask_graph(ddf1.sum(axis=1), 'dataframe-sum') assert_dask_graph(ddf1.min(axis=1), 'dataframe-min') assert_dask_graph(ddf1.max(axis=1), 'dataframe-max') assert_dask_graph(ddf1.count(axis=1), 'dataframe-count') assert_dask_graph(ddf1.std(axis=1), 'dataframe-std') assert_dask_graph(ddf1.var(axis=1), 'dataframe-var') assert_dask_graph(ddf1.mean(axis=1), 'dataframe-mean')
sns.barplot(data=hour_no, x="mnth", y="cnt", ax=ax1) ax1.set(xlabel="Month", ylabel="Average Count", title="Average Count By Month") # Looking at the distribution of the rentals on a daily basis there are clearly periods of higher demand during a day, ie the moments people have to get home from/ go to work/school, and the other moments of the day. # Of course there is a big seasonal effect as well as an effect of wheteher it is a working day or not. # In[69]: # Plot hourly distributions regarding season, day of week, workday fig, (ax1, ax2, ax3) = plt.subplots(nrows=3) fig.set_size_inches(15, 18) hourAggregated = dd.DataFrame( hour_no.groupby(["hr", "season"], sort=True)["cnt"].mean() ).reset_index() sns.pointplot( x=hourAggregated["hr"], y=hourAggregated["cnt"], hue=hourAggregated["season"], data=hourAggregated, join=True, ax=ax1, ) ax2.set( xlabel="Hour Of The Day", ylabel="Users Count", title="Average Users Count By Hour Of The Day Across Season", label="big", )
def test_reductions(): dsk = { ('x', 0): pd.DataFrame({ 'a': [1, 2, 3], 'b': [4, 5, 6] }, index=[0, 1, 3]), ('x', 1): pd.DataFrame({ 'a': [4, 5, 6], 'b': [3, 2, 1] }, index=[5, 6, 8]), ('x', 2): pd.DataFrame({ 'a': [7, 8, 9], 'b': [0, 0, 0] }, index=[9, 9, 9]) } meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8')) ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9]) pdf1 = ddf1.compute() nans1 = pd.Series([1] + [np.nan] * 4 + [2] + [np.nan] * 3) nands1 = dd.from_pandas(nans1, 2) nans2 = pd.Series([1] + [np.nan] * 8) nands2 = dd.from_pandas(nans2, 2) nans3 = pd.Series([np.nan] * 9) nands3 = dd.from_pandas(nans3, 2) bools = pd.Series([True, False, True, False, True], dtype=bool) boolds = dd.from_pandas(bools, 2) for dds, pds in [(ddf1.b, pdf1.b), (ddf1.a, pdf1.a), (ddf1['a'], pdf1['a']), (ddf1['b'], pdf1['b']), (nands1, nans1), (nands2, nans2), (nands3, nans3), (boolds, bools)]: assert isinstance(dds, dd.Series) assert isinstance(pds, pd.Series) assert eq(dds.sum(), pds.sum()) assert eq(dds.min(), pds.min()) assert eq(dds.max(), pds.max()) assert eq(dds.count(), pds.count()) assert eq(dds.std(), pds.std()) assert eq(dds.var(), pds.var()) assert eq(dds.std(ddof=0), pds.std(ddof=0)) assert eq(dds.var(ddof=0), pds.var(ddof=0)) assert eq(dds.mean(), pds.mean()) assert eq(dds.nunique(), pds.nunique()) assert eq(dds.nbytes, pds.nbytes) assert eq(dds.sum(skipna=False), pds.sum(skipna=False)) assert eq(dds.min(skipna=False), pds.min(skipna=False)) assert eq(dds.max(skipna=False), pds.max(skipna=False)) assert eq(dds.std(skipna=False), pds.std(skipna=False)) assert eq(dds.var(skipna=False), pds.var(skipna=False)) assert eq(dds.std(skipna=False, ddof=0), pds.std(skipna=False, ddof=0)) assert eq(dds.var(skipna=False, ddof=0), pds.var(skipna=False, ddof=0)) assert eq(dds.mean(skipna=False), pds.mean(skipna=False)) assert_dask_graph(ddf1.b.sum(), 'series-sum') assert_dask_graph(ddf1.b.min(), 'series-min') assert_dask_graph(ddf1.b.max(), 'series-max') assert_dask_graph(ddf1.b.count(), 'series-count') assert_dask_graph(ddf1.b.std(), 'series-std') assert_dask_graph(ddf1.b.var(), 'series-var') assert_dask_graph(ddf1.b.std(ddof=0), 'series-std') assert_dask_graph(ddf1.b.var(ddof=0), 'series-var') assert_dask_graph(ddf1.b.mean(), 'series-mean') # nunique is performed using drop-duplicates assert_dask_graph(ddf1.b.nunique(), 'drop-duplicates') eq(ddf1.index.min(), pdf1.index.min()) eq(ddf1.index.max(), pdf1.index.max())