def test_reductions(split_every): dsk = { ('x', 0): pd.DataFrame({ 'a': [1, 2, 3], 'b': [4, 5, 6] }, index=[0, 1, 3]), ('x', 1): pd.DataFrame({ 'a': [4, 5, 6], 'b': [3, 2, 1] }, index=[5, 6, 8]), ('x', 2): pd.DataFrame({ 'a': [7, 8, 9], 'b': [0, 0, 0] }, index=[9, 9, 9]) } meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8')) ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9]) pdf1 = ddf1.compute() nans1 = pd.Series([1] + [np.nan] * 4 + [2] + [np.nan] * 3) nands1 = dd.from_pandas(nans1, 2) nans2 = pd.Series([1] + [np.nan] * 8) nands2 = dd.from_pandas(nans2, 2) nans3 = pd.Series([np.nan] * 9) nands3 = dd.from_pandas(nans3, 2) bools = pd.Series([True, False, True, False, True], dtype=bool) boolds = dd.from_pandas(bools, 2) for dds, pds in [(ddf1.b, pdf1.b), (ddf1.a, pdf1.a), (ddf1['a'], pdf1['a']), (ddf1['b'], pdf1['b']), (nands1, nans1), (nands2, nans2), (nands3, nans3), (boolds, bools)]: assert isinstance(dds, dd.Series) assert isinstance(pds, pd.Series) assert_eq(dds.sum(split_every=split_every), pds.sum()) assert_eq(dds.prod(split_every=split_every), pds.prod()) assert_eq(dds.min(split_every=split_every), pds.min()) assert_eq(dds.max(split_every=split_every), pds.max()) assert_eq(dds.count(split_every=split_every), pds.count()) with pytest.warns(None): # runtime warnings; https://github.com/dask/dask/issues/2381 assert_eq(dds.std(split_every=split_every), pds.std()) with pytest.warns(None): # runtime warnings; https://github.com/dask/dask/issues/2381 assert_eq(dds.var(split_every=split_every), pds.var()) with pytest.warns(None): # runtime warnings; https://github.com/dask/dask/issues/2381 assert_eq(dds.sem(split_every=split_every), pds.sem()) assert_eq(dds.std(ddof=0, split_every=split_every), pds.std(ddof=0)) assert_eq(dds.var(ddof=0, split_every=split_every), pds.var(ddof=0)) assert_eq(dds.sem(ddof=0, split_every=split_every), pds.sem(ddof=0)) assert_eq(dds.mean(split_every=split_every), pds.mean()) assert_eq(dds.nunique(split_every=split_every), pds.nunique()) assert_eq(dds.sum(skipna=False, split_every=split_every), pds.sum(skipna=False)) assert_eq(dds.prod(skipna=False, split_every=split_every), pds.prod(skipna=False)) assert_eq(dds.min(skipna=False, split_every=split_every), pds.min(skipna=False)) assert_eq(dds.max(skipna=False, split_every=split_every), pds.max(skipna=False)) assert_eq(dds.std(skipna=False, split_every=split_every), pds.std(skipna=False)) assert_eq(dds.var(skipna=False, split_every=split_every), pds.var(skipna=False)) assert_eq(dds.sem(skipna=False, split_every=split_every), pds.sem(skipna=False)) assert_eq(dds.std(skipna=False, ddof=0, split_every=split_every), pds.std(skipna=False, ddof=0)) assert_eq(dds.var(skipna=False, ddof=0, split_every=split_every), pds.var(skipna=False, ddof=0)) assert_eq(dds.sem(skipna=False, ddof=0, split_every=split_every), pds.sem(skipna=False, ddof=0)) assert_eq(dds.mean(skipna=False, split_every=split_every), pds.mean(skipna=False)) assert_dask_graph(ddf1.b.sum(split_every=split_every), 'series-sum') assert_dask_graph(ddf1.b.prod(split_every=split_every), 'series-prod') assert_dask_graph(ddf1.b.min(split_every=split_every), 'series-min') assert_dask_graph(ddf1.b.max(split_every=split_every), 'series-max') assert_dask_graph(ddf1.b.count(split_every=split_every), 'series-count') assert_dask_graph(ddf1.b.std(split_every=split_every), 'series-std') assert_dask_graph(ddf1.b.var(split_every=split_every), 'series-var') assert_dask_graph(ddf1.b.sem(split_every=split_every), 'series-sem') assert_dask_graph(ddf1.b.std(ddof=0, split_every=split_every), 'series-std') assert_dask_graph(ddf1.b.var(ddof=0, split_every=split_every), 'series-var') assert_dask_graph(ddf1.b.sem(ddof=0, split_every=split_every), 'series-sem') assert_dask_graph(ddf1.b.mean(split_every=split_every), 'series-mean') # nunique is performed using drop-duplicates assert_dask_graph(ddf1.b.nunique(split_every=split_every), 'drop-duplicates') # testing index assert_eq(ddf1.index.min(split_every=split_every), pdf1.index.min()) assert_eq(ddf1.index.max(split_every=split_every), pdf1.index.max()) assert_eq(ddf1.index.count(split_every=split_every), pd.notnull(pdf1.index).sum())
def test_split_apply_combine_on_series(): pdf1 = pd.DataFrame({'a': [1, 2, 6, 4, 4, 6, 4, 3, 7], 'b': [4, 2, 7, 3, 3, 1, 1, 1, 2]}, index=[0, 1, 3, 5, 6, 8, 9, 9, 9]) ddf = dd.from_pandas(pdf1, npartitions=3) ddf1 = ddf for ddkey, pdkey in [('b', 'b'), (ddf1.b, pdf1.b), (ddf1.b + 1, pdf1.b + 1)]: assert eq(ddf1.groupby(ddkey).a.min(), pdf1.groupby(pdkey).a.min()) assert eq(ddf1.groupby(ddkey).a.max(), pdf1.groupby(pdkey).a.max()) assert eq(ddf1.groupby(ddkey).a.count(), pdf1.groupby(pdkey).a.count()) assert eq(ddf1.groupby(ddkey).a.mean(), pdf1.groupby(pdkey).a.mean()) assert eq(ddf1.groupby(ddkey).a.nunique(), pdf1.groupby(pdkey).a.nunique()) for ddof in [0, 1, 2]: assert eq(ddf1.groupby(ddkey).a.var(ddof), pdf1.groupby(pdkey).a.var(ddof)) assert eq(ddf1.groupby(ddkey).a.std(ddof), pdf1.groupby(pdkey).a.std(ddof)) assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum()) assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min()) assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max()) assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count()) assert eq(ddf1.groupby(ddkey).mean(), pdf1.groupby(pdkey).mean()) for ddof in [0, 1, 2]: assert eq(ddf1.groupby(ddkey).var(ddof), pdf1.groupby(pdkey).var(ddof), check_dtype=False) assert eq(ddf1.groupby(ddkey).std(ddof), pdf1.groupby(pdkey).std(ddof), check_dtype=False) for ddkey, pdkey in [(ddf1.b, pdf1.b), (ddf1.b + 1, pdf1.b + 1)]: assert eq(ddf1.a.groupby(ddkey).sum(), pdf1.a.groupby(pdkey).sum(), check_names=False) assert eq(ddf1.a.groupby(ddkey).max(), pdf1.a.groupby(pdkey).max(), check_names=False) assert eq(ddf1.a.groupby(ddkey).count(), pdf1.a.groupby(pdkey).count(), check_names=False) assert eq(ddf1.a.groupby(ddkey).mean(), pdf1.a.groupby(pdkey).mean(), check_names=False) assert eq(ddf1.a.groupby(ddkey).nunique(), pdf1.a.groupby(pdkey).nunique(), check_names=False) for ddof in [0, 1, 2]: assert eq(ddf1.a.groupby(ddkey).var(ddof), pdf1.a.groupby(pdkey).var(ddof)) assert eq(ddf1.a.groupby(ddkey).std(ddof), pdf1.a.groupby(pdkey).std(ddof)) for i in range(8): assert eq(ddf1.groupby(ddf1.b > i).a.sum(), pdf1.groupby(pdf1.b > i).a.sum()) assert eq(ddf1.groupby(ddf1.b > i).a.min(), pdf1.groupby(pdf1.b > i).a.min()) assert eq(ddf1.groupby(ddf1.b > i).a.max(), pdf1.groupby(pdf1.b > i).a.max()) assert eq(ddf1.groupby(ddf1.b > i).a.count(), pdf1.groupby(pdf1.b > i).a.count()) assert eq(ddf1.groupby(ddf1.b > i).a.mean(), pdf1.groupby(pdf1.b > i).a.mean()) assert eq(ddf1.groupby(ddf1.b > i).a.nunique(), pdf1.groupby(pdf1.b > i).a.nunique()) assert eq(ddf1.groupby(ddf1.a > i).b.sum(), pdf1.groupby(pdf1.a > i).b.sum()) assert eq(ddf1.groupby(ddf1.a > i).b.min(), pdf1.groupby(pdf1.a > i).b.min()) assert eq(ddf1.groupby(ddf1.a > i).b.max(), pdf1.groupby(pdf1.a > i).b.max()) assert eq(ddf1.groupby(ddf1.a > i).b.count(), pdf1.groupby(pdf1.a > i).b.count()) assert eq(ddf1.groupby(ddf1.a > i).b.mean(), pdf1.groupby(pdf1.a > i).b.mean()) assert eq(ddf1.groupby(ddf1.a > i).b.nunique(), pdf1.groupby(pdf1.a > i).b.nunique()) assert eq(ddf1.groupby(ddf1.b > i).sum(), pdf1.groupby(pdf1.b > i).sum()) assert eq(ddf1.groupby(ddf1.b > i).min(), pdf1.groupby(pdf1.b > i).min()) assert eq(ddf1.groupby(ddf1.b > i).max(), pdf1.groupby(pdf1.b > i).max()) assert eq(ddf1.groupby(ddf1.b > i).count(), pdf1.groupby(pdf1.b > i).count()) assert eq(ddf1.groupby(ddf1.b > i).mean(), pdf1.groupby(pdf1.b > i).mean()) assert eq(ddf1.groupby(ddf1.a > i).sum(), pdf1.groupby(pdf1.a > i).sum()) assert eq(ddf1.groupby(ddf1.a > i).min(), pdf1.groupby(pdf1.a > i).min()) assert eq(ddf1.groupby(ddf1.a > i).max(), pdf1.groupby(pdf1.a > i).max()) assert eq(ddf1.groupby(ddf1.a > i).count(), pdf1.groupby(pdf1.a > i).count()) assert eq(ddf1.groupby(ddf1.a > i).mean(), pdf1.groupby(pdf1.a > i).mean()) for ddof in [0, 1, 2]: assert eq(ddf1.groupby(ddf1.b > i).std(ddof), pdf1.groupby(pdf1.b > i).std(ddof)) for ddkey, pdkey in [('a', 'a'), (ddf1.a, pdf1.a), (ddf1.a + 1, pdf1.a + 1), (ddf1.a > 3, pdf1.a > 3)]: assert eq(ddf1.groupby(ddkey).b.sum(), pdf1.groupby(pdkey).b.sum()) assert eq(ddf1.groupby(ddkey).b.min(), pdf1.groupby(pdkey).b.min()) assert eq(ddf1.groupby(ddkey).b.max(), pdf1.groupby(pdkey).b.max()) assert eq(ddf1.groupby(ddkey).b.count(), pdf1.groupby(pdkey).b.count()) assert eq(ddf1.groupby(ddkey).b.mean(), pdf1.groupby(pdkey).b.mean()) assert eq(ddf1.groupby(ddkey).b.nunique(), pdf1.groupby(pdkey).b.nunique()) assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum()) assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min()) assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max()) assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count()) assert eq(ddf1.groupby(ddkey).mean(), pdf1.groupby(pdkey).mean().astype(float)) for ddof in [0, 1, 2]: assert eq(ddf1.groupby(ddkey).b.std(ddof), pdf1.groupby(pdkey).b.std(ddof)) assert sorted(ddf1.groupby('b').a.sum().dask) == \ sorted(ddf1.groupby('b').a.sum().dask) assert sorted(ddf1.groupby(ddf1.a > 3).b.mean().dask) == \ sorted(ddf1.groupby(ddf1.a > 3).b.mean().dask) # test raises with incorrect key assert raises(KeyError, lambda: ddf1.groupby('x')) assert raises(KeyError, lambda: ddf1.groupby(['a', 'x'])) assert raises(KeyError, lambda: ddf1.groupby('a')['x']) assert raises(KeyError, lambda: ddf1.groupby('a')['b', 'x']) assert raises(KeyError, lambda: ddf1.groupby('a')[['b', 'x']]) # test graph node labels assert_dask_graph(ddf1.groupby('b').a.sum(), 'series-groupby-sum') assert_dask_graph(ddf1.groupby('b').a.min(), 'series-groupby-min') assert_dask_graph(ddf1.groupby('b').a.max(), 'series-groupby-max') assert_dask_graph(ddf1.groupby('b').a.count(), 'series-groupby-count') assert_dask_graph(ddf1.groupby('b').a.var(), 'series-groupby-var') # mean consists from sum and count operations assert_dask_graph(ddf1.groupby('b').a.mean(), 'series-groupby-sum') assert_dask_graph(ddf1.groupby('b').a.mean(), 'series-groupby-count') assert_dask_graph(ddf1.groupby('b').a.nunique(), 'series-groupby-nunique') assert_dask_graph(ddf1.groupby('b').sum(), 'dataframe-groupby-sum') assert_dask_graph(ddf1.groupby('b').min(), 'dataframe-groupby-min') assert_dask_graph(ddf1.groupby('b').max(), 'dataframe-groupby-max') assert_dask_graph(ddf1.groupby('b').count(), 'dataframe-groupby-count') # mean consists from sum and count operations assert_dask_graph(ddf1.groupby('b').mean(), 'dataframe-groupby-sum') assert_dask_graph(ddf1.groupby('b').mean(), 'dataframe-groupby-count')
def test_reductions_frame(split_every): dsk = { ('x', 0): pd.DataFrame({ 'a': [1, 2, 3], 'b': [4, 5, 6] }, index=[0, 1, 3]), ('x', 1): pd.DataFrame({ 'a': [4, 5, 6], 'b': [3, 2, 1] }, index=[5, 6, 8]), ('x', 2): pd.DataFrame({ 'a': [7, 8, 9], 'b': [0, 0, 0] }, index=[9, 9, 9]) } meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8')) ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9]) pdf1 = ddf1.compute() assert_eq(ddf1.sum(split_every=split_every), pdf1.sum()) assert_eq(ddf1.prod(split_every=split_every), pdf1.prod()) assert_eq(ddf1.min(split_every=split_every), pdf1.min()) assert_eq(ddf1.max(split_every=split_every), pdf1.max()) assert_eq(ddf1.count(split_every=split_every), pdf1.count()) assert_eq(ddf1.std(split_every=split_every), pdf1.std()) assert_eq(ddf1.var(split_every=split_every), pdf1.var()) assert_eq(ddf1.sem(split_every=split_every), pdf1.sem()) assert_eq(ddf1.std(ddof=0, split_every=split_every), pdf1.std(ddof=0)) assert_eq(ddf1.var(ddof=0, split_every=split_every), pdf1.var(ddof=0)) assert_eq(ddf1.sem(ddof=0, split_every=split_every), pdf1.sem(ddof=0)) assert_eq(ddf1.mean(split_every=split_every), pdf1.mean()) for axis in [0, 1, 'index', 'columns']: assert_eq(ddf1.sum(axis=axis, split_every=split_every), pdf1.sum(axis=axis)) assert_eq(ddf1.prod(axis=axis, split_every=split_every), pdf1.prod(axis=axis)) assert_eq(ddf1.min(axis=axis, split_every=split_every), pdf1.min(axis=axis)) assert_eq(ddf1.max(axis=axis, split_every=split_every), pdf1.max(axis=axis)) assert_eq(ddf1.count(axis=axis, split_every=split_every), pdf1.count(axis=axis)) assert_eq(ddf1.std(axis=axis, split_every=split_every), pdf1.std(axis=axis)) assert_eq(ddf1.var(axis=axis, split_every=split_every), pdf1.var(axis=axis)) assert_eq(ddf1.sem(axis=axis, split_every=split_every), pdf1.sem(axis=axis)) assert_eq(ddf1.std(axis=axis, ddof=0, split_every=split_every), pdf1.std(axis=axis, ddof=0)) assert_eq(ddf1.var(axis=axis, ddof=0, split_every=split_every), pdf1.var(axis=axis, ddof=0)) assert_eq(ddf1.sem(axis=axis, ddof=0, split_every=split_every), pdf1.sem(axis=axis, ddof=0)) assert_eq(ddf1.mean(axis=axis, split_every=split_every), pdf1.mean(axis=axis)) pytest.raises(ValueError, lambda: ddf1.sum(axis='incorrect').compute()) # axis=0 assert_dask_graph(ddf1.sum(split_every=split_every), 'dataframe-sum') assert_dask_graph(ddf1.prod(split_every=split_every), 'dataframe-prod') assert_dask_graph(ddf1.min(split_every=split_every), 'dataframe-min') assert_dask_graph(ddf1.max(split_every=split_every), 'dataframe-max') assert_dask_graph(ddf1.count(split_every=split_every), 'dataframe-count') # std, var, sem, and mean consist of sum and count operations assert_dask_graph(ddf1.std(split_every=split_every), 'dataframe-sum') assert_dask_graph(ddf1.std(split_every=split_every), 'dataframe-count') assert_dask_graph(ddf1.var(split_every=split_every), 'dataframe-sum') assert_dask_graph(ddf1.var(split_every=split_every), 'dataframe-count') assert_dask_graph(ddf1.sem(split_every=split_every), 'dataframe-sum') assert_dask_graph(ddf1.sem(split_every=split_every), 'dataframe-count') assert_dask_graph(ddf1.mean(split_every=split_every), 'dataframe-sum') assert_dask_graph(ddf1.mean(split_every=split_every), 'dataframe-count') # axis=1 assert_dask_graph(ddf1.sum(axis=1, split_every=split_every), 'dataframe-sum') assert_dask_graph(ddf1.prod(axis=1, split_every=split_every), 'dataframe-prod') assert_dask_graph(ddf1.min(axis=1, split_every=split_every), 'dataframe-min') assert_dask_graph(ddf1.max(axis=1, split_every=split_every), 'dataframe-max') assert_dask_graph(ddf1.count(axis=1, split_every=split_every), 'dataframe-count') assert_dask_graph(ddf1.std(axis=1, split_every=split_every), 'dataframe-std') assert_dask_graph(ddf1.var(axis=1, split_every=split_every), 'dataframe-var') assert_dask_graph(ddf1.sem(axis=1, split_every=split_every), 'dataframe-sem') assert_dask_graph(ddf1.mean(axis=1, split_every=split_every), 'dataframe-mean')
def test_reductions(): dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, index=[0, 1, 3]), ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]}, index=[5, 6, 8]), ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]}, index=[9, 9, 9])} meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8')) ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9]) pdf1 = ddf1.compute() nans1 = pd.Series([1] + [np.nan] * 4 + [2] + [np.nan] * 3) nands1 = dd.from_pandas(nans1, 2) nans2 = pd.Series([1] + [np.nan] * 8) nands2 = dd.from_pandas(nans2, 2) nans3 = pd.Series([np.nan] * 9) nands3 = dd.from_pandas(nans3, 2) bools = pd.Series([True, False, True, False, True], dtype=bool) boolds = dd.from_pandas(bools, 2) for dds, pds in [(ddf1.b, pdf1.b), (ddf1.a, pdf1.a), (ddf1['a'], pdf1['a']), (ddf1['b'], pdf1['b']), (nands1, nans1), (nands2, nans2), (nands3, nans3), (boolds, bools)]: assert isinstance(dds, dd.Series) assert isinstance(pds, pd.Series) assert eq(dds.sum(), pds.sum()) assert eq(dds.min(), pds.min()) assert eq(dds.max(), pds.max()) assert eq(dds.count(), pds.count()) assert eq(dds.std(), pds.std()) assert eq(dds.var(), pds.var()) assert eq(dds.std(ddof=0), pds.std(ddof=0)) assert eq(dds.var(ddof=0), pds.var(ddof=0)) assert eq(dds.mean(), pds.mean()) assert eq(dds.nunique(), pds.nunique()) assert eq(dds.nbytes, pds.nbytes) assert eq(dds.sum(skipna=False), pds.sum(skipna=False)) assert eq(dds.min(skipna=False), pds.min(skipna=False)) assert eq(dds.max(skipna=False), pds.max(skipna=False)) assert eq(dds.std(skipna=False), pds.std(skipna=False)) assert eq(dds.var(skipna=False), pds.var(skipna=False)) assert eq(dds.std(skipna=False, ddof=0), pds.std(skipna=False, ddof=0)) assert eq(dds.var(skipna=False, ddof=0), pds.var(skipna=False, ddof=0)) assert eq(dds.mean(skipna=False), pds.mean(skipna=False)) assert_dask_graph(ddf1.b.sum(), 'series-sum') assert_dask_graph(ddf1.b.min(), 'series-min') assert_dask_graph(ddf1.b.max(), 'series-max') assert_dask_graph(ddf1.b.count(), 'series-count') assert_dask_graph(ddf1.b.std(), 'series-std') assert_dask_graph(ddf1.b.var(), 'series-var') assert_dask_graph(ddf1.b.std(ddof=0), 'series-std') assert_dask_graph(ddf1.b.var(ddof=0), 'series-var') assert_dask_graph(ddf1.b.mean(), 'series-mean') # nunique is performed using drop-duplicates assert_dask_graph(ddf1.b.nunique(), 'drop-duplicates') eq(ddf1.index.min(), pdf1.index.min()) eq(ddf1.index.max(), pdf1.index.max())
def test_reductions_frame(): dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, index=[0, 1, 3]), ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]}, index=[5, 6, 8]), ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]}, index=[9, 9, 9])} meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8')) ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9]) pdf1 = ddf1.compute() assert eq(ddf1.sum(), pdf1.sum()) assert eq(ddf1.min(), pdf1.min()) assert eq(ddf1.max(), pdf1.max()) assert eq(ddf1.count(), pdf1.count()) assert eq(ddf1.std(), pdf1.std()) assert eq(ddf1.var(), pdf1.var()) assert eq(ddf1.std(ddof=0), pdf1.std(ddof=0)) assert eq(ddf1.var(ddof=0), pdf1.var(ddof=0)) assert eq(ddf1.mean(), pdf1.mean()) for axis in [0, 1, 'index', 'columns']: assert eq(ddf1.sum(axis=axis), pdf1.sum(axis=axis)) assert eq(ddf1.min(axis=axis), pdf1.min(axis=axis)) assert eq(ddf1.max(axis=axis), pdf1.max(axis=axis)) assert eq(ddf1.count(axis=axis), pdf1.count(axis=axis)) assert eq(ddf1.std(axis=axis), pdf1.std(axis=axis)) assert eq(ddf1.var(axis=axis), pdf1.var(axis=axis)) assert eq(ddf1.std(axis=axis, ddof=0), pdf1.std(axis=axis, ddof=0)) assert eq(ddf1.var(axis=axis, ddof=0), pdf1.var(axis=axis, ddof=0)) assert eq(ddf1.mean(axis=axis), pdf1.mean(axis=axis)) assert raises(ValueError, lambda: ddf1.sum(axis='incorrect').compute()) # axis=0 assert_dask_graph(ddf1.sum(), 'dataframe-sum') assert_dask_graph(ddf1.min(), 'dataframe-min') assert_dask_graph(ddf1.max(), 'dataframe-max') assert_dask_graph(ddf1.count(), 'dataframe-count') # std, var, mean consists from sum and count operations assert_dask_graph(ddf1.std(), 'dataframe-sum') assert_dask_graph(ddf1.std(), 'dataframe-count') assert_dask_graph(ddf1.var(), 'dataframe-sum') assert_dask_graph(ddf1.var(), 'dataframe-count') assert_dask_graph(ddf1.mean(), 'dataframe-sum') assert_dask_graph(ddf1.mean(), 'dataframe-count') # axis=1 assert_dask_graph(ddf1.sum(axis=1), 'dataframe-sum') assert_dask_graph(ddf1.min(axis=1), 'dataframe-min') assert_dask_graph(ddf1.max(axis=1), 'dataframe-max') assert_dask_graph(ddf1.count(axis=1), 'dataframe-count') assert_dask_graph(ddf1.std(axis=1), 'dataframe-std') assert_dask_graph(ddf1.var(axis=1), 'dataframe-var') assert_dask_graph(ddf1.mean(axis=1), 'dataframe-mean')
def test_reductions(split_every): dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, index=[0, 1, 3]), ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]}, index=[5, 6, 8]), ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]}, index=[9, 9, 9])} meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8')) ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9]) pdf1 = ddf1.compute() nans1 = pd.Series([1] + [np.nan] * 4 + [2] + [np.nan] * 3) nands1 = dd.from_pandas(nans1, 2) nans2 = pd.Series([1] + [np.nan] * 8) nands2 = dd.from_pandas(nans2, 2) nans3 = pd.Series([np.nan] * 9) nands3 = dd.from_pandas(nans3, 2) bools = pd.Series([True, False, True, False, True], dtype=bool) boolds = dd.from_pandas(bools, 2) for dds, pds in [(ddf1.b, pdf1.b), (ddf1.a, pdf1.a), (ddf1['a'], pdf1['a']), (ddf1['b'], pdf1['b']), (nands1, nans1), (nands2, nans2), (nands3, nans3), (boolds, bools)]: assert isinstance(dds, dd.Series) assert isinstance(pds, pd.Series) assert_eq(dds.sum(split_every=split_every), pds.sum()) assert_eq(dds.prod(split_every=split_every), pds.prod()) assert_eq(dds.min(split_every=split_every), pds.min()) assert_eq(dds.max(split_every=split_every), pds.max()) assert_eq(dds.count(split_every=split_every), pds.count()) with pytest.warns(None): # runtime warnings; https://github.com/dask/dask/issues/2381 assert_eq(dds.std(split_every=split_every), pds.std()) with pytest.warns(None): # runtime warnings; https://github.com/dask/dask/issues/2381 assert_eq(dds.var(split_every=split_every), pds.var()) with pytest.warns(None): # runtime warnings; https://github.com/dask/dask/issues/2381 assert_eq(dds.sem(split_every=split_every), pds.sem()) assert_eq(dds.std(ddof=0, split_every=split_every), pds.std(ddof=0)) assert_eq(dds.var(ddof=0, split_every=split_every), pds.var(ddof=0)) assert_eq(dds.sem(ddof=0, split_every=split_every), pds.sem(ddof=0)) assert_eq(dds.mean(split_every=split_every), pds.mean()) assert_eq(dds.nunique(split_every=split_every), pds.nunique()) assert_eq(dds.sum(skipna=False, split_every=split_every), pds.sum(skipna=False)) assert_eq(dds.prod(skipna=False, split_every=split_every), pds.prod(skipna=False)) assert_eq(dds.min(skipna=False, split_every=split_every), pds.min(skipna=False)) assert_eq(dds.max(skipna=False, split_every=split_every), pds.max(skipna=False)) assert_eq(dds.std(skipna=False, split_every=split_every), pds.std(skipna=False)) assert_eq(dds.var(skipna=False, split_every=split_every), pds.var(skipna=False)) assert_eq(dds.sem(skipna=False, split_every=split_every), pds.sem(skipna=False)) assert_eq(dds.std(skipna=False, ddof=0, split_every=split_every), pds.std(skipna=False, ddof=0)) assert_eq(dds.var(skipna=False, ddof=0, split_every=split_every), pds.var(skipna=False, ddof=0)) assert_eq(dds.sem(skipna=False, ddof=0, split_every=split_every), pds.sem(skipna=False, ddof=0)) assert_eq(dds.mean(skipna=False, split_every=split_every), pds.mean(skipna=False)) assert_dask_graph(ddf1.b.sum(split_every=split_every), 'series-sum') assert_dask_graph(ddf1.b.prod(split_every=split_every), 'series-prod') assert_dask_graph(ddf1.b.min(split_every=split_every), 'series-min') assert_dask_graph(ddf1.b.max(split_every=split_every), 'series-max') assert_dask_graph(ddf1.b.count(split_every=split_every), 'series-count') assert_dask_graph(ddf1.b.std(split_every=split_every), 'series-std') assert_dask_graph(ddf1.b.var(split_every=split_every), 'series-var') assert_dask_graph(ddf1.b.sem(split_every=split_every), 'series-sem') assert_dask_graph(ddf1.b.std(ddof=0, split_every=split_every), 'series-std') assert_dask_graph(ddf1.b.var(ddof=0, split_every=split_every), 'series-var') assert_dask_graph(ddf1.b.sem(ddof=0, split_every=split_every), 'series-sem') assert_dask_graph(ddf1.b.mean(split_every=split_every), 'series-mean') # nunique is performed using drop-duplicates assert_dask_graph(ddf1.b.nunique(split_every=split_every), 'drop-duplicates') # testing index assert_eq(ddf1.index.min(split_every=split_every), pdf1.index.min()) assert_eq(ddf1.index.max(split_every=split_every), pdf1.index.max()) assert_eq(ddf1.index.count(split_every=split_every), pd.notnull(pdf1.index).sum())
def test_split_apply_combine_on_series(): pdf1 = pd.DataFrame( { 'a': [1, 2, 6, 4, 4, 6, 4, 3, 7], 'b': [4, 2, 7, 3, 3, 1, 1, 1, 2] }, index=[0, 1, 3, 5, 6, 8, 9, 9, 9]) ddf = dd.from_pandas(pdf1, npartitions=3) ddf1 = ddf for ddkey, pdkey in [('b', 'b'), (ddf1.b, pdf1.b), (ddf1.b + 1, pdf1.b + 1)]: assert eq(ddf1.groupby(ddkey).a.min(), pdf1.groupby(pdkey).a.min()) assert eq(ddf1.groupby(ddkey).a.max(), pdf1.groupby(pdkey).a.max()) assert eq(ddf1.groupby(ddkey).a.count(), pdf1.groupby(pdkey).a.count()) assert eq(ddf1.groupby(ddkey).a.mean(), pdf1.groupby(pdkey).a.mean()) assert eq( ddf1.groupby(ddkey).a.nunique(), pdf1.groupby(pdkey).a.nunique()) assert eq(ddf1.groupby(ddkey).a.size(), pdf1.groupby(pdkey).a.size()) for ddof in [0, 1, 2]: assert eq( ddf1.groupby(ddkey).a.var(ddof), pdf1.groupby(pdkey).a.var(ddof)) assert eq( ddf1.groupby(ddkey).a.std(ddof), pdf1.groupby(pdkey).a.std(ddof)) assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum()) assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min()) assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max()) assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count()) assert eq(ddf1.groupby(ddkey).mean(), pdf1.groupby(pdkey).mean()) assert eq(ddf1.groupby(ddkey).size(), pdf1.groupby(pdkey).size()) for ddof in [0, 1, 2]: assert eq(ddf1.groupby(ddkey).var(ddof), pdf1.groupby(pdkey).var(ddof), check_dtype=False) assert eq(ddf1.groupby(ddkey).std(ddof), pdf1.groupby(pdkey).std(ddof), check_dtype=False) for ddkey, pdkey in [(ddf1.b, pdf1.b), (ddf1.b + 1, pdf1.b + 1)]: assert eq(ddf1.a.groupby(ddkey).sum(), pdf1.a.groupby(pdkey).sum(), check_names=False) assert eq(ddf1.a.groupby(ddkey).max(), pdf1.a.groupby(pdkey).max(), check_names=False) assert eq(ddf1.a.groupby(ddkey).count(), pdf1.a.groupby(pdkey).count(), check_names=False) assert eq(ddf1.a.groupby(ddkey).mean(), pdf1.a.groupby(pdkey).mean(), check_names=False) assert eq(ddf1.a.groupby(ddkey).nunique(), pdf1.a.groupby(pdkey).nunique(), check_names=False) for ddof in [0, 1, 2]: assert eq( ddf1.a.groupby(ddkey).var(ddof), pdf1.a.groupby(pdkey).var(ddof)) assert eq( ddf1.a.groupby(ddkey).std(ddof), pdf1.a.groupby(pdkey).std(ddof)) for i in range(8): assert eq( ddf1.groupby(ddf1.b > i).a.sum(), pdf1.groupby(pdf1.b > i).a.sum()) assert eq( ddf1.groupby(ddf1.b > i).a.min(), pdf1.groupby(pdf1.b > i).a.min()) assert eq( ddf1.groupby(ddf1.b > i).a.max(), pdf1.groupby(pdf1.b > i).a.max()) assert eq( ddf1.groupby(ddf1.b > i).a.count(), pdf1.groupby(pdf1.b > i).a.count()) assert eq( ddf1.groupby(ddf1.b > i).a.mean(), pdf1.groupby(pdf1.b > i).a.mean()) assert eq( ddf1.groupby(ddf1.b > i).a.nunique(), pdf1.groupby(pdf1.b > i).a.nunique()) assert eq( ddf1.groupby(ddf1.b > i).a.size(), pdf1.groupby(pdf1.b > i).a.size()) assert eq( ddf1.groupby(ddf1.a > i).b.sum(), pdf1.groupby(pdf1.a > i).b.sum()) assert eq( ddf1.groupby(ddf1.a > i).b.min(), pdf1.groupby(pdf1.a > i).b.min()) assert eq( ddf1.groupby(ddf1.a > i).b.max(), pdf1.groupby(pdf1.a > i).b.max()) assert eq( ddf1.groupby(ddf1.a > i).b.count(), pdf1.groupby(pdf1.a > i).b.count()) assert eq( ddf1.groupby(ddf1.a > i).b.mean(), pdf1.groupby(pdf1.a > i).b.mean()) assert eq( ddf1.groupby(ddf1.a > i).b.nunique(), pdf1.groupby(pdf1.a > i).b.nunique()) assert eq( ddf1.groupby(ddf1.b > i).b.size(), pdf1.groupby(pdf1.b > i).b.size()) assert eq( ddf1.groupby(ddf1.b > i).sum(), pdf1.groupby(pdf1.b > i).sum()) assert eq( ddf1.groupby(ddf1.b > i).min(), pdf1.groupby(pdf1.b > i).min()) assert eq( ddf1.groupby(ddf1.b > i).max(), pdf1.groupby(pdf1.b > i).max()) assert eq( ddf1.groupby(ddf1.b > i).count(), pdf1.groupby(pdf1.b > i).count()) assert eq( ddf1.groupby(ddf1.b > i).mean(), pdf1.groupby(pdf1.b > i).mean()) assert eq( ddf1.groupby(ddf1.b > i).size(), pdf1.groupby(pdf1.b > i).size()) assert eq( ddf1.groupby(ddf1.a > i).sum(), pdf1.groupby(pdf1.a > i).sum()) assert eq( ddf1.groupby(ddf1.a > i).min(), pdf1.groupby(pdf1.a > i).min()) assert eq( ddf1.groupby(ddf1.a > i).max(), pdf1.groupby(pdf1.a > i).max()) assert eq( ddf1.groupby(ddf1.a > i).count(), pdf1.groupby(pdf1.a > i).count()) assert eq( ddf1.groupby(ddf1.a > i).mean(), pdf1.groupby(pdf1.a > i).mean()) assert eq( ddf1.groupby(ddf1.a > i).size(), pdf1.groupby(pdf1.a > i).size()) for ddof in [0, 1, 2]: assert eq( ddf1.groupby(ddf1.b > i).std(ddof), pdf1.groupby(pdf1.b > i).std(ddof)) for ddkey, pdkey in [('a', 'a'), (ddf1.a, pdf1.a), (ddf1.a + 1, pdf1.a + 1), (ddf1.a > 3, pdf1.a > 3)]: assert eq(ddf1.groupby(ddkey).b.sum(), pdf1.groupby(pdkey).b.sum()) assert eq(ddf1.groupby(ddkey).b.min(), pdf1.groupby(pdkey).b.min()) assert eq(ddf1.groupby(ddkey).b.max(), pdf1.groupby(pdkey).b.max()) assert eq(ddf1.groupby(ddkey).b.count(), pdf1.groupby(pdkey).b.count()) assert eq(ddf1.groupby(ddkey).b.mean(), pdf1.groupby(pdkey).b.mean()) assert eq( ddf1.groupby(ddkey).b.nunique(), pdf1.groupby(pdkey).b.nunique()) assert eq(ddf1.groupby(ddkey).b.size(), pdf1.groupby(pdkey).b.size()) assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum()) assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min()) assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max()) assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count()) assert eq( ddf1.groupby(ddkey).mean(), pdf1.groupby(pdkey).mean().astype(float)) assert eq(ddf1.groupby(ddkey).size(), pdf1.groupby(pdkey).size()) for ddof in [0, 1, 2]: assert eq( ddf1.groupby(ddkey).b.std(ddof), pdf1.groupby(pdkey).b.std(ddof)) assert (sorted(ddf1.groupby('b').a.sum().dask) == sorted( ddf1.groupby('b').a.sum().dask)) assert (sorted(ddf1.groupby(ddf1.a > 3).b.mean().dask) == sorted( ddf1.groupby(ddf1.a > 3).b.mean().dask)) # test raises with incorrect key assert raises(KeyError, lambda: ddf1.groupby('x')) assert raises(KeyError, lambda: ddf1.groupby(['a', 'x'])) assert raises(KeyError, lambda: ddf1.groupby('a')['x']) assert raises(KeyError, lambda: ddf1.groupby('a')['b', 'x']) assert raises(KeyError, lambda: ddf1.groupby('a')[['b', 'x']]) # test graph node labels assert_dask_graph(ddf1.groupby('b').a.sum(), 'series-groupby-sum') assert_dask_graph(ddf1.groupby('b').a.min(), 'series-groupby-min') assert_dask_graph(ddf1.groupby('b').a.max(), 'series-groupby-max') assert_dask_graph(ddf1.groupby('b').a.count(), 'series-groupby-count') assert_dask_graph(ddf1.groupby('b').a.var(), 'series-groupby-var') # mean consists from sum and count operations assert_dask_graph(ddf1.groupby('b').a.mean(), 'series-groupby-sum') assert_dask_graph(ddf1.groupby('b').a.mean(), 'series-groupby-count') assert_dask_graph(ddf1.groupby('b').a.nunique(), 'series-groupby-nunique') assert_dask_graph(ddf1.groupby('b').a.size(), 'series-groupby-size') assert_dask_graph(ddf1.groupby('b').sum(), 'dataframe-groupby-sum') assert_dask_graph(ddf1.groupby('b').min(), 'dataframe-groupby-min') assert_dask_graph(ddf1.groupby('b').max(), 'dataframe-groupby-max') assert_dask_graph(ddf1.groupby('b').count(), 'dataframe-groupby-count') # mean consists from sum and count operations assert_dask_graph(ddf1.groupby('b').mean(), 'dataframe-groupby-sum') assert_dask_graph(ddf1.groupby('b').mean(), 'dataframe-groupby-count') assert_dask_graph(ddf1.groupby('b').size(), 'dataframe-groupby-size')
def test_reductions(): dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, index=[0, 1, 3]), ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]}, index=[5, 6, 8]), ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]}, index=[9, 9, 9])} ddf1 = dd.DataFrame(dsk, 'x', ['a', 'b'], [0, 4, 9, 9]) pdf1 = ddf1.compute() nans1 = pd.Series([1] + [np.nan] * 4 + [2] + [np.nan] * 3) nands1 = dd.from_pandas(nans1, 2) nans2 = pd.Series([1] + [np.nan] * 8) nands2 = dd.from_pandas(nans2, 2) nans3 = pd.Series([np.nan] * 9) nands3 = dd.from_pandas(nans3, 2) bools = pd.Series([True, False, True, False, True], dtype=bool) boolds = dd.from_pandas(bools, 2) for dds, pds in [(ddf1.b, pdf1.b), (ddf1.a, pdf1.a), (ddf1['a'], pdf1['a']), (ddf1['b'], pdf1['b']), (nands1, nans1), (nands2, nans2), (nands3, nans3), (boolds, bools)]: assert isinstance(dds, dd.Series) assert isinstance(pds, pd.Series) assert eq(dds.sum(), pds.sum()) assert eq(dds.min(), pds.min()) assert eq(dds.max(), pds.max()) assert eq(dds.count(), pds.count()) assert eq(dds.std(), pds.std()) assert eq(dds.var(), pds.var()) assert eq(dds.std(ddof=0), pds.std(ddof=0)) assert eq(dds.var(ddof=0), pds.var(ddof=0)) assert eq(dds.mean(), pds.mean()) assert eq(dds.nunique(), pds.nunique()) assert eq(dds.nbytes, pds.nbytes) assert eq(dds.sum(skipna=False), pds.sum(skipna=False)) assert eq(dds.min(skipna=False), pds.min(skipna=False)) assert eq(dds.max(skipna=False), pds.max(skipna=False)) assert eq(dds.std(skipna=False), pds.std(skipna=False)) assert eq(dds.var(skipna=False), pds.var(skipna=False)) assert eq(dds.std(skipna=False, ddof=0), pds.std(skipna=False, ddof=0)) assert eq(dds.var(skipna=False, ddof=0), pds.var(skipna=False, ddof=0)) assert eq(dds.mean(skipna=False), pds.mean(skipna=False)) assert_dask_graph(ddf1.b.sum(), 'series-sum') assert_dask_graph(ddf1.b.min(), 'series-min') assert_dask_graph(ddf1.b.max(), 'series-max') assert_dask_graph(ddf1.b.count(), 'series-count') assert_dask_graph(ddf1.b.std(), 'series-std(ddof=1)') assert_dask_graph(ddf1.b.var(), 'series-var(ddof=1)') assert_dask_graph(ddf1.b.std(ddof=0), 'series-std(ddof=0)') assert_dask_graph(ddf1.b.var(ddof=0), 'series-var(ddof=0)') assert_dask_graph(ddf1.b.mean(), 'series-mean') # nunique is performed using drop-duplicates assert_dask_graph(ddf1.b.nunique(), 'drop-duplicates')
def test_reductions_frame(): dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, index=[0, 1, 3]), ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]}, index=[5, 6, 8]), ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]}, index=[9, 9, 9])} ddf1 = dd.DataFrame(dsk, 'x', ['a', 'b'], [0, 4, 9, 9]) pdf1 = ddf1.compute() assert eq(ddf1.sum(), pdf1.sum()) assert eq(ddf1.min(), pdf1.min()) assert eq(ddf1.max(), pdf1.max()) assert eq(ddf1.count(), pdf1.count()) assert eq(ddf1.std(), pdf1.std()) assert eq(ddf1.var(), pdf1.var()) assert eq(ddf1.std(ddof=0), pdf1.std(ddof=0)) assert eq(ddf1.var(ddof=0), pdf1.var(ddof=0)) assert eq(ddf1.mean(), pdf1.mean()) for axis in [0, 1, 'index', 'columns']: assert eq(ddf1.sum(axis=axis), pdf1.sum(axis=axis)) assert eq(ddf1.min(axis=axis), pdf1.min(axis=axis)) assert eq(ddf1.max(axis=axis), pdf1.max(axis=axis)) assert eq(ddf1.count(axis=axis), pdf1.count(axis=axis)) assert eq(ddf1.std(axis=axis), pdf1.std(axis=axis)) assert eq(ddf1.var(axis=axis), pdf1.var(axis=axis)) assert eq(ddf1.std(axis=axis, ddof=0), pdf1.std(axis=axis, ddof=0)) assert eq(ddf1.var(axis=axis, ddof=0), pdf1.var(axis=axis, ddof=0)) assert eq(ddf1.mean(axis=axis), pdf1.mean(axis=axis)) assert raises(ValueError, lambda: ddf1.sum(axis='incorrect').compute()) # axis=0 assert_dask_graph(ddf1.sum(), 'dataframe-sum') assert_dask_graph(ddf1.min(), 'dataframe-min') assert_dask_graph(ddf1.max(), 'dataframe-max') assert_dask_graph(ddf1.count(), 'dataframe-count') # std, var, mean consists from sum and count operations assert_dask_graph(ddf1.std(), 'dataframe-sum') assert_dask_graph(ddf1.std(), 'dataframe-count') assert_dask_graph(ddf1.var(), 'dataframe-sum') assert_dask_graph(ddf1.var(), 'dataframe-count') assert_dask_graph(ddf1.mean(), 'dataframe-sum') assert_dask_graph(ddf1.mean(), 'dataframe-count') # axis=1 assert_dask_graph(ddf1.sum(axis=1), 'dataframe-sum(axis=1)') assert_dask_graph(ddf1.min(axis=1), 'dataframe-min(axis=1)') assert_dask_graph(ddf1.max(axis=1), 'dataframe-max(axis=1)') assert_dask_graph(ddf1.count(axis=1), 'dataframe-count(axis=1)') assert_dask_graph(ddf1.std(axis=1), 'dataframe-std(axis=1, ddof=1)') assert_dask_graph(ddf1.var(axis=1), 'dataframe-var(axis=1, ddof=1)') assert_dask_graph(ddf1.mean(axis=1), 'dataframe-mean(axis=1)')
def test_split_apply_combine_on_series(): dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 6], 'b': [4, 2, 7]}, index=[0, 1, 3]), ('x', 1): pd.DataFrame({'a': [4, 4, 6], 'b': [3, 3, 1]}, index=[5, 6, 8]), ('x', 2): pd.DataFrame({'a': [4, 3, 7], 'b': [1, 1, 3]}, index=[9, 9, 9])} ddf1 = dd.DataFrame(dsk, 'x', ['a', 'b'], [0, 4, 9, 9]) pdf1 = ddf1.compute() for ddkey, pdkey in [('b', 'b'), (ddf1.b, pdf1.b), (ddf1.b + 1, pdf1.b + 1)]: assert eq(ddf1.groupby(ddkey).a.min(), pdf1.groupby(pdkey).a.min()) assert eq(ddf1.groupby(ddkey).a.max(), pdf1.groupby(pdkey).a.max()) assert eq(ddf1.groupby(ddkey).a.count(), pdf1.groupby(pdkey).a.count()) assert eq(ddf1.groupby(ddkey).a.mean(), pdf1.groupby(pdkey).a.mean()) assert eq(ddf1.groupby(ddkey).a.nunique(), pdf1.groupby(pdkey).a.nunique()) assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum()) assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min()) assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max()) assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count()) assert eq(ddf1.groupby(ddkey).mean(), pdf1.groupby(pdkey).mean()) for ddkey, pdkey in [(ddf1.b, pdf1.b), (ddf1.b + 1, pdf1.b + 1)]: assert eq(ddf1.a.groupby(ddkey).sum(), pdf1.a.groupby(pdkey).sum(), check_names=False) assert eq(ddf1.a.groupby(ddkey).max(), pdf1.a.groupby(pdkey).max(), check_names=False) assert eq(ddf1.a.groupby(ddkey).count(), pdf1.a.groupby(pdkey).count(), check_names=False) assert eq(ddf1.a.groupby(ddkey).mean(), pdf1.a.groupby(pdkey).mean(), check_names=False) assert eq(ddf1.a.groupby(ddkey).nunique(), pdf1.a.groupby(pdkey).nunique(), check_names=False) for i in range(8): assert eq(ddf1.groupby(ddf1.b > i).a.sum(), pdf1.groupby(pdf1.b > i).a.sum()) assert eq(ddf1.groupby(ddf1.b > i).a.min(), pdf1.groupby(pdf1.b > i).a.min()) assert eq(ddf1.groupby(ddf1.b > i).a.max(), pdf1.groupby(pdf1.b > i).a.max()) assert eq(ddf1.groupby(ddf1.b > i).a.count(), pdf1.groupby(pdf1.b > i).a.count()) assert eq(ddf1.groupby(ddf1.b > i).a.mean(), pdf1.groupby(pdf1.b > i).a.mean()) assert eq(ddf1.groupby(ddf1.b > i).a.nunique(), pdf1.groupby(pdf1.b > i).a.nunique()) assert eq(ddf1.groupby(ddf1.a > i).b.sum(), pdf1.groupby(pdf1.a > i).b.sum()) assert eq(ddf1.groupby(ddf1.a > i).b.min(), pdf1.groupby(pdf1.a > i).b.min()) assert eq(ddf1.groupby(ddf1.a > i).b.max(), pdf1.groupby(pdf1.a > i).b.max()) assert eq(ddf1.groupby(ddf1.a > i).b.count(), pdf1.groupby(pdf1.a > i).b.count()) assert eq(ddf1.groupby(ddf1.a > i).b.mean(), pdf1.groupby(pdf1.a > i).b.mean()) assert eq(ddf1.groupby(ddf1.a > i).b.nunique(), pdf1.groupby(pdf1.a > i).b.nunique()) assert eq(ddf1.groupby(ddf1.b > i).sum(), pdf1.groupby(pdf1.b > i).sum()) assert eq(ddf1.groupby(ddf1.b > i).min(), pdf1.groupby(pdf1.b > i).min()) assert eq(ddf1.groupby(ddf1.b > i).max(), pdf1.groupby(pdf1.b > i).max()) assert eq(ddf1.groupby(ddf1.b > i).count(), pdf1.groupby(pdf1.b > i).count()) assert eq(ddf1.groupby(ddf1.b > i).mean(), pdf1.groupby(pdf1.b > i).mean()) assert eq(ddf1.groupby(ddf1.a > i).sum(), pdf1.groupby(pdf1.a > i).sum()) assert eq(ddf1.groupby(ddf1.a > i).min(), pdf1.groupby(pdf1.a > i).min()) assert eq(ddf1.groupby(ddf1.a > i).max(), pdf1.groupby(pdf1.a > i).max()) assert eq(ddf1.groupby(ddf1.a > i).count(), pdf1.groupby(pdf1.a > i).count()) assert eq(ddf1.groupby(ddf1.a > i).mean(), pdf1.groupby(pdf1.a > i).mean()) for ddkey, pdkey in [('a', 'a'), (ddf1.a, pdf1.a), (ddf1.a + 1, pdf1.a + 1), (ddf1.a > 3, pdf1.a > 3)]: assert eq(ddf1.groupby(ddkey).b.sum(), pdf1.groupby(pdkey).b.sum()) assert eq(ddf1.groupby(ddkey).b.min(), pdf1.groupby(pdkey).b.min()) assert eq(ddf1.groupby(ddkey).b.max(), pdf1.groupby(pdkey).b.max()) assert eq(ddf1.groupby(ddkey).b.count(), pdf1.groupby(pdkey).b.count()) assert eq(ddf1.groupby(ddkey).b.mean(), pdf1.groupby(pdkey).b.mean()) assert eq(ddf1.groupby(ddkey).b.nunique(), pdf1.groupby(pdkey).b.nunique()) assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum()) assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min()) assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max()) assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count()) assert eq(ddf1.groupby(ddkey).mean(), pdf1.groupby(pdkey).mean().astype(float)) assert sorted(ddf1.groupby('b').a.sum().dask) == \ sorted(ddf1.groupby('b').a.sum().dask) assert sorted(ddf1.groupby(ddf1.a > 3).b.mean().dask) == \ sorted(ddf1.groupby(ddf1.a > 3).b.mean().dask) # test raises with incorrect key assert raises(KeyError, lambda: ddf1.groupby('x')) assert raises(KeyError, lambda: ddf1.groupby(['a', 'x'])) assert raises(KeyError, lambda: ddf1.groupby('a')['x']) assert raises(KeyError, lambda: ddf1.groupby('a')['b', 'x']) assert raises(KeyError, lambda: ddf1.groupby('a')[['b', 'x']]) # test graph node labels assert_dask_graph(ddf1.groupby('b').a.sum(), 'series-groupby-sum') assert_dask_graph(ddf1.groupby('b').a.min(), 'series-groupby-min') assert_dask_graph(ddf1.groupby('b').a.max(), 'series-groupby-max') assert_dask_graph(ddf1.groupby('b').a.count(), 'series-groupby-count') # mean consists from sum and count operations assert_dask_graph(ddf1.groupby('b').a.mean(), 'series-groupby-sum') assert_dask_graph(ddf1.groupby('b').a.mean(), 'series-groupby-count') assert_dask_graph(ddf1.groupby('b').a.nunique(), 'series-groupby-nunique') assert_dask_graph(ddf1.groupby('b').sum(), 'dataframe-groupby-sum') assert_dask_graph(ddf1.groupby('b').min(), 'dataframe-groupby-min') assert_dask_graph(ddf1.groupby('b').max(), 'dataframe-groupby-max') assert_dask_graph(ddf1.groupby('b').count(), 'dataframe-groupby-count') # mean consists from sum and count operations assert_dask_graph(ddf1.groupby('b').mean(), 'dataframe-groupby-sum') assert_dask_graph(ddf1.groupby('b').mean(), 'dataframe-groupby-count')
def test_reductions(split_every): dsk = { ("x", 0): pd.DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": [True, True, False]}, index=[0, 1, 3] ), ("x", 1): pd.DataFrame( {"a": [4, 5, 6], "b": [3, 2, 1], "c": [False, False, False]}, index=[5, 6, 8], ), ("x", 2): pd.DataFrame( { "a": [13094304034, 3489385935, 100006774], "b": [0, 0, 0], "c": [True, True, True], }, index=[9, 9, 9], ), } meta = make_meta({"a": "i8", "b": "i8", "c": "bool"}, index=pd.Index([], "i8")) ddf1 = dd.DataFrame(dsk, "x", meta, [0, 4, 9, 9]) pdf1 = ddf1.compute() nans1 = pd.Series([1] + [np.nan] * 4 + [2] + [np.nan] * 3) nands1 = dd.from_pandas(nans1, 2) nans2 = pd.Series([1] + [np.nan] * 8) nands2 = dd.from_pandas(nans2, 2) nans3 = pd.Series([np.nan] * 9) nands3 = dd.from_pandas(nans3, 2) bools = pd.Series([True, False, True, False, True], dtype=bool) boolds = dd.from_pandas(bools, 2) for dds, pds in [ (ddf1.a, pdf1.a), (ddf1.b, pdf1.b), (ddf1.c, pdf1.c), (ddf1["a"], pdf1["a"]), (ddf1["b"], pdf1["b"]), (nands1, nans1), (nands2, nans2), (nands3, nans3), (boolds, bools), ]: assert isinstance(dds, dd.Series) assert isinstance(pds, pd.Series) assert_eq(dds.sum(split_every=split_every), pds.sum()) assert_eq(dds.prod(split_every=split_every), pds.prod()) assert_eq(dds.min(split_every=split_every), pds.min()) assert_eq(dds.max(split_every=split_every), pds.max()) assert_eq(dds.count(split_every=split_every), pds.count()) if scipy: # pandas uses unbiased skew, need to correct for that n = pds.shape[0] bias_factor = (n * (n - 1)) ** 0.5 / (n - 2) assert_eq(dds.skew(), pds.skew() / bias_factor) with pytest.warns(None): # runtime warnings; https://github.com/dask/dask/issues/2381 assert_eq(dds.std(split_every=split_every), pds.std()) with pytest.warns(None): # runtime warnings; https://github.com/dask/dask/issues/2381 assert_eq(dds.var(split_every=split_every), pds.var()) with pytest.warns(None): # runtime warnings; https://github.com/dask/dask/issues/2381 assert_eq(dds.sem(split_every=split_every), pds.sem()) with warnings.catch_warnings(): # dask.dataframe should probably filter this, to match pandas, but # it seems quite difficult. warnings.simplefilter("ignore", RuntimeWarning) assert_eq(dds.std(ddof=0, split_every=split_every), pds.std(ddof=0)) assert_eq(dds.var(ddof=0, split_every=split_every), pds.var(ddof=0)) assert_eq(dds.sem(ddof=0, split_every=split_every), pds.sem(ddof=0)) assert_eq(dds.mean(split_every=split_every), pds.mean()) assert_eq(dds.nunique(split_every=split_every), pds.nunique()) assert_eq(dds.sum(skipna=False, split_every=split_every), pds.sum(skipna=False)) assert_eq( dds.prod(skipna=False, split_every=split_every), pds.prod(skipna=False) ) assert_eq(dds.min(skipna=False, split_every=split_every), pds.min(skipna=False)) assert_eq(dds.max(skipna=False, split_every=split_every), pds.max(skipna=False)) assert_eq(dds.std(skipna=False, split_every=split_every), pds.std(skipna=False)) assert_eq(dds.var(skipna=False, split_every=split_every), pds.var(skipna=False)) assert_eq(dds.sem(skipna=False, split_every=split_every), pds.sem(skipna=False)) assert_eq( dds.std(skipna=False, ddof=0, split_every=split_every), pds.std(skipna=False, ddof=0), ) assert_eq( dds.var(skipna=False, ddof=0, split_every=split_every), pds.var(skipna=False, ddof=0), ) assert_eq( dds.sem(skipna=False, ddof=0, split_every=split_every), pds.sem(skipna=False, ddof=0), ) assert_eq( dds.mean(skipna=False, split_every=split_every), pds.mean(skipna=False) ) assert_dask_graph(ddf1.b.sum(split_every=split_every), "series-sum") assert_dask_graph(ddf1.b.prod(split_every=split_every), "series-prod") assert_dask_graph(ddf1.b.min(split_every=split_every), "series-min") assert_dask_graph(ddf1.b.max(split_every=split_every), "series-max") assert_dask_graph(ddf1.b.count(split_every=split_every), "series-count") assert_dask_graph(ddf1.b.std(split_every=split_every), "series-std") assert_dask_graph(ddf1.b.var(split_every=split_every), "series-var") assert_dask_graph(ddf1.b.sem(split_every=split_every), "series-sem") assert_dask_graph(ddf1.b.std(ddof=0, split_every=split_every), "series-std") assert_dask_graph(ddf1.b.var(ddof=0, split_every=split_every), "series-var") assert_dask_graph(ddf1.b.sem(ddof=0, split_every=split_every), "series-sem") assert_dask_graph(ddf1.b.mean(split_every=split_every), "series-mean") # nunique is performed using drop-duplicates assert_dask_graph(ddf1.b.nunique(split_every=split_every), "drop-duplicates") # testing index assert_eq(ddf1.index.min(split_every=split_every), pdf1.index.min()) assert_eq(ddf1.index.max(split_every=split_every), pdf1.index.max()) assert_eq(ddf1.index.count(split_every=split_every), pd.notnull(pdf1.index).sum())
def test_reductions_frame(split_every): dsk = { ("x", 0): pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=[0, 1, 3]), ("x", 1): pd.DataFrame({"a": [4, 5, 6], "b": [3, 2, 1]}, index=[5, 6, 8]), ("x", 2): pd.DataFrame({"a": [7, 8, 9], "b": [0, 0, 0]}, index=[9, 9, 9]), } meta = make_meta({"a": "i8", "b": "i8"}, index=pd.Index([], "i8")) ddf1 = dd.DataFrame(dsk, "x", meta, [0, 4, 9, 9]) pdf1 = ddf1.compute() assert_eq(ddf1.sum(split_every=split_every), pdf1.sum()) assert_eq(ddf1.prod(split_every=split_every), pdf1.prod()) assert_eq(ddf1.min(split_every=split_every), pdf1.min()) assert_eq(ddf1.max(split_every=split_every), pdf1.max()) assert_eq(ddf1.count(split_every=split_every), pdf1.count()) assert_eq(ddf1.std(split_every=split_every), pdf1.std()) assert_eq(ddf1.var(split_every=split_every), pdf1.var()) assert_eq(ddf1.sem(split_every=split_every), pdf1.sem()) assert_eq(ddf1.std(ddof=0, split_every=split_every), pdf1.std(ddof=0)) assert_eq(ddf1.var(ddof=0, split_every=split_every), pdf1.var(ddof=0)) assert_eq(ddf1.sem(ddof=0, split_every=split_every), pdf1.sem(ddof=0)) assert_eq(ddf1.mean(split_every=split_every), pdf1.mean()) for axis in [0, 1, "index", "columns"]: assert_eq(ddf1.sum(axis=axis, split_every=split_every), pdf1.sum(axis=axis)) assert_eq(ddf1.prod(axis=axis, split_every=split_every), pdf1.prod(axis=axis)) assert_eq(ddf1.min(axis=axis, split_every=split_every), pdf1.min(axis=axis)) assert_eq(ddf1.max(axis=axis, split_every=split_every), pdf1.max(axis=axis)) assert_eq(ddf1.count(axis=axis, split_every=split_every), pdf1.count(axis=axis)) assert_eq(ddf1.std(axis=axis, split_every=split_every), pdf1.std(axis=axis)) assert_eq(ddf1.var(axis=axis, split_every=split_every), pdf1.var(axis=axis)) assert_eq(ddf1.sem(axis=axis, split_every=split_every), pdf1.sem(axis=axis)) assert_eq( ddf1.std(axis=axis, ddof=0, split_every=split_every), pdf1.std(axis=axis, ddof=0), ) assert_eq( ddf1.var(axis=axis, ddof=0, split_every=split_every), pdf1.var(axis=axis, ddof=0), ) assert_eq( ddf1.sem(axis=axis, ddof=0, split_every=split_every), pdf1.sem(axis=axis, ddof=0), ) assert_eq(ddf1.mean(axis=axis, split_every=split_every), pdf1.mean(axis=axis)) pytest.raises(ValueError, lambda: ddf1.sum(axis="incorrect").compute()) # axis=0 assert_dask_graph(ddf1.sum(split_every=split_every), "dataframe-sum") assert_dask_graph(ddf1.prod(split_every=split_every), "dataframe-prod") assert_dask_graph(ddf1.min(split_every=split_every), "dataframe-min") assert_dask_graph(ddf1.max(split_every=split_every), "dataframe-max") assert_dask_graph(ddf1.count(split_every=split_every), "dataframe-count") # std, var, sem, and mean consist of moment_* operations assert_dask_graph(ddf1.std(split_every=split_every), "dataframe-var") assert_dask_graph(ddf1.std(split_every=split_every), "moment_chunk") assert_dask_graph(ddf1.std(split_every=split_every), "moment_agg") assert_dask_graph(ddf1.std(split_every=split_every), "values") assert_dask_graph(ddf1.var(split_every=split_every), "moment_chunk") assert_dask_graph(ddf1.var(split_every=split_every), "moment_agg") assert_dask_graph(ddf1.var(split_every=split_every), "values") assert_dask_graph(ddf1.sem(split_every=split_every), "dataframe-var") assert_dask_graph(ddf1.sem(split_every=split_every), "moment_chunk") assert_dask_graph(ddf1.sem(split_every=split_every), "moment_agg") assert_dask_graph(ddf1.sem(split_every=split_every), "values") assert_dask_graph(ddf1.mean(split_every=split_every), "dataframe-sum") assert_dask_graph(ddf1.mean(split_every=split_every), "dataframe-count") # axis=1 assert_dask_graph(ddf1.sum(axis=1, split_every=split_every), "dataframe-sum") assert_dask_graph(ddf1.prod(axis=1, split_every=split_every), "dataframe-prod") assert_dask_graph(ddf1.min(axis=1, split_every=split_every), "dataframe-min") assert_dask_graph(ddf1.max(axis=1, split_every=split_every), "dataframe-max") assert_dask_graph(ddf1.count(axis=1, split_every=split_every), "dataframe-count") assert_dask_graph(ddf1.std(axis=1, split_every=split_every), "dataframe-std") assert_dask_graph(ddf1.var(axis=1, split_every=split_every), "dataframe-var") assert_dask_graph(ddf1.sem(axis=1, split_every=split_every), "dataframe-sem") assert_dask_graph(ddf1.mean(axis=1, split_every=split_every), "dataframe-mean")
def test_split_apply_combine_on_series(): dsk = { ('x', 0): pd.DataFrame({ 'a': [1, 2, 6], 'b': [4, 2, 7] }, index=[0, 1, 3]), ('x', 1): pd.DataFrame({ 'a': [4, 4, 6], 'b': [3, 3, 1] }, index=[5, 6, 8]), ('x', 2): pd.DataFrame({ 'a': [4, 3, 7], 'b': [1, 1, 3] }, index=[9, 9, 9]) } ddf1 = dd.DataFrame(dsk, 'x', ['a', 'b'], [0, 4, 9, 9]) pdf1 = ddf1.compute() for ddkey, pdkey in [('b', 'b'), (ddf1.b, pdf1.b), (ddf1.b + 1, pdf1.b + 1)]: assert eq(ddf1.groupby(ddkey).a.min(), pdf1.groupby(pdkey).a.min()) assert eq(ddf1.groupby(ddkey).a.max(), pdf1.groupby(pdkey).a.max()) assert eq(ddf1.groupby(ddkey).a.count(), pdf1.groupby(pdkey).a.count()) assert eq(ddf1.groupby(ddkey).a.mean(), pdf1.groupby(pdkey).a.mean()) assert eq( ddf1.groupby(ddkey).a.nunique(), pdf1.groupby(pdkey).a.nunique()) assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum()) assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min()) assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max()) assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count()) assert eq(ddf1.groupby(ddkey).mean(), pdf1.groupby(pdkey).mean()) for ddkey, pdkey in [(ddf1.b, pdf1.b), (ddf1.b + 1, pdf1.b + 1)]: assert eq(ddf1.a.groupby(ddkey).sum(), pdf1.a.groupby(pdkey).sum(), check_names=False) assert eq(ddf1.a.groupby(ddkey).max(), pdf1.a.groupby(pdkey).max(), check_names=False) assert eq(ddf1.a.groupby(ddkey).count(), pdf1.a.groupby(pdkey).count(), check_names=False) assert eq(ddf1.a.groupby(ddkey).mean(), pdf1.a.groupby(pdkey).mean(), check_names=False) assert eq(ddf1.a.groupby(ddkey).nunique(), pdf1.a.groupby(pdkey).nunique(), check_names=False) for i in range(8): assert eq( ddf1.groupby(ddf1.b > i).a.sum(), pdf1.groupby(pdf1.b > i).a.sum()) assert eq( ddf1.groupby(ddf1.b > i).a.min(), pdf1.groupby(pdf1.b > i).a.min()) assert eq( ddf1.groupby(ddf1.b > i).a.max(), pdf1.groupby(pdf1.b > i).a.max()) assert eq( ddf1.groupby(ddf1.b > i).a.count(), pdf1.groupby(pdf1.b > i).a.count()) assert eq( ddf1.groupby(ddf1.b > i).a.mean(), pdf1.groupby(pdf1.b > i).a.mean()) assert eq( ddf1.groupby(ddf1.b > i).a.nunique(), pdf1.groupby(pdf1.b > i).a.nunique()) assert eq( ddf1.groupby(ddf1.a > i).b.sum(), pdf1.groupby(pdf1.a > i).b.sum()) assert eq( ddf1.groupby(ddf1.a > i).b.min(), pdf1.groupby(pdf1.a > i).b.min()) assert eq( ddf1.groupby(ddf1.a > i).b.max(), pdf1.groupby(pdf1.a > i).b.max()) assert eq( ddf1.groupby(ddf1.a > i).b.count(), pdf1.groupby(pdf1.a > i).b.count()) assert eq( ddf1.groupby(ddf1.a > i).b.mean(), pdf1.groupby(pdf1.a > i).b.mean()) assert eq( ddf1.groupby(ddf1.a > i).b.nunique(), pdf1.groupby(pdf1.a > i).b.nunique()) assert eq( ddf1.groupby(ddf1.b > i).sum(), pdf1.groupby(pdf1.b > i).sum()) assert eq( ddf1.groupby(ddf1.b > i).min(), pdf1.groupby(pdf1.b > i).min()) assert eq( ddf1.groupby(ddf1.b > i).max(), pdf1.groupby(pdf1.b > i).max()) assert eq( ddf1.groupby(ddf1.b > i).count(), pdf1.groupby(pdf1.b > i).count()) assert eq( ddf1.groupby(ddf1.b > i).mean(), pdf1.groupby(pdf1.b > i).mean()) assert eq( ddf1.groupby(ddf1.a > i).sum(), pdf1.groupby(pdf1.a > i).sum()) assert eq( ddf1.groupby(ddf1.a > i).min(), pdf1.groupby(pdf1.a > i).min()) assert eq( ddf1.groupby(ddf1.a > i).max(), pdf1.groupby(pdf1.a > i).max()) assert eq( ddf1.groupby(ddf1.a > i).count(), pdf1.groupby(pdf1.a > i).count()) assert eq( ddf1.groupby(ddf1.a > i).mean(), pdf1.groupby(pdf1.a > i).mean()) for ddkey, pdkey in [('a', 'a'), (ddf1.a, pdf1.a), (ddf1.a + 1, pdf1.a + 1), (ddf1.a > 3, pdf1.a > 3)]: assert eq(ddf1.groupby(ddkey).b.sum(), pdf1.groupby(pdkey).b.sum()) assert eq(ddf1.groupby(ddkey).b.min(), pdf1.groupby(pdkey).b.min()) assert eq(ddf1.groupby(ddkey).b.max(), pdf1.groupby(pdkey).b.max()) assert eq(ddf1.groupby(ddkey).b.count(), pdf1.groupby(pdkey).b.count()) assert eq(ddf1.groupby(ddkey).b.mean(), pdf1.groupby(pdkey).b.mean()) assert eq( ddf1.groupby(ddkey).b.nunique(), pdf1.groupby(pdkey).b.nunique()) assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum()) assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min()) assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max()) assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count()) assert eq( ddf1.groupby(ddkey).mean(), pdf1.groupby(pdkey).mean().astype(float)) assert sorted(ddf1.groupby('b').a.sum().dask) == \ sorted(ddf1.groupby('b').a.sum().dask) assert sorted(ddf1.groupby(ddf1.a > 3).b.mean().dask) == \ sorted(ddf1.groupby(ddf1.a > 3).b.mean().dask) # test raises with incorrect key assert raises(KeyError, lambda: ddf1.groupby('x')) assert raises(KeyError, lambda: ddf1.groupby(['a', 'x'])) assert raises(KeyError, lambda: ddf1.groupby('a')['x']) assert raises(KeyError, lambda: ddf1.groupby('a')['b', 'x']) assert raises(KeyError, lambda: ddf1.groupby('a')[['b', 'x']]) # test graph node labels assert_dask_graph(ddf1.groupby('b').a.sum(), 'series-groupby-sum') assert_dask_graph(ddf1.groupby('b').a.min(), 'series-groupby-min') assert_dask_graph(ddf1.groupby('b').a.max(), 'series-groupby-max') assert_dask_graph(ddf1.groupby('b').a.count(), 'series-groupby-count') # mean consists from sum and count operations assert_dask_graph(ddf1.groupby('b').a.mean(), 'series-groupby-sum') assert_dask_graph(ddf1.groupby('b').a.mean(), 'series-groupby-count') assert_dask_graph(ddf1.groupby('b').a.nunique(), 'series-groupby-nunique') assert_dask_graph(ddf1.groupby('b').sum(), 'dataframe-groupby-sum') assert_dask_graph(ddf1.groupby('b').min(), 'dataframe-groupby-min') assert_dask_graph(ddf1.groupby('b').max(), 'dataframe-groupby-max') assert_dask_graph(ddf1.groupby('b').count(), 'dataframe-groupby-count') # mean consists from sum and count operations assert_dask_graph(ddf1.groupby('b').mean(), 'dataframe-groupby-sum') assert_dask_graph(ddf1.groupby('b').mean(), 'dataframe-groupby-count')
def test_split_apply_combine_on_series(): dsk = { ("x", 0): pd.DataFrame({"a": [1, 2, 6], "b": [4, 2, 7]}, index=[0, 1, 3]), ("x", 1): pd.DataFrame({"a": [4, 4, 6], "b": [3, 3, 1]}, index=[5, 6, 8]), ("x", 2): pd.DataFrame({"a": [4, 3, 7], "b": [1, 1, 3]}, index=[9, 9, 9]), } ddf1 = dd.DataFrame(dsk, "x", ["a", "b"], [0, 4, 9, 9]) pdf1 = ddf1.compute() for ddkey, pdkey in [("b", "b"), (ddf1.b, pdf1.b), (ddf1.b + 1, pdf1.b + 1)]: assert eq(ddf1.groupby(ddkey).a.min(), pdf1.groupby(pdkey).a.min()) assert eq(ddf1.groupby(ddkey).a.max(), pdf1.groupby(pdkey).a.max()) assert eq(ddf1.groupby(ddkey).a.count(), pdf1.groupby(pdkey).a.count()) assert eq(ddf1.groupby(ddkey).a.mean(), pdf1.groupby(pdkey).a.mean()) assert eq(ddf1.groupby(ddkey).a.nunique(), pdf1.groupby(pdkey).a.nunique()) assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum()) assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min()) assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max()) assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count()) assert eq(ddf1.groupby(ddkey).mean(), pdf1.groupby(pdkey).mean()) for ddkey, pdkey in [(ddf1.b, pdf1.b), (ddf1.b + 1, pdf1.b + 1)]: assert eq(ddf1.a.groupby(ddkey).sum(), pdf1.a.groupby(pdkey).sum(), check_names=False) assert eq(ddf1.a.groupby(ddkey).max(), pdf1.a.groupby(pdkey).max(), check_names=False) assert eq(ddf1.a.groupby(ddkey).count(), pdf1.a.groupby(pdkey).count(), check_names=False) assert eq(ddf1.a.groupby(ddkey).mean(), pdf1.a.groupby(pdkey).mean(), check_names=False) assert eq(ddf1.a.groupby(ddkey).nunique(), pdf1.a.groupby(pdkey).nunique(), check_names=False) for i in range(8): assert eq(ddf1.groupby(ddf1.b > i).a.sum(), pdf1.groupby(pdf1.b > i).a.sum()) assert eq(ddf1.groupby(ddf1.b > i).a.min(), pdf1.groupby(pdf1.b > i).a.min()) assert eq(ddf1.groupby(ddf1.b > i).a.max(), pdf1.groupby(pdf1.b > i).a.max()) assert eq(ddf1.groupby(ddf1.b > i).a.count(), pdf1.groupby(pdf1.b > i).a.count()) assert eq(ddf1.groupby(ddf1.b > i).a.mean(), pdf1.groupby(pdf1.b > i).a.mean()) assert eq(ddf1.groupby(ddf1.b > i).a.nunique(), pdf1.groupby(pdf1.b > i).a.nunique()) assert eq(ddf1.groupby(ddf1.a > i).b.sum(), pdf1.groupby(pdf1.a > i).b.sum()) assert eq(ddf1.groupby(ddf1.a > i).b.min(), pdf1.groupby(pdf1.a > i).b.min()) assert eq(ddf1.groupby(ddf1.a > i).b.max(), pdf1.groupby(pdf1.a > i).b.max()) assert eq(ddf1.groupby(ddf1.a > i).b.count(), pdf1.groupby(pdf1.a > i).b.count()) assert eq(ddf1.groupby(ddf1.a > i).b.mean(), pdf1.groupby(pdf1.a > i).b.mean()) assert eq(ddf1.groupby(ddf1.a > i).b.nunique(), pdf1.groupby(pdf1.a > i).b.nunique()) assert eq(ddf1.groupby(ddf1.b > i).sum(), pdf1.groupby(pdf1.b > i).sum()) assert eq(ddf1.groupby(ddf1.b > i).min(), pdf1.groupby(pdf1.b > i).min()) assert eq(ddf1.groupby(ddf1.b > i).max(), pdf1.groupby(pdf1.b > i).max()) assert eq(ddf1.groupby(ddf1.b > i).count(), pdf1.groupby(pdf1.b > i).count()) assert eq(ddf1.groupby(ddf1.b > i).mean(), pdf1.groupby(pdf1.b > i).mean()) assert eq(ddf1.groupby(ddf1.a > i).sum(), pdf1.groupby(pdf1.a > i).sum()) assert eq(ddf1.groupby(ddf1.a > i).min(), pdf1.groupby(pdf1.a > i).min()) assert eq(ddf1.groupby(ddf1.a > i).max(), pdf1.groupby(pdf1.a > i).max()) assert eq(ddf1.groupby(ddf1.a > i).count(), pdf1.groupby(pdf1.a > i).count()) assert eq(ddf1.groupby(ddf1.a > i).mean(), pdf1.groupby(pdf1.a > i).mean()) for ddkey, pdkey in [("a", "a"), (ddf1.a, pdf1.a), (ddf1.a + 1, pdf1.a + 1), (ddf1.a > 3, pdf1.a > 3)]: assert eq(ddf1.groupby(ddkey).b.sum(), pdf1.groupby(pdkey).b.sum()) assert eq(ddf1.groupby(ddkey).b.min(), pdf1.groupby(pdkey).b.min()) assert eq(ddf1.groupby(ddkey).b.max(), pdf1.groupby(pdkey).b.max()) assert eq(ddf1.groupby(ddkey).b.count(), pdf1.groupby(pdkey).b.count()) assert eq(ddf1.groupby(ddkey).b.mean(), pdf1.groupby(pdkey).b.mean()) assert eq(ddf1.groupby(ddkey).b.nunique(), pdf1.groupby(pdkey).b.nunique()) assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum()) assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min()) assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max()) assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count()) assert eq(ddf1.groupby(ddkey).mean(), pdf1.groupby(pdkey).mean().astype(float)) assert sorted(ddf1.groupby("b").a.sum().dask) == sorted(ddf1.groupby("b").a.sum().dask) assert sorted(ddf1.groupby(ddf1.a > 3).b.mean().dask) == sorted(ddf1.groupby(ddf1.a > 3).b.mean().dask) # test raises with incorrect key assert raises(KeyError, lambda: ddf1.groupby("x")) assert raises(KeyError, lambda: ddf1.groupby(["a", "x"])) assert raises(KeyError, lambda: ddf1.groupby("a")["x"]) assert raises(KeyError, lambda: ddf1.groupby("a")["b", "x"]) assert raises(KeyError, lambda: ddf1.groupby("a")[["b", "x"]]) # test graph node labels assert_dask_graph(ddf1.groupby("b").a.sum(), "series-groupby-sum") assert_dask_graph(ddf1.groupby("b").a.min(), "series-groupby-min") assert_dask_graph(ddf1.groupby("b").a.max(), "series-groupby-max") assert_dask_graph(ddf1.groupby("b").a.count(), "series-groupby-count") # mean consists from sum and count operations assert_dask_graph(ddf1.groupby("b").a.mean(), "series-groupby-sum") assert_dask_graph(ddf1.groupby("b").a.mean(), "series-groupby-count") assert_dask_graph(ddf1.groupby("b").a.nunique(), "series-groupby-nunique") assert_dask_graph(ddf1.groupby("b").sum(), "dataframe-groupby-sum") assert_dask_graph(ddf1.groupby("b").min(), "dataframe-groupby-min") assert_dask_graph(ddf1.groupby("b").max(), "dataframe-groupby-max") assert_dask_graph(ddf1.groupby("b").count(), "dataframe-groupby-count") # mean consists from sum and count operations assert_dask_graph(ddf1.groupby("b").mean(), "dataframe-groupby-sum") assert_dask_graph(ddf1.groupby("b").mean(), "dataframe-groupby-count")