def test_add(): df = pd.DataFrame(np.identity(12)) df2 = df.copy() df2.index += 1 sf1 = sp.SparseFrame(df) sf2 = sp.SparseFrame(df2) correct = sf1.add(sf2).todense() dsf = dsp.from_pandas(df, npartitions=4) dsf2 = dsp.from_pandas(df2, npartitions=4) res = dsf.add(dsf2).compute().todense() pdt.assert_frame_equal(res, correct)
def test_loc(iindexer, correct_shape): df = pd.DataFrame(np.random.rand(10, 2), index=list('ABCDEFGHIJ')) dsf = dsp.from_pandas(df, npartitions=2) res = dsf.loc[iindexer].compute() assert isinstance(res, sp.SparseFrame) assert res.shape == correct_shape
def test_todense_series(): data = pd.DataFrame(np.random.rand(10, 2)) dsf = dsp.from_pandas(data, npartitions=3)[0] res = dsf.todense() assert isinstance(res, dd.Series) computed = res.compute() pdt.assert_series_equal(computed, data[0], check_dtype=False)
def test_distributed_join(how): left = pd.DataFrame(np.identity(10), index=np.arange(10), columns=list('ABCDEFGHIJ')) right = pd.DataFrame(np.identity(10), index=np.arange(5, 15), columns=list('KLMNOPQRST')) correct = left.join(right, how=how).fillna(0) d_left = dsp.from_pandas(left, npartitions=2) d_right = dsp.from_pandas(right, npartitions=2) joined = d_left.join(d_right, how=how) res = joined.compute().todense() pdt.assert_frame_equal(correct, res)
def test_map_partitions(): data = pd.DataFrame(np.random.rand(10, 2)) dsf = dsp.from_pandas(data, npartitions=3) dsf = dsf.map_partitions(lambda x: x, dsf._meta) res = dsf.compute() assert isinstance(res, sp.SparseFrame) assert res.shape == (10, 2)
def test_repartition_n_divisions(start_part, end_part): df = pd.DataFrame(np.identity(10)) dsf = dsp.from_pandas(df, npartitions=start_part) dsf2 = dsf.repartition(npartitions=end_part) assert isinstance(dsf2, dsp.SparseFrame) assert dsf2.npartitions == end_part df2 = dsf2.compute().todense() pdt.assert_frame_equal(df, df2)
def test_assign_column(): s = pd.Series(np.arange(10)) ds = dd.from_pandas(s, npartitions=2) f = pd.DataFrame(np.random.rand(10, 2), columns=['a', 'b']) dsf = dsp.from_pandas(f, npartitions=2) dsf = dsf.assign(new=ds) assert dsf._meta.empty sf = dsf.compute() assert np.all((sf.todense() == f.assign(new=s)).values)
def test_repartition_divisions(arg_dict): df = pd.DataFrame(np.identity(100)) dsf = dsp.from_pandas(df, npartitions=4) dsf2 = dsf.repartition(**arg_dict) assert isinstance(dsf2, dsp.SparseFrame) if 'divisions' in arg_dict: assert tuple(dsf2.divisions) == tuple(arg_dict['divisions']) df2 = dsf2.compute().todense() pdt.assert_frame_equal(df, df2)
def test_map_partitions_mappable(): data = pd.DataFrame(np.ones((10, 2))) dsf = dsp.from_pandas(data, chunksize=5) def foo(sf, x, y): return sp.SparseFrame(sf.data * x * y, index=sf.index, columns=sf.columns) dsf = dsf.map_partitions(foo, dsf._meta, x=(i for i in range(2, 4)), y=2) res = dsf.compute().todense() assert res.shape == (10, 2) assert (res.iloc[:5, :] == 4).all().all() assert (res.iloc[5:, :] == 6).all().all()
def test_groupby_sum(idx, sorted): df = pd.DataFrame(dict(A=np.ones(100), B=np.ones(100)), index=idx) correct = df.groupby(level=0).sum() correct.sort_index(inplace=True) spf = dsp.from_pandas(df, npartitions=2) if not sorted: spf.divisions = [None] * (spf.npartitions + 1) assert spf.npartitions == 2 grouped = spf.groupby_sum(split_out=3) assert grouped.npartitions == 3 res = grouped.compute().todense() res.sort_index(inplace=True) pdt.assert_frame_equal(res, correct)
def test_getitem(item, raises): df = pd.DataFrame(np.random.rand(10, 3), columns=list('XYZ'), index=list('ABCDEFGHIJ')) dsf = dsp.from_pandas(df, npartitions=2) correct_cols = item if isinstance(item, list) else [item] if raises: with pytest.raises(KeyError): dsf[item] return res = dsf[item] assert res.columns.tolist() == correct_cols res_computed = res.compute() assert res_computed.columns.tolist() == correct_cols if not isinstance(item, list): pdt.assert_series_equal(df[item], res_computed.todense()) else: pdt.assert_frame_equal(df[item], res_computed.todense())
def test_repr(): dsf = dsp.from_pandas(pd.DataFrame(np.random.rand(10, 2)), npartitions=3) assert isinstance(dsf.__repr__(), str) dsf = dsp.from_pandas(pd.DataFrame(np.random.rand(10, 100)), npartitions=3) assert isinstance(dsf.__repr__(), str)
def test_from_pandas(): dsf = dsp.from_pandas(pd.DataFrame(np.random.rand(10, 2)), npartitions=3) res = dsf.compute() assert isinstance(res, sp.SparseFrame) assert res.shape == (10, 2)
def dsf_arange(sf_arange): return dsp.from_pandas(sf_arange.todense(), chunksize=5)
def dsf(): return dsp.from_pandas(pd.DataFrame(np.random.rand(10, 2), columns=['A', 'B']), npartitions=3)