def test_pluck(): d = {('x', 0): [(1, 10), (2, 20)], ('x', 1): [(3, 30), (4, 40)]} b = Bag(d, 'x', 2) assert set(b.pluck(0)) == set([1, 2, 3, 4]) assert set(b.pluck(1)) == set([10, 20, 30, 40]) assert set(b.pluck([1, 0])) == set([(10, 1), (20, 2), (30, 3), (40, 4)])
def test_pluck(): d = {('x', 0): [(1, 10), (2, 20)], ('x', 1): [(3, 30), (4, 40)]} b = Bag(d, 'x', 2) assert set(b.pluck(0)) == set([1, 2, 3, 4]) assert set(b.pluck(1)) == set([10, 20, 30, 40]) assert set(b.pluck([1, 0])) == set([(10, 1), (20, 2), (30, 3), (40, 4)]) assert b.pluck([1, 0]).name == b.pluck([1, 0]).name
def test_pluck(): d = {("x", 0): [(1, 10), (2, 20)], ("x", 1): [(3, 30), (4, 40)]} b = Bag(d, "x", 2) assert set(b.pluck(0)) == {1, 2, 3, 4} assert set(b.pluck(1)) == {10, 20, 30, 40} assert set(b.pluck([1, 0])) == {(10, 1), (20, 2), (30, 3), (40, 4)} assert b.pluck([1, 0]).name == b.pluck([1, 0]).name
def test_to_dataframe(): dd = pytest.importorskip("dask.dataframe") pd = pytest.importorskip("pandas") def check_parts(df, sol): assert all( (p.dtypes == sol.dtypes).all() for p in dask.compute(*df.to_delayed()) ) dsk = {("test", 0): [(1, 2)], ("test", 1): [], ("test", 2): [(10, 20), (100, 200)]} b = Bag(dsk, "test", 3) sol = pd.DataFrame(b.compute(), columns=["a", "b"]) # Elements are tuples df = b.to_dataframe() dd.utils.assert_eq(df, sol.rename(columns={"a": 0, "b": 1}), check_index=False) df = b.to_dataframe(columns=["a", "b"]) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) df = b.to_dataframe(meta=[("a", "i8"), ("b", "i8")]) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) # Elements are dictionaries b = b.map(lambda x: dict(zip(["a", "b"], x))) df = b.to_dataframe() dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) assert df._name == b.to_dataframe()._name # With metadata specified for meta in [sol, [("a", "i8"), ("b", "i8")]]: df = b.to_dataframe(meta=meta) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) # Error to specify both columns and meta with pytest.raises(ValueError): b.to_dataframe(columns=["a", "b"], meta=sol) # Inference fails if empty first partition b2 = b.filter(lambda x: x["a"] > 200) with pytest.raises(ValueError): b2.to_dataframe() # Single column b = b.pluck("a") sol = sol[["a"]] df = b.to_dataframe(meta=sol) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) # Works with iterators and tuples sol = pd.DataFrame({"a": range(100)}) b = db.from_sequence(range(100), npartitions=5) for f in [iter, tuple]: df = b.map_partitions(f).to_dataframe(meta=sol) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol)
def test_to_dataframe(): dd = pytest.importorskip('dask.dataframe') pd = pytest.importorskip('pandas') def check_parts(df, sol): assert all((p.dtypes == sol.dtypes).all() for p in dask.compute(*df.to_delayed())) dsk = {('test', 0): [(1, 2)], ('test', 1): [], ('test', 2): [(10, 20), (100, 200)]} b = Bag(dsk, 'test', 3) sol = pd.DataFrame(b.compute(), columns=['a', 'b']) # Elements are tuples df = b.to_dataframe() dd.utils.assert_eq(df, sol.rename(columns={'a': 0, 'b': 1}), check_index=False) df = b.to_dataframe(columns=['a', 'b']) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) df = b.to_dataframe(meta=[('a', 'i8'), ('b', 'i8')]) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) # Elements are dictionaries b = b.map(lambda x: dict(zip(['a', 'b'], x))) df = b.to_dataframe() dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) assert df._name == b.to_dataframe()._name # With metadata specified for meta in [sol, [('a', 'i8'), ('b', 'i8')]]: df = b.to_dataframe(meta=meta) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) # Error to specify both columns and meta with pytest.raises(ValueError): b.to_dataframe(columns=['a', 'b'], meta=sol) # Single column b = b.pluck('a') sol = sol[['a']] df = b.to_dataframe(meta=sol) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) # Works with iterators and tuples sol = pd.DataFrame({'a': range(100)}) b = db.from_sequence(range(100), npartitions=5) for f in [iter, tuple]: df = b.map_partitions(f).to_dataframe(meta=sol) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol)
def test_to_dataframe(): dd = pytest.importorskip('dask.dataframe') pd = pytest.importorskip('pandas') def check_parts(df, sol): assert all((p.dtypes == sol.dtypes).all() for p in dask.compute(*df.to_delayed())) dsk = {('test', 0): [(1, 2)], ('test', 1): [], ('test', 2): [(10, 20), (100, 200)]} b = Bag(dsk, 'test', 3) sol = pd.DataFrame(b.compute(), columns=['a', 'b']) # Elements are tuples df = b.to_dataframe() dd.utils.assert_eq(df, sol.rename(columns={'a': 0, 'b': 1}), check_index=False) df = b.to_dataframe(columns=['a', 'b']) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) df = b.to_dataframe(meta=[('a', 'i8'), ('b', 'i8')]) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) # Elements are dictionaries b = b.map(lambda x: dict(zip(['a', 'b'], x))) df = b.to_dataframe() dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) assert df._name == b.to_dataframe()._name # With metadata specified for meta in [sol, [('a', 'i8'), ('b', 'i8')]]: df = b.to_dataframe(meta=meta) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) # Error to specify both columns and meta with pytest.raises(ValueError): b.to_dataframe(columns=['a', 'b'], meta=sol) # Single column b = b.pluck('a') sol = sol[['a']] df = b.to_dataframe(meta=sol) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) # Works with iterators and tuples sol = pd.DataFrame({'a': range(100)}) b = db.from_sequence(range(100), npartitions=5) for f in [iter, tuple]: df = b.map_partitions(f).to_dataframe(meta=sol) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol)
def test_to_dataframe(): dd = pytest.importorskip('dask.dataframe') pd = pytest.importorskip('pandas') def check_parts(df, sol): assert all((p.dtypes == sol.dtypes).all() for p in dask.compute(*df.to_delayed())) dsk = { ('test', 0): [(1, 2)], ('test', 1): [], ('test', 2): [(10, 20), (100, 200)] } b = Bag(dsk, 'test', 3) sol = pd.DataFrame(b.compute(), columns=['a', 'b']) # Elements are tuples df = b.to_dataframe() dd.utils.assert_eq(df, sol.rename(columns={ 'a': 0, 'b': 1 }), check_index=False) df = b.to_dataframe(columns=['a', 'b']) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) # Elements are dictionaries b = b.map(lambda x: dict(zip(['a', 'b'], x))) df = b.to_dataframe() dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) assert df._name == b.to_dataframe()._name # With metadata specified df = b.to_dataframe(columns=sol) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) # Single column b = b.pluck('a') sol = sol[['a']] df = b.to_dataframe(columns=sol) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol)