Ejemplo n.º 1
0
def test_pluck():
    d = {('x', 0): [(1, 10), (2, 20)],
         ('x', 1): [(3, 30), (4, 40)]}
    b = Bag(d, 'x', 2)
    assert set(b.pluck(0)) == set([1, 2, 3, 4])
    assert set(b.pluck(1)) == set([10, 20, 30, 40])
    assert set(b.pluck([1, 0])) == set([(10, 1), (20, 2), (30, 3), (40, 4)])
Ejemplo n.º 2
0
def test_pluck():
    d = {('x', 0): [(1, 10), (2, 20)], ('x', 1): [(3, 30), (4, 40)]}
    b = Bag(d, 'x', 2)
    assert set(b.pluck(0)) == set([1, 2, 3, 4])
    assert set(b.pluck(1)) == set([10, 20, 30, 40])
    assert set(b.pluck([1, 0])) == set([(10, 1), (20, 2), (30, 3), (40, 4)])
    assert b.pluck([1, 0]).name == b.pluck([1, 0]).name
Ejemplo n.º 3
0
def test_pluck():
    d = {("x", 0): [(1, 10), (2, 20)], ("x", 1): [(3, 30), (4, 40)]}
    b = Bag(d, "x", 2)
    assert set(b.pluck(0)) == {1, 2, 3, 4}
    assert set(b.pluck(1)) == {10, 20, 30, 40}
    assert set(b.pluck([1, 0])) == {(10, 1), (20, 2), (30, 3), (40, 4)}
    assert b.pluck([1, 0]).name == b.pluck([1, 0]).name
Ejemplo n.º 4
0
def test_to_dataframe():
    dd = pytest.importorskip("dask.dataframe")
    pd = pytest.importorskip("pandas")

    def check_parts(df, sol):
        assert all(
            (p.dtypes == sol.dtypes).all() for p in dask.compute(*df.to_delayed())
        )

    dsk = {("test", 0): [(1, 2)], ("test", 1): [], ("test", 2): [(10, 20), (100, 200)]}
    b = Bag(dsk, "test", 3)
    sol = pd.DataFrame(b.compute(), columns=["a", "b"])

    # Elements are tuples
    df = b.to_dataframe()
    dd.utils.assert_eq(df, sol.rename(columns={"a": 0, "b": 1}), check_index=False)
    df = b.to_dataframe(columns=["a", "b"])
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)
    df = b.to_dataframe(meta=[("a", "i8"), ("b", "i8")])
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)

    # Elements are dictionaries
    b = b.map(lambda x: dict(zip(["a", "b"], x)))
    df = b.to_dataframe()
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)
    assert df._name == b.to_dataframe()._name

    # With metadata specified
    for meta in [sol, [("a", "i8"), ("b", "i8")]]:
        df = b.to_dataframe(meta=meta)
        dd.utils.assert_eq(df, sol, check_index=False)
        check_parts(df, sol)

    # Error to specify both columns and meta
    with pytest.raises(ValueError):
        b.to_dataframe(columns=["a", "b"], meta=sol)

    # Inference fails if empty first partition
    b2 = b.filter(lambda x: x["a"] > 200)
    with pytest.raises(ValueError):
        b2.to_dataframe()

    # Single column
    b = b.pluck("a")
    sol = sol[["a"]]
    df = b.to_dataframe(meta=sol)
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)

    # Works with iterators and tuples
    sol = pd.DataFrame({"a": range(100)})
    b = db.from_sequence(range(100), npartitions=5)
    for f in [iter, tuple]:
        df = b.map_partitions(f).to_dataframe(meta=sol)
        dd.utils.assert_eq(df, sol, check_index=False)
        check_parts(df, sol)
Ejemplo n.º 5
0
def test_to_dataframe():
    dd = pytest.importorskip('dask.dataframe')
    pd = pytest.importorskip('pandas')

    def check_parts(df, sol):
        assert all((p.dtypes == sol.dtypes).all() for p in
                   dask.compute(*df.to_delayed()))

    dsk = {('test', 0): [(1, 2)],
           ('test', 1): [],
           ('test', 2): [(10, 20), (100, 200)]}
    b = Bag(dsk, 'test', 3)
    sol = pd.DataFrame(b.compute(), columns=['a', 'b'])

    # Elements are tuples
    df = b.to_dataframe()
    dd.utils.assert_eq(df, sol.rename(columns={'a': 0, 'b': 1}),
                       check_index=False)
    df = b.to_dataframe(columns=['a', 'b'])
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)
    df = b.to_dataframe(meta=[('a', 'i8'), ('b', 'i8')])
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)

    # Elements are dictionaries
    b = b.map(lambda x: dict(zip(['a', 'b'], x)))
    df = b.to_dataframe()
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)
    assert df._name == b.to_dataframe()._name

    # With metadata specified
    for meta in [sol, [('a', 'i8'), ('b', 'i8')]]:
        df = b.to_dataframe(meta=meta)
        dd.utils.assert_eq(df, sol, check_index=False)
        check_parts(df, sol)

    # Error to specify both columns and meta
    with pytest.raises(ValueError):
        b.to_dataframe(columns=['a', 'b'], meta=sol)

    # Single column
    b = b.pluck('a')
    sol = sol[['a']]
    df = b.to_dataframe(meta=sol)
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)

    # Works with iterators and tuples
    sol = pd.DataFrame({'a': range(100)})
    b = db.from_sequence(range(100), npartitions=5)
    for f in [iter, tuple]:
        df = b.map_partitions(f).to_dataframe(meta=sol)
        dd.utils.assert_eq(df, sol, check_index=False)
        check_parts(df, sol)
Ejemplo n.º 6
0
def test_to_dataframe():
    dd = pytest.importorskip('dask.dataframe')
    pd = pytest.importorskip('pandas')

    def check_parts(df, sol):
        assert all((p.dtypes == sol.dtypes).all() for p in
                   dask.compute(*df.to_delayed()))

    dsk = {('test', 0): [(1, 2)],
           ('test', 1): [],
           ('test', 2): [(10, 20), (100, 200)]}
    b = Bag(dsk, 'test', 3)
    sol = pd.DataFrame(b.compute(), columns=['a', 'b'])

    # Elements are tuples
    df = b.to_dataframe()
    dd.utils.assert_eq(df, sol.rename(columns={'a': 0, 'b': 1}),
                       check_index=False)
    df = b.to_dataframe(columns=['a', 'b'])
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)
    df = b.to_dataframe(meta=[('a', 'i8'), ('b', 'i8')])
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)

    # Elements are dictionaries
    b = b.map(lambda x: dict(zip(['a', 'b'], x)))
    df = b.to_dataframe()
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)
    assert df._name == b.to_dataframe()._name

    # With metadata specified
    for meta in [sol, [('a', 'i8'), ('b', 'i8')]]:
        df = b.to_dataframe(meta=meta)
        dd.utils.assert_eq(df, sol, check_index=False)
        check_parts(df, sol)

    # Error to specify both columns and meta
    with pytest.raises(ValueError):
        b.to_dataframe(columns=['a', 'b'], meta=sol)

    # Single column
    b = b.pluck('a')
    sol = sol[['a']]
    df = b.to_dataframe(meta=sol)
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)

    # Works with iterators and tuples
    sol = pd.DataFrame({'a': range(100)})
    b = db.from_sequence(range(100), npartitions=5)
    for f in [iter, tuple]:
        df = b.map_partitions(f).to_dataframe(meta=sol)
        dd.utils.assert_eq(df, sol, check_index=False)
        check_parts(df, sol)
Ejemplo n.º 7
0
def test_to_dataframe():
    dd = pytest.importorskip('dask.dataframe')
    pd = pytest.importorskip('pandas')

    def check_parts(df, sol):
        assert all((p.dtypes == sol.dtypes).all()
                   for p in dask.compute(*df.to_delayed()))

    dsk = {
        ('test', 0): [(1, 2)],
        ('test', 1): [],
        ('test', 2): [(10, 20), (100, 200)]
    }
    b = Bag(dsk, 'test', 3)
    sol = pd.DataFrame(b.compute(), columns=['a', 'b'])

    # Elements are tuples
    df = b.to_dataframe()
    dd.utils.assert_eq(df,
                       sol.rename(columns={
                           'a': 0,
                           'b': 1
                       }),
                       check_index=False)
    df = b.to_dataframe(columns=['a', 'b'])
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)

    # Elements are dictionaries
    b = b.map(lambda x: dict(zip(['a', 'b'], x)))
    df = b.to_dataframe()
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)
    assert df._name == b.to_dataframe()._name

    # With metadata specified
    df = b.to_dataframe(columns=sol)
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)

    # Single column
    b = b.pluck('a')
    sol = sol[['a']]
    df = b.to_dataframe(columns=sol)
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)