Ejemplo n.º 1
0
def test_pluck():
    d = {('x', 0): [(1, 10), (2, 20)],
         ('x', 1): [(3, 30), (4, 40)]}
    b = Bag(d, 'x', 2)
    assert set(b.pluck(0)) == set([1, 2, 3, 4])
    assert set(b.pluck(1)) == set([10, 20, 30, 40])
    assert set(b.pluck([1, 0])) == set([(10, 1), (20, 2), (30, 3), (40, 4)])
Ejemplo n.º 2
0
def test_pluck():
    d = {('x', 0): [(1, 10), (2, 20)],
         ('x', 1): [(3, 30), (4, 40)]}
    b = Bag(d, 'x', 2)
    assert set(b.pluck(0)) == set([1, 2, 3, 4])
    assert set(b.pluck(1)) == set([10, 20, 30, 40])
    assert set(b.pluck([1, 0])) == set([(10, 1), (20, 2), (30, 3), (40, 4)])
Ejemplo n.º 3
0
def to_bag(df, index=False, format="tuple"):
    """Create Dask Bag from a Dask DataFrame

    Parameters
    ----------
    index : bool, optional
        If True, the elements are tuples of ``(index, value)``, otherwise
        they're just the ``value``.  Default is False.
    format : {"tuple", "dict", "frame"}, optional
        Whether to return a bag of tuples, dictionaries, or
        dataframe-like objects. Default is "tuple". If "frame",
        the original partitions of ``df`` will not be transformed
        in any way.


    Examples
    --------
    >>> bag = df.to_bag()  # doctest: +SKIP
    """
    from dask.bag.core import Bag

    if not isinstance(df, (DataFrame, Series)):
        raise TypeError("df must be either DataFrame or Series")
    name = "to_bag-" + tokenize(df, index, format)
    if format == "frame":
        # Use existing graph and name of df, but
        # drop meta to produce a Bag collection
        dsk = df.dask
        name = df._name
    else:
        dsk = {(name, i): (_df_to_bag, block, index, format)
               for (i, block) in enumerate(df.__dask_keys__())}
        dsk.update(
            df.__dask_optimize__(df.__dask_graph__(), df.__dask_keys__()))
    return Bag(dsk, name, df.npartitions)
Ejemplo n.º 4
0
def from_elasticsearch(host, index, query, port=9200, pagination=100):
    """ Create Bag from Elasticsearch Query

    >>> b = from_elasticsearch(host='hostname', index='reddit',
    ...                        query={"match": {'body':'Python'}})
    """
    es = Elasticsearch([{'host': host, 'port': port}])
    count = es.count(index=index, body={'query': query})['count']

    npartitions = int(ceil(count / pagination))
    name = 'elasticsearch' + next(tokens)

    dsk = dict()
    for i in range(npartitions):
        kwargs = {
            'index': index,
            'body': {
                'query': query,
                'from': pagination * i,
                'size': pagination
            }
        }
        dsk[(name, i)] = (get_results, es, kwargs)

    return Bag(dsk, name, npartitions)
Ejemplo n.º 5
0
def test_reductions_are_lazy():
    current = [None]

    def part():
        for i in range(10):
            current[0] = i
            yield i

    def func(part):
        assert current[0] == 0
        return sum(part)

    b = Bag({('foo', 0): part()}, 'foo', 1)

    res = b.reduction(func, sum)

    assert res.compute(get=dask.get) == sum(range(10))
Ejemplo n.º 6
0
def test_reductions_are_lazy():
    current = [None]

    def part():
        for i in range(10):
            current[0] = i
            yield i

    def func(part):
        assert current[0] == 0
        return sum(part)

    b = Bag({('foo', 0): part()}, 'foo', 1)

    res = b.reduction(func, sum)

    assert res.compute(get=dask.get) == sum(range(10))
Ejemplo n.º 7
0
def test_reductions_are_lazy():
    current = [None]

    def part():
        for i in range(10):
            current[0] = i
            yield i

    def func(part):
        assert current[0] == 0
        return sum(part)

    b = Bag({("foo", 0): part()}, "foo", 1)

    res = b.reduction(func, sum)

    assert_eq(res, sum(range(10)))
Ejemplo n.º 8
0
def test_pluck():
    d = {("x", 0): [(1, 10), (2, 20)], ("x", 1): [(3, 30), (4, 40)]}
    b = Bag(d, "x", 2)
    assert set(b.pluck(0)) == {1, 2, 3, 4}
    assert set(b.pluck(1)) == {10, 20, 30, 40}
    assert set(b.pluck([1, 0])) == {(10, 1), (20, 2), (30, 3), (40, 4)}
    assert b.pluck([1, 0]).name == b.pluck([1, 0]).name
Ejemplo n.º 9
0
def bag_to_iterator(x, **kwargs):
    return Bag.from_filenames([tf.path for tf in x])
Ejemplo n.º 10
0
def test_args():
    c = b.map(lambda x: x + 1)
    d = Bag(*c._args)

    assert list(c) == list(d)
    assert c.npartitions == d.npartitions
Ejemplo n.º 11
0
def test_to_dataframe():
    dd = pytest.importorskip('dask.dataframe')
    pd = pytest.importorskip('pandas')

    def check_parts(df, sol):
        assert all((p.dtypes == sol.dtypes).all()
                   for p in dask.compute(*df.to_delayed()))

    dsk = {
        ('test', 0): [(1, 2)],
        ('test', 1): [],
        ('test', 2): [(10, 20), (100, 200)]
    }
    b = Bag(dsk, 'test', 3)
    sol = pd.DataFrame(b.compute(), columns=['a', 'b'])

    # Elements are tuples
    df = b.to_dataframe()
    dd.utils.assert_eq(df,
                       sol.rename(columns={
                           'a': 0,
                           'b': 1
                       }),
                       check_index=False)
    df = b.to_dataframe(columns=['a', 'b'])
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)

    # Elements are dictionaries
    b = b.map(lambda x: dict(zip(['a', 'b'], x)))
    df = b.to_dataframe()
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)
    assert df._name == b.to_dataframe()._name

    # With metadata specified
    df = b.to_dataframe(columns=sol)
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)

    # Single column
    b = b.pluck('a')
    sol = sol[['a']]
    df = b.to_dataframe(columns=sol)
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)
Ejemplo n.º 12
0
import dask
import dask.bag as db
from dask.bag.core import (Bag, lazify, lazify_task, map, collect, reduceby,
                           reify, partition, inline_singleton_lists, optimize,
                           from_delayed)
from dask. async import get_sync
from dask.compatibility import BZ2File, GzipFile, PY2
from dask.utils import filetexts, tmpfile, tmpdir, open
from dask.utils_test import inc, add

dsk = {('x', 0): (range, 5), ('x', 1): (range, 5), ('x', 2): (range, 5)}

L = list(range(5)) * 3

b = Bag(dsk, 'x', 3)


def iseven(x):
    return x % 2 == 0


def isodd(x):
    return x % 2 == 1


def test_Bag():
    assert b.name == 'x'
    assert b.npartitions == 3

Ejemplo n.º 13
0
def bag_to_iterator(x, **kwargs):
    keys = keywords(Bag.from_sequence)
    kwargs2 = dict((k, v) for k, v in kwargs.items() if k in keys)
    return Bag.from_sequence(x, **kwargs2)
Ejemplo n.º 14
0
    optimize,
    from_delayed,
)
from dask.bag.utils import assert_eq
from dask.delayed import Delayed
from dask.utils import filetexts, tmpfile, tmpdir
from dask.utils_test import inc, add

# Needed to pickle the lambda functions used in this test suite
pytest.importorskip("cloudpickle")

dsk = {("x", 0): (range, 5), ("x", 1): (range, 5), ("x", 2): (range, 5)}

L = list(range(5)) * 3

b = Bag(dsk, "x", 3)


def iseven(x):
    return x % 2 == 0


def isodd(x):
    return x % 2 == 1


def test_Bag():
    assert b.name == "x"
    assert b.npartitions == 3

Ejemplo n.º 15
0
def test_to_dataframe():
    dd = pytest.importorskip('dask.dataframe')
    pd = pytest.importorskip('pandas')

    def check_parts(df, sol):
        assert all((p.dtypes == sol.dtypes).all() for p in
                   dask.compute(*df.to_delayed()))

    dsk = {('test', 0): [(1, 2)],
           ('test', 1): [],
           ('test', 2): [(10, 20), (100, 200)]}
    b = Bag(dsk, 'test', 3)
    sol = pd.DataFrame(b.compute(), columns=['a', 'b'])

    # Elements are tuples
    df = b.to_dataframe()
    dd.utils.assert_eq(df, sol.rename(columns={'a': 0, 'b': 1}),
                       check_index=False)
    df = b.to_dataframe(columns=['a', 'b'])
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)
    df = b.to_dataframe(meta=[('a', 'i8'), ('b', 'i8')])
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)

    # Elements are dictionaries
    b = b.map(lambda x: dict(zip(['a', 'b'], x)))
    df = b.to_dataframe()
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)
    assert df._name == b.to_dataframe()._name

    # With metadata specified
    for meta in [sol, [('a', 'i8'), ('b', 'i8')]]:
        df = b.to_dataframe(meta=meta)
        dd.utils.assert_eq(df, sol, check_index=False)
        check_parts(df, sol)

    # Error to specify both columns and meta
    with pytest.raises(ValueError):
        b.to_dataframe(columns=['a', 'b'], meta=sol)

    # Single column
    b = b.pluck('a')
    sol = sol[['a']]
    df = b.to_dataframe(meta=sol)
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)

    # Works with iterators and tuples
    sol = pd.DataFrame({'a': range(100)})
    b = db.from_sequence(range(100), npartitions=5)
    for f in [iter, tuple]:
        df = b.map_partitions(f).to_dataframe(meta=sol)
        dd.utils.assert_eq(df, sol, check_index=False)
        check_parts(df, sol)
Ejemplo n.º 16
0
def test_to_dataframe():
    dd = pytest.importorskip("dask.dataframe")
    pd = pytest.importorskip("pandas")

    def check_parts(df, sol):
        assert all((p.dtypes == sol.dtypes).all()
                   for p in dask.compute(*df.to_delayed()))

    dsk = {
        ("test", 0): [(1, 2)],
        ("test", 1): [],
        ("test", 2): [(10, 20), (100, 200)]
    }
    b = Bag(dsk, "test", 3)
    sol = pd.DataFrame(b.compute(), columns=["a", "b"])

    # Elements are tuples
    df = b.to_dataframe()
    dd.utils.assert_eq(df,
                       sol.rename(columns={
                           "a": 0,
                           "b": 1
                       }),
                       check_index=False)
    df = b.to_dataframe(columns=["a", "b"])
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)
    df = b.to_dataframe(meta=[("a", "i8"), ("b", "i8")])
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)

    # Elements are dictionaries
    b = b.map(lambda x: dict(zip(["a", "b"], x)))
    df = b.to_dataframe()
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)
    assert df._name == b.to_dataframe()._name

    # With metadata specified
    for meta in [sol, [("a", "i8"), ("b", "i8")]]:
        df = b.to_dataframe(meta=meta)
        dd.utils.assert_eq(df, sol, check_index=False)
        check_parts(df, sol)

    # Error to specify both columns and meta
    with pytest.raises(ValueError):
        b.to_dataframe(columns=["a", "b"], meta=sol)

    # Inference fails if empty first partition
    b2 = b.filter(lambda x: x["a"] > 200)
    with pytest.raises(ValueError):
        b2.to_dataframe()

    # Single column
    b = b.pluck("a")
    sol = sol[["a"]]
    df = b.to_dataframe(meta=sol)
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)

    # Works with iterators and tuples
    sol = pd.DataFrame({"a": range(100)})
    b = db.from_sequence(range(100), npartitions=5)
    for f in [iter, tuple]:
        df = b.map_partitions(f).to_dataframe(meta=sol)
        dd.utils.assert_eq(df, sol, check_index=False)
        check_parts(df, sol)
Ejemplo n.º 17
0
def test_to_dataframe():
    dd = pytest.importorskip('dask.dataframe')
    pd = pytest.importorskip('pandas')

    def check_parts(df, sol):
        assert all((p.dtypes == sol.dtypes).all() for p in
                   dask.compute(*df.to_delayed()))

    dsk = {('test', 0): [(1, 2)],
           ('test', 1): [],
           ('test', 2): [(10, 20), (100, 200)]}
    b = Bag(dsk, 'test', 3)
    sol = pd.DataFrame(b.compute(), columns=['a', 'b'])

    # Elements are tuples
    df = b.to_dataframe()
    dd.utils.assert_eq(df, sol.rename(columns={'a': 0, 'b': 1}),
                       check_index=False)
    df = b.to_dataframe(columns=['a', 'b'])
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)
    df = b.to_dataframe(meta=[('a', 'i8'), ('b', 'i8')])
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)

    # Elements are dictionaries
    b = b.map(lambda x: dict(zip(['a', 'b'], x)))
    df = b.to_dataframe()
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)
    assert df._name == b.to_dataframe()._name

    # With metadata specified
    for meta in [sol, [('a', 'i8'), ('b', 'i8')]]:
        df = b.to_dataframe(meta=meta)
        dd.utils.assert_eq(df, sol, check_index=False)
        check_parts(df, sol)

    # Error to specify both columns and meta
    with pytest.raises(ValueError):
        b.to_dataframe(columns=['a', 'b'], meta=sol)

    # Single column
    b = b.pluck('a')
    sol = sol[['a']]
    df = b.to_dataframe(meta=sol)
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)

    # Works with iterators and tuples
    sol = pd.DataFrame({'a': range(100)})
    b = db.from_sequence(range(100), npartitions=5)
    for f in [iter, tuple]:
        df = b.map_partitions(f).to_dataframe(meta=sol)
        dd.utils.assert_eq(df, sol, check_index=False)
        check_parts(df, sol)