def test_pluck(): d = {('x', 0): [(1, 10), (2, 20)], ('x', 1): [(3, 30), (4, 40)]} b = Bag(d, 'x', 2) assert set(b.pluck(0)) == set([1, 2, 3, 4]) assert set(b.pluck(1)) == set([10, 20, 30, 40]) assert set(b.pluck([1, 0])) == set([(10, 1), (20, 2), (30, 3), (40, 4)])
def to_bag(df, index=False, format="tuple"): """Create Dask Bag from a Dask DataFrame Parameters ---------- index : bool, optional If True, the elements are tuples of ``(index, value)``, otherwise they're just the ``value``. Default is False. format : {"tuple", "dict", "frame"}, optional Whether to return a bag of tuples, dictionaries, or dataframe-like objects. Default is "tuple". If "frame", the original partitions of ``df`` will not be transformed in any way. Examples -------- >>> bag = df.to_bag() # doctest: +SKIP """ from dask.bag.core import Bag if not isinstance(df, (DataFrame, Series)): raise TypeError("df must be either DataFrame or Series") name = "to_bag-" + tokenize(df, index, format) if format == "frame": # Use existing graph and name of df, but # drop meta to produce a Bag collection dsk = df.dask name = df._name else: dsk = {(name, i): (_df_to_bag, block, index, format) for (i, block) in enumerate(df.__dask_keys__())} dsk.update( df.__dask_optimize__(df.__dask_graph__(), df.__dask_keys__())) return Bag(dsk, name, df.npartitions)
def from_elasticsearch(host, index, query, port=9200, pagination=100): """ Create Bag from Elasticsearch Query >>> b = from_elasticsearch(host='hostname', index='reddit', ... query={"match": {'body':'Python'}}) """ es = Elasticsearch([{'host': host, 'port': port}]) count = es.count(index=index, body={'query': query})['count'] npartitions = int(ceil(count / pagination)) name = 'elasticsearch' + next(tokens) dsk = dict() for i in range(npartitions): kwargs = { 'index': index, 'body': { 'query': query, 'from': pagination * i, 'size': pagination } } dsk[(name, i)] = (get_results, es, kwargs) return Bag(dsk, name, npartitions)
def test_reductions_are_lazy(): current = [None] def part(): for i in range(10): current[0] = i yield i def func(part): assert current[0] == 0 return sum(part) b = Bag({('foo', 0): part()}, 'foo', 1) res = b.reduction(func, sum) assert res.compute(get=dask.get) == sum(range(10))
def test_reductions_are_lazy(): current = [None] def part(): for i in range(10): current[0] = i yield i def func(part): assert current[0] == 0 return sum(part) b = Bag({("foo", 0): part()}, "foo", 1) res = b.reduction(func, sum) assert_eq(res, sum(range(10)))
def test_pluck(): d = {("x", 0): [(1, 10), (2, 20)], ("x", 1): [(3, 30), (4, 40)]} b = Bag(d, "x", 2) assert set(b.pluck(0)) == {1, 2, 3, 4} assert set(b.pluck(1)) == {10, 20, 30, 40} assert set(b.pluck([1, 0])) == {(10, 1), (20, 2), (30, 3), (40, 4)} assert b.pluck([1, 0]).name == b.pluck([1, 0]).name
def bag_to_iterator(x, **kwargs): return Bag.from_filenames([tf.path for tf in x])
def test_args(): c = b.map(lambda x: x + 1) d = Bag(*c._args) assert list(c) == list(d) assert c.npartitions == d.npartitions
def test_to_dataframe(): dd = pytest.importorskip('dask.dataframe') pd = pytest.importorskip('pandas') def check_parts(df, sol): assert all((p.dtypes == sol.dtypes).all() for p in dask.compute(*df.to_delayed())) dsk = { ('test', 0): [(1, 2)], ('test', 1): [], ('test', 2): [(10, 20), (100, 200)] } b = Bag(dsk, 'test', 3) sol = pd.DataFrame(b.compute(), columns=['a', 'b']) # Elements are tuples df = b.to_dataframe() dd.utils.assert_eq(df, sol.rename(columns={ 'a': 0, 'b': 1 }), check_index=False) df = b.to_dataframe(columns=['a', 'b']) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) # Elements are dictionaries b = b.map(lambda x: dict(zip(['a', 'b'], x))) df = b.to_dataframe() dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) assert df._name == b.to_dataframe()._name # With metadata specified df = b.to_dataframe(columns=sol) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) # Single column b = b.pluck('a') sol = sol[['a']] df = b.to_dataframe(columns=sol) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol)
import dask import dask.bag as db from dask.bag.core import (Bag, lazify, lazify_task, map, collect, reduceby, reify, partition, inline_singleton_lists, optimize, from_delayed) from dask. async import get_sync from dask.compatibility import BZ2File, GzipFile, PY2 from dask.utils import filetexts, tmpfile, tmpdir, open from dask.utils_test import inc, add dsk = {('x', 0): (range, 5), ('x', 1): (range, 5), ('x', 2): (range, 5)} L = list(range(5)) * 3 b = Bag(dsk, 'x', 3) def iseven(x): return x % 2 == 0 def isodd(x): return x % 2 == 1 def test_Bag(): assert b.name == 'x' assert b.npartitions == 3
def bag_to_iterator(x, **kwargs): keys = keywords(Bag.from_sequence) kwargs2 = dict((k, v) for k, v in kwargs.items() if k in keys) return Bag.from_sequence(x, **kwargs2)
optimize, from_delayed, ) from dask.bag.utils import assert_eq from dask.delayed import Delayed from dask.utils import filetexts, tmpfile, tmpdir from dask.utils_test import inc, add # Needed to pickle the lambda functions used in this test suite pytest.importorskip("cloudpickle") dsk = {("x", 0): (range, 5), ("x", 1): (range, 5), ("x", 2): (range, 5)} L = list(range(5)) * 3 b = Bag(dsk, "x", 3) def iseven(x): return x % 2 == 0 def isodd(x): return x % 2 == 1 def test_Bag(): assert b.name == "x" assert b.npartitions == 3
def test_to_dataframe(): dd = pytest.importorskip('dask.dataframe') pd = pytest.importorskip('pandas') def check_parts(df, sol): assert all((p.dtypes == sol.dtypes).all() for p in dask.compute(*df.to_delayed())) dsk = {('test', 0): [(1, 2)], ('test', 1): [], ('test', 2): [(10, 20), (100, 200)]} b = Bag(dsk, 'test', 3) sol = pd.DataFrame(b.compute(), columns=['a', 'b']) # Elements are tuples df = b.to_dataframe() dd.utils.assert_eq(df, sol.rename(columns={'a': 0, 'b': 1}), check_index=False) df = b.to_dataframe(columns=['a', 'b']) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) df = b.to_dataframe(meta=[('a', 'i8'), ('b', 'i8')]) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) # Elements are dictionaries b = b.map(lambda x: dict(zip(['a', 'b'], x))) df = b.to_dataframe() dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) assert df._name == b.to_dataframe()._name # With metadata specified for meta in [sol, [('a', 'i8'), ('b', 'i8')]]: df = b.to_dataframe(meta=meta) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) # Error to specify both columns and meta with pytest.raises(ValueError): b.to_dataframe(columns=['a', 'b'], meta=sol) # Single column b = b.pluck('a') sol = sol[['a']] df = b.to_dataframe(meta=sol) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) # Works with iterators and tuples sol = pd.DataFrame({'a': range(100)}) b = db.from_sequence(range(100), npartitions=5) for f in [iter, tuple]: df = b.map_partitions(f).to_dataframe(meta=sol) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol)
def test_to_dataframe(): dd = pytest.importorskip("dask.dataframe") pd = pytest.importorskip("pandas") def check_parts(df, sol): assert all((p.dtypes == sol.dtypes).all() for p in dask.compute(*df.to_delayed())) dsk = { ("test", 0): [(1, 2)], ("test", 1): [], ("test", 2): [(10, 20), (100, 200)] } b = Bag(dsk, "test", 3) sol = pd.DataFrame(b.compute(), columns=["a", "b"]) # Elements are tuples df = b.to_dataframe() dd.utils.assert_eq(df, sol.rename(columns={ "a": 0, "b": 1 }), check_index=False) df = b.to_dataframe(columns=["a", "b"]) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) df = b.to_dataframe(meta=[("a", "i8"), ("b", "i8")]) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) # Elements are dictionaries b = b.map(lambda x: dict(zip(["a", "b"], x))) df = b.to_dataframe() dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) assert df._name == b.to_dataframe()._name # With metadata specified for meta in [sol, [("a", "i8"), ("b", "i8")]]: df = b.to_dataframe(meta=meta) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) # Error to specify both columns and meta with pytest.raises(ValueError): b.to_dataframe(columns=["a", "b"], meta=sol) # Inference fails if empty first partition b2 = b.filter(lambda x: x["a"] > 200) with pytest.raises(ValueError): b2.to_dataframe() # Single column b = b.pluck("a") sol = sol[["a"]] df = b.to_dataframe(meta=sol) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) # Works with iterators and tuples sol = pd.DataFrame({"a": range(100)}) b = db.from_sequence(range(100), npartitions=5) for f in [iter, tuple]: df = b.map_partitions(f).to_dataframe(meta=sol) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol)