def test__futures_to_dask_bag(s, a, b): import dask.bag as db e = Executor((s.ip, s.port), start=False) yield e._start() L = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] futures = yield e._scatter(L) rb = yield _futures_to_dask_bag(futures) assert isinstance(rb, db.Bag) assert rb.npartitions == len(L) lb = db.from_sequence([1, 2, 3, 4, 5, 6, 7, 8, 9], npartitions=3) exprs = [lambda x: x.map(lambda x: x + 1).sum(), lambda x: x.filter(lambda x: x % 2)] for expr in exprs: local = expr(lb).compute(get=dask.get) remote = e.compute(expr(rb)) remote = yield remote._result() assert local == remote yield e._shutdown()
def test__futures_to_collection(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() remote_dfs = e.map(identity, dfs) ddf = yield _futures_to_collection(remote_dfs, divisions=True) ddf2 = yield _futures_to_dask_dataframe(remote_dfs, divisions=True) assert isinstance(ddf, dd.DataFrame) assert ddf.dask == ddf2.dask remote_arrays = e.map(np.arange, range(3, 5)) x = yield _futures_to_collection(remote_arrays) y = yield _futures_to_dask_array(remote_arrays) assert type(x) == type(y) assert x.dask == y.dask remote_lists = yield e._scatter([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) b = yield _futures_to_collection(remote_lists) c = yield _futures_to_dask_bag(remote_lists) assert type(b) == type(c) assert b.dask == b.dask yield e._shutdown()
def test__futures_to_dask_bag(s, a, b): import dask.bag as db e = Executor((s.ip, s.port), start=False) yield e._start() L = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] futures = yield e._scatter(L) rb = yield _futures_to_dask_bag(futures) assert isinstance(rb, db.Bag) assert rb.npartitions == len(L) lb = db.from_sequence([1, 2, 3, 4, 5, 6, 7, 8, 9], npartitions=3) exprs = [ lambda x: x.map(lambda x: x + 1).sum(), lambda x: x.filter(lambda x: x % 2) ] for expr in exprs: local = expr(lb).compute(get=dask.get) remote = e.compute(expr(rb)) remote = yield remote._result() assert local == remote yield e._shutdown()
def _read_text(fn, encoding='utf-8', errors='strict', lineterminator='\n', executor=None, hdfs=None, lazy=True, collection=True): from hdfs3 import HDFileSystem from dask import do import pandas as pd hdfs = hdfs or HDFileSystem() executor = default_executor(executor) filenames = sorted(hdfs.glob(fn)) blocks = [block for fn in filenames for block in read_bytes(fn, executor, hdfs, lazy=True, delimiter=lineterminator.encode())] strings = [do(bytes.decode)(b, encoding, errors) for b in blocks] lines = [do(unicode.split)(s, lineterminator) for s in strings] if lazy: from dask.bag import from_imperative if collection: ensure_default_get(executor) raise gen.Return(from_imperative(lines)) else: raise gen.Return(lines) else: futures = executor.compute(lines) from distributed.collections import _futures_to_dask_bag if collection: ensure_default_get(executor) b = yield _futures_to_dask_bag(futures) raise gen.Return(b) else: raise gen.Return(futures)
def test__futures_to_collection(c, s, a, b): remote_dfs = c.map(identity, dfs) ddf = yield _futures_to_collection(remote_dfs, divisions=True) ddf2 = yield _futures_to_dask_dataframe(remote_dfs, divisions=True) assert isinstance(ddf, dd.DataFrame) assert ddf.dask == ddf2.dask remote_arrays = c.map(np.arange, range(3, 5)) x = yield _futures_to_collection(remote_arrays) y = yield _futures_to_dask_array(remote_arrays) assert type(x) == type(y) assert x.dask == y.dask remote_lists = yield c._scatter([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) b = yield _futures_to_collection(remote_lists) c = yield _futures_to_dask_bag(remote_lists) assert type(b) == type(c) assert b.dask == b.dask
def test__futures_to_dask_bag(c, s, a, b): import dask.bag as db L = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] futures = yield c._scatter(L) rb = yield _futures_to_dask_bag(futures) assert isinstance(rb, db.Bag) assert rb.npartitions == len(L) lb = db.from_sequence([1, 2, 3, 4, 5, 6, 7, 8, 9], npartitions=3) exprs = [lambda x: x.map(lambda x: x + 1).sum(), lambda x: x.filter(lambda x: x % 2)] for expr in exprs: local = expr(lb).compute(get=dask.get) remote = c.compute(expr(rb)) remote = yield remote assert local == remote
def _read_text(fn, encoding='utf-8', errors='strict', lineterminator='\n', executor=None, hdfs=None, lazy=True, collection=True): from hdfs3 import HDFileSystem from dask import do import pandas as pd hdfs = hdfs or HDFileSystem() executor = default_executor(executor) filenames = sorted(hdfs.glob(fn)) blocks = [ block for fn in filenames for block in read_bytes( fn, executor, hdfs, lazy=True, delimiter=lineterminator.encode()) ] strings = [do(bytes.decode)(b, encoding, errors) for b in blocks] lines = [do(unicode.split)(s, lineterminator) for s in strings] if lazy: from dask.bag import from_imperative if collection: ensure_default_get(executor) raise gen.Return(from_imperative(lines)) else: raise gen.Return(lines) else: futures = executor.compute(lines) from distributed.collections import _futures_to_dask_bag if collection: ensure_default_get(executor) b = yield _futures_to_dask_bag(futures) raise gen.Return(b) else: raise gen.Return(futures)