def test_from_imperative(): from dask import do dfs = [do(tm.makeTimeDataFrame)(i) for i in range(1, 5)] df = dd.from_imperative(dfs, columns=['A', 'B', 'C', 'D']) assert (df.compute().columns == df.columns).all() assert list(df.map_partitions(len).compute()) == [1, 2, 3, 4] ss = [df.A for df in dfs] s = dd.from_imperative(ss, columns='A') assert s.compute().name == s.name assert list(s.map_partitions(len).compute()) == [1, 2, 3, 4]
def test_from_imperative(): from dask import do dfs = [do(tm.makeTimeDataFrame)(i) for i in range(1, 5)] df = dd.from_imperative(dfs, columns=["A", "B", "C", "D"]) assert (df.compute().columns == df.columns).all() assert list(df.map_partitions(len).compute()) == [1, 2, 3, 4] ss = [df.A for df in dfs] s = dd.from_imperative(ss, columns="A") assert s.compute().name == s.name assert list(s.map_partitions(len).compute()) == [1, 2, 3, 4]
def _read_csv(fn, executor=None, hdfs=None, lazy=False, lineterminator='\n', header=True, names=None, **kwargs): from hdfs3 import HDFileSystem from dask import do import pandas as pd hdfs = hdfs or HDFileSystem() executor = default_executor(executor) kwargs['lineterminator'] = lineterminator filenames = hdfs.glob(fn) blockss = [read_binary(fn, executor, hdfs, lazy=True, delimiter=lineterminator) for fn in filenames] if names is None and header: with hdfs.open(filenames[0]) as f: head = pd.read_csv(f, nrows=5, **kwargs) names = head.columns dfs1 = [[do(buffer_to_csv)(blocks[0], names=names, skiprows=1, **kwargs)] + [do(buffer_to_csv)(b, names=names, **kwargs) for b in blocks[1:]] for blocks in blockss] dfs2 = sum(dfs1, []) if lazy: from dask.dataframe import from_imperative raise gen.Return(from_imperative(dfs2, columns=names)) else: futures = executor.compute(*dfs2) from distributed.collections import _futures_to_dask_dataframe df = yield _futures_to_dask_dataframe(futures) raise gen.Return(df)
def _read_text(fn, encoding='utf-8', errors='strict', lineterminator='\n', executor=None, hdfs=None, lazy=True, collection=True): from hdfs3 import HDFileSystem from dask import do import pandas as pd hdfs = hdfs or HDFileSystem() executor = default_executor(executor) filenames = sorted(hdfs.glob(fn)) blocks = [block for fn in filenames for block in read_bytes(fn, executor, hdfs, lazy=True, delimiter=lineterminator.encode())] strings = [do(bytes.decode)(b, encoding, errors) for b in blocks] lines = [do(unicode.split)(s, lineterminator) for s in strings] if lazy: from dask.bag import from_imperative if collection: ensure_default_get(executor) raise gen.Return(from_imperative(lines)) else: raise gen.Return(lines) else: futures = executor.compute(lines) from distributed.collections import _futures_to_dask_bag if collection: ensure_default_get(executor) b = yield _futures_to_dask_bag(futures) raise gen.Return(b) else: raise gen.Return(futures)
def read_csv(block_lists, header, head, kwargs, lazy=True, collection=True, executor=None): """ Convert blocks of bytes to a dask.dataframe or other high-level object This accepts a list of lists of futures/values of bytes where each list corresponds to one file, and the futures/values of bytes concatenate to comprise the entire file, in order. Parameters ---------- block_lists: list of lists of futures of bytes The lists of bytestrings with each list corresponding to one logical file header: bytestring The header, found at the front of the first file, to be prepended to all blocks head: pd.DataFrame An example Pandas DataFrame to be used for metadata kwargs: dict Keyword arguments to pass down to ``pd.read_csv`` lazy: boolean, optional (defaults to True) collection: boolean, optional (defaults to True) Returns ------- A dask.dataframe, or list of futures or values, depending on the value of lazy and collection. """ from dask.dataframe import from_imperative executor = default_executor(executor) dfs1 = [[do(bytes_read_csv)(blocks[0], '', kwargs)] + [do(bytes_read_csv)(b, header, kwargs) for b in blocks[1:]] for blocks in block_lists] dfs2 = sum(dfs1, []) ensure_default_get(executor) if collection: result = from_imperative(dfs2, head) else: result = dfs2 if not lazy: if collection: result = executor.persist(result) else: result = executor.compute(result) return result
def read_csv(block_lists, header, head, kwargs, lazy=True, collection=True, executor=None): """ Convert blocks of bytes to a dask.dataframe or other high-level object This accepts a list of lists of futures/values of bytes where each list corresponds to one file, and the futures/values of bytes concatenate to comprise the entire file, in order. Parameters ---------- block_lists: list of lists of futures of bytes The lists of bytestrings with each list corresponding to one logical file header: bytestring The header, found at the front of the first file, to be prepended to all blocks head: pd.DataFrame An example Pandas DataFrame to be used for metadata kwargs: dict Keyword arguments to pass down to ``pd.read_csv`` lazy: boolean, optional (defaults to True) collection: boolean, optional (defaults to True) Returns ------- A dask.dataframe, or list of futures or values, depending on the value of lazy and collection. """ executor = default_executor(executor) dfs1 = [[do(bytes_read_csv)(blocks[0], '', kwargs)] + [do(bytes_read_csv)(b, header, kwargs) for b in blocks[1:]] for blocks in block_lists] dfs2 = sum(dfs1, []) ensure_default_get(executor) if collection: result = from_imperative(dfs2, head) else: result = dfs2 if not lazy: if collection: result = executor.persist(result) else: result = executor.compute(result) return result
def _read_text(fn, encoding='utf-8', errors='strict', lineterminator='\n', executor=None, hdfs=None, lazy=True, collection=True): from hdfs3 import HDFileSystem from dask import do import pandas as pd hdfs = hdfs or HDFileSystem() executor = default_executor(executor) filenames = sorted(hdfs.glob(fn)) blocks = [ block for fn in filenames for block in read_bytes( fn, executor, hdfs, lazy=True, delimiter=lineterminator.encode()) ] strings = [do(bytes.decode)(b, encoding, errors) for b in blocks] lines = [do(unicode.split)(s, lineterminator) for s in strings] if lazy: from dask.bag import from_imperative if collection: ensure_default_get(executor) raise gen.Return(from_imperative(lines)) else: raise gen.Return(lines) else: futures = executor.compute(lines) from distributed.collections import _futures_to_dask_bag if collection: ensure_default_get(executor) b = yield _futures_to_dask_bag(futures) raise gen.Return(b) else: raise gen.Return(futures)