def test_ensure(): assert isinstance(ensure_bytes(''), bytes) assert isinstance(ensure_bytes(b''), bytes) assert isinstance(ensure_string(''), unicode) assert isinstance(ensure_string(b''), unicode) assert ensure_string({'x': b'', 'y': ''}) == {'x': '', 'y': ''} assert ensure_bytes({'x': b'', 'y': ''}) == {'x': b'', 'y': b''}
def _read_csv(path, executor=None, hdfs=None, lazy=True, lineterminator='\n', header=True, names=None, collection=True, **kwargs): from hdfs3 import HDFileSystem from hdfs3.core import ensure_bytes from dask import do import pandas as pd hdfs = hdfs or HDFileSystem() executor = default_executor(executor) kwargs['lineterminator'] = lineterminator filenames = walk_glob(hdfs, path) blockss = [ read_bytes(fn, executor, hdfs, lazy=True, delimiter=ensure_bytes(lineterminator)) for fn in filenames ] if names is None and header: with hdfs.open(filenames[0]) as f: head = pd.read_csv(f, nrows=5, **kwargs) names = head.columns dfs1 = [ [do(bytes_read_csv)(blocks[0], names=names, skiprows=1, **kwargs)] + [do(bytes_read_csv)(b, names=names, **kwargs) for b in blocks[1:]] for blocks in blockss ] dfs2 = sum(dfs1, []) if lazy: from dask.dataframe import from_imperative if collection: ensure_default_get(executor) raise gen.Return(from_imperative(dfs2, head)) else: raise gen.Return(dfs2) else: futures = executor.compute(dfs2) from distributed.collections import _futures_to_dask_dataframe if collection: ensure_default_get(executor) df = yield _futures_to_dask_dataframe(futures) raise gen.Return(df) else: raise gen.Return(futures)
def _read_csv(path, executor=None, hdfs=None, lazy=True, lineterminator='\n', header=True, names=None, collection=True, **kwargs): from hdfs3 import HDFileSystem from hdfs3.core import ensure_bytes from dask import do import pandas as pd hdfs = hdfs or HDFileSystem() executor = default_executor(executor) kwargs['lineterminator'] = lineterminator filenames = walk_glob(hdfs, path) blockss = [read_bytes(fn, executor, hdfs, lazy=True, delimiter=ensure_bytes(lineterminator)) for fn in filenames] if names is None and header: with hdfs.open(filenames[0]) as f: head = pd.read_csv(f, nrows=5, **kwargs) names = head.columns dfs1 = [[do(bytes_read_csv)(blocks[0], names=names, skiprows=1, **kwargs)] + [do(bytes_read_csv)(b, names=names, **kwargs) for b in blocks[1:]] for blocks in blockss] dfs2 = sum(dfs1, []) if lazy: from dask.dataframe import from_imperative if collection: ensure_default_get(executor) raise gen.Return(from_imperative(dfs2, head)) else: raise gen.Return(dfs2) else: futures = executor.compute(dfs2) from distributed.collections import _futures_to_dask_dataframe if collection: ensure_default_get(executor) df = yield _futures_to_dask_dataframe(futures) raise gen.Return(df) else: raise gen.Return(futures)