Beispiel #1
0
def test_ensure():
    assert isinstance(ensure_bytes(''), bytes)
    assert isinstance(ensure_bytes(b''), bytes)
    assert isinstance(ensure_string(''), unicode)
    assert isinstance(ensure_string(b''), unicode)
    assert ensure_string({'x': b'', 'y': ''}) == {'x': '', 'y': ''}
    assert ensure_bytes({'x': b'', 'y': ''}) == {'x': b'', 'y': b''}
Beispiel #2
0
def test_ensure():
    assert isinstance(ensure_bytes(''), bytes)
    assert isinstance(ensure_bytes(b''), bytes)
    assert isinstance(ensure_string(''), unicode)
    assert isinstance(ensure_string(b''), unicode)
    assert ensure_string({'x': b'', 'y': ''}) == {'x': '', 'y': ''}
    assert ensure_bytes({'x': b'', 'y': ''}) == {'x': b'', 'y': b''}
Beispiel #3
0
def _read_csv(path,
              executor=None,
              hdfs=None,
              lazy=True,
              lineterminator='\n',
              header=True,
              names=None,
              collection=True,
              **kwargs):
    from hdfs3 import HDFileSystem
    from hdfs3.core import ensure_bytes
    from dask import do
    import pandas as pd
    hdfs = hdfs or HDFileSystem()
    executor = default_executor(executor)
    kwargs['lineterminator'] = lineterminator

    filenames = walk_glob(hdfs, path)
    blockss = [
        read_bytes(fn,
                   executor,
                   hdfs,
                   lazy=True,
                   delimiter=ensure_bytes(lineterminator)) for fn in filenames
    ]
    if names is None and header:
        with hdfs.open(filenames[0]) as f:
            head = pd.read_csv(f, nrows=5, **kwargs)
            names = head.columns

    dfs1 = [
        [do(bytes_read_csv)(blocks[0], names=names, skiprows=1, **kwargs)] +
        [do(bytes_read_csv)(b, names=names, **kwargs) for b in blocks[1:]]
        for blocks in blockss
    ]
    dfs2 = sum(dfs1, [])
    if lazy:
        from dask.dataframe import from_imperative
        if collection:
            ensure_default_get(executor)
            raise gen.Return(from_imperative(dfs2, head))
        else:
            raise gen.Return(dfs2)

    else:
        futures = executor.compute(dfs2)
        from distributed.collections import _futures_to_dask_dataframe
        if collection:
            ensure_default_get(executor)
            df = yield _futures_to_dask_dataframe(futures)
            raise gen.Return(df)
        else:
            raise gen.Return(futures)
Beispiel #4
0
def _read_csv(path, executor=None, hdfs=None, lazy=True, lineterminator='\n',
        header=True, names=None, collection=True, **kwargs):
    from hdfs3 import HDFileSystem
    from hdfs3.core import ensure_bytes
    from dask import do
    import pandas as pd
    hdfs = hdfs or HDFileSystem()
    executor = default_executor(executor)
    kwargs['lineterminator'] = lineterminator

    filenames = walk_glob(hdfs, path)
    blockss = [read_bytes(fn, executor, hdfs, lazy=True,
                          delimiter=ensure_bytes(lineterminator))
               for fn in filenames]
    if names is None and header:
        with hdfs.open(filenames[0]) as f:
            head = pd.read_csv(f, nrows=5, **kwargs)
            names = head.columns

    dfs1 = [[do(bytes_read_csv)(blocks[0], names=names, skiprows=1, **kwargs)] +
            [do(bytes_read_csv)(b, names=names, **kwargs) for b in blocks[1:]]
            for blocks in blockss]
    dfs2 = sum(dfs1, [])
    if lazy:
        from dask.dataframe import from_imperative
        if collection:
            ensure_default_get(executor)
            raise gen.Return(from_imperative(dfs2, head))
        else:
            raise gen.Return(dfs2)

    else:
        futures = executor.compute(dfs2)
        from distributed.collections import _futures_to_dask_dataframe
        if collection:
            ensure_default_get(executor)
            df = yield _futures_to_dask_dataframe(futures)
            raise gen.Return(df)
        else:
            raise gen.Return(futures)