Exemple #1
0
def test_from_imperative():
    from dask import do
    dfs = [do(tm.makeTimeDataFrame)(i) for i in range(1, 5)]
    df = dd.from_imperative(dfs, columns=['A', 'B', 'C', 'D'])

    assert (df.compute().columns == df.columns).all()
    assert list(df.map_partitions(len).compute()) == [1, 2, 3, 4]

    ss = [df.A for df in dfs]
    s = dd.from_imperative(ss, columns='A')

    assert s.compute().name == s.name
    assert list(s.map_partitions(len).compute()) == [1, 2, 3, 4]
Exemple #2
0
def test_from_imperative():
    from dask import do

    dfs = [do(tm.makeTimeDataFrame)(i) for i in range(1, 5)]
    df = dd.from_imperative(dfs, columns=["A", "B", "C", "D"])

    assert (df.compute().columns == df.columns).all()
    assert list(df.map_partitions(len).compute()) == [1, 2, 3, 4]

    ss = [df.A for df in dfs]
    s = dd.from_imperative(ss, columns="A")

    assert s.compute().name == s.name
    assert list(s.map_partitions(len).compute()) == [1, 2, 3, 4]
Exemple #3
0
def _read_csv(fn, executor=None, hdfs=None, lazy=False, lineterminator='\n',
        header=True, names=None, **kwargs):
    from hdfs3 import HDFileSystem
    from dask import do
    import pandas as pd
    hdfs = hdfs or HDFileSystem()
    executor = default_executor(executor)
    kwargs['lineterminator'] = lineterminator
    filenames = hdfs.glob(fn)
    blockss = [read_binary(fn, executor, hdfs, lazy=True, delimiter=lineterminator)
               for fn in filenames]
    if names is None and header:
        with hdfs.open(filenames[0]) as f:
            head = pd.read_csv(f, nrows=5, **kwargs)
            names = head.columns

    dfs1 = [[do(buffer_to_csv)(blocks[0], names=names, skiprows=1, **kwargs)] +
            [do(buffer_to_csv)(b, names=names, **kwargs) for b in blocks[1:]]
            for blocks in blockss]
    dfs2 = sum(dfs1, [])
    if lazy:
        from dask.dataframe import from_imperative
        raise gen.Return(from_imperative(dfs2, columns=names))
    else:
        futures = executor.compute(*dfs2)
        from distributed.collections import _futures_to_dask_dataframe
        df = yield _futures_to_dask_dataframe(futures)
        raise gen.Return(df)
Exemple #4
0
def _read_text(fn, encoding='utf-8', errors='strict', lineterminator='\n',
               executor=None, hdfs=None, lazy=True, collection=True):
    from hdfs3 import HDFileSystem
    from dask import do
    import pandas as pd
    hdfs = hdfs or HDFileSystem()
    executor = default_executor(executor)

    filenames = sorted(hdfs.glob(fn))
    blocks = [block for fn in filenames
                    for block in read_bytes(fn, executor, hdfs, lazy=True,
                                            delimiter=lineterminator.encode())]
    strings = [do(bytes.decode)(b, encoding, errors) for b in blocks]
    lines = [do(unicode.split)(s, lineterminator) for s in strings]

    if lazy:
        from dask.bag import from_imperative
        if collection:
            ensure_default_get(executor)
            raise gen.Return(from_imperative(lines))
        else:
            raise gen.Return(lines)

    else:
        futures = executor.compute(lines)
        from distributed.collections import _futures_to_dask_bag
        if collection:
            ensure_default_get(executor)
            b = yield _futures_to_dask_bag(futures)
            raise gen.Return(b)
        else:
            raise gen.Return(futures)
Exemple #5
0
def read_csv(block_lists,
             header,
             head,
             kwargs,
             lazy=True,
             collection=True,
             executor=None):
    """ Convert blocks of bytes to a dask.dataframe or other high-level object

    This accepts a list of lists of futures/values of bytes where each list
    corresponds to one file, and the futures/values of bytes concatenate to
    comprise the entire file, in order.

    Parameters
    ----------
    block_lists: list of lists of futures of bytes
        The lists of bytestrings with each list corresponding to one logical file
    header: bytestring
        The header, found at the front of the first file, to be prepended to
        all blocks
    head: pd.DataFrame
        An example Pandas DataFrame to be used for metadata
    kwargs: dict
        Keyword arguments to pass down to ``pd.read_csv``
    lazy: boolean, optional (defaults to True)
    collection: boolean, optional (defaults to True)

    Returns
    -------
    A dask.dataframe, or list of futures or values, depending on the value of
    lazy and collection.
    """
    from dask.dataframe import from_imperative
    executor = default_executor(executor)

    dfs1 = [[do(bytes_read_csv)(blocks[0], '', kwargs)] +
            [do(bytes_read_csv)(b, header, kwargs) for b in blocks[1:]]
            for blocks in block_lists]
    dfs2 = sum(dfs1, [])

    ensure_default_get(executor)

    if collection:
        result = from_imperative(dfs2, head)
    else:
        result = dfs2

    if not lazy:
        if collection:
            result = executor.persist(result)
        else:
            result = executor.compute(result)

    return result
Exemple #6
0
def read_csv(block_lists, header, head, kwargs, lazy=True, collection=True,
        executor=None):
    """ Convert blocks of bytes to a dask.dataframe or other high-level object

    This accepts a list of lists of futures/values of bytes where each list
    corresponds to one file, and the futures/values of bytes concatenate to
    comprise the entire file, in order.

    Parameters
    ----------
    block_lists: list of lists of futures of bytes
        The lists of bytestrings with each list corresponding to one logical file
    header: bytestring
        The header, found at the front of the first file, to be prepended to
        all blocks
    head: pd.DataFrame
        An example Pandas DataFrame to be used for metadata
    kwargs: dict
        Keyword arguments to pass down to ``pd.read_csv``
    lazy: boolean, optional (defaults to True)
    collection: boolean, optional (defaults to True)

    Returns
    -------
    A dask.dataframe, or list of futures or values, depending on the value of
    lazy and collection.
    """
    executor = default_executor(executor)

    dfs1 = [[do(bytes_read_csv)(blocks[0], '', kwargs)] +
            [do(bytes_read_csv)(b, header, kwargs)
                for b in blocks[1:]]
            for blocks in block_lists]
    dfs2 = sum(dfs1, [])

    ensure_default_get(executor)

    if collection:
        result = from_imperative(dfs2, head)
    else:
        result = dfs2

    if not lazy:
        if collection:
            result = executor.persist(result)
        else:
            result = executor.compute(result)

    return result
Exemple #7
0
def _read_text(fn,
               encoding='utf-8',
               errors='strict',
               lineterminator='\n',
               executor=None,
               hdfs=None,
               lazy=True,
               collection=True):
    from hdfs3 import HDFileSystem
    from dask import do
    import pandas as pd
    hdfs = hdfs or HDFileSystem()
    executor = default_executor(executor)

    filenames = sorted(hdfs.glob(fn))
    blocks = [
        block for fn in filenames for block in read_bytes(
            fn, executor, hdfs, lazy=True, delimiter=lineterminator.encode())
    ]
    strings = [do(bytes.decode)(b, encoding, errors) for b in blocks]
    lines = [do(unicode.split)(s, lineterminator) for s in strings]

    if lazy:
        from dask.bag import from_imperative
        if collection:
            ensure_default_get(executor)
            raise gen.Return(from_imperative(lines))
        else:
            raise gen.Return(lines)

    else:
        futures = executor.compute(lines)
        from distributed.collections import _futures_to_dask_bag
        if collection:
            ensure_default_get(executor)
            b = yield _futures_to_dask_bag(futures)
            raise gen.Return(b)
        else:
            raise gen.Return(futures)