Esempio n. 1
0
def read_text(bucket_name,
              prefix='',
              path_delimiter='',
              encoding='utf-8',
              errors='strict',
              lineterminator='\n',
              executor=None,
              anon=None,
              collection=True,
              lazy=True,
              compression=None):
    """
    Read lines of text from S3

    Parameters
    ----------
    bucket_name: string
        Name of S3 bucket like ``'my-bucket'``
    prefix: string
        Prefix of key name to match like ``'/data/2016/``
    path_delimiter: string (optional)
        Delimiter like ``'/'`` to define implicit S3 directory structure
    compression: {None, 'gzip'}

    Returns
    -------
    Dask bag
    """
    from dask import do
    import dask.bag as db
    executor = default_executor(executor)

    blocks = read_bytes(bucket_name,
                        prefix,
                        path_delimiter,
                        executor=executor,
                        lazy=True,
                        anon=anon)

    if compression:
        blocks = map(do(decompress[compression]), blocks)

    lists = [b.decode(encoding, errors).split(lineterminator) for b in blocks]

    if collection:
        ensure_default_get(executor)
        b = db.from_imperative(lists).filter(None)
        if lazy:
            return b
        else:
            return executor.persist(b)[0]
    else:
        if lazy:
            ensure_default_get(executor)
            return lists
        else:
            return executor.compute(lists)
Esempio n. 2
0
def read_text(fn,
              encoding='utf-8',
              errors='strict',
              lineterminator='\n',
              executor=None,
              hdfs=None,
              lazy=True,
              collection=True):
    """ Read text lines from HDFS

    Parameters
    ----------
    fn: string
        filename or globstring of files on HDFS
    collection: boolean, optional
        Whether or not to return a high level collection
    lazy: boolean, optional
        Whether or not to start reading immediately

    Returns
    -------
    Dask bag (if collection=True) or Futures or dask values
    """
    from hdfs3 import HDFileSystem
    from dask import do
    import pandas as pd
    hdfs = hdfs or HDFileSystem()
    executor = default_executor(executor)
    ensure_default_get(executor)

    filenames = sorted(hdfs.glob(fn))
    blocks = [
        block for fn in filenames for block in read_bytes(
            fn, executor, hdfs, lazy=True, delimiter=lineterminator.encode())
    ]
    strings = [do(bytes.decode)(b, encoding, errors) for b in blocks]
    lines = [do(unicode.split)(s, lineterminator) for s in strings]

    from dask.bag import from_imperative
    if collection:
        result = from_imperative(lines).filter(None)
    else:
        result = lines

    if not lazy:
        if collection:
            result = executor.persist(result)
        else:
            result = executor.compute(result)

    return result
Esempio n. 3
0
def read_text(bucket_name, prefix='', path_delimiter='', encoding='utf-8',
        errors='strict', lineterminator='\n', executor=None, anon=None,
        collection=True, lazy=True, compression=None):
    """
    Read lines of text from S3

    Parameters
    ----------
    bucket_name: string
        Name of S3 bucket like ``'my-bucket'``
    prefix: string
        Prefix of key name to match like ``'/data/2016/``
    path_delimiter: string (optional)
        Delimiter like ``'/'`` to define implicit S3 directory structure
    compression: {None, 'gzip'}

    Returns
    -------
    Dask bag
    """
    from dask import do
    import dask.bag as db
    executor = default_executor(executor)

    blocks = read_bytes(bucket_name, prefix, path_delimiter, executor=executor,
                        lazy=True, anon=anon)

    if compression:
        blocks = map(do(decompress[compression]), blocks)

    lists = [b.decode(encoding, errors).split(lineterminator) for b in blocks]

    if collection:
        ensure_default_get(executor)
        b = db.from_imperative(lists).filter(None)
        if lazy:
            return b
        else:
            return executor.persist(b)[0]
    else:
        if lazy:
            ensure_default_get(executor)
            return lists
        else:
            return executor.compute(lists)
Esempio n. 4
0
def read_text(fn, encoding='utf-8', errors='strict', lineterminator='\n',
               executor=None, hdfs=None, lazy=True, collection=True):
    """ Read text lines from HDFS

    Parameters
    ----------
    fn: string
        filename or globstring of files on HDFS
    collection: boolean, optional
        Whether or not to return a high level collection
    lazy: boolean, optional
        Whether or not to start reading immediately

    Returns
    -------
    Dask bag (if collection=True) or Futures or dask values
    """
    from hdfs3 import HDFileSystem
    from dask import do
    import pandas as pd
    hdfs = hdfs or HDFileSystem()
    executor = default_executor(executor)
    ensure_default_get(executor)

    filenames = sorted(hdfs.glob(fn))
    blocks = [block for fn in filenames
                    for block in read_bytes(fn, executor, hdfs, lazy=True,
                                            delimiter=lineterminator.encode())]
    strings = [do(bytes.decode)(b, encoding, errors) for b in blocks]
    lines = [do(unicode.split)(s, lineterminator) for s in strings]

    from dask.bag import from_imperative
    if collection:
        result = from_imperative(lines).filter(None)
    else:
        result = lines

    if not lazy:
        if collection:
            result = executor.persist(result)
        else:
            result = executor.compute(result)

    return result
Esempio n. 5
0
def read_text(fn, keyname=None, encoding='utf-8', errors='strict', lineterminator='\n',
               executor=None, fs=None, lazy=True, collection=True,
               blocksize=2**27, compression=None):
    """ Read text lines from S3

    Parameters
    ----------
    path: string
        Path of files on S3, including both bucket, key, or globstring
    keyname: string, optional
        If path is only the bucket name, provide key name as second argument
    collection: boolean, optional
        Whether or not to return a high level collection
    lazy: boolean, optional
        Whether or not to start reading immediately
    blocksize: int, optional
        Number of bytes per partition.  Use ``None`` for no blocking.
        Silently ignored if data is compressed with a non-splittable format like gzip.
    lineterminator: str, optional
        The endline string used to deliniate line breaks
    compression: str, optional
        Compression to use options include: gzip
        The use of compression will suppress blocking

    Examples
    --------

    Provide bucket and keyname joined by slash.
    >>> b = read_text('bucket/key-directory/')  # doctest: +SKIP

    Alternatively use support globstrings
    >>> b = read_text('bucket/key-directory/2015-*.json').map(json.loads)  # doctest: +SKIP

    Or separate bucket and keyname
    >>> b = read_text('bucket', 'key-directory/2015-*.json').map(json.loads)  # doctest: +SKIP

    Optionally provide blocksizes and delimiter to chunk up large files
    >>> b = read_text('bucket', 'key-directory/2015-*.json',
    ...               linedelimiter='\\n', blocksize=2**25)  # doctest: +SKIP

    Specify compression, blocksizes not allowed
    >>> b = read_text('bucket/my-data.*.json.gz',
    ...               compression='gzip', blocksize=None)  # doctest: +SKIP

    Returns
    -------
    Dask bag if collection=True or Futures or dask values otherwise
    """
    if keyname is not None:
        if not keyname.startswith('/'):
            keyname = '/' + keyname
        fn = fn + keyname
    fs = fs or S3FileSystem()
    executor = default_executor(executor)

    if compression:
        blocksize=None
        decompress = decompressors[compression]

    filenames = sorted(fs.glob(fn))
    blocks = [block for fn in filenames
                    for block in read_bytes(fn, executor, fs, lazy=True,
                                            delimiter=lineterminator.encode(),
                                            blocksize=blocksize)]
    if compression:
        blocks = [do(decompress)(b) for b in blocks]
    strings = [do(bytes.decode)(b, encoding, errors) for b in blocks]
    lines = [do(unicode.split)(s, lineterminator) for s in strings]

    ensure_default_get(executor)
    from dask.bag import from_imperative
    if collection:
        result = from_imperative(lines)
    else:
        result = lines

    if not lazy:
        if collection:
            result = executor.persist(result)
        else:
            result = executor.compute(result)

    return result
Esempio n. 6
0
def read_text(fn,
              keyname=None,
              encoding='utf-8',
              errors='strict',
              lineterminator='\n',
              executor=None,
              fs=None,
              lazy=True,
              collection=True,
              blocksize=2**27,
              compression=None):
    """ Read text lines from S3

    Parameters
    ----------
    path: string
        Path of files on S3, including both bucket, key, or globstring
    keyname: string, optional
        If path is only the bucket name, provide key name as second argument
    collection: boolean, optional
        Whether or not to return a high level collection
    lazy: boolean, optional
        Whether or not to start reading immediately
    blocksize: int, optional
        Number of bytes per partition.  Use ``None`` for no blocking.
        Silently ignored if data is compressed with a non-splittable format like gzip.
    lineterminator: str, optional
        The endline string used to deliniate line breaks
    compression: str, optional
        Compression to use options include: gzip
        The use of compression will suppress blocking

    Examples
    --------

    Provide bucket and keyname joined by slash.
    >>> b = read_text('bucket/key-directory/')  # doctest: +SKIP

    Alternatively use support globstrings
    >>> b = read_text('bucket/key-directory/2015-*.json').map(json.loads)  # doctest: +SKIP

    Or separate bucket and keyname
    >>> b = read_text('bucket', 'key-directory/2015-*.json').map(json.loads)  # doctest: +SKIP

    Optionally provide blocksizes and delimiter to chunk up large files
    >>> b = read_text('bucket', 'key-directory/2015-*.json',
    ...               linedelimiter='\\n', blocksize=2**25)  # doctest: +SKIP

    Specify compression, blocksizes not allowed
    >>> b = read_text('bucket/my-data.*.json.gz',
    ...               compression='gzip', blocksize=None)  # doctest: +SKIP

    Returns
    -------
    Dask bag if collection=True or Futures or dask values otherwise
    """
    if keyname is not None:
        if not keyname.startswith('/'):
            keyname = '/' + keyname
        fn = fn + keyname
    fs = fs or S3FileSystem()
    executor = default_executor(executor)

    if compression:
        blocksize = None
        decompress = decompressors[compression]

    filenames = sorted(fs.glob(fn))
    blocks = [
        block for fn in filenames
        for block in read_bytes(fn,
                                executor,
                                fs,
                                lazy=True,
                                delimiter=lineterminator.encode(),
                                blocksize=blocksize)
    ]
    if compression:
        blocks = [do(decompress)(b) for b in blocks]
    strings = [do(bytes.decode)(b, encoding, errors) for b in blocks]
    lines = [do(unicode.split)(s, lineterminator) for s in strings]

    ensure_default_get(executor)
    from dask.bag import from_imperative
    if collection:
        result = from_imperative(lines)
    else:
        result = lines

    if not lazy:
        if collection:
            result = executor.persist(result)
        else:
            result = executor.compute(result)

    return result