Exemple #1
0
def _byte_block_counts(
    urlpath,
    blocksize,
    lineterminator=None,
    compression="infer",
    storage_options=None,
    **kwargs,
):
    """Return a list of paths and block counts.

    Logic copied from dask.bytes.read_bytes
    """

    if lineterminator is not None and len(lineterminator) == 1:
        kwargs["lineterminator"] = lineterminator
    else:
        lineterminator = "\n"

    if compression == "infer":
        paths = get_fs_token_paths(urlpath,
                                   mode="rb",
                                   storage_options=storage_options)[2]
        compression = infer_compression(paths[0])

    if isinstance(blocksize, str):
        blocksize = parse_bytes(blocksize)
    if blocksize and compression:
        blocksize = None

    b_out = read_bytes(
        urlpath,
        delimiter=lineterminator.encode(),
        blocksize=blocksize,
        sample=False,
        compression=compression,
        include_path=True,
        **(storage_options or {}),
    )
    _, values, paths = b_out

    if not isinstance(values[0], (tuple, list)):
        values = [values]

    return paths, [len(v) for v in values]
Exemple #2
0
def read_json(url_path,
              orient="records",
              lines=None,
              storage_options=None,
              blocksize=None,
              sample=2**20,
              encoding="utf-8",
              errors="strict",
              compression="infer",
              meta=None,
              engine=pd.read_json,
              **kwargs):
    """Create a dataframe from a set of JSON files

    This utilises ``pandas.read_json()``, and most parameters are
    passed through - see its docstring.

    Differences: orient is 'records' by default, with lines=True; this
    is appropriate for line-delimited "JSON-lines" data, the kind of JSON output
    that is most common in big-data scenarios, and which can be chunked when
    reading (see ``read_json()``). All other options require blocksize=None,
    i.e., one partition per input file.

    Parameters
    ----------
    url_path: str, list of str
        Location to read from. If a string, can include a glob character to
        find a set of file names.
        Supports protocol specifications such as ``"s3://"``.
    encoding, errors:
        The text encoding to implement, e.g., "utf-8" and how to respond
        to errors in the conversion (see ``str.encode()``).
    orient, lines, kwargs
        passed to pandas; if not specified, lines=True when orient='records',
        False otherwise.
    storage_options: dict
        Passed to backend file-system implementation
    blocksize: None or int
        If None, files are not blocked, and you get one partition per input
        file. If int, which can only be used for line-delimited JSON files,
        each partition will be approximately this size in bytes, to the nearest
        newline character.
    sample: int
        Number of bytes to pre-load, to provide an empty dataframe structure
        to any blocks wihout data. Only relevant is using blocksize.
    encoding, errors:
        Text conversion, ``see bytes.decode()``
    compression : string or None
        String like 'gzip' or 'xz'.
    engine : function object, default ``pd.read_json``
        The underlying function that dask will use to read JSON files. By
        default, this will be the pandas JSON reader (``pd.read_json``).
    $META

    Returns
    -------
    dask.DataFrame

    Examples
    --------
    Load single file

    >>> dd.read_json('myfile.1.json')  # doctest: +SKIP

    Load multiple files

    >>> dd.read_json('myfile.*.json')  # doctest: +SKIP

    >>> dd.read_json(['myfile.1.json', 'myfile.2.json'])  # doctest: +SKIP

    Load large line-delimited JSON files using partitions of approx
    256MB size

    >> dd.read_json('data/file*.csv', blocksize=2**28)
    """
    import dask.dataframe as dd

    if lines is None:
        lines = orient == "records"
    if orient != "records" and lines:
        raise ValueError("Line-delimited JSON is only available with"
                         'orient="records".')
    if blocksize and (orient != "records" or not lines):
        raise ValueError("JSON file chunking only allowed for JSON-lines"
                         "input (orient='records', lines=True).")
    storage_options = storage_options or {}
    if blocksize:
        first, chunks = read_bytes(url_path,
                                   b"\n",
                                   blocksize=blocksize,
                                   sample=sample,
                                   compression=compression,
                                   **storage_options)
        chunks = list(dask.core.flatten(chunks))
        if meta is None:
            meta = read_json_chunk(first, encoding, errors, engine, kwargs)
        meta = make_meta(meta)
        parts = [
            dask.delayed(read_json_chunk)(chunk,
                                          encoding,
                                          errors,
                                          engine,
                                          kwargs,
                                          meta=meta) for chunk in chunks
        ]
        return dd.from_delayed(parts, meta=meta)
    else:
        files = open_files(url_path,
                           "rt",
                           encoding=encoding,
                           errors=errors,
                           compression=compression,
                           **storage_options)
        parts = [
            dask.delayed(read_json_file)(f, orient, lines, engine, kwargs)
            for f in files
        ]
        return dd.from_delayed(parts, meta=meta)
Exemple #3
0
def read_text(
    urlpath,
    blocksize=None,
    compression="infer",
    encoding=system_encoding,
    errors="strict",
    linedelimiter=None,
    collection=True,
    storage_options=None,
    files_per_partition=None,
    include_path=False,
):
    """Read lines from text files

    Parameters
    ----------
    urlpath : string or list
        Absolute or relative filepath(s). Prefix with a protocol like ``s3://``
        to read from alternative filesystems. To read from multiple files you
        can pass a globstring or a list of paths, with the caveat that they
        must all have the same protocol.
    blocksize: None, int, or str
        Size (in bytes) to cut up larger files.  Streams by default.
        Can be ``None`` for streaming, an integer number of bytes, or a string
        like "128MiB"
    compression: string
        Compression format like 'gzip' or 'xz'.  Defaults to 'infer'
    encoding: string
    errors: string
    linedelimiter: string or None
    collection: bool, optional
        Return dask.bag if True, or list of delayed values if false
    storage_options: dict
        Extra options that make sense to a particular storage connection, e.g.
        host, port, username, password, etc.
    files_per_partition: None or int
        If set, group input files into partitions of the requested size,
        instead of one partition per file. Mutually exclusive with blocksize.
    include_path: bool
        Whether or not to include the path in the bag.
        If true, elements are tuples of (line, path).
        Default is False.

    Examples
    --------
    >>> b = read_text('myfiles.1.txt')  # doctest: +SKIP
    >>> b = read_text('myfiles.*.txt')  # doctest: +SKIP
    >>> b = read_text('myfiles.*.txt.gz')  # doctest: +SKIP
    >>> b = read_text('s3://bucket/myfiles.*.txt')  # doctest: +SKIP
    >>> b = read_text('s3://key:secret@bucket/myfiles.*.txt')  # doctest: +SKIP
    >>> b = read_text('hdfs://namenode.example.com/myfiles.*.txt')  # doctest: +SKIP

    Parallelize a large file by providing the number of uncompressed bytes to
    load into each partition.

    >>> b = read_text('largefile.txt', blocksize='10MB')  # doctest: +SKIP

    Get file paths of the bag by setting include_path=True

    >>> b = read_text('myfiles.*.txt', include_path=True) # doctest: +SKIP
    >>> b.take(1) # doctest: +SKIP
    (('first line of the first file', '/home/dask/myfiles.0.txt'),)

    Returns
    -------
    dask.bag.Bag or list
        dask.bag.Bag if collection is True or list of Delayed lists otherwise.

    See Also
    --------
    from_sequence: Build bag from Python sequence
    """
    if blocksize is not None and files_per_partition is not None:
        raise ValueError(
            "Only one of blocksize or files_per_partition can be set")
    if isinstance(blocksize, str):
        blocksize = parse_bytes(blocksize)

    if blocksize is None:
        if linedelimiter in [None, "", "\n", "\r", "\r\n"]:
            newline = linedelimiter
            linedelimiter = None
        else:
            newline = ""
        files = open_files(
            urlpath,
            mode="rt",
            encoding=encoding,
            errors=errors,
            compression=compression,
            newline=newline,
            **(storage_options or {}),
        )
        if files_per_partition is None:
            blocks = [
                delayed(list)(delayed(
                    partial(file_to_blocks,
                            include_path,
                            delimiter=linedelimiter))(fil)) for fil in files
            ]
        else:
            blocks = []
            for start in range(0, len(files), files_per_partition):
                block_files = files[start:(start + files_per_partition)]
                block_lines = delayed(concat)(delayed(map)(
                    partial(file_to_blocks,
                            include_path,
                            delimiter=linedelimiter),
                    block_files,
                ))
                blocks.append(block_lines)
    else:
        # special case for linedelimiter=None: we will need to split on an actual bytestring
        # and the line reader will then use "universal" mode. Just as well that \r\n and \n
        # will both work (thankfully \r for MacOS is no longer a thing)
        o = read_bytes(
            urlpath,
            delimiter=linedelimiter.encode()
            if linedelimiter is not None else b"\n",
            blocksize=blocksize,
            sample=False,
            compression=compression,
            include_path=include_path,
            **(storage_options or {}),
        )
        raw_blocks = o[1]
        blocks = [
            delayed(decode)(b, encoding, errors, linedelimiter)
            for b in concat(raw_blocks)
        ]
        if include_path:
            paths = list(
                concat([[path] * len(raw_blocks[i])
                        for i, path in enumerate(o[2])]))
            blocks = [
                delayed(attach_path)(entry, path)
                for entry, path in zip(blocks, paths)
            ]

    if not blocks:
        raise ValueError("No files found", urlpath)

    if collection:
        blocks = from_delayed(blocks)

    return blocks
Exemple #4
0
def read_pandas(
    reader,
    urlpath,
    blocksize="default",
    lineterminator=None,
    compression="infer",
    sample=256000,
    sample_rows=10,
    enforce=False,
    assume_missing=False,
    storage_options=None,
    include_path_column=False,
    **kwargs,
):
    reader_name = reader.__name__
    if lineterminator is not None and len(lineterminator) == 1:
        kwargs["lineterminator"] = lineterminator
    else:
        lineterminator = "\n"
    if include_path_column and isinstance(include_path_column, bool):
        include_path_column = "path"
    if "index" in kwargs or "index_col" in kwargs:
        raise ValueError(
            "Keywords 'index' and 'index_col' not supported. "
            f"Use dd.{reader_name}(...).set_index('my-index') instead")
    for kw in ["iterator", "chunksize"]:
        if kw in kwargs:
            raise ValueError(f"{kw} not supported for dd.{reader_name}")
    if kwargs.get("nrows", None):
        raise ValueError("The 'nrows' keyword is not supported by "
                         "`dd.{0}`. To achieve the same behavior, it's "
                         "recommended to use `dd.{0}(...)."
                         "head(n=nrows)`".format(reader_name))
    if isinstance(kwargs.get("skiprows"), int):
        skiprows = lastskiprow = firstrow = kwargs.get("skiprows")
    elif kwargs.get("skiprows") is None:
        skiprows = lastskiprow = firstrow = 0
    else:
        # When skiprows is a list, we expect more than max(skiprows) to
        # be included in the sample. This means that [0,2] will work well,
        # but [0, 440] might not work.
        skiprows = set(kwargs.get("skiprows"))
        lastskiprow = max(skiprows)
        # find the firstrow that is not skipped, for use as header
        firstrow = min(set(range(len(skiprows) + 1)) - set(skiprows))
    if isinstance(kwargs.get("header"), list):
        raise TypeError(
            f"List of header rows not supported for dd.{reader_name}")
    if isinstance(kwargs.get("converters"), dict) and include_path_column:
        path_converter = kwargs.get("converters").get(include_path_column,
                                                      None)
    else:
        path_converter = None

    # If compression is "infer", inspect the (first) path suffix and
    # set the proper compression option if the suffix is recongnized.
    if compression == "infer":
        # Translate the input urlpath to a simple path list
        paths = get_fs_token_paths(urlpath,
                                   mode="rb",
                                   storage_options=storage_options)[2]

        # Check for at least one valid path
        if len(paths) == 0:
            raise OSError(f"{urlpath} resolved to no files")

        # Infer compression from first path
        compression = infer_compression(paths[0])

    if blocksize == "default":
        blocksize = AUTO_BLOCKSIZE
    if isinstance(blocksize, str):
        blocksize = parse_bytes(blocksize)
    if blocksize and compression:
        # NONE of the compressions should use chunking
        warn("Warning %s compression does not support breaking apart files\n"
             "Please ensure that each individual file can fit in memory and\n"
             "use the keyword ``blocksize=None to remove this message``\n"
             "Setting ``blocksize=None``" % compression)
        blocksize = None
    if compression not in compr:
        raise NotImplementedError("Compression format %s not installed" %
                                  compression)
    if blocksize and sample and blocksize < sample and lastskiprow != 0:
        warn("Unexpected behavior can result from passing skiprows when\n"
             "blocksize is smaller than sample size.\n"
             "Setting ``sample=blocksize``")
        sample = blocksize
    b_lineterminator = lineterminator.encode()
    b_out = read_bytes(
        urlpath,
        delimiter=b_lineterminator,
        blocksize=blocksize,
        sample=sample,
        compression=compression,
        include_path=include_path_column,
        **(storage_options or {}),
    )

    if include_path_column:
        b_sample, values, paths = b_out
        path = (include_path_column, path_converter)
    else:
        b_sample, values = b_out
        path = None

    if not isinstance(values[0], (tuple, list)):
        values = [values]
    # If we have not sampled, then use the first row of the first values
    # as a representative sample.
    if b_sample is False and len(values[0]):
        b_sample = values[0][0].compute()

    # Get header row, and check that sample is long enough. If the file
    # contains a header row, we need at least 2 nonempty rows + the number of
    # rows to skip.
    names = kwargs.get("names", None)
    header = kwargs.get("header", "infer" if names is None else None)
    need = 1 if header is None else 2

    if kwargs.get("comment"):
        # if comment is provided, step through lines of b_sample and strip out comments
        parts = []
        for part in b_sample.split(b_lineterminator):
            split_comment = part.decode().split(kwargs.get("comment"))
            if len(split_comment) > 1:
                # if line starts with comment, don't include that line in parts.
                if len(split_comment[0]) > 0:
                    parts.append(split_comment[0].strip().encode())
            else:
                parts.append(part)
            if len(parts) > need:
                break
    else:
        parts = b_sample.split(b_lineterminator, lastskiprow + need)

    # If the last partition is empty, don't count it
    nparts = 0 if not parts else len(parts) - int(not parts[-1])

    if sample is not False and nparts < lastskiprow + need and len(
            b_sample) >= sample:
        raise ValueError("Sample is not large enough to include at least one "
                         "row of data. Please increase the number of bytes "
                         "in `sample` in the call to `read_csv`/`read_table`")

    if isinstance(header, int):
        firstrow += header
    header = b"" if header is None else parts[firstrow] + b_lineterminator

    # Use sample to infer dtypes and check for presence of include_path_column
    head_kwargs = kwargs.copy()
    head_kwargs.pop("skipfooter", None)
    try:
        head = reader(BytesIO(b_sample), nrows=sample_rows, **head_kwargs)
    except pd.errors.ParserError as e:
        if "EOF" in str(e):
            raise ValueError(
                "EOF encountered while reading header. \n"
                "Pass argument `sample_rows` and make sure the value of `sample` "
                "is large enough to accommodate that many rows of data") from e
        raise
    if include_path_column and (include_path_column in head.columns):
        raise ValueError("Files already contain the column name: %s, so the "
                         "path column cannot use this name. Please set "
                         "`include_path_column` to a unique name." %
                         include_path_column)

    specified_dtypes = kwargs.get("dtype", {})
    if specified_dtypes is None:
        specified_dtypes = {}
    # If specified_dtypes is a single type, then all columns were specified
    if assume_missing and isinstance(specified_dtypes, dict):
        # Convert all non-specified integer columns to floats
        for c in head.columns:
            if is_integer_dtype(head[c].dtype) and c not in specified_dtypes:
                head[c] = head[c].astype(float)

    values = [[list(dsk.dask.values()) for dsk in block] for block in values]

    return text_blocks_to_pandas(
        reader,
        values,
        header,
        head,
        kwargs,
        enforce=enforce,
        specified_dtypes=specified_dtypes,
        path=path,
        blocksize=blocksize,
        urlpath=urlpath,
    )
Exemple #5
0
def read_json(
    url_path,
    orient="records",
    lines=None,
    storage_options=None,
    blocksize=None,
    sample=2**20,
    encoding="utf-8",
    errors="strict",
    compression="infer",
    meta=None,
    engine=pd.read_json,
    include_path_column=False,
    path_converter=None,
    **kwargs,
):
    """Create a dataframe from a set of JSON files

    This utilises ``pandas.read_json()``, and most parameters are
    passed through - see its docstring.

    Differences: orient is 'records' by default, with lines=True; this
    is appropriate for line-delimited "JSON-lines" data, the kind of JSON output
    that is most common in big-data scenarios, and which can be chunked when
    reading (see ``read_json()``). All other options require blocksize=None,
    i.e., one partition per input file.

    Parameters
    ----------
    url_path: str, list of str
        Location to read from. If a string, can include a glob character to
        find a set of file names.
        Supports protocol specifications such as ``"s3://"``.
    encoding, errors:
        The text encoding to implement, e.g., "utf-8" and how to respond
        to errors in the conversion (see ``str.encode()``).
    orient, lines, kwargs
        passed to pandas; if not specified, lines=True when orient='records',
        False otherwise.
    storage_options: dict
        Passed to backend file-system implementation
    blocksize: None or int
        If None, files are not blocked, and you get one partition per input
        file. If int, which can only be used for line-delimited JSON files,
        each partition will be approximately this size in bytes, to the nearest
        newline character.
    sample: int
        Number of bytes to pre-load, to provide an empty dataframe structure
        to any blocks without data. Only relevant when using blocksize.
    encoding, errors:
        Text conversion, ``see bytes.decode()``
    compression : string or None
        String like 'gzip' or 'xz'.
    engine : function object, default ``pd.read_json``
        The underlying function that dask will use to read JSON files. By
        default, this will be the pandas JSON reader (``pd.read_json``).
    include_path_column : bool or str, optional
        Include a column with the file path where each row in the dataframe
        originated. If ``True``, a new column is added to the dataframe called
        ``path``. If ``str``, sets new column name. Default is ``False``.
    path_converter : function or None, optional
        A function that takes one argument and returns a string. Used to convert
        paths in the ``path`` column, for instance, to strip a common prefix from
        all the paths.
    $META

    Returns
    -------
    dask.DataFrame

    Examples
    --------
    Load single file

    >>> dd.read_json('myfile.1.json')  # doctest: +SKIP

    Load multiple files

    >>> dd.read_json('myfile.*.json')  # doctest: +SKIP

    >>> dd.read_json(['myfile.1.json', 'myfile.2.json'])  # doctest: +SKIP

    Load large line-delimited JSON files using partitions of approx
    256MB size

    >> dd.read_json('data/file*.csv', blocksize=2**28)
    """
    if lines is None:
        lines = orient == "records"
    if orient != "records" and lines:
        raise ValueError(
            "Line-delimited JSON is only available with" 'orient="records".'
        )
    if blocksize and (orient != "records" or not lines):
        raise ValueError(
            "JSON file chunking only allowed for JSON-lines"
            "input (orient='records', lines=True)."
        )
    storage_options = storage_options or {}
    if include_path_column is True:
        include_path_column = "path"

    if path_converter is None:
        path_converter = lambda x: x

    if blocksize:
        b_out = read_bytes(
            url_path,
            b"\n",
            blocksize=blocksize,
            sample=sample,
            compression=compression,
            include_path=include_path_column,
            **storage_options,
        )
        if include_path_column:
            first, chunks, paths = b_out
            first_path = path_converter(paths[0])
            path_dtype = pd.CategoricalDtype(path_converter(p) for p in paths)
            flat_paths = flatten(
                [path_converter(p)] * len(chunk) for p, chunk in zip(paths, chunks)
            )
        else:
            first, chunks = b_out
            first_path = None
            flat_paths = (None,)
            path_dtype = None

        flat_chunks = flatten(chunks)
        if meta is None:
            meta = read_json_chunk(
                first,
                encoding,
                errors,
                engine,
                include_path_column,
                first_path,
                path_dtype,
                kwargs,
            )
        meta = make_meta(meta)
        parts = [
            delayed(read_json_chunk)(
                chunk,
                encoding,
                errors,
                engine,
                include_path_column,
                path,
                path_dtype,
                kwargs,
                meta=meta,
            )
            for chunk, path in zip_longest(flat_chunks, flat_paths)
        ]
    else:
        files = open_files(
            url_path,
            "rt",
            encoding=encoding,
            errors=errors,
            compression=compression,
            **storage_options,
        )
        path_dtype = pd.CategoricalDtype(path_converter(f.path) for f in files)
        parts = [
            delayed(read_json_file)(
                f,
                orient,
                lines,
                engine,
                include_path_column,
                path_converter(f.path),
                path_dtype,
                kwargs,
            )
            for f in files
        ]

    return from_delayed(parts, meta=meta)
Exemple #6
0
def read_json(url_path, orient='records', lines=None, storage_options=None,
              blocksize=None, sample=2**20, encoding='utf-8', errors='strict',
              **kwargs):
    """Create a dataframe from a set of JSON files

    This utilises ``pandas.read_json()``, and most parameters are
    passed through - see its docstring.

    Differences: orient is 'records' by default, with lines=True; this
    is apropriate for line-delimited "JSON-lines" data, the kind of JSON output
    that is most common in big-data scenarios, and which can be chunked when
    reading (see ``read_json()``). All other options require blocksize=None,
    i.e., one partition per input file.


    Parameters
    ----------
    url_path: str, list of str
        Location to read from. If a string, can include a glob character to
        find a set of file names.
        Supports protocol specifications such as ``"s3://"``.
    encoding, errors:
        The text encoding to implement, e.g., "utf-8" and how to respond
        to errors in the conversion (see ``str.encode()``).
    orient, lines, kwargs
        passed to pandas; if not specified, lines=True when orient='records',
        False otherwise.
    storage_options: dict
        Passed to backend file-system implementation
    blocksize: None or int
        If None, files are not blocked, and you get one partition per input
        file. If int, which can only be used for line-delimited JSON files,
        each partition will be approximately this size in bytes, to the nearest
        newline character.
    sample: int
        Number of bytes to pre-load, to provide an empty dataframe structure
        to any blocks wihout data. Only relevant is using blocksize.
    encoding, errors:
        Text conversion, ``see bytes.decode()``

    Returns
    -------
    dask.DataFrame

    Examples
    --------
    Load single file

    >>> dd.read_json('myfile.1.json')  # doctest: +SKIP

    Load multiple files

    >>> dd.read_json('myfile.*.json')  # doctest: +SKIP

    >>> dd.read_json(['myfile.1.json', 'myfile.2.json'])  # doctest: +SKIP

    Load large line-delimited JSON files using partitions of approx
    256MB size

    >> dd.read_json('data/file*.csv', blocksize=2**28)
    """
    import dask.dataframe as dd
    if lines is None:
        lines = orient == 'records'
    if orient != 'records' and lines:
        raise ValueError('Line-delimited JSON is only available with'
                         'orient="records".')
    if blocksize and (orient != 'records' or not lines):
        raise ValueError("JSON file chunking only allowed for JSON-lines"
                         "input (orient='records', lines=True).")
    storage_options = storage_options or {}
    if blocksize:
        first, chunks = read_bytes(url_path, b'\n', blocksize=blocksize,
                                   sample=sample, **storage_options)
        chunks = list(dask.core.flatten(chunks))
        first = read_json_chunk(first, encoding, errors, kwargs)
        parts = [dask.delayed(read_json_chunk)(
            chunk, encoding, errors, kwargs, meta=first[:0]
        ) for chunk in chunks]

    else:
        files = open_files(url_path, 'rt', encoding=encoding, errors=errors,
                           **storage_options)
        parts = [dask.delayed(read_json_file)(f, orient, lines, kwargs)
                 for f in files]
    return dd.from_delayed(parts)