Beispiel #1
0
 def ls(self,
        path: str,
        detail: bool = False,
        invalidate_cache: bool = True,
        **kwargs):
     """ Create a list of blob names from a blob container
     
     Parameters
     ----------
     path:  Path to an Azure Blob directory
     detail:  If False, return a list of blob names, else a list of dictionaries with blob details
     invalidate_cache:  Boolean
     """
     logging.debug("Running abfs.ls() method")
     path = stringify_path(path)
     blobs = self.blob_fs.list_blobs(container_name=self.container_name,
                                     prefix=path)
     if detail is False:
         pathlist = [blob.name for blob in blobs]
         logging.debug(f"Detail is False.  Returning {pathlist}")
         return pathlist
     else:
         pathlist = []
         for blob in blobs:
             data = {}
             data["name"] = blob.name
             data["size"] = blob.properties.content_length
             data["container_name"] = self.container_name
             if blob.properties.content_settings.content_type is not None:
                 data["type"] = "file"
             else:
                 data["type"] = "directory"
             pathlist.append(data)
         logging.debug(f"Detail is True:  Returning {pathlist}")
         return pathlist
Beispiel #2
0
    def _strip_protocol(cls, path: Union[str, List[str]]):
        """Turn path from fully-qualified to file-system-specifi
        Parameters
        ----------
        path : string
            Input path, like
            `http://oss-cn-hangzhou.aliyuncs.com/mybucket/myobject`
            `oss://mybucket/myobject`
        Examples
        --------
        >>> _strip_protocol(
            "http://oss-cn-hangzhou.aliyuncs.com/mybucket/myobject"
            )
        ('/mybucket/myobject')
        >>> _strip_protocol(
            "oss://mybucket/myobject"
            )
        ('/mybucket/myobject')
        """
        if isinstance(path, list):
            return [cls._strip_protocol(p) for p in path]
        path = stringify_path(path)
        if path.startswith("oss://"):
            path = path[5:]

        parser_re = r"https?://(?P<endpoint>oss.+aliyuncs\.com)(?P<path>/.+)"
        matcher = re.compile(parser_re).match(path)
        if matcher:
            path = matcher["path"]
        return path or cls.root_marker
Beispiel #3
0
def test_stringify_path():
    test_filepath = os.path.join("path", "to", "file.txt")

    # Pathlib.path
    path = pathlib.Path(test_filepath)
    assert stringify_path(path) == test_filepath

    # fspath protocol
    class CustomFSPath:
        """For testing fspath on unknown objects"""
        def __init__(self, path):
            self.path = path

        def __fspath__(self):
            return self.path

    path = CustomFSPath(test_filepath)
    assert stringify_path(path) == test_filepath

    # Non path-like input is unaffected
    path = (1, 2, 3)
    assert stringify_path(path) is path
Beispiel #4
0
 def _strip_protocol(cls, path):
     if isinstance(path, list):
         return [cls._strip_protocol(p) for p in path]
     path = stringify_path(path)
     protos = (cls.protocol, ) if isinstance(cls.protocol,
                                             str) else cls.protocol
     for protocol in protos:
         if path.startswith(protocol + "://"):
             path = path[len(protocol) + 3:]
         elif path.startswith(protocol + "::"):
             path = path[len(protocol) + 2:]
     # use of root_marker to make minimum required path, e.g., "/"
     return path or cls.root_marker
Beispiel #5
0
    def __init__(
        self,
        paths,
        engine=None,
        part_size=None,
        part_mem_fraction=None,
        storage_options=None,
        **kwargs,
    ):
        if part_size:
            # If a specific partition size is given, use it directly
            part_size = parse_bytes(part_size)
        else:
            # If a fractional partition size is given, calculate part_size
            part_mem_fraction = part_mem_fraction or 0.125
            assert part_mem_fraction > 0.0 and part_mem_fraction < 1.0
            if part_mem_fraction > 0.25:
                warnings.warn("Using very large partitions sizes for Dask. "
                              "Memory-related errors are likely.")
            part_size = int(device_mem_size(kind="total") * part_mem_fraction)

        # Engine-agnostic path handling
        if hasattr(paths, "name"):
            paths = stringify_path(paths)
        if isinstance(paths, str):
            paths = [paths]

        storage_options = storage_options or {}
        # If engine is not provided, try to infer from end of paths[0]
        if engine is None:
            engine = paths[0].split(".")[-1]
        if isinstance(engine, str):
            if engine == "parquet":
                self.engine = ParquetDatasetEngine(
                    paths,
                    part_size,
                    storage_options=storage_options,
                    **kwargs)
            elif engine == "csv":
                self.engine = CSVDatasetEngine(paths,
                                               part_size,
                                               storage_options=storage_options,
                                               **kwargs)
            else:
                raise ValueError("Only parquet and csv supported (for now).")
        else:
            self.engine = engine(paths,
                                 part_size,
                                 storage_options=storage_options)
Beispiel #6
0
 def _strip_protocol(cls, path):
     """ Turn path from fully-qualified to file-system-specific
     May require FS-specific handling, e.g., for relative paths or links.
     """
     path = stringify_path(path)
     protos = (cls.protocol,) if isinstance(cls.protocol, str) else cls.protocol # NOQA
     for protocol in protos:
         path = path.rstrip("/")
         if path.startswith(protocol + "://"):
             path = path[len(protocol) + 3:]
         elif path.startswith(protocol + ":"):
             path = path[len(protocol) + 1:]
     # use of root_marker to make minimum required path, e.g., "/"
     if not path.startswith("/"):
         path = f"/{path}"
     return path or cls.root_marker
Beispiel #7
0
    def __init__(
        self,
        path,
        engine=None,
        part_size=None,
        part_mem_fraction=None,
        storage_options=None,
        **kwargs,
    ):
        if part_size:
            # If a specific partition size is given, use it directly
            part_size = parse_bytes(part_size)
        else:
            # If a fractional partition size is given, calculate part_size
            part_mem_fraction = part_mem_fraction or 0.125
            assert part_mem_fraction > 0.0 and part_mem_fraction < 1.0
            if part_mem_fraction > 0.25:
                warnings.warn("Using very large partitions sizes for Dask. "
                              "Memory-related errors are likely.")
            part_size = int(cuda.current_context().get_memory_info()[1] *
                            part_mem_fraction)

        # Engine-agnostic path handling
        if hasattr(path, "name"):
            path = stringify_path(path)
        storage_options = storage_options or {}
        fs, fs_token, paths = get_fs_token_paths(
            path, mode="rb", storage_options=storage_options)
        paths = sorted(paths, key=natural_sort_key)

        # If engine is not provided, try to infer from end of paths[0]
        if engine is None:
            engine = paths[0].split(".")[-1]
        if isinstance(engine, str):
            if engine == "parquet":
                self.engine = ParquetDatasetEngine(paths, part_size, fs,
                                                   fs_token, **kwargs)
            elif engine == "csv":
                self.engine = CSVDatasetEngine(paths, part_size, fs, fs_token,
                                               **kwargs)
            else:
                raise ValueError("Only parquet and csv supported (for now).")
        else:
            self.engine = engine(paths, part_size, fs, fs_token, **kwargs)
Beispiel #8
0
 def _strip_protocol(cls, path):
     path = stringify_path(path)
     if path.startswith("file://"):
         path = path[7:]
     path = os.path.expanduser(path)
     return make_path_posix(path)
Beispiel #9
0
def read_parquet(
    path,
    columns=None,
    filters=None,
    categories=None,
    index=None,
    storage_options=None,
    engine="auto",
    gather_statistics=None,
    split_row_groups=None,
    read_from_paths=None,
    chunksize=None,
    aggregate_files=None,
    **kwargs,
):
    """
    Read a Parquet file into a Dask DataFrame

    This reads a directory of Parquet data into a Dask.dataframe, one file per
    partition.  It selects the index among the sorted columns if any exist.

    Parameters
    ----------
    path : str or list
        Source directory for data, or path(s) to individual parquet files.
        Prefix with a protocol like ``s3://`` to read from alternative
        filesystems. To read from multiple files you can pass a globstring or a
        list of paths, with the caveat that they must all have the same
        protocol.
    columns : str or list, default None
        Field name(s) to read in as columns in the output. By default all
        non-index fields will be read (as determined by the pandas parquet
        metadata, if present). Provide a single field name instead of a list to
        read in the data as a Series.
    filters : Union[List[Tuple[str, str, Any]], List[List[Tuple[str, str, Any]]]], default None
        List of filters to apply, like ``[[('col1', '==', 0), ...], ...]``.
        Using this argument will NOT result in row-wise filtering of the final
        partitions unless ``engine="pyarrow-dataset"`` is also specified.  For
        other engines, filtering is only performed at the partition level, i.e.,
        to prevent the loading of some row-groups and/or files.

        For the "pyarrow" engines, predicates can be expressed in disjunctive
        normal form (DNF). This means that the innermost tuple describes a single
        column predicate. These inner predicates are combined with an AND
        conjunction into a larger predicate. The outer-most list then combines all
        of the combined filters with an OR disjunction.

        Predicates can also be expressed as a List[Tuple]. These are evaluated
        as an AND conjunction. To express OR in predictates, one must use the
        (preferred for "pyarrow") List[List[Tuple]] notation.

        Note that the "fastparquet" engine does not currently support DNF for
        the filtering of partitioned columns (List[Tuple] is required).
    index : str, list or False, default None
        Field name(s) to use as the output frame index. By default will be
        inferred from the pandas parquet file metadata (if present). Use False
        to read all fields as columns.
    categories : list or dict, default None
        For any fields listed here, if the parquet encoding is Dictionary,
        the column will be created with dtype category. Use only if it is
        guaranteed that the column is encoded as dictionary in all row-groups.
        If a list, assumes up to 2**16-1 labels; if a dict, specify the number
        of labels expected; if None, will load categories automatically for
        data written by dask/fastparquet, not otherwise.
    storage_options : dict, default None
        Key/value pairs to be passed on to the file-system backend, if any.
    engine : str, default 'auto'
        Parquet reader library to use. Options include: 'auto', 'fastparquet',
        'pyarrow', 'pyarrow-dataset', and 'pyarrow-legacy'. Defaults to 'auto',
        which selects the FastParquetEngine if fastparquet is installed (and
        ArrowDatasetEngine otherwise).  If 'pyarrow' or 'pyarrow-dataset' is
        specified, the ArrowDatasetEngine (which leverages the pyarrow.dataset
        API) will be used. If 'pyarrow-legacy' is specified, ArrowLegacyEngine
        will be used (which leverages the pyarrow.parquet.ParquetDataset API).
        NOTE: The 'pyarrow-legacy' option (ArrowLegacyEngine) is deprecated
        for pyarrow>=5.
    gather_statistics : bool, default None
        Gather the statistics for each dataset partition. By default,
        this will only be done if the _metadata file is available. Otherwise,
        statistics will only be gathered if True, because the footer of
        every file will be parsed (which is very slow on some systems).
    split_row_groups : bool or int, default None
        Default is True if a _metadata file is available or if
        the dataset is composed of a single file (otherwise defult is False).
        If True, then each output dataframe partition will correspond to a single
        parquet-file row-group. If False, each partition will correspond to a
        complete file.  If a positive integer value is given, each dataframe
        partition will correspond to that number of parquet row-groups (or fewer).
        Only the "pyarrow" engine supports this argument.
    read_from_paths : bool, default None
        Only used by ``ArrowDatasetEngine`` when ``filters`` are specified.
        Determines whether the engine should avoid inserting large pyarrow
        (``ParquetFileFragment``) objects in the task graph.  If this option
        is True, ``read_partition`` will need to regenerate the appropriate
        fragment object from the path and row-group IDs.  This will reduce the
        size of the task graph, but will add minor overhead to ``read_partition``.
        By default (None), ``ArrowDatasetEngine`` will set this option to
        ``False`` when there are filters.
    chunksize : int or str, default None
        The desired size of each output ``DataFrame`` partition in terms of total
        (uncompressed) parquet storage space. If specified, adjacent row-groups
        and/or files will be aggregated into the same output partition until the
        cumulative ``total_byte_size`` parquet-metadata statistic reaches this
        value. Use `aggregate_files` to enable/disable inter-file aggregation.
    aggregate_files : bool or str, default None
        Whether distinct file paths may be aggregated into the same output
        partition. This parameter requires `gather_statistics=True`, and is
        only used when `chunksize` is specified or when `split_row_groups` is
        an integer >1. A setting of True means that any two file paths may be
        aggregated into the same output partition, while False means that
        inter-file aggregation is prohibited.

        For "hive-partitioned" datasets, a "partition"-column name can also be
        specified. In this case, we allow the aggregation of any two files
        sharing a file path up to, and including, the corresponding directory name.
        For example, if ``aggregate_files`` is set to ``"section"`` for the
        directory structure below, ``03.parquet`` and ``04.parquet`` may be
        aggregated together, but ``01.parquet`` and ``02.parquet`` cannot be.
        If, however, ``aggregate_files`` is set to ``"region"``, ``01.parquet``
        may be aggregated with ``02.parquet``, and ``03.parquet`` may be aggregated
        with ``04.parquet``::

            dataset-path/
            ├── region=1/
            │   ├── section=a/
            │   │   └── 01.parquet
            │   ├── section=b/
            │   └── └── 02.parquet
            └── region=2/
                ├── section=a/
                │   ├── 03.parquet
                └── └── 04.parquet

        Note that the default behavior of ``aggregate_files`` is False.
    **kwargs: dict (of dicts)
        Passthrough key-word arguments for read backend.
        The top-level keys correspond to the appropriate operation type, and
        the second level corresponds to the kwargs that will be passed on to
        the underlying ``pyarrow`` or ``fastparquet`` function.
        Supported top-level keys: 'dataset' (for opening a ``pyarrow`` dataset),
        'file' (for opening a ``fastparquet`` ``ParquetFile``), 'read' (for the
        backend read function), 'arrow_to_pandas' (for controlling the arguments
        passed to convert from a ``pyarrow.Table.to_pandas()``)

    Examples
    --------
    >>> df = dd.read_parquet('s3://bucket/my-parquet-data')  # doctest: +SKIP

    See Also
    --------
    to_parquet
    pyarrow.parquet.ParquetDataset
    """

    if isinstance(columns, str):
        df = read_parquet(
            path,
            columns=[columns],
            filters=filters,
            categories=categories,
            index=index,
            storage_options=storage_options,
            engine=engine,
            gather_statistics=gather_statistics,
            split_row_groups=split_row_groups,
            read_from_paths=read_from_paths,
            chunksize=chunksize,
            aggregate_files=aggregate_files,
        )
        return df[columns]

    if columns is not None:
        columns = list(columns)

    label = "read-parquet-"
    output_name = label + tokenize(
        path,
        columns,
        filters,
        categories,
        index,
        storage_options,
        engine,
        gather_statistics,
        split_row_groups,
        read_from_paths,
        chunksize,
        aggregate_files,
    )

    if isinstance(engine, str):
        engine = get_engine(engine)

    if hasattr(path, "name"):
        path = stringify_path(path)
    fs, _, paths = get_fs_token_paths(path,
                                      mode="rb",
                                      storage_options=storage_options)

    paths = sorted(paths,
                   key=natural_sort_key)  # numeric rather than glob ordering

    auto_index_allowed = False
    if index is None:
        # User is allowing auto-detected index
        auto_index_allowed = True
    if index and isinstance(index, str):
        index = [index]

    if chunksize or (split_row_groups and int(split_row_groups) > 1
                     and aggregate_files):
        # Require `gather_statistics=True` if `chunksize` is used,
        # or if `split_row_groups>1` and we are aggregating files.
        if gather_statistics is False:
            raise ValueError(
                "read_parquet options require gather_statistics=True")
        gather_statistics = True

    read_metadata_result = engine.read_metadata(
        fs,
        paths,
        categories=categories,
        index=index,
        gather_statistics=gather_statistics,
        filters=filters,
        split_row_groups=split_row_groups,
        read_from_paths=read_from_paths,
        chunksize=chunksize,
        aggregate_files=aggregate_files,
        **kwargs,
    )

    # In the future, we may want to give the engine the
    # option to return a dedicated element for `common_kwargs`.
    # However, to avoid breaking the API, we just embed this
    # data in the first element of `parts` for now.
    # The logic below is inteded to handle backward and forward
    # compatibility with a user-defined engine.
    meta, statistics, parts, index = read_metadata_result[:4]
    common_kwargs = {}
    aggregation_depth = False
    if len(parts):
        # For now, `common_kwargs` and `aggregation_depth`
        # may be stored in the first element of `parts`
        common_kwargs = parts[0].pop("common_kwargs", {})
        aggregation_depth = parts[0].pop("aggregation_depth",
                                         aggregation_depth)

    # Parse dataset statistics from metadata (if available)
    parts, divisions, index, index_in_columns = process_statistics(
        parts,
        statistics,
        filters,
        index,
        chunksize,
        split_row_groups,
        fs,
        aggregation_depth,
    )

    # Account for index and columns arguments.
    # Modify `meta` dataframe accordingly
    meta, index, columns = set_index_columns(meta, index, columns,
                                             index_in_columns,
                                             auto_index_allowed)
    if meta.index.name == NONE_LABEL:
        meta.index.name = None

    # Set the index that was previously treated as a column
    if index_in_columns:
        meta = meta.set_index(index)
        if meta.index.name == NONE_LABEL:
            meta.index.name = None

    if len(divisions) < 2:
        # empty dataframe - just use meta
        graph = {(output_name, 0): meta}
        divisions = (None, None)
    else:
        # Create Blockwise layer
        layer = DataFrameIOLayer(
            output_name,
            columns,
            parts,
            ParquetFunctionWrapper(
                engine,
                fs,
                meta,
                columns,
                index,
                kwargs,
                common_kwargs,
            ),
            label=label,
        )
        graph = HighLevelGraph({output_name: layer}, {output_name: set()})

    return new_dd_object(graph, output_name, meta, divisions)
Beispiel #10
0
def to_parquet(
    df,
    path,
    engine="auto",
    compression="default",
    write_index=True,
    append=False,
    overwrite=False,
    ignore_divisions=False,
    partition_on=None,
    storage_options=None,
    custom_metadata=None,
    write_metadata_file=True,
    compute=True,
    compute_kwargs=None,
    schema=None,
    **kwargs,
):
    """Store Dask.dataframe to Parquet files

    Notes
    -----
    Each partition will be written to a separate file.

    Parameters
    ----------
    df : dask.dataframe.DataFrame
    path : string or pathlib.Path
        Destination directory for data.  Prepend with protocol like ``s3://``
        or ``hdfs://`` for remote data.
    engine : {'auto', 'fastparquet', 'pyarrow'}, default 'auto'
        Parquet library to use. If only one library is installed, it will use
        that one; if both, it will use 'fastparquet'.
    compression : string or dict, default 'default'
        Either a string like ``"snappy"`` or a dictionary mapping column names
        to compressors like ``{"name": "gzip", "values": "snappy"}``. The
        default is ``"default"``, which uses the default compression for
        whichever engine is selected.
    write_index : boolean, default True
        Whether or not to write the index. Defaults to True.
    append : bool, default False
        If False (default), construct data-set from scratch. If True, add new
        row-group(s) to an existing data-set. In the latter case, the data-set
        must exist, and the schema must match the input data.
    overwrite : bool, default False
        Whether or not to remove the contents of `path` before writing the dataset.
        The default is False.  If True, the specified path must correspond to
        a directory (but not the current working directory).  This option cannot
        be set to True if `append=True`.
        NOTE: `overwrite=True` will remove the original data even if the current
        write operation fails.  Use at your own risk.
    ignore_divisions : bool, default False
        If False (default) raises error when previous divisions overlap with
        the new appended divisions. Ignored if append=False.
    partition_on : list, default None
        Construct directory-based partitioning by splitting on these fields'
        values. Each dask partition will result in one or more datafiles,
        there will be no global groupby.
    storage_options : dict, default None
        Key/value pairs to be passed on to the file-system backend, if any.
    custom_metadata : dict, default None
        Custom key/value metadata to include in all footer metadata (and
        in the global "_metadata" file, if applicable).  Note that the custom
        metadata may not contain the reserved b"pandas" key.
    write_metadata_file : bool, default True
        Whether to write the special "_metadata" file.
    compute : bool, default True
        If :obj:`True` (default) then the result is computed immediately. If  :obj:`False`
        then a ``dask.dataframe.Scalar`` object is returned for future computation.
    compute_kwargs : dict, default True
        Options to be passed in to the compute method
    schema : Schema object, dict, or {"infer", None}, default None
        Global schema to use for the output dataset. Alternatively, a `dict`
        of pyarrow types can be specified (e.g. `schema={"id": pa.string()}`).
        For this case, fields excluded from the dictionary will be inferred
        from `_meta_nonempty`.  If "infer", the first non-empty and non-null
        partition will be used to infer the type for "object" columns. If
        None (default), we let the backend infer the schema for each distinct
        output partition. If the partitions produce inconsistent schemas,
        pyarrow will throw an error when writing the shared _metadata file.
        Note that this argument is ignored by the "fastparquet" engine.
    **kwargs :
        Extra options to be passed on to the specific backend.

    Examples
    --------
    >>> df = dd.read_csv(...)  # doctest: +SKIP
    >>> df.to_parquet('/path/to/output/', ...)  # doctest: +SKIP

    See Also
    --------
    read_parquet: Read parquet data to dask.dataframe
    """
    compute_kwargs = compute_kwargs or {}

    if compression == "default":
        if snappy is not None:
            compression = "snappy"
        else:
            compression = None

    partition_on = partition_on or []
    if isinstance(partition_on, str):
        partition_on = [partition_on]

    if set(partition_on) - set(df.columns):
        raise ValueError("Partitioning on non-existent column. "
                         "partition_on=%s ."
                         "columns=%s" %
                         (str(partition_on), str(list(df.columns))))

    if isinstance(engine, str):
        engine = get_engine(engine)

    if hasattr(path, "name"):
        path = stringify_path(path)
    fs, _, _ = get_fs_token_paths(path,
                                  mode="wb",
                                  storage_options=storage_options)
    # Trim any protocol information from the path before forwarding
    path = fs._strip_protocol(path)

    if overwrite:
        if isinstance(fs, LocalFileSystem):
            working_dir = fs.expand_path(".")[0]
            if path.rstrip("/") == working_dir.rstrip("/"):
                raise ValueError(
                    "Cannot clear the contents of the current working directory!"
                )
        if append:
            raise ValueError(
                "Cannot use both `overwrite=True` and `append=True`!")
        if fs.isdir(path):
            # Only remove path contents if
            # (1) The path exists
            # (2) The path is a directory
            # (3) The path is not the current working directory
            fs.rm(path, recursive=True)

    # Save divisions and corresponding index name. This is necessary,
    # because we may be resetting the index to write the file
    division_info = {"divisions": df.divisions, "name": df.index.name}
    if division_info["name"] is None:
        # As of 0.24.2, pandas will rename an index with name=None
        # when df.reset_index() is called.  The default name is "index",
        # but dask will always change the name to the NONE_LABEL constant
        if NONE_LABEL not in df.columns:
            division_info["name"] = NONE_LABEL
        elif write_index:
            raise ValueError(
                "Index must have a name if __null_dask_index__ is a column.")
        else:
            warnings.warn(
                "If read back by Dask, column named __null_dask_index__ "
                "will be set to the index (and renamed to None).")

    # There are some "resrved" names that may be used as the default column
    # name after resetting the index. However, we don't want to treat it as
    # a "special" name if the string is already used as a "real" column name.
    reserved_names = []
    for name in ["index", "level_0"]:
        if name not in df.columns:
            reserved_names.append(name)

    # If write_index==True (default), reset the index and record the
    # name of the original index in `index_cols` (we will set the name
    # to the NONE_LABEL constant if it is originally `None`).
    # `fastparquet` will use `index_cols` to specify the index column(s)
    # in the metadata.  `pyarrow` will revert the `reset_index` call
    # below if `index_cols` is populated (because pyarrow will want to handle
    # index preservation itself).  For both engines, the column index
    # will be written to "pandas metadata" if write_index=True
    index_cols = []
    if write_index:
        real_cols = set(df.columns)
        none_index = list(df._meta.index.names) == [None]
        df = df.reset_index()
        if none_index:
            df.columns = [
                c if c not in reserved_names else NONE_LABEL
                for c in df.columns
            ]
        index_cols = [c for c in set(df.columns) - real_cols]
    else:
        # Not writing index - might as well drop it
        df = df.reset_index(drop=True)

    _to_parquet_kwargs = {
        "engine",
        "compression",
        "write_index",
        "append",
        "ignore_divisions",
        "partition_on",
        "storage_options",
        "write_metadata_file",
        "compute",
    }
    kwargs_pass = {
        k: v
        for k, v in kwargs.items() if k not in _to_parquet_kwargs
    }

    # Engine-specific initialization steps to write the dataset.
    # Possibly create parquet metadata, and load existing stuff if appending
    meta, schema, i_offset = engine.initialize_write(
        df,
        fs,
        path,
        append=append,
        ignore_divisions=ignore_divisions,
        partition_on=partition_on,
        division_info=division_info,
        index_cols=index_cols,
        schema=schema,
        **kwargs_pass,
    )

    # Use i_offset and df.npartitions to define file-name list
    filenames = [
        "part.%i.parquet" % (i + i_offset) for i in range(df.npartitions)
    ]

    # Construct IO graph
    dsk = {}
    name = "to-parquet-" + tokenize(
        df,
        fs,
        path,
        append,
        ignore_divisions,
        partition_on,
        division_info,
        index_cols,
        schema,
    )
    part_tasks = []
    kwargs_pass["fmd"] = meta
    kwargs_pass["compression"] = compression
    kwargs_pass["index_cols"] = index_cols
    kwargs_pass["schema"] = schema
    if custom_metadata:
        if b"pandas" in custom_metadata.keys():
            raise ValueError(
                "User-defined key/value metadata (custom_metadata) can not "
                "contain a b'pandas' key.  This key is reserved by Pandas, "
                "and overwriting the corresponding value can render the "
                "entire dataset unreadable.")
        kwargs_pass["custom_metadata"] = custom_metadata
    for d, filename in enumerate(filenames):
        dsk[(name, d)] = (
            apply,
            engine.write_partition,
            [
                (df._name, d),
                path,
                fs,
                filename,
                partition_on,
                write_metadata_file,
            ],
            toolz.merge(kwargs_pass, {"head": True})
            if d == 0 else kwargs_pass,
        )
        part_tasks.append((name, d))

    final_name = "metadata-" + name
    # Collect metadata and write _metadata

    if write_metadata_file:
        dsk[(final_name, 0)] = (
            apply,
            engine.write_metadata,
            [
                part_tasks,
                meta,
                fs,
                path,
            ],
            {
                "append": append,
                "compression": compression
            },
        )
    else:
        dsk[(final_name, 0)] = (lambda x: None, part_tasks)

    graph = HighLevelGraph.from_collections(name, dsk, dependencies=[df])

    if compute:
        if write_metadata_file:
            return compute_as_if_collection(DataFrame, graph, (final_name, 0),
                                            **compute_kwargs)
        else:
            return compute_as_if_collection(DataFrame, graph, part_tasks,
                                            **compute_kwargs)
    else:
        return Scalar(graph, final_name, "")
Beispiel #11
0
def to_hdf(
    df,
    path,
    key,
    mode="a",
    append=False,
    scheduler=None,
    name_function=None,
    compute=True,
    lock=None,
    dask_kwargs={},
    **kwargs
):
    """Store Dask Dataframe to Hierarchical Data Format (HDF) files

    This is a parallel version of the Pandas function of the same name.  Please
    see the Pandas docstring for more detailed information about shared keyword
    arguments.

    This function differs from the Pandas version by saving the many partitions
    of a Dask DataFrame in parallel, either to many files, or to many datasets
    within the same file.  You may specify this parallelism with an asterix
    ``*`` within the filename or datapath, and an optional ``name_function``.
    The asterix will be replaced with an increasing sequence of integers
    starting from ``0`` or with the result of calling ``name_function`` on each
    of those integers.

    This function only supports the Pandas ``'table'`` format, not the more
    specialized ``'fixed'`` format.

    Parameters
    ----------
    path : string, pathlib.Path
        Path to a target filename. Supports strings, ``pathlib.Path``, or any
        object implementing the ``__fspath__`` protocol. May contain a ``*`` to
        denote many filenames.
    key : string
        Datapath within the files.  May contain a ``*`` to denote many locations
    name_function : function
        A function to convert the ``*`` in the above options to a string.
        Should take in a number from 0 to the number of partitions and return a
        string. (see examples below)
    compute : bool
        Whether or not to execute immediately.  If False then this returns a
        ``dask.Delayed`` value.
    lock : Lock, optional
        Lock to use to prevent concurrency issues.  By default a
        ``threading.Lock``, ``multiprocessing.Lock`` or ``SerializableLock``
        will be used depending on your scheduler if a lock is required. See
        dask.utils.get_scheduler_lock for more information about lock
        selection.
    scheduler : string
        The scheduler to use, like "threads" or "processes"
    **other:
        See pandas.to_hdf for more information

    Examples
    --------
    Save Data to a single file

    >>> df.to_hdf('output.hdf', '/data')            # doctest: +SKIP

    Save data to multiple datapaths within the same file:

    >>> df.to_hdf('output.hdf', '/data-*')          # doctest: +SKIP

    Save data to multiple files:

    >>> df.to_hdf('output-*.hdf', '/data')          # doctest: +SKIP

    Save data to multiple files, using the multiprocessing scheduler:

    >>> df.to_hdf('output-*.hdf', '/data', scheduler='processes') # doctest: +SKIP

    Specify custom naming scheme.  This writes files as
    '2000-01-01.hdf', '2000-01-02.hdf', '2000-01-03.hdf', etc..

    >>> from datetime import date, timedelta
    >>> base = date(year=2000, month=1, day=1)
    >>> def name_function(i):
    ...     ''' Convert integer 0 to n to a string '''
    ...     return base + timedelta(days=i)

    >>> df.to_hdf('*.hdf', '/data', name_function=name_function) # doctest: +SKIP

    Returns
    -------
    filenames : list
        Returned if ``compute`` is True. List of file names that each partition
        is saved to.
    delayed : dask.Delayed
        Returned if ``compute`` is False. Delayed object to execute ``to_hdf``
        when computed.

    See Also
    --------
    read_hdf:
    to_parquet:
    """
    name = "to-hdf-" + uuid.uuid1().hex

    pd_to_hdf = getattr(df._partition_type, "to_hdf")

    single_file = True
    single_node = True

    path = stringify_path(path)

    # if path is string, format using i_name
    if isinstance(path, str):
        if path.count("*") + key.count("*") > 1:
            raise ValueError(
                "A maximum of one asterisk is accepted in file path and dataset key"
            )

        fmt_obj = lambda path, i_name: path.replace("*", i_name)

        if "*" in path:
            single_file = False
    else:
        if key.count("*") > 1:
            raise ValueError("A maximum of one asterisk is accepted in dataset key")

        fmt_obj = lambda path, _: path

    if "*" in key:
        single_node = False

    if "format" in kwargs and kwargs["format"] not in ["t", "table"]:
        raise ValueError("Dask only support 'table' format in hdf files.")

    if mode not in ("a", "w", "r+"):
        raise ValueError("Mode must be one of 'a', 'w' or 'r+'")

    if name_function is None:
        name_function = build_name_function(df.npartitions - 1)

    # we guarantee partition order is preserved when its saved and read
    # so we enforce name_function to maintain the order of its input.
    if not (single_file and single_node):
        formatted_names = [name_function(i) for i in range(df.npartitions)]
        if formatted_names != sorted(formatted_names):
            warn(
                "To preserve order between partitions name_function "
                "must preserve the order of its input"
            )

    # If user did not specify scheduler and write is sequential default to the
    # sequential scheduler. otherwise let the _get method choose the scheduler
    if (
        scheduler is None
        and not config.get("scheduler", None)
        and single_node
        and single_file
    ):
        scheduler = "single-threaded"

    # handle lock default based on whether we're writing to a single entity
    _actual_get = get_scheduler(collections=[df], scheduler=scheduler)
    if lock is None:
        if not single_node:
            lock = True
        elif not single_file and _actual_get is not multiprocessing.get:
            # if we're writing to multiple files with the multiprocessing
            # scheduler we don't need to lock
            lock = True
        else:
            lock = False
    if lock:
        lock = get_scheduler_lock(df, scheduler=scheduler)

    kwargs.update({"format": "table", "mode": mode, "append": append})

    dsk = dict()

    i_name = name_function(0)
    dsk[(name, 0)] = (
        _pd_to_hdf,
        pd_to_hdf,
        lock,
        [(df._name, 0), fmt_obj(path, i_name), key.replace("*", i_name)],
        kwargs,
    )

    kwargs2 = kwargs.copy()
    if single_file:
        kwargs2["mode"] = "a"
    if single_node:
        kwargs2["append"] = True

    filenames = []
    for i in range(0, df.npartitions):
        i_name = name_function(i)
        filenames.append(fmt_obj(path, i_name))

    for i in range(1, df.npartitions):
        i_name = name_function(i)
        task = (
            _pd_to_hdf,
            pd_to_hdf,
            lock,
            [(df._name, i), fmt_obj(path, i_name), key.replace("*", i_name)],
            kwargs2,
        )
        if single_file:
            link_dep = i - 1 if single_node else 0
            task = (_link, (name, link_dep), task)
        dsk[(name, i)] = task

    dsk = merge(df.dask, dsk)
    if single_file and single_node:
        keys = [(name, df.npartitions - 1)]
    else:
        keys = [(name, i) for i in range(df.npartitions)]

    if compute:
        compute_as_if_collection(
            DataFrame, dsk, keys, scheduler=scheduler, **dask_kwargs
        )
        return filenames
    else:
        return delayed([Delayed(k, dsk) for k in keys])
Beispiel #12
0
    def __init__(
        self,
        path_or_source,
        engine=None,
        part_size=None,
        part_mem_fraction=None,
        storage_options=None,
        dtypes=None,
        client=None,
        cpu=None,
        base_dataset=None,
        **kwargs,
    ):
        self.dtypes = dtypes
        self.client = client

        # Check if we are keeping data in cpu memory
        self.cpu = cpu or False

        # Keep track of base dataset (optional)
        self.base_dataset = base_dataset or self

        # For now, lets warn the user that "cpu mode" is experimental
        if self.cpu:
            warnings.warn(
                "Initializing an NVTabular Dataset in CPU mode."
                "This is an experimental feature with extremely limited support!"
            )

        if isinstance(path_or_source, (dask.dataframe.DataFrame, cudf.DataFrame, pd.DataFrame)):
            # User is passing in a <dask.dataframe|cudf|pd>.DataFrame
            # Use DataFrameDatasetEngine
            moved_collection = (
                False  # Whether a pd-backed collection was moved to cudf (or vice versa)
            )
            if self.cpu:
                if isinstance(path_or_source, pd.DataFrame):
                    # Convert pandas DataFrame to pandas-backed dask.dataframe.DataFrame
                    path_or_source = dask.dataframe.from_pandas(path_or_source, npartitions=1)
                elif isinstance(path_or_source, cudf.DataFrame):
                    # Convert cudf DataFrame to pandas-backed dask.dataframe.DataFrame
                    path_or_source = dask.dataframe.from_pandas(
                        path_or_source.to_pandas(), npartitions=1
                    )
                elif isinstance(path_or_source, dask_cudf.DataFrame):
                    # Convert dask_cudf DataFrame to pandas-backed dask.dataframe.DataFrame
                    path_or_source = path_or_source.to_dask_dataframe()
                    moved_collection = True
            else:
                if isinstance(path_or_source, cudf.DataFrame):
                    # Convert cudf DataFrame to dask_cudf.DataFrame
                    path_or_source = dask_cudf.from_cudf(path_or_source, npartitions=1)
                elif isinstance(path_or_source, pd.DataFrame):
                    # Convert pandas DataFrame to dask_cudf.DataFrame
                    path_or_source = dask_cudf.from_cudf(
                        cudf.from_pandas(path_or_source), npartitions=1
                    )
                elif not isinstance(path_or_source, dask_cudf.DataFrame):
                    # Convert dask.dataframe.DataFrame DataFrame to dask_cudf.DataFrame
                    path_or_source = dask_cudf.from_dask_dataframe(path_or_source)
                    moved_collection = True
            if part_size:
                warnings.warn("part_size is ignored for DataFrame input.")
            if part_mem_fraction:
                warnings.warn("part_mem_fraction is ignored for DataFrame input.")
            self.engine = DataFrameDatasetEngine(
                path_or_source, cpu=self.cpu, moved_collection=moved_collection
            )
        else:
            if part_size:
                # If a specific partition size is given, use it directly
                part_size = parse_bytes(part_size)
            else:
                # If a fractional partition size is given, calculate part_size
                part_mem_fraction = part_mem_fraction or 0.125
                assert 0.0 < part_mem_fraction < 1.0
                if part_mem_fraction > 0.25:
                    warnings.warn(
                        "Using very large partitions sizes for Dask. "
                        "Memory-related errors are likely."
                    )
                part_size = int(device_mem_size(kind="total") * part_mem_fraction)

            # Engine-agnostic path handling
            paths = path_or_source
            if hasattr(paths, "name"):
                paths = stringify_path(paths)
            if isinstance(paths, str):
                paths = [paths]
            paths = sorted(paths, key=natural_sort_key)

            storage_options = storage_options or {}
            # If engine is not provided, try to infer from end of paths[0]
            if engine is None:
                engine = paths[0].split(".")[-1]
            if isinstance(engine, str):
                if engine == "parquet":
                    self.engine = ParquetDatasetEngine(
                        paths, part_size, storage_options=storage_options, cpu=self.cpu, **kwargs
                    )
                elif engine == "csv":
                    self.engine = CSVDatasetEngine(
                        paths, part_size, storage_options=storage_options, cpu=self.cpu, **kwargs
                    )
                elif engine == "avro":
                    try:
                        from .avro import AvroDatasetEngine
                    except ImportError as e:
                        raise RuntimeError(
                            "Failed to import AvroDatasetEngine. Make sure uavro is installed."
                        ) from e

                    self.engine = AvroDatasetEngine(
                        paths, part_size, storage_options=storage_options, cpu=self.cpu, **kwargs
                    )
                else:
                    raise ValueError("Only parquet, csv, and avro supported (for now).")
            else:
                self.engine = engine(
                    paths, part_size, cpu=self.cpu, storage_options=storage_options
                )
Beispiel #13
0
def to_parquet(
    df,
    path,
    engine="auto",
    compression="default",
    write_index=True,
    append=False,
    ignore_divisions=False,
    partition_on=None,
    storage_options=None,
    write_metadata_file=True,
    compute=True,
    compute_kwargs=None,
    schema=None,
    **kwargs,
):
    """Store Dask.dataframe to Parquet files

    Notes
    -----
    Each partition will be written to a separate file.

    Parameters
    ----------
    df : dask.dataframe.DataFrame
    path : string or pathlib.Path
        Destination directory for data.  Prepend with protocol like ``s3://``
        or ``hdfs://`` for remote data.
    engine : {'auto', 'fastparquet', 'pyarrow'}, default 'auto'
        Parquet library to use. If only one library is installed, it will use
        that one; if both, it will use 'fastparquet'.
    compression : string or dict, optional
        Either a string like ``"snappy"`` or a dictionary mapping column names
        to compressors like ``{"name": "gzip", "values": "snappy"}``. The
        default is ``"default"``, which uses the default compression for
        whichever engine is selected.
    write_index : boolean, optional
        Whether or not to write the index. Defaults to True.
    append : bool, optional
        If False (default), construct data-set from scratch. If True, add new
        row-group(s) to an existing data-set. In the latter case, the data-set
        must exist, and the schema must match the input data.
    ignore_divisions : bool, optional
        If False (default) raises error when previous divisions overlap with
        the new appended divisions. Ignored if append=False.
    partition_on : list, optional
        Construct directory-based partitioning by splitting on these fields'
        values. Each dask partition will result in one or more datafiles,
        there will be no global groupby.
    storage_options : dict, optional
        Key/value pairs to be passed on to the file-system backend, if any.
    write_metadata_file : bool, optional
        Whether to write the special "_metadata" file.
    compute : bool, optional
        If True (default) then the result is computed immediately. If False
        then a ``dask.delayed`` object is returned for future computation.
    compute_kwargs : dict, optional
        Options to be passed in to the compute method
    schema : Schema object, dict, or {"infer", None}, optional
        Global schema to use for the output dataset. Alternatively, a `dict`
        of pyarrow types can be specified (e.g. `schema={"id": pa.string()}`).
        For this case, fields excluded from the dictionary will be inferred
        from `_meta_nonempty`.  If "infer", the first non-empty and non-null
        partition will be used to infer the type for "object" columns. If
        None (default), we let the backend infer the schema for each distinct
        output partition. If the partitions produce inconsistent schemas,
        pyarrow will throw an error when writing the shared _metadata file.
        Note that this argument is ignored by the "fastparquet" engine.
    **kwargs :
        Extra options to be passed on to the specific backend.

    Examples
    --------
    >>> df = dd.read_csv(...)  # doctest: +SKIP
    >>> dd.to_parquet(df, '/path/to/output/',...)  # doctest: +SKIP

    See Also
    --------
    read_parquet: Read parquet data to dask.dataframe
    """
    from dask import delayed

    if compression == "default":
        if snappy is not None:
            compression = "snappy"
        else:
            compression = None

    partition_on = partition_on or []
    if isinstance(partition_on, str):
        partition_on = [partition_on]

    if set(partition_on) - set(df.columns):
        raise ValueError("Partitioning on non-existent column. "
                         "partition_on=%s ."
                         "columns=%s" %
                         (str(partition_on), str(list(df.columns))))

    if isinstance(engine, str):
        engine = get_engine(engine)

    if hasattr(path, "name"):
        path = stringify_path(path)
    fs, _, _ = get_fs_token_paths(path,
                                  mode="wb",
                                  storage_options=storage_options)
    # Trim any protocol information from the path before forwarding
    path = fs._strip_protocol(path)

    # Save divisions and corresponding index name. This is necessary,
    # because we may be resetting the index to write the file
    division_info = {"divisions": df.divisions, "name": df.index.name}
    if division_info["name"] is None:
        # As of 0.24.2, pandas will rename an index with name=None
        # when df.reset_index() is called.  The default name is "index",
        # but dask will always change the name to the NONE_LABEL constant
        if NONE_LABEL not in df.columns:
            division_info["name"] = NONE_LABEL
        elif write_index:
            raise ValueError(
                "Index must have a name if __null_dask_index__ is a column.")
        else:
            warnings.warn(
                "If read back by Dask, column named __null_dask_index__ "
                "will be set to the index (and renamed to None).")

    # There are some "resrved" names that may be used as the default column
    # name after resetting the index. However, we don't want to treat it as
    # a "special" name if the string is already used as a "real" column name.
    reserved_names = []
    for name in ["index", "level_0"]:
        if name not in df.columns:
            reserved_names.append(name)

    # If write_index==True (default), reset the index and record the
    # name of the original index in `index_cols` (we will set the name
    # to the NONE_LABEL constant if it is originally `None`).
    # `fastparquet` will use `index_cols` to specify the index column(s)
    # in the metadata.  `pyarrow` will revert the `reset_index` call
    # below if `index_cols` is populated (because pyarrow will want to handle
    # index preservation itself).  For both engines, the column index
    # will be written to "pandas metadata" if write_index=True
    index_cols = []
    if write_index:
        real_cols = set(df.columns)
        none_index = list(df._meta.index.names) == [None]
        df = df.reset_index()
        if none_index:
            df.columns = [
                c if c not in reserved_names else NONE_LABEL
                for c in df.columns
            ]
        index_cols = [c for c in set(df.columns).difference(real_cols)]
    else:
        # Not writing index - might as well drop it
        df = df.reset_index(drop=True)

    _to_parquet_kwargs = {
        "engine",
        "compression",
        "write_index",
        "append",
        "ignore_divisions",
        "partition_on",
        "storage_options",
        "write_metadata_file",
        "compute",
    }
    kwargs_pass = {
        k: v
        for k, v in kwargs.items() if k not in _to_parquet_kwargs
    }

    # Engine-specific initialization steps to write the dataset.
    # Possibly create parquet metadata, and load existing stuff if appending
    meta, schema, i_offset = engine.initialize_write(
        df,
        fs,
        path,
        append=append,
        ignore_divisions=ignore_divisions,
        partition_on=partition_on,
        division_info=division_info,
        index_cols=index_cols,
        schema=schema,
        **kwargs_pass,
    )

    # Use i_offset and df.npartitions to define file-name list
    filenames = [
        "part.%i.parquet" % (i + i_offset) for i in range(df.npartitions)
    ]

    # write parts
    dwrite = delayed(engine.write_partition)
    parts = [
        dwrite(
            d,
            path,
            fs,
            filename,
            partition_on,
            write_metadata_file,
            fmd=meta,
            compression=compression,
            index_cols=index_cols,
            schema=schema,
            **kwargs_pass,
        ) for d, filename in zip(df.to_delayed(), filenames)
    ]

    # single task to complete
    out = delayed(lambda x: None)(parts)
    if write_metadata_file:
        out = delayed(engine.write_metadata)(parts,
                                             meta,
                                             fs,
                                             path,
                                             append=append,
                                             compression=compression)

    if compute:
        if compute_kwargs is None:
            compute_kwargs = dict()
        out = out.compute(**compute_kwargs)
    return out
Beispiel #14
0
def read_parquet(
    filepath_or_buffer,
    engine="cudf",
    columns=None,
    filters=None,
    row_groups=None,
    skiprows=None,
    num_rows=None,
    strings_to_categorical=False,
    use_pandas_metadata=True,
    *args,
    **kwargs,
):
    """{docstring}"""

    # Multiple sources are passed as a list. If a single source is passed,
    # wrap it in a list for unified processing downstream.
    if not is_list_like(filepath_or_buffer):
        filepath_or_buffer = [filepath_or_buffer]

    # a list of row groups per source should be passed. make the list of
    # lists that is expected for multiple sources
    if row_groups is not None:
        if not is_list_like(row_groups):
            row_groups = [[row_groups]]
        elif not is_list_like(row_groups[0]):
            row_groups = [row_groups]

    filepaths_or_buffers = []
    for source in filepath_or_buffer:
        if ioutils.is_directory(source, **kwargs):
            fs = _ensure_filesystem(passed_filesystem=None, path=source)
            source = stringify_path(source)
            source = fs.sep.join([source, "*.parquet"])

        tmp_source, compression = ioutils.get_filepath_or_buffer(
            path_or_data=source,
            compression=None,
            **kwargs,
        )
        if compression is not None:
            raise ValueError(
                "URL content-encoding decompression is not supported")
        if isinstance(tmp_source, list):
            filepath_or_buffer.extend(tmp_source)
        else:
            filepaths_or_buffers.append(tmp_source)

    if filters is not None:
        # Convert filters to ds.Expression
        filters = pq._filters_to_expression(filters)

        # Initialize ds.FilesystemDataset
        dataset = ds.dataset(filepaths_or_buffers,
                             format="parquet",
                             partitioning="hive")

        # Load IDs of filtered row groups for each file in dataset
        filtered_rg_ids = defaultdict(list)
        for fragment in dataset.get_fragments(filter=filters):
            for rg_fragment in fragment.split_by_row_group(filters):
                for rg_info in rg_fragment.row_groups:
                    filtered_rg_ids[rg_fragment.path].append(rg_info.id)

        # Initialize row_groups to be selected
        if row_groups is None:
            row_groups = [None for _ in dataset.files]

        # Store IDs of selected row groups for each file
        for i, file in enumerate(dataset.files):
            if row_groups[i] is None:
                row_groups[i] = filtered_rg_ids[file]
            else:
                row_groups[i] = filter(lambda id: id in row_groups[i],
                                       filtered_rg_ids[file])

    if engine == "cudf":
        return libparquet.read_parquet(
            filepaths_or_buffers,
            columns=columns,
            row_groups=row_groups,
            skiprows=skiprows,
            num_rows=num_rows,
            strings_to_categorical=strings_to_categorical,
            use_pandas_metadata=use_pandas_metadata,
        )
    else:
        warnings.warn("Using CPU via PyArrow to read Parquet dataset.")
        return cudf.DataFrame.from_arrow(
            pq.ParquetDataset(filepaths_or_buffers).read_pandas(
                columns=columns, *args, **kwargs))
Beispiel #15
0
def to_orc(
    df,
    path,
    engine="pyarrow",
    write_index=True,
    storage_options=None,
    compute=True,
    compute_kwargs=None,
):
    """Store Dask.dataframe to ORC files

    Notes
    -----
    Each partition will be written to a separate file.

    Parameters
    ----------
    df : dask.dataframe.DataFrame
    path : string or pathlib.Path
        Destination directory for data.  Prepend with protocol like ``s3://``
        or ``hdfs://`` for remote data.
    engine : 'pyarrow' or ORCEngine
        Parquet library to use. If only one library is installed, it will use
        that one; if both, it will use 'fastparquet'.
    write_index : boolean, default True
        Whether or not to write the index. Defaults to True.
    storage_options : dict, default None
        Key/value pairs to be passed on to the file-system backend, if any.
    compute : bool, default True
        If True (default) then the result is computed immediately. If False
        then a ``dask.delayed`` object is returned for future computation.
    compute_kwargs : dict, default True
        Options to be passed in to the compute method

    Examples
    --------
    >>> df = dd.read_csv(...)  # doctest: +SKIP
    >>> df.to_orc('/path/to/output/', ...)  # doctest: +SKIP

    See Also
    --------
    read_orc: Read ORC data to dask.dataframe
    """

    # Get engine
    engine = _get_engine(engine, write=True)

    if hasattr(path, "name"):
        path = stringify_path(path)
    fs, _, _ = get_fs_token_paths(path, mode="wb", storage_options=storage_options)
    # Trim any protocol information from the path before forwarding
    path = fs._strip_protocol(path)

    if not write_index:
        # Not writing index - might as well drop it
        df = df.reset_index(drop=True)

    # Use df.npartitions to define file-name list
    fs.mkdirs(path, exist_ok=True)
    filenames = [f"part.{i}.orc" for i in range(df.npartitions)]

    # Construct IO graph
    dsk = {}
    name = "to-orc-" + tokenize(
        df,
        fs,
        path,
        engine,
        write_index,
        storage_options,
    )
    final_name = name + "-final"
    for d, filename in enumerate(filenames):
        dsk[(name, d)] = (
            apply,
            engine.write_partition,
            [
                (df._name, d),
                path,
                fs,
                filename,
            ],
        )
    part_tasks = list(dsk.keys())
    dsk[(final_name, 0)] = (lambda x: None, part_tasks)
    graph = HighLevelGraph.from_collections((final_name, 0), dsk, dependencies=[df])

    # Compute or return future
    if compute:
        if compute_kwargs is None:
            compute_kwargs = dict()
        return compute_as_if_collection(DataFrame, graph, part_tasks, **compute_kwargs)
    return Scalar(graph, final_name, "")
Beispiel #16
0
    def __init__(
        self,
        path_or_source,
        engine=None,
        part_size=None,
        part_mem_fraction=None,
        storage_options=None,
        dtypes=None,
        **kwargs,
    ):
        self.dtypes = dtypes
        if isinstance(
                path_or_source,
            (dask.dataframe.DataFrame, cudf.DataFrame, pd.DataFrame)):
            # User is passing in a <dask.dataframe|cudf|pd>.DataFrame
            # Use DataFrameDatasetEngine
            if isinstance(path_or_source, cudf.DataFrame):
                path_or_source = dask_cudf.from_cudf(path_or_source,
                                                     npartitions=1)
            elif isinstance(path_or_source, pd.DataFrame):
                path_or_source = dask_cudf.from_cudf(
                    cudf.from_pandas(path_or_source), npartitions=1)
            elif not isinstance(path_or_source, dask_cudf.DataFrame):
                path_or_source = dask_cudf.from_dask_dataframe(path_or_source)
            if part_size:
                warnings.warn("part_size is ignored for DataFrame input.")
            if part_mem_fraction:
                warnings.warn(
                    "part_mem_fraction is ignored for DataFrame input.")
            self.engine = DataFrameDatasetEngine(path_or_source)
        else:
            if part_size:
                # If a specific partition size is given, use it directly
                part_size = parse_bytes(part_size)
            else:
                # If a fractional partition size is given, calculate part_size
                part_mem_fraction = part_mem_fraction or 0.125
                assert part_mem_fraction > 0.0 and part_mem_fraction < 1.0
                if part_mem_fraction > 0.25:
                    warnings.warn(
                        "Using very large partitions sizes for Dask. "
                        "Memory-related errors are likely.")
                part_size = int(
                    device_mem_size(kind="total") * part_mem_fraction)

            # Engine-agnostic path handling
            paths = path_or_source
            if hasattr(paths, "name"):
                paths = stringify_path(paths)
            if isinstance(paths, str):
                paths = [paths]

            storage_options = storage_options or {}
            # If engine is not provided, try to infer from end of paths[0]
            if engine is None:
                engine = paths[0].split(".")[-1]
            if isinstance(engine, str):
                if engine == "parquet":
                    self.engine = ParquetDatasetEngine(
                        paths,
                        part_size,
                        storage_options=storage_options,
                        **kwargs)
                elif engine == "csv":
                    self.engine = CSVDatasetEngine(
                        paths,
                        part_size,
                        storage_options=storage_options,
                        **kwargs)
                else:
                    raise ValueError(
                        "Only parquet and csv supported (for now).")
            else:
                self.engine = engine(paths,
                                     part_size,
                                     storage_options=storage_options)
Beispiel #17
0
    def _strip_protocol(cls, path):
        path = stringify_path(path)
        if "://" in path:
            _, _, path = path.partition("://")

        return path
Beispiel #18
0
    def ls(
        self,
        path: str,
        detail: bool = False,
        invalidate_cache: bool = True,
        delimiter: str = "/",
        **kwargs,
    ):
        """ Create a list of blob names from a blob container

        Parameters
        ----------
        path:  Path to an Azure Blob directory
        detail:  If False, return a list of blob names, else a list of dictionaries with blob details
        invalidate_cache:  Boolean
        """
        logging.debug("Running abfs.ls() method")
        path = stringify_path(path)
        if path.strip() == "":
            homedir_ = self.blob_fs.list_blobs(
                container_name=self.container_name, prefix=path, delimiter=delimiter
            )
            homedir = list(homedir_)[0]
            return self.ls(path=homedir.name, detail=detail, delimiter=delimiter)

        else:
            blobs = self.blob_fs.list_blobs(
                container_name=self.container_name, prefix=path, delimiter=delimiter
            )
            blobs_ = list(blobs)
            if len(blobs_) == 1 and isinstance(blobs_[0], BlobPrefix):
                path = blobs_[0].name
                return self.ls(path, detail=detail, delimiter=delimiter)

            if detail is False:
                pathlist = [blob.name for blob in blobs]
                logging.debug(f"Detail is False.  Returning {pathlist}")
                return pathlist
            else:
                pathlist = []
                for blob in blobs:
                    logging.debug(f"Parsing {blob}")
                    data = {}
                    data["name"] = blob.name
                    data["container_name"] = self.container_name
                    try:
                        data["size"] = blob.properties.content_length
                        if blob.properties.content_settings.content_type is not None:
                            data["type"] = "file"
                        else:
                            logging.debug("Assigning {blob} is a directory")
                            data["type"] = "directory"
                    except AttributeError:
                        logging.debug(f"Handling AttributeError for {blob.name}")
                        logging.debug(f"")
                        if path == blob.name.rstrip("/"):
                            return self.ls(blob.name, detail=detail, delimiter=None)
                        elif isinstance(blob, BlobPrefix):
                            data["type"] = "directory"
                            data["size"] = 0
                        else:
                            raise AttributeError(
                                f"AzureBlobFileSystem.ls() method unable to assign attributes for {blob}!!"
                            )
                    pathlist.append(data)
                logging.debug(f"Detail is True:  Returning {pathlist}")
                return pathlist
Beispiel #19
0
 def chmod(self, path, mode):
     path = stringify_path(path)
     return os.chmod(path, mode)
Beispiel #20
0
Datei: orc.py Projekt: vyasr/cudf
def to_orc(
    df,
    path,
    write_index=True,
    storage_options=None,
    compression=None,
    compute=True,
    **kwargs,
):
    """Write a dask_cudf dataframe to ORC file(s) (one file per partition).

    Parameters
    ----------
    df : dask_cudf.DataFrame
    path: string or pathlib.Path
        Destination directory for data.  Prepend with protocol like ``s3://``
        or ``hdfs://`` for remote data.
    write_index : boolean, optional
        Whether or not to write the index. Defaults to True.
    storage_options: None or dict
        Further parameters to pass to the bytes backend.
    compression : string or dict, optional
    compute : bool, optional
        If True (default) then the result is computed immediately. If False
        then a ``dask.delayed`` object is returned for future computation.
    """

    from dask import compute as dask_compute, delayed

    # TODO: Use upstream dask implementation once available
    #       (see: Dask Issue#5596)

    if hasattr(path, "name"):
        path = stringify_path(path)
    fs, _, _ = get_fs_token_paths(path,
                                  mode="wb",
                                  storage_options=storage_options)
    # Trim any protocol information from the path before forwarding
    path = fs._strip_protocol(path)

    if write_index:
        df = df.reset_index()
    else:
        # Not writing index - might as well drop it
        df = df.reset_index(drop=True)

    fs.mkdirs(path, exist_ok=True)

    # Use i_offset and df.npartitions to define file-name list
    filenames = ["part.%i.orc" % i for i in range(df.npartitions)]

    # write parts
    dwrite = delayed(write_orc_partition)
    parts = [
        dwrite(d, path, fs, filename, compression=compression)
        for d, filename in zip(df.to_delayed(), filenames)
    ]

    if compute:
        return dask_compute(*parts)

    return delayed(list)(parts)
Beispiel #21
0
def read_hdf(
    pattern,
    key,
    start=0,
    stop=None,
    columns=None,
    chunksize=1000000,
    sorted_index=False,
    lock=True,
    mode="a",
):
    """
    Read HDF files into a Dask DataFrame

    Read hdf files into a dask dataframe. This function is like
    ``pandas.read_hdf``, except it can read from a single large file, or from
    multiple files, or from multiple keys from the same file.

    Parameters
    ----------
    pattern : string, pathlib.Path, list
        File pattern (string), pathlib.Path, buffer to read from, or list of
        file paths. Can contain wildcards.
    key : group identifier in the store. Can contain wildcards
    start : optional, integer (defaults to 0), row number to start at
    stop : optional, integer (defaults to None, the last row), row number to
        stop at
    columns : list of columns, optional
        A list of columns that if not None, will limit the return
        columns (default is None)
    chunksize : positive integer, optional
        Maximal number of rows per partition (default is 1000000).
    sorted_index : boolean, optional
        Option to specify whether or not the input hdf files have a sorted
        index (default is False).
    lock : boolean, optional
        Option to use a lock to prevent concurrency issues (default is True).
    mode : {'a', 'r', 'r+'}, default 'a'. Mode to use when opening file(s).
        'r'
            Read-only; no data can be modified.
        'a'
            Append; an existing file is opened for reading and writing,
            and if the file does not exist it is created.
        'r+'
            It is similar to 'a', but the file must already exist.

    Returns
    -------
    dask.DataFrame

    Examples
    --------
    Load single file

    >>> dd.read_hdf('myfile.1.hdf5', '/x')  # doctest: +SKIP

    Load multiple files

    >>> dd.read_hdf('myfile.*.hdf5', '/x')  # doctest: +SKIP

    >>> dd.read_hdf(['myfile.1.hdf5', 'myfile.2.hdf5'], '/x')  # doctest: +SKIP

    Load multiple datasets

    >>> dd.read_hdf('myfile.1.hdf5', '/*')  # doctest: +SKIP
    """
    if lock is True:
        lock = get_scheduler_lock()

    key = key if key.startswith("/") else "/" + key
    # Convert path-like objects to a string
    pattern = stringify_path(pattern)

    if isinstance(pattern, str):
        paths = sorted(glob(pattern))
    else:
        paths = pattern
    if (start != 0 or stop is not None) and len(paths) > 1:
        raise NotImplementedError(read_hdf_error_msg)
    if chunksize <= 0:
        raise ValueError("Chunksize must be a positive integer")
    if (start != 0 or stop is not None) and sorted_index:
        raise ValueError("When assuming pre-partitioned data, data must be "
                         "read in its entirety using the same chunksizes")
    from ..multi import concat

    return concat([
        _read_single_hdf(
            path,
            key,
            start=start,
            stop=stop,
            columns=columns,
            chunksize=chunksize,
            sorted_index=sorted_index,
            lock=lock,
            mode=mode,
        ) for path in paths
    ])
Beispiel #22
0
def parquet_reader(
    path,
    columns=None,
    row_groups_per_part=None,
    index=None,
    storage_options=None,
    **kwargs,
):

    name = "opt-read-parquet-" + tokenize(path, columns, index,
                                          storage_options, row_groups_per_part)

    if hasattr(path, "name"):
        path = stringify_path(path)
    fs, _, paths = get_fs_token_paths(path,
                                      mode="rb",
                                      storage_options=storage_options)
    if len(paths) > 1 or not fs.isdir(paths[0]):
        raise ValueError(
            "Must pass in a directory path to use `row_groups_per_part`.")

    auto_index_allowed = False
    if index is None:
        # User is allowing auto-detected index
        auto_index_allowed = True
    if index and isinstance(index, str):
        index = [index]

    dd_meta, parts = _read_metadata(fs, path, row_groups_per_part, index=index)
    strings_to_cats = kwargs.get("strings_to_categorical", False)
    meta = cudf.DataFrame(index=dd_meta.index)
    for col in dd_meta.columns:
        if dd_meta[col].dtype == "O":
            meta[col] = as_column(
                dd_meta[col], dtype="int32" if strings_to_cats else "object")
        else:
            meta[col] = as_column(dd_meta[col])

    if meta.index.name is not None:
        index = meta.index.name

    # Account for index and columns arguments.
    # Modify `meta` dataframe accordingly
    index_in_columns = False
    meta, index, columns = set_index_columns(meta, index, columns,
                                             index_in_columns,
                                             auto_index_allowed)

    dsk = {}
    for p, part in enumerate(parts):
        read_key = (name, p)
        dsk[read_key] = (
            _read_partition,
            part,
            index,
            columns,
            strings_to_cats,
        )

    # Set the index that was previously treated as a column
    if index_in_columns:
        meta = meta.set_index(index)

    divisions = [None] * (len(parts) + 1)
    return DataFrame(dsk, name, meta, divisions)
Beispiel #23
0
def read_parquet(
    path,
    columns=None,
    filters=None,
    categories=None,
    index=None,
    storage_options=None,
    engine="auto",
    gather_statistics=None,
    split_row_groups=None,
    read_from_paths=None,
    chunksize=None,
    **kwargs,
):
    """
    Read a Parquet file into a Dask DataFrame

    This reads a directory of Parquet data into a Dask.dataframe, one file per
    partition.  It selects the index among the sorted columns if any exist.

    Parameters
    ----------
    path : string or list
        Source directory for data, or path(s) to individual parquet files.
        Prefix with a protocol like ``s3://`` to read from alternative
        filesystems. To read from multiple files you can pass a globstring or a
        list of paths, with the caveat that they must all have the same
        protocol.
    columns : string, list or None (default)
        Field name(s) to read in as columns in the output. By default all
        non-index fields will be read (as determined by the pandas parquet
        metadata, if present). Provide a single field name instead of a list to
        read in the data as a Series.
    filters : Union[List[Tuple[str, str, Any]], List[List[Tuple[str, str, Any]]]]
        List of filters to apply, like ``[[('x', '=', 0), ...], ...]``. Using this
        argument will NOT result in row-wise filtering of the final partitions
        unless ``engine="pyarrow-dataset"`` is also specified.  For other engines,
        filtering is only performed at the partition level, i.e., to prevent the
        loading of some row-groups and/or files.

        Predicates can be expressed in disjunctive normal form (DNF). This means
        that the innermost tuple describes a single column predicate. These
        inner predicates are combined with an AND conjunction into a larger
        predicate. The outer-most list then combines all of the combined
        filters with an OR disjunction.

        Predicates can also be expressed as a List[Tuple]. These are evaluated
        as an AND conjunction. To express OR in predictates, one must use the
        (preferred) List[List[Tuple]] notation.
    index : string, list, False or None (default)
        Field name(s) to use as the output frame index. By default will be
        inferred from the pandas parquet file metadata (if present). Use False
        to read all fields as columns.
    categories : list, dict or None
        For any fields listed here, if the parquet encoding is Dictionary,
        the column will be created with dtype category. Use only if it is
        guaranteed that the column is encoded as dictionary in all row-groups.
        If a list, assumes up to 2**16-1 labels; if a dict, specify the number
        of labels expected; if None, will load categories automatically for
        data written by dask/fastparquet, not otherwise.
    storage_options : dict
        Key/value pairs to be passed on to the file-system backend, if any.
    engine : str, default 'auto'
        Parquet reader library to use. Options include: 'auto', 'fastparquet',
        'pyarrow', 'pyarrow-dataset', and 'pyarrow-legacy'. Defaults to 'auto',
        which selects the FastParquetEngine if fastparquet is installed (and
        ArrowLegacyEngine otherwise).  If 'pyarrow-dataset' is specified, the
        ArrowDatasetEngine (which leverages the pyarrow.dataset API) will be used
        for newer PyArrow versions (>=1.0.0). If 'pyarrow' or 'pyarrow-legacy' are
        specified, the ArrowLegacyEngine will be used (which leverages the
        pyarrow.parquet.ParquetDataset API).
        NOTE: 'pyarrow-dataset' enables row-wise filtering, but requires
        pyarrow>=1.0. The behavior of 'pyarrow' will most likely change to
        ArrowDatasetEngine in a future release, and the 'pyarrow-legacy'
        option will be deprecated once the ParquetDataset API is deprecated.
    gather_statistics : bool or None (default).
        Gather the statistics for each dataset partition. By default,
        this will only be done if the _metadata file is available. Otherwise,
        statistics will only be gathered if True, because the footer of
        every file will be parsed (which is very slow on some systems).
    split_row_groups : bool or int
        Default is True if a _metadata file is available or if
        the dataset is composed of a single file (otherwise defult is False).
        If True, then each output dataframe partition will correspond to a single
        parquet-file row-group. If False, each partition will correspond to a
        complete file.  If a positive integer value is given, each dataframe
        partition will correspond to that number of parquet row-groups (or fewer).
        Only the "pyarrow" engine supports this argument.
    read_from_paths : bool or None (default)
        Only used by ``ArrowDatasetEngine`` when ``filters`` are specified.
        Determines whether the engine should avoid inserting large pyarrow
        (``ParquetFileFragment``) objects in the task graph.  If this option
        is True, ``read_partition`` will need to regenerate the appropriate
        fragment object from the path and row-group IDs.  This will reduce the
        size of the task graph, but will add minor overhead to ``read_partition``.
        By default (None), ``ArrowDatasetEngine`` will set this option to
        ``False`` when there are filters.
    chunksize : int, str
        The target task partition size.  If set, consecutive row-groups
        from the same file will be aggregated into the same output
        partition until the aggregate size reaches this value.
    **kwargs: dict (of dicts)
        Passthrough key-word arguments for read backend.
        The top-level keys correspond to the appropriate operation type, and
        the second level corresponds to the kwargs that will be passed on to
        the underlying ``pyarrow`` or ``fastparquet`` function.
        Supported top-level keys: 'dataset' (for opening a ``pyarrow`` dataset),
        'file' (for opening a ``fastparquet`` ``ParquetFile``), 'read' (for the
        backend read function), 'arrow_to_pandas' (for controlling the arguments
        passed to convert from a ``pyarrow.Table.to_pandas()``)

    Examples
    --------
    >>> df = dd.read_parquet('s3://bucket/my-parquet-data')  # doctest: +SKIP

    See Also
    --------
    to_parquet
    """

    if isinstance(columns, str):
        df = read_parquet(
            path,
            columns=[columns],
            filters=filters,
            categories=categories,
            index=index,
            storage_options=storage_options,
            engine=engine,
            gather_statistics=gather_statistics,
            split_row_groups=split_row_groups,
            read_from_paths=read_from_paths,
            chunksize=chunksize,
        )
        return df[columns]

    if columns is not None:
        columns = list(columns)

    name = "read-parquet-" + tokenize(
        path,
        columns,
        filters,
        categories,
        index,
        storage_options,
        engine,
        gather_statistics,
        split_row_groups,
        read_from_paths,
        chunksize,
    )

    if isinstance(engine, str):
        engine = get_engine(engine)

    if hasattr(path, "name"):
        path = stringify_path(path)
    fs, _, paths = get_fs_token_paths(path,
                                      mode="rb",
                                      storage_options=storage_options)

    paths = sorted(paths,
                   key=natural_sort_key)  # numeric rather than glob ordering

    auto_index_allowed = False
    if index is None:
        # User is allowing auto-detected index
        auto_index_allowed = True
    if index and isinstance(index, str):
        index = [index]

    read_metadata_result = engine.read_metadata(
        fs,
        paths,
        categories=categories,
        index=index,
        gather_statistics=True if chunksize else gather_statistics,
        filters=filters,
        split_row_groups=split_row_groups,
        read_from_paths=read_from_paths,
        **kwargs,
    )

    # In the future, we may want to give the engine the
    # option to return a dedicated element for `common_kwargs`.
    # However, to avoid breaking the API, we just embed this
    # data in the first element of `parts` for now.
    # The logic below is inteded to handle backward and forward
    # compatibility with a user-defined engine.
    meta, statistics, parts, index = read_metadata_result[:4]
    common_kwargs = {}
    if len(read_metadata_result) > 4:
        # Engine may return common_kwargs as a separate element
        common_kwargs = read_metadata_result[4]
    elif len(parts):
        # If the engine does not return a dedicated
        # common_kwargs argument, it may be stored in
        # the first element of `parts`
        common_kwargs = parts[0].pop("common_kwargs", {})

    # Parse dataset statistics from metadata (if available)
    parts, divisions, index, index_in_columns = process_statistics(
        parts, statistics, filters, index, chunksize)

    # Account for index and columns arguments.
    # Modify `meta` dataframe accordingly
    meta, index, columns = set_index_columns(meta, index, columns,
                                             index_in_columns,
                                             auto_index_allowed)
    if meta.index.name == NONE_LABEL:
        meta.index.name = None

    subgraph = ParquetSubgraph(
        name,
        engine,
        fs,
        meta,
        columns,
        index,
        parts,
        kwargs,
        common_kwargs=common_kwargs,
    )

    # Set the index that was previously treated as a column
    if index_in_columns:
        meta = meta.set_index(index)
        if meta.index.name == NONE_LABEL:
            meta.index.name = None

    if len(divisions) < 2:
        # empty dataframe - just use meta
        subgraph = {(name, 0): meta}
        divisions = (None, None)

    return new_dd_object(subgraph, name, meta, divisions)
Beispiel #24
0
def read_parquet(
    path,
    columns=None,
    filters=None,
    categories=None,
    index=None,
    storage_options=None,
    engine="auto",
    gather_statistics=None,
    split_row_groups=None,
    chunksize=None,
    **kwargs,
):
    """
    Read a Parquet file into a Dask DataFrame

    This reads a directory of Parquet data into a Dask.dataframe, one file per
    partition.  It selects the index among the sorted columns if any exist.

    Parameters
    ----------
    path : string or list
        Source directory for data, or path(s) to individual parquet files.
        Prefix with a protocol like ``s3://`` to read from alternative
        filesystems. To read from multiple files you can pass a globstring or a
        list of paths, with the caveat that they must all have the same
        protocol.
    columns : string, list or None (default)
        Field name(s) to read in as columns in the output. By default all
        non-index fields will be read (as determined by the pandas parquet
        metadata, if present). Provide a single field name instead of a list to
        read in the data as a Series.
    filters : Union[List[Tuple[str, str, Any]], List[List[Tuple[str, str, Any]]]]
        List of filters to apply, like ``[[('x', '=', 0), ...], ...]``. This
        implements partition-level (hive) filtering only, i.e., to prevent the
        loading of some row-groups and/or files.

        Predicates can be expressed in disjunctive normal form (DNF). This means
        that the innermost tuple describes a single column predicate. These
        inner predicates are combined with an AND conjunction into a larger
        predicate. The outer-most list then combines all of the combined
        filters with an OR disjunction.

        Predicates can also be expressed as a List[Tuple]. These are evaluated
        as an AND conjunction. To express OR in predictates, one must use the
        (preferred) List[List[Tuple]] notation.
    index : string, list, False or None (default)
        Field name(s) to use as the output frame index. By default will be
        inferred from the pandas parquet file metadata (if present). Use False
        to read all fields as columns.
    categories : list, dict or None
        For any fields listed here, if the parquet encoding is Dictionary,
        the column will be created with dtype category. Use only if it is
        guaranteed that the column is encoded as dictionary in all row-groups.
        If a list, assumes up to 2**16-1 labels; if a dict, specify the number
        of labels expected; if None, will load categories automatically for
        data written by dask/fastparquet, not otherwise.
    storage_options : dict
        Key/value pairs to be passed on to the file-system backend, if any.
    engine : {'auto', 'fastparquet', 'pyarrow'}, default 'auto'
        Parquet reader library to use. If only one library is installed, it
        will use that one; if both, it will use 'fastparquet'
    gather_statistics : bool or None (default).
        Gather the statistics for each dataset partition. By default,
        this will only be done if the _metadata file is available. Otherwise,
        statistics will only be gathered if True, because the footer of
        every file will be parsed (which is very slow on some systems).
    split_row_groups : bool or int
        Default is True if a _metadata file is available or if
        the dataset is composed of a single file (otherwise defult is False).
        If True, then each output dataframe partition will correspond to a single
        parquet-file row-group. If False, each partition will correspond to a
        complete file.  If a positive integer value is given, each dataframe
        partition will correspond to that number of parquet row-groups (or fewer).
        Only the "pyarrow" engine supports this argument.
    chunksize : int, str
        The target task partition size.  If set, consecutive row-groups
        from the same file will be aggregated into the same output
        partition until the aggregate size reaches this value.
    **kwargs: dict (of dicts)
        Passthrough key-word arguments for read backend.
        The top-level keys correspond to the appropriate operation type, and
        the second level corresponds to the kwargs that will be passed on to
        the underlying `pyarrow` or `fastparquet` function.
        Supported top-level keys: 'dataset' (for opening a `pyarrow` dataset),
        'file' (for opening a `fastparquet` `ParquetFile`), 'read' (for the
        backend read function), 'arrow_to_pandas' (for controlling the arguments
        passed to convert from a `pyarrow.Table.to_pandas()`)

    Examples
    --------
    >>> df = dd.read_parquet('s3://bucket/my-parquet-data')  # doctest: +SKIP

    See Also
    --------
    to_parquet
    """

    if isinstance(columns, str):
        df = read_parquet(
            path,
            [columns],
            filters,
            categories,
            index,
            storage_options,
            engine,
            gather_statistics,
        )
        return df[columns]

    if columns is not None:
        columns = list(columns)

    name = "read-parquet-" + tokenize(
        path,
        columns,
        filters,
        categories,
        index,
        storage_options,
        engine,
        gather_statistics,
    )

    if isinstance(engine, str):
        engine = get_engine(engine)

    if hasattr(path, "name"):
        path = stringify_path(path)
    fs, _, paths = get_fs_token_paths(path,
                                      mode="rb",
                                      storage_options=storage_options)

    paths = sorted(paths,
                   key=natural_sort_key)  # numeric rather than glob ordering

    auto_index_allowed = False
    if index is None:
        # User is allowing auto-detected index
        auto_index_allowed = True
    if index and isinstance(index, str):
        index = [index]

    meta, statistics, parts, index = engine.read_metadata(
        fs,
        paths,
        categories=categories,
        index=index,
        gather_statistics=gather_statistics,
        filters=filters,
        split_row_groups=split_row_groups,
        **kwargs,
    )

    # Parse dataset statistics from metadata (if available)
    parts, divisions, index, index_in_columns = process_statistics(
        parts, statistics, filters, index, chunksize)

    # Account for index and columns arguments.
    # Modify `meta` dataframe accordingly
    meta, index, columns = set_index_columns(meta, index, columns,
                                             index_in_columns,
                                             auto_index_allowed)
    if meta.index.name == NONE_LABEL:
        meta.index.name = None

    subgraph = ParquetSubgraph(name, engine, fs, meta, columns, index, parts,
                               kwargs)

    # Set the index that was previously treated as a column
    if index_in_columns:
        meta = meta.set_index(index)
        if meta.index.name == NONE_LABEL:
            meta.index.name = None

    if len(divisions) < 2:
        # empty dataframe - just use meta
        subgraph = {(name, 0): meta}
        divisions = (None, None)

    return new_dd_object(subgraph, name, meta, divisions)
Beispiel #25
0
def read_parquet(path,
                 columns=None,
                 filters=None,
                 categories=None,
                 index=None,
                 storage_options=None,
                 engine="auto",
                 gather_statistics=None,
                 **kwargs):
    """
    Read a Parquet file into a Dask DataFrame

    This reads a directory of Parquet data into a Dask.dataframe, one file per
    partition.  It selects the index among the sorted columns if any exist.

    Parameters
    ----------
    path : string or list
        Source directory for data, or path(s) to individual parquet files.
        Prefix with a protocol like ``s3://`` to read from alternative
        filesystems. To read from multiple files you can pass a globstring or a
        list of paths, with the caveat that they must all have the same
        protocol.
    columns : string, list or None (default)
        Field name(s) to read in as columns in the output. By default all
        non-index fields will be read (as determined by the pandas parquet
        metadata, if present). Provide a single field name instead of a list to
        read in the data as a Series.
    filters : list
        List of filters to apply, like ``[('x', '>', 0), ...]``. This
         implements row-group (partition) -level filtering only, i.e., to
        prevent the loading of some chunks of the data, and only if relevant
        statistics have been included in the metadata.
    index : string, list, False or None (default)
        Field name(s) to use as the output frame index. By default will be
        inferred from the pandas parquet file metadata (if present). Use False
        to read all fields as columns.
    categories : list, dict or None
        For any fields listed here, if the parquet encoding is Dictionary,
        the column will be created with dtype category. Use only if it is
        guaranteed that the column is encoded as dictionary in all row-groups.
        If a list, assumes up to 2**16-1 labels; if a dict, specify the number
        of labels expected; if None, will load categories automatically for
        data written by dask/fastparquet, not otherwise.
    storage_options : dict
        Key/value pairs to be passed on to the file-system backend, if any.
    engine : {'auto', 'fastparquet', 'pyarrow'}, default 'auto'
        Parquet reader library to use. If only one library is installed, it
        will use that one; if both, it will use 'fastparquet'
    gather_statistics : bool or None (default).
        Gather the statistics for each dataset partition. By default,
        this will only be done if the _metadata file is available. Otherwise,
        statistics will only be gathered if True, because the footer of
        every file will be parsed (which is very slow on some systems).
    **kwargs: dict (of dicts)
        Passthrough key-word arguments for read backend.
        The top-level keys correspond to the appropriate operation type, and
        the second level corresponds to the kwargs that will be passed on to
        the underlying `pyarrow` or `fastparquet` function.
        Supported top-level keys: 'dataset' (for opening a `pyarrow` dataset),
        'file' (for opening a `fastparquet` `ParquetFile`), and 'read' (for the
        backend read function)

    Examples
    --------
    >>> df = dd.read_parquet('s3://bucket/my-parquet-data')  # doctest: +SKIP

    See Also
    --------
    to_parquet
    """

    if isinstance(columns, str):
        df = read_parquet(
            path,
            [columns],
            filters,
            categories,
            index,
            storage_options,
            engine,
            gather_statistics,
        )
        return df[columns]

    if columns is not None:
        columns = list(columns)

    name = "read-parquet-" + tokenize(
        path,
        columns,
        filters,
        categories,
        index,
        storage_options,
        engine,
        gather_statistics,
    )

    if isinstance(engine, str):
        engine = get_engine(engine)

    if hasattr(path, "name"):
        path = stringify_path(path)
    fs, _, paths = get_fs_token_paths(path,
                                      mode="rb",
                                      storage_options=storage_options)

    paths = sorted(paths,
                   key=natural_sort_key)  # numeric rather than glob ordering

    auto_index_allowed = False
    if index is None:
        # User is allowing auto-detected index
        auto_index_allowed = True
    if index and isinstance(index, str):
        index = [index]

    meta, statistics, parts = engine.read_metadata(
        fs,
        paths,
        categories=categories,
        index=index,
        gather_statistics=gather_statistics,
        filters=filters,
        **kwargs)
    if meta.index.name is not None:
        index = meta.index.name

    ignore_index_column_intersection = False
    if columns is None:
        # User didn't specify columns, so ignore any intersection
        # of auto-detected values with the index (if necessary)
        ignore_index_column_intersection = True
        columns = meta.columns

    if not set(columns).issubset(set(meta.columns)):
        raise ValueError(
            "The following columns were not found in the dataset %s\n"
            "The following columns were found %s" %
            (set(columns) - set(meta.columns), meta.columns))

    # Parse dataset statistics from metadata (if available)
    index_in_columns = False
    if statistics:
        result = list(
            zip(*[(part, stats) for part, stats in zip(parts, statistics)
                  if stats["num-rows"] > 0]))
        parts, statistics = result or [[], []]
        if filters:
            parts, statistics = apply_filters(parts, statistics, filters)

        out = sorted_columns(statistics)

        if index and isinstance(index, str):
            index = [index]
        if index and out:
            # Only one valid column
            out = [o for o in out if o["name"] in index]
        if index is not False and len(out) == 1:
            # Use only sorted column with statistics as the index
            divisions = out[0]["divisions"]
            if index is None:
                index_in_columns = True
                index = [out[0]["name"]]
            elif index != [out[0]["name"]]:
                raise ValueError("Specified index is invalid.\n"
                                 "index: {}".format(index))
        elif index is not False and len(out) > 1:
            if any(o["name"] == "index" for o in out):
                # Use sorted column named "index" as the index
                [o] = [o for o in out if o["name"] == "index"]
                divisions = o["divisions"]
                if index is None:
                    index = [o["name"]]
                    index_in_columns = True
                elif index != [o["name"]]:
                    raise ValueError("Specified index is invalid.\n"
                                     "index: {}".format(index))
            else:
                # Multiple sorted columns found, cannot autodetect the index
                warnings.warn(
                    "Multiple sorted columns found, cannot autodetect index",
                    RuntimeWarning,
                )
                index = False
                divisions = [None] * (len(parts) + 1)
        else:
            divisions = [None] * (len(parts) + 1)
    else:
        divisions = [None] * (len(parts) + 1)

    if index:
        if isinstance(index, str):
            index = [index]
        if isinstance(columns, str):
            columns = [columns]

        if ignore_index_column_intersection:
            columns = [col for col in columns if col not in index]
        if set(index).intersection(columns):
            if auto_index_allowed:
                raise ValueError(
                    "Specified index and column arguments must not intersect"
                    " (set index=False or remove the detected index from columns).\n"
                    "index: {} | column: {}".format(index, columns))
            else:
                raise ValueError(
                    "Specified index and column arguments must not intersect.\n"
                    "index: {} | column: {}".format(index, columns))

        # Leaving index as a column in `meta`, because the index
        # will be reset below (in case the index was detected after
        # meta was created)
        if index_in_columns:
            meta = meta[columns + index]
        else:
            meta = meta[columns]

    else:
        meta = meta[list(columns)]

    def _merge_kwargs(x, y):
        z = x.copy()
        z.update(y)
        return z

    subgraph = {(name, i): (
        read_parquet_part,
        engine.read_partition,
        fs,
        meta,
        part["piece"],
        columns,
        index,
        _merge_kwargs(part["kwargs"], kwargs or {}),
    )
                for i, part in enumerate(parts)}

    # Set the index that was previously treated as a column
    if index_in_columns:
        meta = meta.set_index(index)

    if len(divisions) < 2:
        # empty dataframe - just use meta
        subgraph = {(name, 0): meta}
        divisions = (None, None)

    return new_dd_object(subgraph, name, meta, divisions)
Beispiel #26
0
def read_orc(
    filepath_or_buffer,
    engine="cudf",
    columns=None,
    filters=None,
    stripes=None,
    skiprows=None,
    num_rows=None,
    use_index=True,
    decimal_cols_as_float=None,
    timestamp_type=None,
    use_python_file_object=True,
    **kwargs,
):
    """{docstring}"""
    if decimal_cols_as_float is not None:
        warnings.warn(
            "`decimal_cols_as_float` is deprecated and will be removed in "
            "the future",
            FutureWarning,
        )
    from cudf import DataFrame

    # Multiple sources are passed as a list. If a single source is passed,
    # wrap it in a list for unified processing downstream.
    if not is_list_like(filepath_or_buffer):
        filepath_or_buffer = [filepath_or_buffer]

    # Each source must have a correlating stripe list. If a single stripe list
    # is provided rather than a list of list of stripes then extrapolate that
    # stripe list across all input sources
    if stripes is not None:
        if any(not isinstance(stripe, list) for stripe in stripes):
            stripes = [stripes]

        # Must ensure a stripe for each source is specified, unless None
        if not len(stripes) == len(filepath_or_buffer):
            raise ValueError(
                "A list of stripes must be provided for each input source"
            )

    filepaths_or_buffers = []
    for source in filepath_or_buffer:
        if ioutils.is_directory(source, **kwargs):
            fs = ioutils._ensure_filesystem(
                passed_filesystem=None, path=source, **kwargs,
            )
            source = stringify_path(source)
            source = fs.sep.join([source, "*.orc"])

        tmp_source, compression = ioutils.get_filepath_or_buffer(
            path_or_data=source,
            compression=None,
            use_python_file_object=use_python_file_object,
            **kwargs,
        )
        if compression is not None:
            raise ValueError(
                "URL content-encoding decompression is not supported"
            )
        if isinstance(tmp_source, list):
            filepaths_or_buffers.extend(tmp_source)
        else:
            filepaths_or_buffers.append(tmp_source)

    if filters is not None:
        selected_stripes = _filter_stripes(
            filters, filepaths_or_buffers, stripes, skiprows, num_rows
        )

        # Return empty if everything was filtered
        if len(selected_stripes) == 0:
            return _make_empty_df(filepaths_or_buffers[0], columns)
        else:
            stripes = selected_stripes

    if engine == "cudf":
        return DataFrame._from_data(
            *liborc.read_orc(
                filepaths_or_buffers,
                columns,
                stripes,
                skiprows,
                num_rows,
                use_index,
                decimal_cols_as_float,
                timestamp_type,
            )
        )
    else:

        def read_orc_stripe(orc_file, stripe, columns):
            pa_table = orc_file.read_stripe(stripe, columns)
            if isinstance(pa_table, pa.RecordBatch):
                pa_table = pa.Table.from_batches([pa_table])
            return pa_table

        warnings.warn("Using CPU via PyArrow to read ORC dataset.")
        if len(filepath_or_buffer) > 1:
            raise NotImplementedError(
                "Using CPU via PyArrow only supports a single a "
                "single input source"
            )

        orc_file = orc.ORCFile(filepath_or_buffer[0])
        if stripes is not None and len(stripes) > 0:
            for stripe_source_file in stripes:
                pa_tables = [
                    read_orc_stripe(orc_file, i, columns)
                    for i in stripe_source_file
                ]
                pa_table = pa.concat_tables(pa_tables)
        else:
            pa_table = orc_file.read(columns=columns)
        df = cudf.DataFrame.from_arrow(pa_table)

    return df
Beispiel #27
0
def to_parquet(df,
               path,
               engine="auto",
               compression="default",
               write_index=True,
               append=False,
               ignore_divisions=False,
               partition_on=None,
               storage_options=None,
               write_metadata_file=True,
               compute=True,
               **kwargs):
    """Store Dask.dataframe to Parquet files

    Notes
    -----
    Each partition will be written to a separate file.

    Parameters
    ----------
    df : dask.dataframe.DataFrame
    path : string or pathlib.Path
        Destination directory for data.  Prepend with protocol like ``s3://``
        or ``hdfs://`` for remote data.
    engine : {'auto', 'fastparquet', 'pyarrow'}, default 'auto'
        Parquet library to use. If only one library is installed, it will use
        that one; if both, it will use 'fastparquet'.
    compression : string or dict, optional
        Either a string like ``"snappy"`` or a dictionary mapping column names
        to compressors like ``{"name": "gzip", "values": "snappy"}``. The
        default is ``"default"``, which uses the default compression for
        whichever engine is selected.
    write_index : boolean, optional
        Whether or not to write the index. Defaults to True.
    append : bool, optional
        If False (default), construct data-set from scratch. If True, add new
        row-group(s) to an existing data-set. In the latter case, the data-set
        must exist, and the schema must match the input data.
    ignore_divisions : bool, optional
        If False (default) raises error when previous divisions overlap with
        the new appended divisions. Ignored if append=False.
    partition_on : list, optional
        Construct directory-based partitioning by splitting on these fields'
        values. Each dask partition will result in one or more datafiles,
        there will be no global groupby.
    storage_options : dict, optional
        Key/value pairs to be passed on to the file-system backend, if any.
    write_metadata_file : bool, optional
        Whether to write the special "_metadata" file.
    compute : bool, optional
        If True (default) then the result is computed immediately. If False
        then a ``dask.delayed`` object is returned for future computation.
    **kwargs :
        Extra options to be passed on to the specific backend.

    Examples
    --------
    >>> df = dd.read_csv(...)  # doctest: +SKIP
    >>> dd.to_parquet(df, '/path/to/output/',...)  # doctest: +SKIP

    See Also
    --------
    read_parquet: Read parquet data to dask.dataframe
    """
    from dask import delayed

    partition_on = partition_on or []
    if isinstance(partition_on, string_types):
        partition_on = [partition_on]

    if set(partition_on) - set(df.columns):
        raise ValueError("Partitioning on non-existent column. "
                         "partition_on=%s ."
                         "columns=%s" %
                         (str(partition_on), str(list(df.columns))))

    if compression != "default":
        kwargs["compression"] = compression
    elif snappy is not None:
        kwargs["compression"] = "snappy"

    if isinstance(engine, str):
        engine = get_engine(engine)

    if hasattr(path, "name"):
        path = stringify_path(path)
    fs, _, _ = get_fs_token_paths(path,
                                  mode="wb",
                                  storage_options=storage_options)
    # Trim any protocol information from the path before forwarding
    path = fs._strip_protocol(path)

    # Save divisions and corresponding index name. This is necessary,
    # because we may be resetting the index to write the file
    division_info = {"divisions": df.divisions, "name": df.index.name}
    if division_info["name"] is None:
        # As of 0.24.2, pandas will rename an index with name=None
        # when df.reset_index() is called.  The default name is "index",
        # (or "level_0" if "index" is already a column name)
        division_info[
            "name"] = "index" if "index" not in df.columns else "level_0"

    # If write_index==True (default), reset the index and record the
    # name of the original index in `index_cols` (will be `index` if None,
    # or `level_0` if `index` is already a column name).
    # `fastparquet` will use `index_cols` to specify the index column(s)
    # in the metadata.  `pyarrow` will revert the `reset_index` call
    # below if `index_cols` is populated (because pyarrow will want to handle
    # index preservation itself).  For both engines, the column index
    # will be written to "pandas metadata" if write_index=True
    index_cols = []
    if write_index:
        real_cols = set(df.columns)
        df = df.reset_index()
        index_cols = [c for c in set(df.columns).difference(real_cols)]
    else:
        # Not writing index - might as well drop it
        df = df.reset_index(drop=True)

    _to_parquet_kwargs = {
        "engine",
        "compression",
        "write_index",
        "append",
        "ignore_divisions",
        "partition_on",
        "storage_options",
        "write_metadata_file",
        "compute",
    }
    kwargs_pass = {
        k: v
        for k, v in kwargs.items() if k not in _to_parquet_kwargs
    }

    # Engine-specific initialization steps to write the dataset.
    # Possibly create parquet metadata, and load existing stuff if appending
    meta, i_offset = engine.initialize_write(df,
                                             fs,
                                             path,
                                             append=append,
                                             ignore_divisions=ignore_divisions,
                                             partition_on=partition_on,
                                             division_info=division_info,
                                             index_cols=index_cols,
                                             **kwargs_pass)

    # Use i_offset and df.npartitions to define file-name list
    filenames = [
        "part.%i.parquet" % (i + i_offset) for i in range(df.npartitions)
    ]

    # write parts
    dwrite = delayed(engine.write_partition)
    parts = [
        dwrite(d,
               path,
               fs,
               filename,
               partition_on,
               write_metadata_file,
               fmd=meta,
               index_cols=index_cols,
               **kwargs_pass)
        for d, filename in zip(df.to_delayed(), filenames)
    ]

    # single task to complete
    out = delayed(lambda x: None)(parts)
    if write_metadata_file:
        out = delayed(engine.write_metadata)(parts,
                                             meta,
                                             fs,
                                             path,
                                             append=append,
                                             compression=compression)

    if compute:
        out = out.compute()
    return out
Beispiel #28
0
 def _strip_protocol(cls, path):
     path = stringify_path(path)
     if path.startswith("file://"):
         path = path[7:]
     return make_path_posix(path).rstrip("/")
Beispiel #29
0
def read_hdf(
    pattern,
    key,
    start=0,
    stop=None,
    columns=None,
    chunksize=1000000,
    sorted_index=False,
    lock=True,
    mode="r",
):
    """
    Read HDF files into a Dask DataFrame

    Read hdf files into a dask dataframe. This function is like
    ``pandas.read_hdf``, except it can read from a single large file, or from
    multiple files, or from multiple keys from the same file.

    Parameters
    ----------
    pattern : string, pathlib.Path, list
        File pattern (string), pathlib.Path, buffer to read from, or list of
        file paths. Can contain wildcards.
    key : group identifier in the store. Can contain wildcards
    start : optional, integer (defaults to 0), row number to start at
    stop : optional, integer (defaults to None, the last row), row number to
        stop at
    columns : list of columns, optional
        A list of columns that if not None, will limit the return
        columns (default is None)
    chunksize : positive integer, optional
        Maximal number of rows per partition (default is 1000000).
    sorted_index : boolean, optional
        Option to specify whether or not the input hdf files have a sorted
        index (default is False).
    lock : boolean, optional
        Option to use a lock to prevent concurrency issues (default is True).
    mode : {'a', 'r', 'r+'}, default 'r'. Mode to use when opening file(s).
        'r'
            Read-only; no data can be modified.
        'a'
            Append; an existing file is opened for reading and writing,
            and if the file does not exist it is created.
        'r+'
            It is similar to 'a', but the file must already exist.

    Returns
    -------
    dask.DataFrame

    Examples
    --------
    Load single file

    >>> dd.read_hdf('myfile.1.hdf5', '/x')  # doctest: +SKIP

    Load multiple files

    >>> dd.read_hdf('myfile.*.hdf5', '/x')  # doctest: +SKIP

    >>> dd.read_hdf(['myfile.1.hdf5', 'myfile.2.hdf5'], '/x')  # doctest: +SKIP

    Load multiple datasets

    >>> dd.read_hdf('myfile.1.hdf5', '/*')  # doctest: +SKIP
    """
    if lock is True:
        lock = get_scheduler_lock()

    key = key if key.startswith("/") else "/" + key
    # Convert path-like objects to a string
    pattern = stringify_path(pattern)

    if isinstance(pattern, str):
        paths = sorted(glob(pattern))
    else:
        paths = pattern

    if not isinstance(pattern, str) and len(paths) == 0:
        raise ValueError("No files provided")
    if not paths or len(paths) == 0:
        raise IOError("File(s) not found: {0}".format(pattern))
    for path in paths:
        try:
            exists = os.path.exists(path)
        except (ValueError, TypeError):
            exists = False
        if not exists:
            raise IOError(
                "File not found or insufficient permissions: {0}".format(path)
            )
    if (start != 0 or stop is not None) and len(paths) > 1:
        raise NotImplementedError(read_hdf_error_msg)
    if chunksize <= 0:
        raise ValueError("Chunksize must be a positive integer")
    if (start != 0 or stop is not None) and sorted_index:
        raise ValueError(
            "When assuming pre-partitioned data, data must be "
            "read in its entirety using the same chunksizes"
        )

    # Build metadata
    with pd.HDFStore(paths[0], mode=mode) as hdf:
        meta_key = _expand_key(key, hdf)[0]
    meta = pd.read_hdf(paths[0], meta_key, mode=mode, stop=0)
    if columns is not None:
        meta = meta[columns]

    # Common kwargs
    if meta.ndim == 1:
        common_kwargs = {"name": meta.name, "mode": mode}
    else:
        common_kwargs = {"mode": mode}

    # Build parts
    parts, divisions = _build_parts(
        paths, key, start, stop, chunksize, sorted_index, mode
    )

    # Construct Layer and Collection
    label = "read-hdf-"
    name = label + tokenize(paths, key, start, stop, sorted_index, chunksize, mode)
    layer = DataFrameIOLayer(
        name,
        columns,
        parts,
        HDFFunctionWrapper(columns, meta.ndim, lock, common_kwargs),
        label=label,
    )
    graph = HighLevelGraph({name: layer}, {name: set()})
    return new_dd_object(graph, name, meta, divisions)