def ls(self, path: str, detail: bool = False, invalidate_cache: bool = True, **kwargs): """ Create a list of blob names from a blob container Parameters ---------- path: Path to an Azure Blob directory detail: If False, return a list of blob names, else a list of dictionaries with blob details invalidate_cache: Boolean """ logging.debug("Running abfs.ls() method") path = stringify_path(path) blobs = self.blob_fs.list_blobs(container_name=self.container_name, prefix=path) if detail is False: pathlist = [blob.name for blob in blobs] logging.debug(f"Detail is False. Returning {pathlist}") return pathlist else: pathlist = [] for blob in blobs: data = {} data["name"] = blob.name data["size"] = blob.properties.content_length data["container_name"] = self.container_name if blob.properties.content_settings.content_type is not None: data["type"] = "file" else: data["type"] = "directory" pathlist.append(data) logging.debug(f"Detail is True: Returning {pathlist}") return pathlist
def _strip_protocol(cls, path: Union[str, List[str]]): """Turn path from fully-qualified to file-system-specifi Parameters ---------- path : string Input path, like `http://oss-cn-hangzhou.aliyuncs.com/mybucket/myobject` `oss://mybucket/myobject` Examples -------- >>> _strip_protocol( "http://oss-cn-hangzhou.aliyuncs.com/mybucket/myobject" ) ('/mybucket/myobject') >>> _strip_protocol( "oss://mybucket/myobject" ) ('/mybucket/myobject') """ if isinstance(path, list): return [cls._strip_protocol(p) for p in path] path = stringify_path(path) if path.startswith("oss://"): path = path[5:] parser_re = r"https?://(?P<endpoint>oss.+aliyuncs\.com)(?P<path>/.+)" matcher = re.compile(parser_re).match(path) if matcher: path = matcher["path"] return path or cls.root_marker
def test_stringify_path(): test_filepath = os.path.join("path", "to", "file.txt") # Pathlib.path path = pathlib.Path(test_filepath) assert stringify_path(path) == test_filepath # fspath protocol class CustomFSPath: """For testing fspath on unknown objects""" def __init__(self, path): self.path = path def __fspath__(self): return self.path path = CustomFSPath(test_filepath) assert stringify_path(path) == test_filepath # Non path-like input is unaffected path = (1, 2, 3) assert stringify_path(path) is path
def _strip_protocol(cls, path): if isinstance(path, list): return [cls._strip_protocol(p) for p in path] path = stringify_path(path) protos = (cls.protocol, ) if isinstance(cls.protocol, str) else cls.protocol for protocol in protos: if path.startswith(protocol + "://"): path = path[len(protocol) + 3:] elif path.startswith(protocol + "::"): path = path[len(protocol) + 2:] # use of root_marker to make minimum required path, e.g., "/" return path or cls.root_marker
def __init__( self, paths, engine=None, part_size=None, part_mem_fraction=None, storage_options=None, **kwargs, ): if part_size: # If a specific partition size is given, use it directly part_size = parse_bytes(part_size) else: # If a fractional partition size is given, calculate part_size part_mem_fraction = part_mem_fraction or 0.125 assert part_mem_fraction > 0.0 and part_mem_fraction < 1.0 if part_mem_fraction > 0.25: warnings.warn("Using very large partitions sizes for Dask. " "Memory-related errors are likely.") part_size = int(device_mem_size(kind="total") * part_mem_fraction) # Engine-agnostic path handling if hasattr(paths, "name"): paths = stringify_path(paths) if isinstance(paths, str): paths = [paths] storage_options = storage_options or {} # If engine is not provided, try to infer from end of paths[0] if engine is None: engine = paths[0].split(".")[-1] if isinstance(engine, str): if engine == "parquet": self.engine = ParquetDatasetEngine( paths, part_size, storage_options=storage_options, **kwargs) elif engine == "csv": self.engine = CSVDatasetEngine(paths, part_size, storage_options=storage_options, **kwargs) else: raise ValueError("Only parquet and csv supported (for now).") else: self.engine = engine(paths, part_size, storage_options=storage_options)
def _strip_protocol(cls, path): """ Turn path from fully-qualified to file-system-specific May require FS-specific handling, e.g., for relative paths or links. """ path = stringify_path(path) protos = (cls.protocol,) if isinstance(cls.protocol, str) else cls.protocol # NOQA for protocol in protos: path = path.rstrip("/") if path.startswith(protocol + "://"): path = path[len(protocol) + 3:] elif path.startswith(protocol + ":"): path = path[len(protocol) + 1:] # use of root_marker to make minimum required path, e.g., "/" if not path.startswith("/"): path = f"/{path}" return path or cls.root_marker
def __init__( self, path, engine=None, part_size=None, part_mem_fraction=None, storage_options=None, **kwargs, ): if part_size: # If a specific partition size is given, use it directly part_size = parse_bytes(part_size) else: # If a fractional partition size is given, calculate part_size part_mem_fraction = part_mem_fraction or 0.125 assert part_mem_fraction > 0.0 and part_mem_fraction < 1.0 if part_mem_fraction > 0.25: warnings.warn("Using very large partitions sizes for Dask. " "Memory-related errors are likely.") part_size = int(cuda.current_context().get_memory_info()[1] * part_mem_fraction) # Engine-agnostic path handling if hasattr(path, "name"): path = stringify_path(path) storage_options = storage_options or {} fs, fs_token, paths = get_fs_token_paths( path, mode="rb", storage_options=storage_options) paths = sorted(paths, key=natural_sort_key) # If engine is not provided, try to infer from end of paths[0] if engine is None: engine = paths[0].split(".")[-1] if isinstance(engine, str): if engine == "parquet": self.engine = ParquetDatasetEngine(paths, part_size, fs, fs_token, **kwargs) elif engine == "csv": self.engine = CSVDatasetEngine(paths, part_size, fs, fs_token, **kwargs) else: raise ValueError("Only parquet and csv supported (for now).") else: self.engine = engine(paths, part_size, fs, fs_token, **kwargs)
def _strip_protocol(cls, path): path = stringify_path(path) if path.startswith("file://"): path = path[7:] path = os.path.expanduser(path) return make_path_posix(path)
def read_parquet( path, columns=None, filters=None, categories=None, index=None, storage_options=None, engine="auto", gather_statistics=None, split_row_groups=None, read_from_paths=None, chunksize=None, aggregate_files=None, **kwargs, ): """ Read a Parquet file into a Dask DataFrame This reads a directory of Parquet data into a Dask.dataframe, one file per partition. It selects the index among the sorted columns if any exist. Parameters ---------- path : str or list Source directory for data, or path(s) to individual parquet files. Prefix with a protocol like ``s3://`` to read from alternative filesystems. To read from multiple files you can pass a globstring or a list of paths, with the caveat that they must all have the same protocol. columns : str or list, default None Field name(s) to read in as columns in the output. By default all non-index fields will be read (as determined by the pandas parquet metadata, if present). Provide a single field name instead of a list to read in the data as a Series. filters : Union[List[Tuple[str, str, Any]], List[List[Tuple[str, str, Any]]]], default None List of filters to apply, like ``[[('col1', '==', 0), ...], ...]``. Using this argument will NOT result in row-wise filtering of the final partitions unless ``engine="pyarrow-dataset"`` is also specified. For other engines, filtering is only performed at the partition level, i.e., to prevent the loading of some row-groups and/or files. For the "pyarrow" engines, predicates can be expressed in disjunctive normal form (DNF). This means that the innermost tuple describes a single column predicate. These inner predicates are combined with an AND conjunction into a larger predicate. The outer-most list then combines all of the combined filters with an OR disjunction. Predicates can also be expressed as a List[Tuple]. These are evaluated as an AND conjunction. To express OR in predictates, one must use the (preferred for "pyarrow") List[List[Tuple]] notation. Note that the "fastparquet" engine does not currently support DNF for the filtering of partitioned columns (List[Tuple] is required). index : str, list or False, default None Field name(s) to use as the output frame index. By default will be inferred from the pandas parquet file metadata (if present). Use False to read all fields as columns. categories : list or dict, default None For any fields listed here, if the parquet encoding is Dictionary, the column will be created with dtype category. Use only if it is guaranteed that the column is encoded as dictionary in all row-groups. If a list, assumes up to 2**16-1 labels; if a dict, specify the number of labels expected; if None, will load categories automatically for data written by dask/fastparquet, not otherwise. storage_options : dict, default None Key/value pairs to be passed on to the file-system backend, if any. engine : str, default 'auto' Parquet reader library to use. Options include: 'auto', 'fastparquet', 'pyarrow', 'pyarrow-dataset', and 'pyarrow-legacy'. Defaults to 'auto', which selects the FastParquetEngine if fastparquet is installed (and ArrowDatasetEngine otherwise). If 'pyarrow' or 'pyarrow-dataset' is specified, the ArrowDatasetEngine (which leverages the pyarrow.dataset API) will be used. If 'pyarrow-legacy' is specified, ArrowLegacyEngine will be used (which leverages the pyarrow.parquet.ParquetDataset API). NOTE: The 'pyarrow-legacy' option (ArrowLegacyEngine) is deprecated for pyarrow>=5. gather_statistics : bool, default None Gather the statistics for each dataset partition. By default, this will only be done if the _metadata file is available. Otherwise, statistics will only be gathered if True, because the footer of every file will be parsed (which is very slow on some systems). split_row_groups : bool or int, default None Default is True if a _metadata file is available or if the dataset is composed of a single file (otherwise defult is False). If True, then each output dataframe partition will correspond to a single parquet-file row-group. If False, each partition will correspond to a complete file. If a positive integer value is given, each dataframe partition will correspond to that number of parquet row-groups (or fewer). Only the "pyarrow" engine supports this argument. read_from_paths : bool, default None Only used by ``ArrowDatasetEngine`` when ``filters`` are specified. Determines whether the engine should avoid inserting large pyarrow (``ParquetFileFragment``) objects in the task graph. If this option is True, ``read_partition`` will need to regenerate the appropriate fragment object from the path and row-group IDs. This will reduce the size of the task graph, but will add minor overhead to ``read_partition``. By default (None), ``ArrowDatasetEngine`` will set this option to ``False`` when there are filters. chunksize : int or str, default None The desired size of each output ``DataFrame`` partition in terms of total (uncompressed) parquet storage space. If specified, adjacent row-groups and/or files will be aggregated into the same output partition until the cumulative ``total_byte_size`` parquet-metadata statistic reaches this value. Use `aggregate_files` to enable/disable inter-file aggregation. aggregate_files : bool or str, default None Whether distinct file paths may be aggregated into the same output partition. This parameter requires `gather_statistics=True`, and is only used when `chunksize` is specified or when `split_row_groups` is an integer >1. A setting of True means that any two file paths may be aggregated into the same output partition, while False means that inter-file aggregation is prohibited. For "hive-partitioned" datasets, a "partition"-column name can also be specified. In this case, we allow the aggregation of any two files sharing a file path up to, and including, the corresponding directory name. For example, if ``aggregate_files`` is set to ``"section"`` for the directory structure below, ``03.parquet`` and ``04.parquet`` may be aggregated together, but ``01.parquet`` and ``02.parquet`` cannot be. If, however, ``aggregate_files`` is set to ``"region"``, ``01.parquet`` may be aggregated with ``02.parquet``, and ``03.parquet`` may be aggregated with ``04.parquet``:: dataset-path/ ├── region=1/ │ ├── section=a/ │ │ └── 01.parquet │ ├── section=b/ │ └── └── 02.parquet └── region=2/ ├── section=a/ │ ├── 03.parquet └── └── 04.parquet Note that the default behavior of ``aggregate_files`` is False. **kwargs: dict (of dicts) Passthrough key-word arguments for read backend. The top-level keys correspond to the appropriate operation type, and the second level corresponds to the kwargs that will be passed on to the underlying ``pyarrow`` or ``fastparquet`` function. Supported top-level keys: 'dataset' (for opening a ``pyarrow`` dataset), 'file' (for opening a ``fastparquet`` ``ParquetFile``), 'read' (for the backend read function), 'arrow_to_pandas' (for controlling the arguments passed to convert from a ``pyarrow.Table.to_pandas()``) Examples -------- >>> df = dd.read_parquet('s3://bucket/my-parquet-data') # doctest: +SKIP See Also -------- to_parquet pyarrow.parquet.ParquetDataset """ if isinstance(columns, str): df = read_parquet( path, columns=[columns], filters=filters, categories=categories, index=index, storage_options=storage_options, engine=engine, gather_statistics=gather_statistics, split_row_groups=split_row_groups, read_from_paths=read_from_paths, chunksize=chunksize, aggregate_files=aggregate_files, ) return df[columns] if columns is not None: columns = list(columns) label = "read-parquet-" output_name = label + tokenize( path, columns, filters, categories, index, storage_options, engine, gather_statistics, split_row_groups, read_from_paths, chunksize, aggregate_files, ) if isinstance(engine, str): engine = get_engine(engine) if hasattr(path, "name"): path = stringify_path(path) fs, _, paths = get_fs_token_paths(path, mode="rb", storage_options=storage_options) paths = sorted(paths, key=natural_sort_key) # numeric rather than glob ordering auto_index_allowed = False if index is None: # User is allowing auto-detected index auto_index_allowed = True if index and isinstance(index, str): index = [index] if chunksize or (split_row_groups and int(split_row_groups) > 1 and aggregate_files): # Require `gather_statistics=True` if `chunksize` is used, # or if `split_row_groups>1` and we are aggregating files. if gather_statistics is False: raise ValueError( "read_parquet options require gather_statistics=True") gather_statistics = True read_metadata_result = engine.read_metadata( fs, paths, categories=categories, index=index, gather_statistics=gather_statistics, filters=filters, split_row_groups=split_row_groups, read_from_paths=read_from_paths, chunksize=chunksize, aggregate_files=aggregate_files, **kwargs, ) # In the future, we may want to give the engine the # option to return a dedicated element for `common_kwargs`. # However, to avoid breaking the API, we just embed this # data in the first element of `parts` for now. # The logic below is inteded to handle backward and forward # compatibility with a user-defined engine. meta, statistics, parts, index = read_metadata_result[:4] common_kwargs = {} aggregation_depth = False if len(parts): # For now, `common_kwargs` and `aggregation_depth` # may be stored in the first element of `parts` common_kwargs = parts[0].pop("common_kwargs", {}) aggregation_depth = parts[0].pop("aggregation_depth", aggregation_depth) # Parse dataset statistics from metadata (if available) parts, divisions, index, index_in_columns = process_statistics( parts, statistics, filters, index, chunksize, split_row_groups, fs, aggregation_depth, ) # Account for index and columns arguments. # Modify `meta` dataframe accordingly meta, index, columns = set_index_columns(meta, index, columns, index_in_columns, auto_index_allowed) if meta.index.name == NONE_LABEL: meta.index.name = None # Set the index that was previously treated as a column if index_in_columns: meta = meta.set_index(index) if meta.index.name == NONE_LABEL: meta.index.name = None if len(divisions) < 2: # empty dataframe - just use meta graph = {(output_name, 0): meta} divisions = (None, None) else: # Create Blockwise layer layer = DataFrameIOLayer( output_name, columns, parts, ParquetFunctionWrapper( engine, fs, meta, columns, index, kwargs, common_kwargs, ), label=label, ) graph = HighLevelGraph({output_name: layer}, {output_name: set()}) return new_dd_object(graph, output_name, meta, divisions)
def to_parquet( df, path, engine="auto", compression="default", write_index=True, append=False, overwrite=False, ignore_divisions=False, partition_on=None, storage_options=None, custom_metadata=None, write_metadata_file=True, compute=True, compute_kwargs=None, schema=None, **kwargs, ): """Store Dask.dataframe to Parquet files Notes ----- Each partition will be written to a separate file. Parameters ---------- df : dask.dataframe.DataFrame path : string or pathlib.Path Destination directory for data. Prepend with protocol like ``s3://`` or ``hdfs://`` for remote data. engine : {'auto', 'fastparquet', 'pyarrow'}, default 'auto' Parquet library to use. If only one library is installed, it will use that one; if both, it will use 'fastparquet'. compression : string or dict, default 'default' Either a string like ``"snappy"`` or a dictionary mapping column names to compressors like ``{"name": "gzip", "values": "snappy"}``. The default is ``"default"``, which uses the default compression for whichever engine is selected. write_index : boolean, default True Whether or not to write the index. Defaults to True. append : bool, default False If False (default), construct data-set from scratch. If True, add new row-group(s) to an existing data-set. In the latter case, the data-set must exist, and the schema must match the input data. overwrite : bool, default False Whether or not to remove the contents of `path` before writing the dataset. The default is False. If True, the specified path must correspond to a directory (but not the current working directory). This option cannot be set to True if `append=True`. NOTE: `overwrite=True` will remove the original data even if the current write operation fails. Use at your own risk. ignore_divisions : bool, default False If False (default) raises error when previous divisions overlap with the new appended divisions. Ignored if append=False. partition_on : list, default None Construct directory-based partitioning by splitting on these fields' values. Each dask partition will result in one or more datafiles, there will be no global groupby. storage_options : dict, default None Key/value pairs to be passed on to the file-system backend, if any. custom_metadata : dict, default None Custom key/value metadata to include in all footer metadata (and in the global "_metadata" file, if applicable). Note that the custom metadata may not contain the reserved b"pandas" key. write_metadata_file : bool, default True Whether to write the special "_metadata" file. compute : bool, default True If :obj:`True` (default) then the result is computed immediately. If :obj:`False` then a ``dask.dataframe.Scalar`` object is returned for future computation. compute_kwargs : dict, default True Options to be passed in to the compute method schema : Schema object, dict, or {"infer", None}, default None Global schema to use for the output dataset. Alternatively, a `dict` of pyarrow types can be specified (e.g. `schema={"id": pa.string()}`). For this case, fields excluded from the dictionary will be inferred from `_meta_nonempty`. If "infer", the first non-empty and non-null partition will be used to infer the type for "object" columns. If None (default), we let the backend infer the schema for each distinct output partition. If the partitions produce inconsistent schemas, pyarrow will throw an error when writing the shared _metadata file. Note that this argument is ignored by the "fastparquet" engine. **kwargs : Extra options to be passed on to the specific backend. Examples -------- >>> df = dd.read_csv(...) # doctest: +SKIP >>> df.to_parquet('/path/to/output/', ...) # doctest: +SKIP See Also -------- read_parquet: Read parquet data to dask.dataframe """ compute_kwargs = compute_kwargs or {} if compression == "default": if snappy is not None: compression = "snappy" else: compression = None partition_on = partition_on or [] if isinstance(partition_on, str): partition_on = [partition_on] if set(partition_on) - set(df.columns): raise ValueError("Partitioning on non-existent column. " "partition_on=%s ." "columns=%s" % (str(partition_on), str(list(df.columns)))) if isinstance(engine, str): engine = get_engine(engine) if hasattr(path, "name"): path = stringify_path(path) fs, _, _ = get_fs_token_paths(path, mode="wb", storage_options=storage_options) # Trim any protocol information from the path before forwarding path = fs._strip_protocol(path) if overwrite: if isinstance(fs, LocalFileSystem): working_dir = fs.expand_path(".")[0] if path.rstrip("/") == working_dir.rstrip("/"): raise ValueError( "Cannot clear the contents of the current working directory!" ) if append: raise ValueError( "Cannot use both `overwrite=True` and `append=True`!") if fs.isdir(path): # Only remove path contents if # (1) The path exists # (2) The path is a directory # (3) The path is not the current working directory fs.rm(path, recursive=True) # Save divisions and corresponding index name. This is necessary, # because we may be resetting the index to write the file division_info = {"divisions": df.divisions, "name": df.index.name} if division_info["name"] is None: # As of 0.24.2, pandas will rename an index with name=None # when df.reset_index() is called. The default name is "index", # but dask will always change the name to the NONE_LABEL constant if NONE_LABEL not in df.columns: division_info["name"] = NONE_LABEL elif write_index: raise ValueError( "Index must have a name if __null_dask_index__ is a column.") else: warnings.warn( "If read back by Dask, column named __null_dask_index__ " "will be set to the index (and renamed to None).") # There are some "resrved" names that may be used as the default column # name after resetting the index. However, we don't want to treat it as # a "special" name if the string is already used as a "real" column name. reserved_names = [] for name in ["index", "level_0"]: if name not in df.columns: reserved_names.append(name) # If write_index==True (default), reset the index and record the # name of the original index in `index_cols` (we will set the name # to the NONE_LABEL constant if it is originally `None`). # `fastparquet` will use `index_cols` to specify the index column(s) # in the metadata. `pyarrow` will revert the `reset_index` call # below if `index_cols` is populated (because pyarrow will want to handle # index preservation itself). For both engines, the column index # will be written to "pandas metadata" if write_index=True index_cols = [] if write_index: real_cols = set(df.columns) none_index = list(df._meta.index.names) == [None] df = df.reset_index() if none_index: df.columns = [ c if c not in reserved_names else NONE_LABEL for c in df.columns ] index_cols = [c for c in set(df.columns) - real_cols] else: # Not writing index - might as well drop it df = df.reset_index(drop=True) _to_parquet_kwargs = { "engine", "compression", "write_index", "append", "ignore_divisions", "partition_on", "storage_options", "write_metadata_file", "compute", } kwargs_pass = { k: v for k, v in kwargs.items() if k not in _to_parquet_kwargs } # Engine-specific initialization steps to write the dataset. # Possibly create parquet metadata, and load existing stuff if appending meta, schema, i_offset = engine.initialize_write( df, fs, path, append=append, ignore_divisions=ignore_divisions, partition_on=partition_on, division_info=division_info, index_cols=index_cols, schema=schema, **kwargs_pass, ) # Use i_offset and df.npartitions to define file-name list filenames = [ "part.%i.parquet" % (i + i_offset) for i in range(df.npartitions) ] # Construct IO graph dsk = {} name = "to-parquet-" + tokenize( df, fs, path, append, ignore_divisions, partition_on, division_info, index_cols, schema, ) part_tasks = [] kwargs_pass["fmd"] = meta kwargs_pass["compression"] = compression kwargs_pass["index_cols"] = index_cols kwargs_pass["schema"] = schema if custom_metadata: if b"pandas" in custom_metadata.keys(): raise ValueError( "User-defined key/value metadata (custom_metadata) can not " "contain a b'pandas' key. This key is reserved by Pandas, " "and overwriting the corresponding value can render the " "entire dataset unreadable.") kwargs_pass["custom_metadata"] = custom_metadata for d, filename in enumerate(filenames): dsk[(name, d)] = ( apply, engine.write_partition, [ (df._name, d), path, fs, filename, partition_on, write_metadata_file, ], toolz.merge(kwargs_pass, {"head": True}) if d == 0 else kwargs_pass, ) part_tasks.append((name, d)) final_name = "metadata-" + name # Collect metadata and write _metadata if write_metadata_file: dsk[(final_name, 0)] = ( apply, engine.write_metadata, [ part_tasks, meta, fs, path, ], { "append": append, "compression": compression }, ) else: dsk[(final_name, 0)] = (lambda x: None, part_tasks) graph = HighLevelGraph.from_collections(name, dsk, dependencies=[df]) if compute: if write_metadata_file: return compute_as_if_collection(DataFrame, graph, (final_name, 0), **compute_kwargs) else: return compute_as_if_collection(DataFrame, graph, part_tasks, **compute_kwargs) else: return Scalar(graph, final_name, "")
def to_hdf( df, path, key, mode="a", append=False, scheduler=None, name_function=None, compute=True, lock=None, dask_kwargs={}, **kwargs ): """Store Dask Dataframe to Hierarchical Data Format (HDF) files This is a parallel version of the Pandas function of the same name. Please see the Pandas docstring for more detailed information about shared keyword arguments. This function differs from the Pandas version by saving the many partitions of a Dask DataFrame in parallel, either to many files, or to many datasets within the same file. You may specify this parallelism with an asterix ``*`` within the filename or datapath, and an optional ``name_function``. The asterix will be replaced with an increasing sequence of integers starting from ``0`` or with the result of calling ``name_function`` on each of those integers. This function only supports the Pandas ``'table'`` format, not the more specialized ``'fixed'`` format. Parameters ---------- path : string, pathlib.Path Path to a target filename. Supports strings, ``pathlib.Path``, or any object implementing the ``__fspath__`` protocol. May contain a ``*`` to denote many filenames. key : string Datapath within the files. May contain a ``*`` to denote many locations name_function : function A function to convert the ``*`` in the above options to a string. Should take in a number from 0 to the number of partitions and return a string. (see examples below) compute : bool Whether or not to execute immediately. If False then this returns a ``dask.Delayed`` value. lock : Lock, optional Lock to use to prevent concurrency issues. By default a ``threading.Lock``, ``multiprocessing.Lock`` or ``SerializableLock`` will be used depending on your scheduler if a lock is required. See dask.utils.get_scheduler_lock for more information about lock selection. scheduler : string The scheduler to use, like "threads" or "processes" **other: See pandas.to_hdf for more information Examples -------- Save Data to a single file >>> df.to_hdf('output.hdf', '/data') # doctest: +SKIP Save data to multiple datapaths within the same file: >>> df.to_hdf('output.hdf', '/data-*') # doctest: +SKIP Save data to multiple files: >>> df.to_hdf('output-*.hdf', '/data') # doctest: +SKIP Save data to multiple files, using the multiprocessing scheduler: >>> df.to_hdf('output-*.hdf', '/data', scheduler='processes') # doctest: +SKIP Specify custom naming scheme. This writes files as '2000-01-01.hdf', '2000-01-02.hdf', '2000-01-03.hdf', etc.. >>> from datetime import date, timedelta >>> base = date(year=2000, month=1, day=1) >>> def name_function(i): ... ''' Convert integer 0 to n to a string ''' ... return base + timedelta(days=i) >>> df.to_hdf('*.hdf', '/data', name_function=name_function) # doctest: +SKIP Returns ------- filenames : list Returned if ``compute`` is True. List of file names that each partition is saved to. delayed : dask.Delayed Returned if ``compute`` is False. Delayed object to execute ``to_hdf`` when computed. See Also -------- read_hdf: to_parquet: """ name = "to-hdf-" + uuid.uuid1().hex pd_to_hdf = getattr(df._partition_type, "to_hdf") single_file = True single_node = True path = stringify_path(path) # if path is string, format using i_name if isinstance(path, str): if path.count("*") + key.count("*") > 1: raise ValueError( "A maximum of one asterisk is accepted in file path and dataset key" ) fmt_obj = lambda path, i_name: path.replace("*", i_name) if "*" in path: single_file = False else: if key.count("*") > 1: raise ValueError("A maximum of one asterisk is accepted in dataset key") fmt_obj = lambda path, _: path if "*" in key: single_node = False if "format" in kwargs and kwargs["format"] not in ["t", "table"]: raise ValueError("Dask only support 'table' format in hdf files.") if mode not in ("a", "w", "r+"): raise ValueError("Mode must be one of 'a', 'w' or 'r+'") if name_function is None: name_function = build_name_function(df.npartitions - 1) # we guarantee partition order is preserved when its saved and read # so we enforce name_function to maintain the order of its input. if not (single_file and single_node): formatted_names = [name_function(i) for i in range(df.npartitions)] if formatted_names != sorted(formatted_names): warn( "To preserve order between partitions name_function " "must preserve the order of its input" ) # If user did not specify scheduler and write is sequential default to the # sequential scheduler. otherwise let the _get method choose the scheduler if ( scheduler is None and not config.get("scheduler", None) and single_node and single_file ): scheduler = "single-threaded" # handle lock default based on whether we're writing to a single entity _actual_get = get_scheduler(collections=[df], scheduler=scheduler) if lock is None: if not single_node: lock = True elif not single_file and _actual_get is not multiprocessing.get: # if we're writing to multiple files with the multiprocessing # scheduler we don't need to lock lock = True else: lock = False if lock: lock = get_scheduler_lock(df, scheduler=scheduler) kwargs.update({"format": "table", "mode": mode, "append": append}) dsk = dict() i_name = name_function(0) dsk[(name, 0)] = ( _pd_to_hdf, pd_to_hdf, lock, [(df._name, 0), fmt_obj(path, i_name), key.replace("*", i_name)], kwargs, ) kwargs2 = kwargs.copy() if single_file: kwargs2["mode"] = "a" if single_node: kwargs2["append"] = True filenames = [] for i in range(0, df.npartitions): i_name = name_function(i) filenames.append(fmt_obj(path, i_name)) for i in range(1, df.npartitions): i_name = name_function(i) task = ( _pd_to_hdf, pd_to_hdf, lock, [(df._name, i), fmt_obj(path, i_name), key.replace("*", i_name)], kwargs2, ) if single_file: link_dep = i - 1 if single_node else 0 task = (_link, (name, link_dep), task) dsk[(name, i)] = task dsk = merge(df.dask, dsk) if single_file and single_node: keys = [(name, df.npartitions - 1)] else: keys = [(name, i) for i in range(df.npartitions)] if compute: compute_as_if_collection( DataFrame, dsk, keys, scheduler=scheduler, **dask_kwargs ) return filenames else: return delayed([Delayed(k, dsk) for k in keys])
def __init__( self, path_or_source, engine=None, part_size=None, part_mem_fraction=None, storage_options=None, dtypes=None, client=None, cpu=None, base_dataset=None, **kwargs, ): self.dtypes = dtypes self.client = client # Check if we are keeping data in cpu memory self.cpu = cpu or False # Keep track of base dataset (optional) self.base_dataset = base_dataset or self # For now, lets warn the user that "cpu mode" is experimental if self.cpu: warnings.warn( "Initializing an NVTabular Dataset in CPU mode." "This is an experimental feature with extremely limited support!" ) if isinstance(path_or_source, (dask.dataframe.DataFrame, cudf.DataFrame, pd.DataFrame)): # User is passing in a <dask.dataframe|cudf|pd>.DataFrame # Use DataFrameDatasetEngine moved_collection = ( False # Whether a pd-backed collection was moved to cudf (or vice versa) ) if self.cpu: if isinstance(path_or_source, pd.DataFrame): # Convert pandas DataFrame to pandas-backed dask.dataframe.DataFrame path_or_source = dask.dataframe.from_pandas(path_or_source, npartitions=1) elif isinstance(path_or_source, cudf.DataFrame): # Convert cudf DataFrame to pandas-backed dask.dataframe.DataFrame path_or_source = dask.dataframe.from_pandas( path_or_source.to_pandas(), npartitions=1 ) elif isinstance(path_or_source, dask_cudf.DataFrame): # Convert dask_cudf DataFrame to pandas-backed dask.dataframe.DataFrame path_or_source = path_or_source.to_dask_dataframe() moved_collection = True else: if isinstance(path_or_source, cudf.DataFrame): # Convert cudf DataFrame to dask_cudf.DataFrame path_or_source = dask_cudf.from_cudf(path_or_source, npartitions=1) elif isinstance(path_or_source, pd.DataFrame): # Convert pandas DataFrame to dask_cudf.DataFrame path_or_source = dask_cudf.from_cudf( cudf.from_pandas(path_or_source), npartitions=1 ) elif not isinstance(path_or_source, dask_cudf.DataFrame): # Convert dask.dataframe.DataFrame DataFrame to dask_cudf.DataFrame path_or_source = dask_cudf.from_dask_dataframe(path_or_source) moved_collection = True if part_size: warnings.warn("part_size is ignored for DataFrame input.") if part_mem_fraction: warnings.warn("part_mem_fraction is ignored for DataFrame input.") self.engine = DataFrameDatasetEngine( path_or_source, cpu=self.cpu, moved_collection=moved_collection ) else: if part_size: # If a specific partition size is given, use it directly part_size = parse_bytes(part_size) else: # If a fractional partition size is given, calculate part_size part_mem_fraction = part_mem_fraction or 0.125 assert 0.0 < part_mem_fraction < 1.0 if part_mem_fraction > 0.25: warnings.warn( "Using very large partitions sizes for Dask. " "Memory-related errors are likely." ) part_size = int(device_mem_size(kind="total") * part_mem_fraction) # Engine-agnostic path handling paths = path_or_source if hasattr(paths, "name"): paths = stringify_path(paths) if isinstance(paths, str): paths = [paths] paths = sorted(paths, key=natural_sort_key) storage_options = storage_options or {} # If engine is not provided, try to infer from end of paths[0] if engine is None: engine = paths[0].split(".")[-1] if isinstance(engine, str): if engine == "parquet": self.engine = ParquetDatasetEngine( paths, part_size, storage_options=storage_options, cpu=self.cpu, **kwargs ) elif engine == "csv": self.engine = CSVDatasetEngine( paths, part_size, storage_options=storage_options, cpu=self.cpu, **kwargs ) elif engine == "avro": try: from .avro import AvroDatasetEngine except ImportError as e: raise RuntimeError( "Failed to import AvroDatasetEngine. Make sure uavro is installed." ) from e self.engine = AvroDatasetEngine( paths, part_size, storage_options=storage_options, cpu=self.cpu, **kwargs ) else: raise ValueError("Only parquet, csv, and avro supported (for now).") else: self.engine = engine( paths, part_size, cpu=self.cpu, storage_options=storage_options )
def to_parquet( df, path, engine="auto", compression="default", write_index=True, append=False, ignore_divisions=False, partition_on=None, storage_options=None, write_metadata_file=True, compute=True, compute_kwargs=None, schema=None, **kwargs, ): """Store Dask.dataframe to Parquet files Notes ----- Each partition will be written to a separate file. Parameters ---------- df : dask.dataframe.DataFrame path : string or pathlib.Path Destination directory for data. Prepend with protocol like ``s3://`` or ``hdfs://`` for remote data. engine : {'auto', 'fastparquet', 'pyarrow'}, default 'auto' Parquet library to use. If only one library is installed, it will use that one; if both, it will use 'fastparquet'. compression : string or dict, optional Either a string like ``"snappy"`` or a dictionary mapping column names to compressors like ``{"name": "gzip", "values": "snappy"}``. The default is ``"default"``, which uses the default compression for whichever engine is selected. write_index : boolean, optional Whether or not to write the index. Defaults to True. append : bool, optional If False (default), construct data-set from scratch. If True, add new row-group(s) to an existing data-set. In the latter case, the data-set must exist, and the schema must match the input data. ignore_divisions : bool, optional If False (default) raises error when previous divisions overlap with the new appended divisions. Ignored if append=False. partition_on : list, optional Construct directory-based partitioning by splitting on these fields' values. Each dask partition will result in one or more datafiles, there will be no global groupby. storage_options : dict, optional Key/value pairs to be passed on to the file-system backend, if any. write_metadata_file : bool, optional Whether to write the special "_metadata" file. compute : bool, optional If True (default) then the result is computed immediately. If False then a ``dask.delayed`` object is returned for future computation. compute_kwargs : dict, optional Options to be passed in to the compute method schema : Schema object, dict, or {"infer", None}, optional Global schema to use for the output dataset. Alternatively, a `dict` of pyarrow types can be specified (e.g. `schema={"id": pa.string()}`). For this case, fields excluded from the dictionary will be inferred from `_meta_nonempty`. If "infer", the first non-empty and non-null partition will be used to infer the type for "object" columns. If None (default), we let the backend infer the schema for each distinct output partition. If the partitions produce inconsistent schemas, pyarrow will throw an error when writing the shared _metadata file. Note that this argument is ignored by the "fastparquet" engine. **kwargs : Extra options to be passed on to the specific backend. Examples -------- >>> df = dd.read_csv(...) # doctest: +SKIP >>> dd.to_parquet(df, '/path/to/output/',...) # doctest: +SKIP See Also -------- read_parquet: Read parquet data to dask.dataframe """ from dask import delayed if compression == "default": if snappy is not None: compression = "snappy" else: compression = None partition_on = partition_on or [] if isinstance(partition_on, str): partition_on = [partition_on] if set(partition_on) - set(df.columns): raise ValueError("Partitioning on non-existent column. " "partition_on=%s ." "columns=%s" % (str(partition_on), str(list(df.columns)))) if isinstance(engine, str): engine = get_engine(engine) if hasattr(path, "name"): path = stringify_path(path) fs, _, _ = get_fs_token_paths(path, mode="wb", storage_options=storage_options) # Trim any protocol information from the path before forwarding path = fs._strip_protocol(path) # Save divisions and corresponding index name. This is necessary, # because we may be resetting the index to write the file division_info = {"divisions": df.divisions, "name": df.index.name} if division_info["name"] is None: # As of 0.24.2, pandas will rename an index with name=None # when df.reset_index() is called. The default name is "index", # but dask will always change the name to the NONE_LABEL constant if NONE_LABEL not in df.columns: division_info["name"] = NONE_LABEL elif write_index: raise ValueError( "Index must have a name if __null_dask_index__ is a column.") else: warnings.warn( "If read back by Dask, column named __null_dask_index__ " "will be set to the index (and renamed to None).") # There are some "resrved" names that may be used as the default column # name after resetting the index. However, we don't want to treat it as # a "special" name if the string is already used as a "real" column name. reserved_names = [] for name in ["index", "level_0"]: if name not in df.columns: reserved_names.append(name) # If write_index==True (default), reset the index and record the # name of the original index in `index_cols` (we will set the name # to the NONE_LABEL constant if it is originally `None`). # `fastparquet` will use `index_cols` to specify the index column(s) # in the metadata. `pyarrow` will revert the `reset_index` call # below if `index_cols` is populated (because pyarrow will want to handle # index preservation itself). For both engines, the column index # will be written to "pandas metadata" if write_index=True index_cols = [] if write_index: real_cols = set(df.columns) none_index = list(df._meta.index.names) == [None] df = df.reset_index() if none_index: df.columns = [ c if c not in reserved_names else NONE_LABEL for c in df.columns ] index_cols = [c for c in set(df.columns).difference(real_cols)] else: # Not writing index - might as well drop it df = df.reset_index(drop=True) _to_parquet_kwargs = { "engine", "compression", "write_index", "append", "ignore_divisions", "partition_on", "storage_options", "write_metadata_file", "compute", } kwargs_pass = { k: v for k, v in kwargs.items() if k not in _to_parquet_kwargs } # Engine-specific initialization steps to write the dataset. # Possibly create parquet metadata, and load existing stuff if appending meta, schema, i_offset = engine.initialize_write( df, fs, path, append=append, ignore_divisions=ignore_divisions, partition_on=partition_on, division_info=division_info, index_cols=index_cols, schema=schema, **kwargs_pass, ) # Use i_offset and df.npartitions to define file-name list filenames = [ "part.%i.parquet" % (i + i_offset) for i in range(df.npartitions) ] # write parts dwrite = delayed(engine.write_partition) parts = [ dwrite( d, path, fs, filename, partition_on, write_metadata_file, fmd=meta, compression=compression, index_cols=index_cols, schema=schema, **kwargs_pass, ) for d, filename in zip(df.to_delayed(), filenames) ] # single task to complete out = delayed(lambda x: None)(parts) if write_metadata_file: out = delayed(engine.write_metadata)(parts, meta, fs, path, append=append, compression=compression) if compute: if compute_kwargs is None: compute_kwargs = dict() out = out.compute(**compute_kwargs) return out
def read_parquet( filepath_or_buffer, engine="cudf", columns=None, filters=None, row_groups=None, skiprows=None, num_rows=None, strings_to_categorical=False, use_pandas_metadata=True, *args, **kwargs, ): """{docstring}""" # Multiple sources are passed as a list. If a single source is passed, # wrap it in a list for unified processing downstream. if not is_list_like(filepath_or_buffer): filepath_or_buffer = [filepath_or_buffer] # a list of row groups per source should be passed. make the list of # lists that is expected for multiple sources if row_groups is not None: if not is_list_like(row_groups): row_groups = [[row_groups]] elif not is_list_like(row_groups[0]): row_groups = [row_groups] filepaths_or_buffers = [] for source in filepath_or_buffer: if ioutils.is_directory(source, **kwargs): fs = _ensure_filesystem(passed_filesystem=None, path=source) source = stringify_path(source) source = fs.sep.join([source, "*.parquet"]) tmp_source, compression = ioutils.get_filepath_or_buffer( path_or_data=source, compression=None, **kwargs, ) if compression is not None: raise ValueError( "URL content-encoding decompression is not supported") if isinstance(tmp_source, list): filepath_or_buffer.extend(tmp_source) else: filepaths_or_buffers.append(tmp_source) if filters is not None: # Convert filters to ds.Expression filters = pq._filters_to_expression(filters) # Initialize ds.FilesystemDataset dataset = ds.dataset(filepaths_or_buffers, format="parquet", partitioning="hive") # Load IDs of filtered row groups for each file in dataset filtered_rg_ids = defaultdict(list) for fragment in dataset.get_fragments(filter=filters): for rg_fragment in fragment.split_by_row_group(filters): for rg_info in rg_fragment.row_groups: filtered_rg_ids[rg_fragment.path].append(rg_info.id) # Initialize row_groups to be selected if row_groups is None: row_groups = [None for _ in dataset.files] # Store IDs of selected row groups for each file for i, file in enumerate(dataset.files): if row_groups[i] is None: row_groups[i] = filtered_rg_ids[file] else: row_groups[i] = filter(lambda id: id in row_groups[i], filtered_rg_ids[file]) if engine == "cudf": return libparquet.read_parquet( filepaths_or_buffers, columns=columns, row_groups=row_groups, skiprows=skiprows, num_rows=num_rows, strings_to_categorical=strings_to_categorical, use_pandas_metadata=use_pandas_metadata, ) else: warnings.warn("Using CPU via PyArrow to read Parquet dataset.") return cudf.DataFrame.from_arrow( pq.ParquetDataset(filepaths_or_buffers).read_pandas( columns=columns, *args, **kwargs))
def to_orc( df, path, engine="pyarrow", write_index=True, storage_options=None, compute=True, compute_kwargs=None, ): """Store Dask.dataframe to ORC files Notes ----- Each partition will be written to a separate file. Parameters ---------- df : dask.dataframe.DataFrame path : string or pathlib.Path Destination directory for data. Prepend with protocol like ``s3://`` or ``hdfs://`` for remote data. engine : 'pyarrow' or ORCEngine Parquet library to use. If only one library is installed, it will use that one; if both, it will use 'fastparquet'. write_index : boolean, default True Whether or not to write the index. Defaults to True. storage_options : dict, default None Key/value pairs to be passed on to the file-system backend, if any. compute : bool, default True If True (default) then the result is computed immediately. If False then a ``dask.delayed`` object is returned for future computation. compute_kwargs : dict, default True Options to be passed in to the compute method Examples -------- >>> df = dd.read_csv(...) # doctest: +SKIP >>> df.to_orc('/path/to/output/', ...) # doctest: +SKIP See Also -------- read_orc: Read ORC data to dask.dataframe """ # Get engine engine = _get_engine(engine, write=True) if hasattr(path, "name"): path = stringify_path(path) fs, _, _ = get_fs_token_paths(path, mode="wb", storage_options=storage_options) # Trim any protocol information from the path before forwarding path = fs._strip_protocol(path) if not write_index: # Not writing index - might as well drop it df = df.reset_index(drop=True) # Use df.npartitions to define file-name list fs.mkdirs(path, exist_ok=True) filenames = [f"part.{i}.orc" for i in range(df.npartitions)] # Construct IO graph dsk = {} name = "to-orc-" + tokenize( df, fs, path, engine, write_index, storage_options, ) final_name = name + "-final" for d, filename in enumerate(filenames): dsk[(name, d)] = ( apply, engine.write_partition, [ (df._name, d), path, fs, filename, ], ) part_tasks = list(dsk.keys()) dsk[(final_name, 0)] = (lambda x: None, part_tasks) graph = HighLevelGraph.from_collections((final_name, 0), dsk, dependencies=[df]) # Compute or return future if compute: if compute_kwargs is None: compute_kwargs = dict() return compute_as_if_collection(DataFrame, graph, part_tasks, **compute_kwargs) return Scalar(graph, final_name, "")
def __init__( self, path_or_source, engine=None, part_size=None, part_mem_fraction=None, storage_options=None, dtypes=None, **kwargs, ): self.dtypes = dtypes if isinstance( path_or_source, (dask.dataframe.DataFrame, cudf.DataFrame, pd.DataFrame)): # User is passing in a <dask.dataframe|cudf|pd>.DataFrame # Use DataFrameDatasetEngine if isinstance(path_or_source, cudf.DataFrame): path_or_source = dask_cudf.from_cudf(path_or_source, npartitions=1) elif isinstance(path_or_source, pd.DataFrame): path_or_source = dask_cudf.from_cudf( cudf.from_pandas(path_or_source), npartitions=1) elif not isinstance(path_or_source, dask_cudf.DataFrame): path_or_source = dask_cudf.from_dask_dataframe(path_or_source) if part_size: warnings.warn("part_size is ignored for DataFrame input.") if part_mem_fraction: warnings.warn( "part_mem_fraction is ignored for DataFrame input.") self.engine = DataFrameDatasetEngine(path_or_source) else: if part_size: # If a specific partition size is given, use it directly part_size = parse_bytes(part_size) else: # If a fractional partition size is given, calculate part_size part_mem_fraction = part_mem_fraction or 0.125 assert part_mem_fraction > 0.0 and part_mem_fraction < 1.0 if part_mem_fraction > 0.25: warnings.warn( "Using very large partitions sizes for Dask. " "Memory-related errors are likely.") part_size = int( device_mem_size(kind="total") * part_mem_fraction) # Engine-agnostic path handling paths = path_or_source if hasattr(paths, "name"): paths = stringify_path(paths) if isinstance(paths, str): paths = [paths] storage_options = storage_options or {} # If engine is not provided, try to infer from end of paths[0] if engine is None: engine = paths[0].split(".")[-1] if isinstance(engine, str): if engine == "parquet": self.engine = ParquetDatasetEngine( paths, part_size, storage_options=storage_options, **kwargs) elif engine == "csv": self.engine = CSVDatasetEngine( paths, part_size, storage_options=storage_options, **kwargs) else: raise ValueError( "Only parquet and csv supported (for now).") else: self.engine = engine(paths, part_size, storage_options=storage_options)
def _strip_protocol(cls, path): path = stringify_path(path) if "://" in path: _, _, path = path.partition("://") return path
def ls( self, path: str, detail: bool = False, invalidate_cache: bool = True, delimiter: str = "/", **kwargs, ): """ Create a list of blob names from a blob container Parameters ---------- path: Path to an Azure Blob directory detail: If False, return a list of blob names, else a list of dictionaries with blob details invalidate_cache: Boolean """ logging.debug("Running abfs.ls() method") path = stringify_path(path) if path.strip() == "": homedir_ = self.blob_fs.list_blobs( container_name=self.container_name, prefix=path, delimiter=delimiter ) homedir = list(homedir_)[0] return self.ls(path=homedir.name, detail=detail, delimiter=delimiter) else: blobs = self.blob_fs.list_blobs( container_name=self.container_name, prefix=path, delimiter=delimiter ) blobs_ = list(blobs) if len(blobs_) == 1 and isinstance(blobs_[0], BlobPrefix): path = blobs_[0].name return self.ls(path, detail=detail, delimiter=delimiter) if detail is False: pathlist = [blob.name for blob in blobs] logging.debug(f"Detail is False. Returning {pathlist}") return pathlist else: pathlist = [] for blob in blobs: logging.debug(f"Parsing {blob}") data = {} data["name"] = blob.name data["container_name"] = self.container_name try: data["size"] = blob.properties.content_length if blob.properties.content_settings.content_type is not None: data["type"] = "file" else: logging.debug("Assigning {blob} is a directory") data["type"] = "directory" except AttributeError: logging.debug(f"Handling AttributeError for {blob.name}") logging.debug(f"") if path == blob.name.rstrip("/"): return self.ls(blob.name, detail=detail, delimiter=None) elif isinstance(blob, BlobPrefix): data["type"] = "directory" data["size"] = 0 else: raise AttributeError( f"AzureBlobFileSystem.ls() method unable to assign attributes for {blob}!!" ) pathlist.append(data) logging.debug(f"Detail is True: Returning {pathlist}") return pathlist
def chmod(self, path, mode): path = stringify_path(path) return os.chmod(path, mode)
def to_orc( df, path, write_index=True, storage_options=None, compression=None, compute=True, **kwargs, ): """Write a dask_cudf dataframe to ORC file(s) (one file per partition). Parameters ---------- df : dask_cudf.DataFrame path: string or pathlib.Path Destination directory for data. Prepend with protocol like ``s3://`` or ``hdfs://`` for remote data. write_index : boolean, optional Whether or not to write the index. Defaults to True. storage_options: None or dict Further parameters to pass to the bytes backend. compression : string or dict, optional compute : bool, optional If True (default) then the result is computed immediately. If False then a ``dask.delayed`` object is returned for future computation. """ from dask import compute as dask_compute, delayed # TODO: Use upstream dask implementation once available # (see: Dask Issue#5596) if hasattr(path, "name"): path = stringify_path(path) fs, _, _ = get_fs_token_paths(path, mode="wb", storage_options=storage_options) # Trim any protocol information from the path before forwarding path = fs._strip_protocol(path) if write_index: df = df.reset_index() else: # Not writing index - might as well drop it df = df.reset_index(drop=True) fs.mkdirs(path, exist_ok=True) # Use i_offset and df.npartitions to define file-name list filenames = ["part.%i.orc" % i for i in range(df.npartitions)] # write parts dwrite = delayed(write_orc_partition) parts = [ dwrite(d, path, fs, filename, compression=compression) for d, filename in zip(df.to_delayed(), filenames) ] if compute: return dask_compute(*parts) return delayed(list)(parts)
def read_hdf( pattern, key, start=0, stop=None, columns=None, chunksize=1000000, sorted_index=False, lock=True, mode="a", ): """ Read HDF files into a Dask DataFrame Read hdf files into a dask dataframe. This function is like ``pandas.read_hdf``, except it can read from a single large file, or from multiple files, or from multiple keys from the same file. Parameters ---------- pattern : string, pathlib.Path, list File pattern (string), pathlib.Path, buffer to read from, or list of file paths. Can contain wildcards. key : group identifier in the store. Can contain wildcards start : optional, integer (defaults to 0), row number to start at stop : optional, integer (defaults to None, the last row), row number to stop at columns : list of columns, optional A list of columns that if not None, will limit the return columns (default is None) chunksize : positive integer, optional Maximal number of rows per partition (default is 1000000). sorted_index : boolean, optional Option to specify whether or not the input hdf files have a sorted index (default is False). lock : boolean, optional Option to use a lock to prevent concurrency issues (default is True). mode : {'a', 'r', 'r+'}, default 'a'. Mode to use when opening file(s). 'r' Read-only; no data can be modified. 'a' Append; an existing file is opened for reading and writing, and if the file does not exist it is created. 'r+' It is similar to 'a', but the file must already exist. Returns ------- dask.DataFrame Examples -------- Load single file >>> dd.read_hdf('myfile.1.hdf5', '/x') # doctest: +SKIP Load multiple files >>> dd.read_hdf('myfile.*.hdf5', '/x') # doctest: +SKIP >>> dd.read_hdf(['myfile.1.hdf5', 'myfile.2.hdf5'], '/x') # doctest: +SKIP Load multiple datasets >>> dd.read_hdf('myfile.1.hdf5', '/*') # doctest: +SKIP """ if lock is True: lock = get_scheduler_lock() key = key if key.startswith("/") else "/" + key # Convert path-like objects to a string pattern = stringify_path(pattern) if isinstance(pattern, str): paths = sorted(glob(pattern)) else: paths = pattern if (start != 0 or stop is not None) and len(paths) > 1: raise NotImplementedError(read_hdf_error_msg) if chunksize <= 0: raise ValueError("Chunksize must be a positive integer") if (start != 0 or stop is not None) and sorted_index: raise ValueError("When assuming pre-partitioned data, data must be " "read in its entirety using the same chunksizes") from ..multi import concat return concat([ _read_single_hdf( path, key, start=start, stop=stop, columns=columns, chunksize=chunksize, sorted_index=sorted_index, lock=lock, mode=mode, ) for path in paths ])
def parquet_reader( path, columns=None, row_groups_per_part=None, index=None, storage_options=None, **kwargs, ): name = "opt-read-parquet-" + tokenize(path, columns, index, storage_options, row_groups_per_part) if hasattr(path, "name"): path = stringify_path(path) fs, _, paths = get_fs_token_paths(path, mode="rb", storage_options=storage_options) if len(paths) > 1 or not fs.isdir(paths[0]): raise ValueError( "Must pass in a directory path to use `row_groups_per_part`.") auto_index_allowed = False if index is None: # User is allowing auto-detected index auto_index_allowed = True if index and isinstance(index, str): index = [index] dd_meta, parts = _read_metadata(fs, path, row_groups_per_part, index=index) strings_to_cats = kwargs.get("strings_to_categorical", False) meta = cudf.DataFrame(index=dd_meta.index) for col in dd_meta.columns: if dd_meta[col].dtype == "O": meta[col] = as_column( dd_meta[col], dtype="int32" if strings_to_cats else "object") else: meta[col] = as_column(dd_meta[col]) if meta.index.name is not None: index = meta.index.name # Account for index and columns arguments. # Modify `meta` dataframe accordingly index_in_columns = False meta, index, columns = set_index_columns(meta, index, columns, index_in_columns, auto_index_allowed) dsk = {} for p, part in enumerate(parts): read_key = (name, p) dsk[read_key] = ( _read_partition, part, index, columns, strings_to_cats, ) # Set the index that was previously treated as a column if index_in_columns: meta = meta.set_index(index) divisions = [None] * (len(parts) + 1) return DataFrame(dsk, name, meta, divisions)
def read_parquet( path, columns=None, filters=None, categories=None, index=None, storage_options=None, engine="auto", gather_statistics=None, split_row_groups=None, read_from_paths=None, chunksize=None, **kwargs, ): """ Read a Parquet file into a Dask DataFrame This reads a directory of Parquet data into a Dask.dataframe, one file per partition. It selects the index among the sorted columns if any exist. Parameters ---------- path : string or list Source directory for data, or path(s) to individual parquet files. Prefix with a protocol like ``s3://`` to read from alternative filesystems. To read from multiple files you can pass a globstring or a list of paths, with the caveat that they must all have the same protocol. columns : string, list or None (default) Field name(s) to read in as columns in the output. By default all non-index fields will be read (as determined by the pandas parquet metadata, if present). Provide a single field name instead of a list to read in the data as a Series. filters : Union[List[Tuple[str, str, Any]], List[List[Tuple[str, str, Any]]]] List of filters to apply, like ``[[('x', '=', 0), ...], ...]``. Using this argument will NOT result in row-wise filtering of the final partitions unless ``engine="pyarrow-dataset"`` is also specified. For other engines, filtering is only performed at the partition level, i.e., to prevent the loading of some row-groups and/or files. Predicates can be expressed in disjunctive normal form (DNF). This means that the innermost tuple describes a single column predicate. These inner predicates are combined with an AND conjunction into a larger predicate. The outer-most list then combines all of the combined filters with an OR disjunction. Predicates can also be expressed as a List[Tuple]. These are evaluated as an AND conjunction. To express OR in predictates, one must use the (preferred) List[List[Tuple]] notation. index : string, list, False or None (default) Field name(s) to use as the output frame index. By default will be inferred from the pandas parquet file metadata (if present). Use False to read all fields as columns. categories : list, dict or None For any fields listed here, if the parquet encoding is Dictionary, the column will be created with dtype category. Use only if it is guaranteed that the column is encoded as dictionary in all row-groups. If a list, assumes up to 2**16-1 labels; if a dict, specify the number of labels expected; if None, will load categories automatically for data written by dask/fastparquet, not otherwise. storage_options : dict Key/value pairs to be passed on to the file-system backend, if any. engine : str, default 'auto' Parquet reader library to use. Options include: 'auto', 'fastparquet', 'pyarrow', 'pyarrow-dataset', and 'pyarrow-legacy'. Defaults to 'auto', which selects the FastParquetEngine if fastparquet is installed (and ArrowLegacyEngine otherwise). If 'pyarrow-dataset' is specified, the ArrowDatasetEngine (which leverages the pyarrow.dataset API) will be used for newer PyArrow versions (>=1.0.0). If 'pyarrow' or 'pyarrow-legacy' are specified, the ArrowLegacyEngine will be used (which leverages the pyarrow.parquet.ParquetDataset API). NOTE: 'pyarrow-dataset' enables row-wise filtering, but requires pyarrow>=1.0. The behavior of 'pyarrow' will most likely change to ArrowDatasetEngine in a future release, and the 'pyarrow-legacy' option will be deprecated once the ParquetDataset API is deprecated. gather_statistics : bool or None (default). Gather the statistics for each dataset partition. By default, this will only be done if the _metadata file is available. Otherwise, statistics will only be gathered if True, because the footer of every file will be parsed (which is very slow on some systems). split_row_groups : bool or int Default is True if a _metadata file is available or if the dataset is composed of a single file (otherwise defult is False). If True, then each output dataframe partition will correspond to a single parquet-file row-group. If False, each partition will correspond to a complete file. If a positive integer value is given, each dataframe partition will correspond to that number of parquet row-groups (or fewer). Only the "pyarrow" engine supports this argument. read_from_paths : bool or None (default) Only used by ``ArrowDatasetEngine`` when ``filters`` are specified. Determines whether the engine should avoid inserting large pyarrow (``ParquetFileFragment``) objects in the task graph. If this option is True, ``read_partition`` will need to regenerate the appropriate fragment object from the path and row-group IDs. This will reduce the size of the task graph, but will add minor overhead to ``read_partition``. By default (None), ``ArrowDatasetEngine`` will set this option to ``False`` when there are filters. chunksize : int, str The target task partition size. If set, consecutive row-groups from the same file will be aggregated into the same output partition until the aggregate size reaches this value. **kwargs: dict (of dicts) Passthrough key-word arguments for read backend. The top-level keys correspond to the appropriate operation type, and the second level corresponds to the kwargs that will be passed on to the underlying ``pyarrow`` or ``fastparquet`` function. Supported top-level keys: 'dataset' (for opening a ``pyarrow`` dataset), 'file' (for opening a ``fastparquet`` ``ParquetFile``), 'read' (for the backend read function), 'arrow_to_pandas' (for controlling the arguments passed to convert from a ``pyarrow.Table.to_pandas()``) Examples -------- >>> df = dd.read_parquet('s3://bucket/my-parquet-data') # doctest: +SKIP See Also -------- to_parquet """ if isinstance(columns, str): df = read_parquet( path, columns=[columns], filters=filters, categories=categories, index=index, storage_options=storage_options, engine=engine, gather_statistics=gather_statistics, split_row_groups=split_row_groups, read_from_paths=read_from_paths, chunksize=chunksize, ) return df[columns] if columns is not None: columns = list(columns) name = "read-parquet-" + tokenize( path, columns, filters, categories, index, storage_options, engine, gather_statistics, split_row_groups, read_from_paths, chunksize, ) if isinstance(engine, str): engine = get_engine(engine) if hasattr(path, "name"): path = stringify_path(path) fs, _, paths = get_fs_token_paths(path, mode="rb", storage_options=storage_options) paths = sorted(paths, key=natural_sort_key) # numeric rather than glob ordering auto_index_allowed = False if index is None: # User is allowing auto-detected index auto_index_allowed = True if index and isinstance(index, str): index = [index] read_metadata_result = engine.read_metadata( fs, paths, categories=categories, index=index, gather_statistics=True if chunksize else gather_statistics, filters=filters, split_row_groups=split_row_groups, read_from_paths=read_from_paths, **kwargs, ) # In the future, we may want to give the engine the # option to return a dedicated element for `common_kwargs`. # However, to avoid breaking the API, we just embed this # data in the first element of `parts` for now. # The logic below is inteded to handle backward and forward # compatibility with a user-defined engine. meta, statistics, parts, index = read_metadata_result[:4] common_kwargs = {} if len(read_metadata_result) > 4: # Engine may return common_kwargs as a separate element common_kwargs = read_metadata_result[4] elif len(parts): # If the engine does not return a dedicated # common_kwargs argument, it may be stored in # the first element of `parts` common_kwargs = parts[0].pop("common_kwargs", {}) # Parse dataset statistics from metadata (if available) parts, divisions, index, index_in_columns = process_statistics( parts, statistics, filters, index, chunksize) # Account for index and columns arguments. # Modify `meta` dataframe accordingly meta, index, columns = set_index_columns(meta, index, columns, index_in_columns, auto_index_allowed) if meta.index.name == NONE_LABEL: meta.index.name = None subgraph = ParquetSubgraph( name, engine, fs, meta, columns, index, parts, kwargs, common_kwargs=common_kwargs, ) # Set the index that was previously treated as a column if index_in_columns: meta = meta.set_index(index) if meta.index.name == NONE_LABEL: meta.index.name = None if len(divisions) < 2: # empty dataframe - just use meta subgraph = {(name, 0): meta} divisions = (None, None) return new_dd_object(subgraph, name, meta, divisions)
def read_parquet( path, columns=None, filters=None, categories=None, index=None, storage_options=None, engine="auto", gather_statistics=None, split_row_groups=None, chunksize=None, **kwargs, ): """ Read a Parquet file into a Dask DataFrame This reads a directory of Parquet data into a Dask.dataframe, one file per partition. It selects the index among the sorted columns if any exist. Parameters ---------- path : string or list Source directory for data, or path(s) to individual parquet files. Prefix with a protocol like ``s3://`` to read from alternative filesystems. To read from multiple files you can pass a globstring or a list of paths, with the caveat that they must all have the same protocol. columns : string, list or None (default) Field name(s) to read in as columns in the output. By default all non-index fields will be read (as determined by the pandas parquet metadata, if present). Provide a single field name instead of a list to read in the data as a Series. filters : Union[List[Tuple[str, str, Any]], List[List[Tuple[str, str, Any]]]] List of filters to apply, like ``[[('x', '=', 0), ...], ...]``. This implements partition-level (hive) filtering only, i.e., to prevent the loading of some row-groups and/or files. Predicates can be expressed in disjunctive normal form (DNF). This means that the innermost tuple describes a single column predicate. These inner predicates are combined with an AND conjunction into a larger predicate. The outer-most list then combines all of the combined filters with an OR disjunction. Predicates can also be expressed as a List[Tuple]. These are evaluated as an AND conjunction. To express OR in predictates, one must use the (preferred) List[List[Tuple]] notation. index : string, list, False or None (default) Field name(s) to use as the output frame index. By default will be inferred from the pandas parquet file metadata (if present). Use False to read all fields as columns. categories : list, dict or None For any fields listed here, if the parquet encoding is Dictionary, the column will be created with dtype category. Use only if it is guaranteed that the column is encoded as dictionary in all row-groups. If a list, assumes up to 2**16-1 labels; if a dict, specify the number of labels expected; if None, will load categories automatically for data written by dask/fastparquet, not otherwise. storage_options : dict Key/value pairs to be passed on to the file-system backend, if any. engine : {'auto', 'fastparquet', 'pyarrow'}, default 'auto' Parquet reader library to use. If only one library is installed, it will use that one; if both, it will use 'fastparquet' gather_statistics : bool or None (default). Gather the statistics for each dataset partition. By default, this will only be done if the _metadata file is available. Otherwise, statistics will only be gathered if True, because the footer of every file will be parsed (which is very slow on some systems). split_row_groups : bool or int Default is True if a _metadata file is available or if the dataset is composed of a single file (otherwise defult is False). If True, then each output dataframe partition will correspond to a single parquet-file row-group. If False, each partition will correspond to a complete file. If a positive integer value is given, each dataframe partition will correspond to that number of parquet row-groups (or fewer). Only the "pyarrow" engine supports this argument. chunksize : int, str The target task partition size. If set, consecutive row-groups from the same file will be aggregated into the same output partition until the aggregate size reaches this value. **kwargs: dict (of dicts) Passthrough key-word arguments for read backend. The top-level keys correspond to the appropriate operation type, and the second level corresponds to the kwargs that will be passed on to the underlying `pyarrow` or `fastparquet` function. Supported top-level keys: 'dataset' (for opening a `pyarrow` dataset), 'file' (for opening a `fastparquet` `ParquetFile`), 'read' (for the backend read function), 'arrow_to_pandas' (for controlling the arguments passed to convert from a `pyarrow.Table.to_pandas()`) Examples -------- >>> df = dd.read_parquet('s3://bucket/my-parquet-data') # doctest: +SKIP See Also -------- to_parquet """ if isinstance(columns, str): df = read_parquet( path, [columns], filters, categories, index, storage_options, engine, gather_statistics, ) return df[columns] if columns is not None: columns = list(columns) name = "read-parquet-" + tokenize( path, columns, filters, categories, index, storage_options, engine, gather_statistics, ) if isinstance(engine, str): engine = get_engine(engine) if hasattr(path, "name"): path = stringify_path(path) fs, _, paths = get_fs_token_paths(path, mode="rb", storage_options=storage_options) paths = sorted(paths, key=natural_sort_key) # numeric rather than glob ordering auto_index_allowed = False if index is None: # User is allowing auto-detected index auto_index_allowed = True if index and isinstance(index, str): index = [index] meta, statistics, parts, index = engine.read_metadata( fs, paths, categories=categories, index=index, gather_statistics=gather_statistics, filters=filters, split_row_groups=split_row_groups, **kwargs, ) # Parse dataset statistics from metadata (if available) parts, divisions, index, index_in_columns = process_statistics( parts, statistics, filters, index, chunksize) # Account for index and columns arguments. # Modify `meta` dataframe accordingly meta, index, columns = set_index_columns(meta, index, columns, index_in_columns, auto_index_allowed) if meta.index.name == NONE_LABEL: meta.index.name = None subgraph = ParquetSubgraph(name, engine, fs, meta, columns, index, parts, kwargs) # Set the index that was previously treated as a column if index_in_columns: meta = meta.set_index(index) if meta.index.name == NONE_LABEL: meta.index.name = None if len(divisions) < 2: # empty dataframe - just use meta subgraph = {(name, 0): meta} divisions = (None, None) return new_dd_object(subgraph, name, meta, divisions)
def read_parquet(path, columns=None, filters=None, categories=None, index=None, storage_options=None, engine="auto", gather_statistics=None, **kwargs): """ Read a Parquet file into a Dask DataFrame This reads a directory of Parquet data into a Dask.dataframe, one file per partition. It selects the index among the sorted columns if any exist. Parameters ---------- path : string or list Source directory for data, or path(s) to individual parquet files. Prefix with a protocol like ``s3://`` to read from alternative filesystems. To read from multiple files you can pass a globstring or a list of paths, with the caveat that they must all have the same protocol. columns : string, list or None (default) Field name(s) to read in as columns in the output. By default all non-index fields will be read (as determined by the pandas parquet metadata, if present). Provide a single field name instead of a list to read in the data as a Series. filters : list List of filters to apply, like ``[('x', '>', 0), ...]``. This implements row-group (partition) -level filtering only, i.e., to prevent the loading of some chunks of the data, and only if relevant statistics have been included in the metadata. index : string, list, False or None (default) Field name(s) to use as the output frame index. By default will be inferred from the pandas parquet file metadata (if present). Use False to read all fields as columns. categories : list, dict or None For any fields listed here, if the parquet encoding is Dictionary, the column will be created with dtype category. Use only if it is guaranteed that the column is encoded as dictionary in all row-groups. If a list, assumes up to 2**16-1 labels; if a dict, specify the number of labels expected; if None, will load categories automatically for data written by dask/fastparquet, not otherwise. storage_options : dict Key/value pairs to be passed on to the file-system backend, if any. engine : {'auto', 'fastparquet', 'pyarrow'}, default 'auto' Parquet reader library to use. If only one library is installed, it will use that one; if both, it will use 'fastparquet' gather_statistics : bool or None (default). Gather the statistics for each dataset partition. By default, this will only be done if the _metadata file is available. Otherwise, statistics will only be gathered if True, because the footer of every file will be parsed (which is very slow on some systems). **kwargs: dict (of dicts) Passthrough key-word arguments for read backend. The top-level keys correspond to the appropriate operation type, and the second level corresponds to the kwargs that will be passed on to the underlying `pyarrow` or `fastparquet` function. Supported top-level keys: 'dataset' (for opening a `pyarrow` dataset), 'file' (for opening a `fastparquet` `ParquetFile`), and 'read' (for the backend read function) Examples -------- >>> df = dd.read_parquet('s3://bucket/my-parquet-data') # doctest: +SKIP See Also -------- to_parquet """ if isinstance(columns, str): df = read_parquet( path, [columns], filters, categories, index, storage_options, engine, gather_statistics, ) return df[columns] if columns is not None: columns = list(columns) name = "read-parquet-" + tokenize( path, columns, filters, categories, index, storage_options, engine, gather_statistics, ) if isinstance(engine, str): engine = get_engine(engine) if hasattr(path, "name"): path = stringify_path(path) fs, _, paths = get_fs_token_paths(path, mode="rb", storage_options=storage_options) paths = sorted(paths, key=natural_sort_key) # numeric rather than glob ordering auto_index_allowed = False if index is None: # User is allowing auto-detected index auto_index_allowed = True if index and isinstance(index, str): index = [index] meta, statistics, parts = engine.read_metadata( fs, paths, categories=categories, index=index, gather_statistics=gather_statistics, filters=filters, **kwargs) if meta.index.name is not None: index = meta.index.name ignore_index_column_intersection = False if columns is None: # User didn't specify columns, so ignore any intersection # of auto-detected values with the index (if necessary) ignore_index_column_intersection = True columns = meta.columns if not set(columns).issubset(set(meta.columns)): raise ValueError( "The following columns were not found in the dataset %s\n" "The following columns were found %s" % (set(columns) - set(meta.columns), meta.columns)) # Parse dataset statistics from metadata (if available) index_in_columns = False if statistics: result = list( zip(*[(part, stats) for part, stats in zip(parts, statistics) if stats["num-rows"] > 0])) parts, statistics = result or [[], []] if filters: parts, statistics = apply_filters(parts, statistics, filters) out = sorted_columns(statistics) if index and isinstance(index, str): index = [index] if index and out: # Only one valid column out = [o for o in out if o["name"] in index] if index is not False and len(out) == 1: # Use only sorted column with statistics as the index divisions = out[0]["divisions"] if index is None: index_in_columns = True index = [out[0]["name"]] elif index != [out[0]["name"]]: raise ValueError("Specified index is invalid.\n" "index: {}".format(index)) elif index is not False and len(out) > 1: if any(o["name"] == "index" for o in out): # Use sorted column named "index" as the index [o] = [o for o in out if o["name"] == "index"] divisions = o["divisions"] if index is None: index = [o["name"]] index_in_columns = True elif index != [o["name"]]: raise ValueError("Specified index is invalid.\n" "index: {}".format(index)) else: # Multiple sorted columns found, cannot autodetect the index warnings.warn( "Multiple sorted columns found, cannot autodetect index", RuntimeWarning, ) index = False divisions = [None] * (len(parts) + 1) else: divisions = [None] * (len(parts) + 1) else: divisions = [None] * (len(parts) + 1) if index: if isinstance(index, str): index = [index] if isinstance(columns, str): columns = [columns] if ignore_index_column_intersection: columns = [col for col in columns if col not in index] if set(index).intersection(columns): if auto_index_allowed: raise ValueError( "Specified index and column arguments must not intersect" " (set index=False or remove the detected index from columns).\n" "index: {} | column: {}".format(index, columns)) else: raise ValueError( "Specified index and column arguments must not intersect.\n" "index: {} | column: {}".format(index, columns)) # Leaving index as a column in `meta`, because the index # will be reset below (in case the index was detected after # meta was created) if index_in_columns: meta = meta[columns + index] else: meta = meta[columns] else: meta = meta[list(columns)] def _merge_kwargs(x, y): z = x.copy() z.update(y) return z subgraph = {(name, i): ( read_parquet_part, engine.read_partition, fs, meta, part["piece"], columns, index, _merge_kwargs(part["kwargs"], kwargs or {}), ) for i, part in enumerate(parts)} # Set the index that was previously treated as a column if index_in_columns: meta = meta.set_index(index) if len(divisions) < 2: # empty dataframe - just use meta subgraph = {(name, 0): meta} divisions = (None, None) return new_dd_object(subgraph, name, meta, divisions)
def read_orc( filepath_or_buffer, engine="cudf", columns=None, filters=None, stripes=None, skiprows=None, num_rows=None, use_index=True, decimal_cols_as_float=None, timestamp_type=None, use_python_file_object=True, **kwargs, ): """{docstring}""" if decimal_cols_as_float is not None: warnings.warn( "`decimal_cols_as_float` is deprecated and will be removed in " "the future", FutureWarning, ) from cudf import DataFrame # Multiple sources are passed as a list. If a single source is passed, # wrap it in a list for unified processing downstream. if not is_list_like(filepath_or_buffer): filepath_or_buffer = [filepath_or_buffer] # Each source must have a correlating stripe list. If a single stripe list # is provided rather than a list of list of stripes then extrapolate that # stripe list across all input sources if stripes is not None: if any(not isinstance(stripe, list) for stripe in stripes): stripes = [stripes] # Must ensure a stripe for each source is specified, unless None if not len(stripes) == len(filepath_or_buffer): raise ValueError( "A list of stripes must be provided for each input source" ) filepaths_or_buffers = [] for source in filepath_or_buffer: if ioutils.is_directory(source, **kwargs): fs = ioutils._ensure_filesystem( passed_filesystem=None, path=source, **kwargs, ) source = stringify_path(source) source = fs.sep.join([source, "*.orc"]) tmp_source, compression = ioutils.get_filepath_or_buffer( path_or_data=source, compression=None, use_python_file_object=use_python_file_object, **kwargs, ) if compression is not None: raise ValueError( "URL content-encoding decompression is not supported" ) if isinstance(tmp_source, list): filepaths_or_buffers.extend(tmp_source) else: filepaths_or_buffers.append(tmp_source) if filters is not None: selected_stripes = _filter_stripes( filters, filepaths_or_buffers, stripes, skiprows, num_rows ) # Return empty if everything was filtered if len(selected_stripes) == 0: return _make_empty_df(filepaths_or_buffers[0], columns) else: stripes = selected_stripes if engine == "cudf": return DataFrame._from_data( *liborc.read_orc( filepaths_or_buffers, columns, stripes, skiprows, num_rows, use_index, decimal_cols_as_float, timestamp_type, ) ) else: def read_orc_stripe(orc_file, stripe, columns): pa_table = orc_file.read_stripe(stripe, columns) if isinstance(pa_table, pa.RecordBatch): pa_table = pa.Table.from_batches([pa_table]) return pa_table warnings.warn("Using CPU via PyArrow to read ORC dataset.") if len(filepath_or_buffer) > 1: raise NotImplementedError( "Using CPU via PyArrow only supports a single a " "single input source" ) orc_file = orc.ORCFile(filepath_or_buffer[0]) if stripes is not None and len(stripes) > 0: for stripe_source_file in stripes: pa_tables = [ read_orc_stripe(orc_file, i, columns) for i in stripe_source_file ] pa_table = pa.concat_tables(pa_tables) else: pa_table = orc_file.read(columns=columns) df = cudf.DataFrame.from_arrow(pa_table) return df
def to_parquet(df, path, engine="auto", compression="default", write_index=True, append=False, ignore_divisions=False, partition_on=None, storage_options=None, write_metadata_file=True, compute=True, **kwargs): """Store Dask.dataframe to Parquet files Notes ----- Each partition will be written to a separate file. Parameters ---------- df : dask.dataframe.DataFrame path : string or pathlib.Path Destination directory for data. Prepend with protocol like ``s3://`` or ``hdfs://`` for remote data. engine : {'auto', 'fastparquet', 'pyarrow'}, default 'auto' Parquet library to use. If only one library is installed, it will use that one; if both, it will use 'fastparquet'. compression : string or dict, optional Either a string like ``"snappy"`` or a dictionary mapping column names to compressors like ``{"name": "gzip", "values": "snappy"}``. The default is ``"default"``, which uses the default compression for whichever engine is selected. write_index : boolean, optional Whether or not to write the index. Defaults to True. append : bool, optional If False (default), construct data-set from scratch. If True, add new row-group(s) to an existing data-set. In the latter case, the data-set must exist, and the schema must match the input data. ignore_divisions : bool, optional If False (default) raises error when previous divisions overlap with the new appended divisions. Ignored if append=False. partition_on : list, optional Construct directory-based partitioning by splitting on these fields' values. Each dask partition will result in one or more datafiles, there will be no global groupby. storage_options : dict, optional Key/value pairs to be passed on to the file-system backend, if any. write_metadata_file : bool, optional Whether to write the special "_metadata" file. compute : bool, optional If True (default) then the result is computed immediately. If False then a ``dask.delayed`` object is returned for future computation. **kwargs : Extra options to be passed on to the specific backend. Examples -------- >>> df = dd.read_csv(...) # doctest: +SKIP >>> dd.to_parquet(df, '/path/to/output/',...) # doctest: +SKIP See Also -------- read_parquet: Read parquet data to dask.dataframe """ from dask import delayed partition_on = partition_on or [] if isinstance(partition_on, string_types): partition_on = [partition_on] if set(partition_on) - set(df.columns): raise ValueError("Partitioning on non-existent column. " "partition_on=%s ." "columns=%s" % (str(partition_on), str(list(df.columns)))) if compression != "default": kwargs["compression"] = compression elif snappy is not None: kwargs["compression"] = "snappy" if isinstance(engine, str): engine = get_engine(engine) if hasattr(path, "name"): path = stringify_path(path) fs, _, _ = get_fs_token_paths(path, mode="wb", storage_options=storage_options) # Trim any protocol information from the path before forwarding path = fs._strip_protocol(path) # Save divisions and corresponding index name. This is necessary, # because we may be resetting the index to write the file division_info = {"divisions": df.divisions, "name": df.index.name} if division_info["name"] is None: # As of 0.24.2, pandas will rename an index with name=None # when df.reset_index() is called. The default name is "index", # (or "level_0" if "index" is already a column name) division_info[ "name"] = "index" if "index" not in df.columns else "level_0" # If write_index==True (default), reset the index and record the # name of the original index in `index_cols` (will be `index` if None, # or `level_0` if `index` is already a column name). # `fastparquet` will use `index_cols` to specify the index column(s) # in the metadata. `pyarrow` will revert the `reset_index` call # below if `index_cols` is populated (because pyarrow will want to handle # index preservation itself). For both engines, the column index # will be written to "pandas metadata" if write_index=True index_cols = [] if write_index: real_cols = set(df.columns) df = df.reset_index() index_cols = [c for c in set(df.columns).difference(real_cols)] else: # Not writing index - might as well drop it df = df.reset_index(drop=True) _to_parquet_kwargs = { "engine", "compression", "write_index", "append", "ignore_divisions", "partition_on", "storage_options", "write_metadata_file", "compute", } kwargs_pass = { k: v for k, v in kwargs.items() if k not in _to_parquet_kwargs } # Engine-specific initialization steps to write the dataset. # Possibly create parquet metadata, and load existing stuff if appending meta, i_offset = engine.initialize_write(df, fs, path, append=append, ignore_divisions=ignore_divisions, partition_on=partition_on, division_info=division_info, index_cols=index_cols, **kwargs_pass) # Use i_offset and df.npartitions to define file-name list filenames = [ "part.%i.parquet" % (i + i_offset) for i in range(df.npartitions) ] # write parts dwrite = delayed(engine.write_partition) parts = [ dwrite(d, path, fs, filename, partition_on, write_metadata_file, fmd=meta, index_cols=index_cols, **kwargs_pass) for d, filename in zip(df.to_delayed(), filenames) ] # single task to complete out = delayed(lambda x: None)(parts) if write_metadata_file: out = delayed(engine.write_metadata)(parts, meta, fs, path, append=append, compression=compression) if compute: out = out.compute() return out
def _strip_protocol(cls, path): path = stringify_path(path) if path.startswith("file://"): path = path[7:] return make_path_posix(path).rstrip("/")
def read_hdf( pattern, key, start=0, stop=None, columns=None, chunksize=1000000, sorted_index=False, lock=True, mode="r", ): """ Read HDF files into a Dask DataFrame Read hdf files into a dask dataframe. This function is like ``pandas.read_hdf``, except it can read from a single large file, or from multiple files, or from multiple keys from the same file. Parameters ---------- pattern : string, pathlib.Path, list File pattern (string), pathlib.Path, buffer to read from, or list of file paths. Can contain wildcards. key : group identifier in the store. Can contain wildcards start : optional, integer (defaults to 0), row number to start at stop : optional, integer (defaults to None, the last row), row number to stop at columns : list of columns, optional A list of columns that if not None, will limit the return columns (default is None) chunksize : positive integer, optional Maximal number of rows per partition (default is 1000000). sorted_index : boolean, optional Option to specify whether or not the input hdf files have a sorted index (default is False). lock : boolean, optional Option to use a lock to prevent concurrency issues (default is True). mode : {'a', 'r', 'r+'}, default 'r'. Mode to use when opening file(s). 'r' Read-only; no data can be modified. 'a' Append; an existing file is opened for reading and writing, and if the file does not exist it is created. 'r+' It is similar to 'a', but the file must already exist. Returns ------- dask.DataFrame Examples -------- Load single file >>> dd.read_hdf('myfile.1.hdf5', '/x') # doctest: +SKIP Load multiple files >>> dd.read_hdf('myfile.*.hdf5', '/x') # doctest: +SKIP >>> dd.read_hdf(['myfile.1.hdf5', 'myfile.2.hdf5'], '/x') # doctest: +SKIP Load multiple datasets >>> dd.read_hdf('myfile.1.hdf5', '/*') # doctest: +SKIP """ if lock is True: lock = get_scheduler_lock() key = key if key.startswith("/") else "/" + key # Convert path-like objects to a string pattern = stringify_path(pattern) if isinstance(pattern, str): paths = sorted(glob(pattern)) else: paths = pattern if not isinstance(pattern, str) and len(paths) == 0: raise ValueError("No files provided") if not paths or len(paths) == 0: raise IOError("File(s) not found: {0}".format(pattern)) for path in paths: try: exists = os.path.exists(path) except (ValueError, TypeError): exists = False if not exists: raise IOError( "File not found or insufficient permissions: {0}".format(path) ) if (start != 0 or stop is not None) and len(paths) > 1: raise NotImplementedError(read_hdf_error_msg) if chunksize <= 0: raise ValueError("Chunksize must be a positive integer") if (start != 0 or stop is not None) and sorted_index: raise ValueError( "When assuming pre-partitioned data, data must be " "read in its entirety using the same chunksizes" ) # Build metadata with pd.HDFStore(paths[0], mode=mode) as hdf: meta_key = _expand_key(key, hdf)[0] meta = pd.read_hdf(paths[0], meta_key, mode=mode, stop=0) if columns is not None: meta = meta[columns] # Common kwargs if meta.ndim == 1: common_kwargs = {"name": meta.name, "mode": mode} else: common_kwargs = {"mode": mode} # Build parts parts, divisions = _build_parts( paths, key, start, stop, chunksize, sorted_index, mode ) # Construct Layer and Collection label = "read-hdf-" name = label + tokenize(paths, key, start, stop, sorted_index, chunksize, mode) layer = DataFrameIOLayer( name, columns, parts, HDFFunctionWrapper(columns, meta.ndim, lock, common_kwargs), label=label, ) graph = HighLevelGraph({name: layer}, {name: set()}) return new_dd_object(graph, name, meta, divisions)