def parquet_dataset(metadata_path, schema=None, filesystem=None, format=None, partitioning=None, partition_base_dir=None): """ Create a FileSystemDataset from a `_metadata` file created via `pyarrrow.parquet.write_metadata`. Parameters ---------- metadata_path : path, Path pointing to a single file parquet metadata file schema : Schema, optional Optionally provide the Schema for the Dataset, in which case it will not be inferred from the source. filesystem : FileSystem or URI string, default None If a single path is given as source and filesystem is None, then the filesystem will be inferred from the path. If an URI string is passed, then a filesystem object is constructed using the URI's optional path component as a directory prefix. See the examples below. Note that the URIs on Windows must follow 'file:///C:...' or 'file:/C:...' patterns. format : ParquetFileFormat An instance of a ParquetFileFormat if special options needs to be passed. partitioning : Partitioning, PartitioningFactory, str, list of str The partitioning scheme specified with the ``partitioning()`` function. A flavor string can be used as shortcut, and with a list of field names a DirectionaryPartitioning will be inferred. partition_base_dir : str, optional For the purposes of applying the partitioning, paths will be stripped of the partition_base_dir. Files not matching the partition_base_dir prefix will be skipped for partitioning discovery. The ignored files will still be part of the Dataset, but will not have partition information. Returns ------- FileSystemDataset """ from pyarrow.fs import LocalFileSystem, _ensure_filesystem if format is None: format = ParquetFileFormat() elif not isinstance(format, ParquetFileFormat): raise ValueError("format argument must be a ParquetFileFormat") if filesystem is None: filesystem = LocalFileSystem() else: filesystem = _ensure_filesystem(filesystem) metadata_path = filesystem.normalize_path(_stringify_path(metadata_path)) options = ParquetFactoryOptions( partition_base_dir=partition_base_dir, partitioning=_ensure_partitioning(partitioning) ) factory = ParquetDatasetFactory( metadata_path, filesystem, format, options=options) return factory.finish(schema)
def _ensure_format(obj): if isinstance(obj, FileFormat): return obj elif obj == "parquet": if not _parquet_available: raise ValueError(_parquet_msg) return ParquetFileFormat() elif obj in {"ipc", "arrow", "feather"}: return IpcFileFormat() elif obj == "csv": return CsvFileFormat() elif obj == "orc": if not _orc_available: raise ValueError(_orc_msg) return OrcFileFormat() else: raise ValueError("format '{}' is not supported".format(obj))