def _ensure_fs(fs_or_uri): from pyarrow.fs import (FileSystem, LocalFileSystem, SubTreeFileSystem, FileType, _ensure_filesystem) if isinstance(fs_or_uri, str): # instantiate the file system from an uri, if the uri has a path # component then it will be treated as a path prefix filesystem, prefix = FileSystem.from_uri(fs_or_uri) is_local = isinstance(filesystem, LocalFileSystem) prefix = filesystem.normalize_path(prefix) if prefix: # validate that the prefix is pointing to a directory prefix_info = filesystem.get_file_info([prefix])[0] if prefix_info.type != FileType.Directory: raise ValueError( "The path component of the filesystem URI must point to a " "directory but it has a type: `{}`. The path component " "is `{}` and the given filesystem URI is `{}`".format( prefix_info.type.name, prefix_info.path, fs_or_uri)) filesystem = SubTreeFileSystem(prefix, filesystem) return filesystem, is_local try: filesystem = _ensure_filesystem(fs_or_uri) except TypeError: raise TypeError( '`filesystem` argument must be a FileSystem instance or a valid ' 'file system URI') if isinstance(filesystem, (LocalFileSystem, _MockFileSystem)): return filesystem, True else: return filesystem, False
def parquet_dataset(metadata_path, schema=None, filesystem=None, format=None, partitioning=None, partition_base_dir=None): """ Create a FileSystemDataset from a `_metadata` file created via `pyarrrow.parquet.write_metadata`. Parameters ---------- metadata_path : path, Path pointing to a single file parquet metadata file schema : Schema, optional Optionally provide the Schema for the Dataset, in which case it will not be inferred from the source. filesystem : FileSystem or URI string, default None If a single path is given as source and filesystem is None, then the filesystem will be inferred from the path. If an URI string is passed, then a filesystem object is constructed using the URI's optional path component as a directory prefix. See the examples below. Note that the URIs on Windows must follow 'file:///C:...' or 'file:/C:...' patterns. format : ParquetFileFormat An instance of a ParquetFileFormat if special options needs to be passed. partitioning : Partitioning, PartitioningFactory, str, list of str The partitioning scheme specified with the ``partitioning()`` function. A flavor string can be used as shortcut, and with a list of field names a DirectionaryPartitioning will be inferred. partition_base_dir : str, optional For the purposes of applying the partitioning, paths will be stripped of the partition_base_dir. Files not matching the partition_base_dir prefix will be skipped for partitioning discovery. The ignored files will still be part of the Dataset, but will not have partition information. Returns ------- FileSystemDataset """ from pyarrow.fs import LocalFileSystem, _ensure_filesystem if format is None: format = ParquetFileFormat() elif not isinstance(format, ParquetFileFormat): raise ValueError("format argument must be a ParquetFileFormat") if filesystem is None: filesystem = LocalFileSystem() else: filesystem = _ensure_filesystem(filesystem) metadata_path = filesystem.normalize_path(_stringify_path(metadata_path)) options = ParquetFactoryOptions( partition_base_dir=partition_base_dir, partitioning=_ensure_partitioning(partitioning) ) factory = ParquetDatasetFactory( metadata_path, filesystem, format, options=options) return factory.finish(schema)
def _ensure_multiple_sources(paths, filesystem=None): """ Treat a list of paths as files belonging to a single file system If the file system is local then also validates that all paths are referencing existing *files* otherwise any non-file paths will be silently skipped (for example on a remote filesystem). Parameters ---------- paths : list of path-like Note that URIs are not allowed. filesystem : FileSystem or str, optional If an URI is passed, then its path component will act as a prefix for the file paths. Returns ------- (FileSystem, list of str) File system object and a list of normalized paths. Raises ------ TypeError If the passed filesystem has wrong type. IOError If the file system is local and a referenced path is not available or not a file. """ from pyarrow.fs import (LocalFileSystem, SubTreeFileSystem, _MockFileSystem, FileType, _ensure_filesystem) if filesystem is None: # fall back to local file system as the default filesystem = LocalFileSystem() else: # construct a filesystem if it is a valid URI filesystem = _ensure_filesystem(filesystem) is_local = (isinstance(filesystem, (LocalFileSystem, _MockFileSystem)) or (isinstance(filesystem, SubTreeFileSystem) and isinstance(filesystem.base_fs, LocalFileSystem))) # allow normalizing irregular paths such as Windows local paths paths = [filesystem.normalize_path(_stringify_path(p)) for p in paths] # validate that all of the paths are pointing to existing *files* # possible improvement is to group the file_infos by type and raise for # multiple paths per error category if is_local: for info in filesystem.get_file_info(paths): file_type = info.type if file_type == FileType.File: continue elif file_type == FileType.NotFound: raise FileNotFoundError(info.path) elif file_type == FileType.Directory: raise IsADirectoryError( 'Path {} points to a directory, but only file paths are ' 'supported. To construct a nested or union dataset pass ' 'a list of dataset objects instead.'.format(info.path)) else: raise IOError( 'Path {} exists but its type is unknown (could be a ' 'special file such as a Unix socket or character device, ' 'or Windows NUL / CON / ...)'.format(info.path)) return filesystem, paths
def write_dataset(data, base_dir, basename_template=None, format=None, partitioning=None, schema=None, filesystem=None, file_options=None, use_threads=True): """ Write a dataset to a given format and partitioning. Parameters ---------- data : Dataset, Table/RecordBatch, or list of Table/RecordBatch The data to write. This can be a Dataset instance or in-memory Arrow data. base_dir : str The root directory where to write the dataset. basename_template : str, optional A template string used to generate basenames of written data files. The token '{i}' will be replaced with an automatically incremented integer. If not specified, it defaults to "part-{i}." + format.default_extname format : FileFormat or str The format in which to write the dataset. Currently supported: "parquet", "ipc"/"feather". If a FileSystemDataset is being written and `format` is not specified, it defaults to the same format as the specified FileSystemDataset. When writing a Table or RecordBatch, this keyword is required. partitioning : Partitioning, optional The partitioning scheme specified with the ``partitioning()`` function. schema : Schema, optional filesystem : FileSystem, optional file_options : FileWriteOptions, optional FileFormat specific write options, created using the ``FileFormat.make_write_options()`` function. use_threads : bool, default True Write files in parallel. If enabled, then maximum parallelism will be used determined by the number of available CPU cores. """ from pyarrow.fs import LocalFileSystem, _ensure_filesystem if isinstance(data, Dataset): schema = schema or data.schema elif isinstance(data, (pa.Table, pa.RecordBatch)): schema = schema or data.schema data = [data] elif isinstance(data, list): schema = schema or data[0].schema else: raise ValueError( "Only Dataset, Table/RecordBatch or a list of Table/RecordBatch " "objects are supported.") if format is None and isinstance(data, FileSystemDataset): format = data.format else: format = _ensure_format(format) if file_options is None: file_options = format.make_write_options() if format != file_options.format: raise TypeError("Supplied FileWriteOptions have format {}, " "which doesn't match supplied FileFormat {}".format( format, file_options)) if basename_template is None: basename_template = "part-{i}." + format.default_extname partitioning = _ensure_write_partitioning(partitioning) if filesystem is None: # fall back to local file system as the default filesystem = LocalFileSystem() else: filesystem = _ensure_filesystem(filesystem) _filesystemdataset_write( data, base_dir, basename_template, schema, filesystem, partitioning, file_options, use_threads, )