def _ensure_filesystem(fs_or_uri): from pyarrow.fs import ( FileSystem, LocalFileSystem, SubTreeFileSystem, FileType ) if isinstance(fs_or_uri, str): # instantiate the file system from an uri, if the uri has a path # component then it will be treated as a path prefix filesystem, prefix = FileSystem.from_uri(fs_or_uri) is_local = isinstance(filesystem, LocalFileSystem) prefix = _normalize_path(filesystem, prefix) if prefix: # validate that the prefix is pointing to a directory prefix_info = filesystem.get_file_info([prefix])[0] if prefix_info.type != FileType.Directory: raise ValueError( "The path component of the filesystem URI must point to a " "directory but it has a type: `{}`. The path component " "is `{}` and the given filesystem URI is `{}`".format( prefix_info.type.name, prefix_info.path, fs_or_uri ) ) filesystem = SubTreeFileSystem(prefix, filesystem) return filesystem, is_local elif isinstance(fs_or_uri, (LocalFileSystem, _MockFileSystem)): return fs_or_uri, True elif isinstance(fs_or_uri, FileSystem): return fs_or_uri, False else: raise TypeError( '`filesystem` argument must be a FileSystem instance or a valid ' 'file system URI' )
def _ensure_fs(filesystem, path): # Validate or infer the filesystem from the path from pyarrow.fs import (FileSystem, LocalFileSystem, FileType, _normalize_path) if filesystem is None: # First check if the file exists as a local (relative) file path filesystem = LocalFileSystem() try: infos = filesystem.get_file_info([path])[0] except OSError: local_path_exists = False else: local_path_exists = (infos.type != FileType.NotFound) if not local_path_exists: # Perhaps it's a URI? try: return FileSystem.from_uri(path) except ValueError as e: if "empty scheme" not in str(e): raise # ARROW-8213: not a URI, assume local path # to get a nice error message. # ensure we have a proper path (eg no backslashes on Windows) path = _normalize_path(filesystem, path) return filesystem, path
def parquet_dataset(metadata_path, schema=None, filesystem=None, format=None): """ Create a FileSystemDataset from a `_metadata` file created via `pyarrrow.parquet.write_metadata`. Parameters ---------- metadata_path : path, Path pointing to a single file parquet metadata file schema : Schema, optional Optionally provide the Schema for the Dataset, in which case it will not be inferred from the source. filesystem : FileSystem or URI string, default None If a single path is given as source and filesystem is None, then the filesystem will be inferred from the path. If an URI string is passed, then a filesystem object is constructed using the URI's optional path component as a directory prefix. See the examples below. Note that the URIs on Windows must follow 'file:///C:...' or 'file:/C:...' patterns. format : ParquetFileFormat An instance of a ParquetFileFormat if special options needs to be passed. Returns ------- FileSystemDataset """ from pyarrow.fs import LocalFileSystem if format is None: format = ParquetFileFormat() elif not isinstance(format, ParquetFileFormat): raise ValueError("format argument must be a ParquetFileFormat") if filesystem is None: filesystem = LocalFileSystem() else: filesystem, _ = _ensure_filesystem(filesystem) metadata_path = _normalize_path(filesystem, _stringify_path(metadata_path)) factory = ParquetDatasetFactory(metadata_path, filesystem, format) return factory.finish(schema)
def _ensure_fs(filesystem, path): # Validate or infer the filesystem from the path from pyarrow.fs import ( FileSystem, LocalFileSystem, FileType, _normalize_path) if filesystem is None: # first check if the file exists as a local (relative) file path filesystem = LocalFileSystem() try: infos = filesystem.get_file_info([path])[0] except OSError: return FileSystem.from_uri(path) if infos.type == FileType.NotFound: return FileSystem.from_uri(path) # ensure we have a proper path (eg no backslashes on Windows) path = _normalize_path(filesystem, path) return filesystem, path
def _ensure_single_source(path, filesystem=None): """ Treat path as either a recursively traversable directory or a single file. Parameters ---------- path : path-like filesystem : FileSystem or str, optional If an URI is passed, then its path component will act as a prefix for the file paths. Returns ------- (FileSystem, list of str or fs.Selector) File system object and either a single item list pointing to a file or an fs.Selector object pointing to a directory. Raises ------ TypeError If the passed filesystem has wrong type. FileNotFoundError If the referenced file or directory doesn't exist. """ from pyarrow.fs import FileSystem, LocalFileSystem, FileType, FileSelector path = _stringify_path(path) # if filesystem is not given try to automatically determine one # first check if the file exists as a local (relative) file path # if not then try to parse the path as an URI file_info = None if filesystem is None: filesystem = LocalFileSystem() try: file_info = filesystem.get_file_info([path])[0] except OSError: file_info = None exists_locally = False else: exists_locally = (file_info.type != FileType.NotFound) # if the file or directory doesn't exists locally, then assume that # the path is an URI describing the file system as well if not exists_locally: try: filesystem, path = FileSystem.from_uri(path) except ValueError as e: # ARROW-8213: neither an URI nor a locally existing path, # so assume that local path was given and propagate a nicer # file not found error instead of a more confusing scheme # parsing error if "empty scheme" not in str(e): raise else: # unset file_info to query it again from the new filesystem file_info = None # construct a filesystem if it is a valid URI filesystem, _ = _ensure_filesystem(filesystem) # ensure that the path is normalized before passing to dataset discovery path = _normalize_path(filesystem, path) # retrieve the file descriptor if file_info is None: file_info = filesystem.get_file_info([path])[0] # depending on the path type either return with a recursive # directory selector or as a list containing a single file if file_info.type == FileType.Directory: paths_or_selector = FileSelector(path, recursive=True) elif file_info.type == FileType.File: paths_or_selector = [path] else: raise FileNotFoundError(path) return filesystem, paths_or_selector
def _ensure_multiple_sources(paths, filesystem=None): """ Treat a list of paths as files belonging to a single file system If the file system is local then also validates that all paths are referencing existing *files* otherwise any non-file paths will be silently skipped (for example on a remote filesystem). Parameters ---------- paths : list of path-like Note that URIs are not allowed. filesystem : FileSystem or str, optional If an URI is passed, then its path component will act as a prefix for the file paths. Returns ------- (FileSystem, list of str) File system object and a list of normalized paths. Raises ------ TypeError If the passed filesystem has wrong type. IOError If the file system is local and a referenced path is not available or not a file. """ from pyarrow.fs import LocalFileSystem, FileType if filesystem is None: # fall back to local file system as the default filesystem = LocalFileSystem() # construct a filesystem if it is a valid URI filesystem, is_local = _ensure_filesystem(filesystem) # allow normalizing irregular paths such as Windows local paths paths = [_normalize_path(filesystem, _stringify_path(p)) for p in paths] # validate that all of the paths are pointing to existing *files* # possible improvement is to group the file_infos by type and raise for # multiple paths per error category if is_local: for info in filesystem.get_file_info(paths): file_type = info.type if file_type == FileType.File: continue elif file_type == FileType.NotFound: raise FileNotFoundError(info.path) elif file_type == FileType.Directory: raise IsADirectoryError( 'Path {} points to a directory, but only file paths are ' 'supported. To construct a nested or union dataset pass ' 'a list of dataset objects instead.'.format(info.path) ) else: raise IOError( 'Path {} exists but its type is unknown (could be a ' 'special file such as a Unix socket or character device, ' 'or Windows NUL / CON / ...)'.format(info.path) ) return filesystem, paths
def parquet_dataset(metadata_path, schema=None, filesystem=None, format=None, partitioning=None, partition_base_dir=None): """ Create a FileSystemDataset from a `_metadata` file created via `pyarrrow.parquet.write_metadata`. Parameters ---------- metadata_path : path, Path pointing to a single file parquet metadata file schema : Schema, optional Optionally provide the Schema for the Dataset, in which case it will not be inferred from the source. filesystem : FileSystem or URI string, default None If a single path is given as source and filesystem is None, then the filesystem will be inferred from the path. If an URI string is passed, then a filesystem object is constructed using the URI's optional path component as a directory prefix. See the examples below. Note that the URIs on Windows must follow 'file:///C:...' or 'file:/C:...' patterns. format : ParquetFileFormat An instance of a ParquetFileFormat if special options needs to be passed. partitioning : Partitioning, PartitioningFactory, str, list of str The partitioning scheme specified with the ``partitioning()`` function. A flavor string can be used as shortcut, and with a list of field names a DirectionaryPartitioning will be inferred. partition_base_dir : str, optional For the purposes of applying the partitioning, paths will be stripped of the partition_base_dir. Files not matching the partition_base_dir prefix will be skipped for partitioning discovery. The ignored files will still be part of the Dataset, but will not have partition information. Returns ------- FileSystemDataset """ from pyarrow.fs import LocalFileSystem if format is None: format = ParquetFileFormat() elif not isinstance(format, ParquetFileFormat): raise ValueError("format argument must be a ParquetFileFormat") if filesystem is None: filesystem = LocalFileSystem() else: filesystem, _ = _ensure_fs(filesystem) metadata_path = _normalize_path(filesystem, _stringify_path(metadata_path)) options = ParquetFactoryOptions( partition_base_dir=partition_base_dir, partitioning=_ensure_partitioning(partitioning)) factory = ParquetDatasetFactory(metadata_path, filesystem, format, options=options) return factory.finish(schema)