def _ensure_fs_and_paths(path_or_paths, filesystem=None): # Validate and convert the path-likes and filesystem. # Returns filesystem and list of string paths or FileSelector from pyarrow.fs import FileSystem, FileType, FileSelector if isinstance(path_or_paths, list): paths_or_selector = [_stringify_path(path) for path in path_or_paths] if filesystem is None: # infer from first path filesystem, _ = FileSystem.from_uri(paths_or_selector[0]) else: path = _stringify_path(path_or_paths) if filesystem is None: filesystem, path = FileSystem.from_uri(path) stats = filesystem.get_target_stats([path])[0] if stats.type == FileType.Directory: # for directory, pass a selector paths_or_selector = FileSelector(path, recursive=True) elif stats.type == FileType.File: # for a single file path, pass it as a list paths_or_selector = [path] else: raise FileNotFoundError(path) return filesystem, paths_or_selector
def _resolve_filesystem_and_path( path, filesystem=None, allow_legacy_filesystem=False ): """ Return filesystem/path from path which could be an URI or a plain filesystem path. """ if not _is_path_like(path): if filesystem is not None: raise ValueError( "'filesystem' passed but the specified path is file-like, so" " there is nothing to open with 'filesystem'." ) return filesystem, path if filesystem is not None: filesystem = _ensure_filesystem( filesystem, allow_legacy_filesystem=allow_legacy_filesystem ) if isinstance(filesystem, LocalFileSystem): path = _stringify_path(path) elif not isinstance(path, str): raise TypeError( "Expected string path; path-like objects are only allowed " "with a local filesystem" ) if not allow_legacy_filesystem: path = filesystem.normalize_path(path) return filesystem, path path = _stringify_path(path) # if filesystem is not given, try to automatically determine one # first check if the file exists as a local (relative) file path # if not then try to parse the path as an URI filesystem = LocalFileSystem() try: file_info = filesystem.get_file_info(path) except ValueError: # ValueError means path is likely an URI file_info = None exists_locally = False else: exists_locally = (file_info.type != FileType.NotFound) # if the file or directory doesn't exists locally, then assume that # the path is an URI describing the file system as well if not exists_locally: try: filesystem, path = FileSystem.from_uri(path) except ValueError as e: # neither an URI nor a locally existing path, so assume that # local path was given and propagate a nicer file not found error # instead of a more confusing scheme parsing error if "empty scheme" not in str(e): raise else: path = filesystem.normalize_path(path) return filesystem, path
def resolve_filesystem_and_path(where, filesystem=None): """ Return filesystem from path which could be an HDFS URI, a local URI, or a plain filesystem path. """ if not _is_path_like(where): if filesystem is not None: raise ValueError("filesystem passed but where is file-like, so" " there is nothing to open with filesystem.") return filesystem, where if filesystem is not None: filesystem = _ensure_filesystem(filesystem) if isinstance(filesystem, LocalFileSystem): path = _stringify_path(where) elif not isinstance(where, str): raise TypeError( "Expected string path; path-like objects are only allowed " "with a local filesystem" ) else: path = where return filesystem, path path = _stringify_path(where) parsed_uri = urllib.parse.urlparse(path) if parsed_uri.scheme == 'hdfs' or parsed_uri.scheme == 'viewfs': # Input is hdfs URI such as hdfs://host:port/myfile.parquet netloc_split = parsed_uri.netloc.split(':') host = netloc_split[0] if host == '': host = 'default' else: host = parsed_uri.scheme + "://" + host port = 0 if len(netloc_split) == 2 and netloc_split[1].isnumeric(): port = int(netloc_split[1]) fs = pa.hdfs._connect(host=host, port=port) fs_path = parsed_uri.path elif parsed_uri.scheme == 'file': # Input is local URI such as file:///home/user/myfile.parquet fs = LocalFileSystem._get_instance() fs_path = parsed_uri.path else: # Input is local path such as /home/user/myfile.parquet fs = LocalFileSystem._get_instance() fs_path = path return fs, fs_path
def __init__(self, dirpath, open_file_func=None, filesystem=None, pathsep='/', partition_scheme='hive', metadata_nthreads=1): filesystem, dirpath = _get_filesystem_and_path(filesystem, dirpath) self.filesystem = filesystem self.open_file_func = open_file_func self.pathsep = pathsep self.dirpath = _stringify_path(dirpath) self.partition_scheme = partition_scheme self.partitions = ParquetPartitions() self.pieces = [] self._metadata_nthreads = metadata_nthreads self._thread_pool = futures.ThreadPoolExecutor( max_workers=metadata_nthreads) self.common_metadata_path = None self.metadata_path = None self._visit_level(0, self.dirpath, []) # Due to concurrency, pieces will potentially by out of order if the # dataset is partitioned so we sort them to yield stable results self.pieces.sort(key=lambda piece: piece.path) if self.common_metadata_path is None: # _common_metadata is a subset of _metadata self.common_metadata_path = self.metadata_path self._thread_pool.shutdown()
def resolve_filesystem_and_path(where, filesystem=None): """ return filesystem from path which could be an HDFS URI """ if not _is_path_like(where): if filesystem is not None: raise ValueError("filesystem passed but where is file-like, so" " there is nothing to open with filesystem.") return filesystem, where # input can be hdfs URI such as hdfs://host:port/myfile.parquet path = _stringify_path(where) if filesystem is not None: return _ensure_filesystem(filesystem), path parsed_uri = urlparse(path) if parsed_uri.scheme == 'hdfs': netloc_split = parsed_uri.netloc.split(':') host = netloc_split[0] if host == '': host = 'default' port = 0 if len(netloc_split) == 2 and netloc_split[1].isnumeric(): port = int(netloc_split[1]) fs = pa.hdfs.connect(host=host, port=port) else: fs = LocalFileSystem.get_instance() return fs, parsed_uri.path
def parquet_dataset(metadata_path, schema=None, filesystem=None, format=None, partitioning=None, partition_base_dir=None): """ Create a FileSystemDataset from a `_metadata` file created via `pyarrrow.parquet.write_metadata`. Parameters ---------- metadata_path : path, Path pointing to a single file parquet metadata file schema : Schema, optional Optionally provide the Schema for the Dataset, in which case it will not be inferred from the source. filesystem : FileSystem or URI string, default None If a single path is given as source and filesystem is None, then the filesystem will be inferred from the path. If an URI string is passed, then a filesystem object is constructed using the URI's optional path component as a directory prefix. See the examples below. Note that the URIs on Windows must follow 'file:///C:...' or 'file:/C:...' patterns. format : ParquetFileFormat An instance of a ParquetFileFormat if special options needs to be passed. partitioning : Partitioning, PartitioningFactory, str, list of str The partitioning scheme specified with the ``partitioning()`` function. A flavor string can be used as shortcut, and with a list of field names a DirectionaryPartitioning will be inferred. partition_base_dir : str, optional For the purposes of applying the partitioning, paths will be stripped of the partition_base_dir. Files not matching the partition_base_dir prefix will be skipped for partitioning discovery. The ignored files will still be part of the Dataset, but will not have partition information. Returns ------- FileSystemDataset """ from pyarrow.fs import LocalFileSystem, _ensure_filesystem if format is None: format = ParquetFileFormat() elif not isinstance(format, ParquetFileFormat): raise ValueError("format argument must be a ParquetFileFormat") if filesystem is None: filesystem = LocalFileSystem() else: filesystem = _ensure_filesystem(filesystem) metadata_path = filesystem.normalize_path(_stringify_path(metadata_path)) options = ParquetFactoryOptions( partition_base_dir=partition_base_dir, partitioning=_ensure_partitioning(partitioning) ) factory = ParquetDatasetFactory( metadata_path, filesystem, format, options=options) return factory.finish(schema)
def disk_usage(self, path): """ Compute bytes used by all contents under indicated path in file tree Parameters ---------- path : string Can be a file path or directory Returns ------- usage : int """ path = _stringify_path(path) path_info = self.stat(path) if path_info['kind'] == 'file': return path_info['size'] total = 0 for root, directories, files in self.walk(path): for child_path in files: abspath = self._path_join(root, child_path) total += self.stat(abspath)['size'] return total
def walk(self, path): """ Directory tree generator, like os.walk Generator version of what is in gcsfs, which yields a flattened list of files """ from gcsfs.core import norm_path path = norm_path(_stringify_path(path)) directories = set() files = set() for obj in self.fs.ls(path, detail=True): # each info name must be at least [path]/part , but here # we check also for names like [path]/part/ obj_path = obj['name'] if obj_path.strip('/') == path: continue if obj['type'] == 'directory': directories.add(obj_path) elif obj['type'] == 'file': files.add(obj_path) rel_files = sorted( [posixpath.split(f)[1] for f in files if f not in directories]) rel_directories = sorted( [posixpath.split(x[:-1])[1] for x in directories]) yield path, rel_directories, rel_files for directory in directories: for tup in self.walk(directory): yield tup
def walk(self, path, refresh=False): """ Directory tree generator, like os.walk Generator version of what is in s3fs, which yields a flattened list of files """ path = _sanitize_s3(_stringify_path(path)) directories = set() files = set() for key in list(self.fs._ls(path, refresh=refresh)): path = key['Key'] if key['StorageClass'] == 'DIRECTORY': directories.add(path) elif key['StorageClass'] == 'BUCKET': pass else: files.add(path) # s3fs creates duplicate 'DIRECTORY' entries files = sorted([posixpath.split(f)[1] for f in files if f not in directories]) directories = sorted([posixpath.split(x)[1] for x in directories]) yield path, directories, files for directory in directories: for tup in self.walk(directory, refresh=refresh): yield tup
def write_table(table, where, row_group_size=None, version='1.0', use_dictionary=True, compression='snappy', use_deprecated_int96_timestamps=None, coerce_timestamps=None, allow_truncated_timestamps=False, flavor=None, **kwargs): row_group_size = kwargs.pop('chunk_size', row_group_size) use_int96 = use_deprecated_int96_timestamps try: with ParquetWriter( where, table.schema, version=version, flavor=flavor, use_dictionary=use_dictionary, coerce_timestamps=coerce_timestamps, allow_truncated_timestamps=allow_truncated_timestamps, compression=compression, use_deprecated_int96_timestamps=use_int96, **kwargs) as writer: writer.write_table(table, row_group_size=row_group_size) except Exception: if _is_path_like(where): try: os.remove(_stringify_path(where)) except os.error: pass raise
def __init__(self, dirpath, filesystem=None, pathsep='/', partition_scheme='hive', metadata_nthreads=1): self.filesystem = filesystem or _get_fs_from_path(dirpath) self.pathsep = pathsep self.dirpath = _stringify_path(dirpath) self.partition_scheme = partition_scheme self.partitions = ParquetPartitions() self.pieces = [] self._metadata_nthreads = metadata_nthreads self._thread_pool = futures.ThreadPoolExecutor( max_workers=metadata_nthreads) self.common_metadata_path = None self.metadata_path = None self._visit_level(0, self.dirpath, []) # Due to concurrency, pieces will potentially by out of order if the # dataset is partitioned so we sort them to yield stable results self.pieces.sort(key=lambda piece: piece.path) if self.common_metadata_path is None: # _common_metadata is a subset of _metadata self.common_metadata_path = self.metadata_path self._thread_pool.shutdown()
def isfile(self, path): path = _sanitize_s3(_stringify_path(path)) try: contents = self.fs.ls(path) return len(contents) == 1 and contents[0] == path except OSError: return False
def walk(self, path, refresh=False): """ Directory tree generator, like os.walk Generator version of what is in s3fs, which yields a flattened list of files """ path = _sanitize_s3(_stringify_path(path)) directories = set() files = set() for key in list(self.fs._ls(path, refresh=refresh)): path = key['Key'] if key['StorageClass'] == 'DIRECTORY': directories.add(path) elif key['StorageClass'] == 'BUCKET': pass else: files.add(path) # s3fs creates duplicate 'DIRECTORY' entries files = sorted( [posixpath.split(f)[1] for f in files if f not in directories]) directories = sorted([posixpath.split(x)[1] for x in directories]) yield path, directories, files for directory in directories: for tup in self.walk(directory, refresh=refresh): yield tup
def write_table(table, where, row_group_size=None, version='1.0', use_dictionary=True, compression='snappy', use_deprecated_int96_timestamps=None, coerce_timestamps=None, allow_truncated_timestamps=False, flavor=None, filesystem=None, **kwargs): row_group_size = kwargs.pop('chunk_size', row_group_size) use_int96 = use_deprecated_int96_timestamps try: with ParquetWriter( where, table.schema, filesystem=filesystem, version=version, flavor=flavor, use_dictionary=use_dictionary, coerce_timestamps=coerce_timestamps, allow_truncated_timestamps=allow_truncated_timestamps, compression=compression, use_deprecated_int96_timestamps=use_int96, **kwargs) as writer: writer.write_table(table, row_group_size=row_group_size) except Exception: if _is_path_like(where): try: os.remove(_stringify_path(where)) except os.error: pass raise
def walk(self, path): """ Directory tree generator, like os.walk Generator version of what is in gcsfs, which yields a flattened list of files """ from gcsfs.core import norm_path path = norm_path(_stringify_path(path)) directories = set() files = set() for key in self.fs.ls(path, detail=True): # each info name must be at least [path]/part , but here # we check also for names like [path]/part/ path = key['name'] if key['storageClass'] == 'DIRECTORY': if path.endswith('/'): directories.add(path[:-1]) else: directories.add(path) elif key['storageClass'] == 'BUCKET': pass else: files.add(path) files = sorted( [posixpath.split(f)[1] for f in files if f not in directories]) directories = sorted([posixpath.split(x)[1] for x in directories]) yield path, directories, files for directory in directories: for tup in self.walk(directory): yield tup
def isfile(self, path): from gcsfs.core import norm_path path = norm_path(_stringify_path(path)) try: contents = self.fs.ls(path) return len(contents) == 1 and contents[0] == path except OSError: return False
def __init__(self, path, open_file_func=partial(open, mode='rb'), row_group=None, partition_keys=None): self.path = _stringify_path(path) self.open_file_func = open_file_func self.row_group = row_group self.partition_keys = partition_keys or []
def _parse_uri(path): path = _stringify_path(path) parsed_uri = urlparse(path) if parsed_uri.scheme in _URI_STRIP_SCHEMES: return parsed_uri.path else: # ARROW-4073: On Windows returning the path with the scheme # stripped removes the drive letter, if any return path
def isdir(self, path): path = _sanitize_s3(_stringify_path(path)) try: contents = self.fs.ls(path) if len(contents) == 1 and contents[0] == path: return False else: return True except OSError: return False
def isdir(self, path): from gcsfs.core import norm_path path = norm_path(_stringify_path(path)) try: contents = self.fs.ls(path) if len(contents) == 1 and contents[0] == path: return False else: return True except OSError: return False
def _ensure_fs_and_paths(path, filesystem=None): # Return filesystem and list of string paths or FileSelector from pyarrow.fs import FileType, FileSelector filesystem, path = _ensure_fs(filesystem, _stringify_path(path)) infos = filesystem.get_file_info([path])[0] if infos.type == FileType.Directory: # for directory, pass a selector paths_or_selector = FileSelector(path, recursive=True) elif infos.type == FileType.File: # for a single file path, pass it as a list paths_or_selector = [path] else: raise FileNotFoundError(path) return filesystem, paths_or_selector
def parquet_dataset(metadata_path, schema=None, filesystem=None, format=None): """ Create a FileSystemDataset from a `_metadata` file created via `pyarrrow.parquet.write_metadata`. Parameters ---------- metadata_path : path, Path pointing to a single file parquet metadata file schema : Schema, optional Optionally provide the Schema for the Dataset, in which case it will not be inferred from the source. filesystem : FileSystem or URI string, default None If a single path is given as source and filesystem is None, then the filesystem will be inferred from the path. If an URI string is passed, then a filesystem object is constructed using the URI's optional path component as a directory prefix. See the examples below. Note that the URIs on Windows must follow 'file:///C:...' or 'file:/C:...' patterns. format : ParquetFileFormat An instance of a ParquetFileFormat if special options needs to be passed. Returns ------- FileSystemDataset """ from pyarrow.fs import LocalFileSystem if format is None: format = ParquetFileFormat() elif not isinstance(format, ParquetFileFormat): raise ValueError("format argument must be a ParquetFileFormat") if filesystem is None: filesystem = LocalFileSystem() else: filesystem, _ = _ensure_filesystem(filesystem) metadata_path = _normalize_path(filesystem, _stringify_path(metadata_path)) factory = ParquetDatasetFactory(metadata_path, filesystem, format) return factory.finish(schema)
def _resolve_filesystem_and_path(path, filesystem=None, allow_legacy_filesystem=False): """ Return filesystem/path from path which could be an URI or a plain filesystem path. """ if not _is_path_like(path): if filesystem is not None: raise ValueError( "'filesystem' passed but the specified path is file-like, so" " there is nothing to open with 'filesystem'.") return filesystem, path path = _stringify_path(path) if filesystem is not None: filesystem = _ensure_filesystem( filesystem, allow_legacy_filesystem=allow_legacy_filesystem) return filesystem, path else: return FileSystem.from_uri(path)
def _get_fs_from_path(path): """ return filesystem from path which could be an HDFS URI """ # input can be hdfs URI such as hdfs://host:port/myfile.parquet path = _stringify_path(path) # if _has_pathlib and isinstance(path, pathlib.Path): # path = str(path) parsed_uri = urlparse(path) if parsed_uri.scheme == 'hdfs': netloc_split = parsed_uri.netloc.split(':') host = netloc_split[0] if host == '': host = 'default' port = 0 if len(netloc_split) == 2 and netloc_split[1].isnumeric(): port = int(netloc_split[1]) fs = pa.hdfs.connect(host=host, port=port) else: fs = LocalFileSystem.get_instance() return fs
def resolve_filesystem_and_path(where, filesystem=None): """ Return filesystem from path which could be an HDFS URI, a local URI, or a plain filesystem path. """ if not _is_path_like(where): if filesystem is not None: raise ValueError("filesystem passed but where is file-like, so" " there is nothing to open with filesystem.") return filesystem, where path = _stringify_path(where) if filesystem is not None: return _ensure_filesystem(filesystem), path parsed_uri = urlparse(path) if parsed_uri.scheme == 'hdfs': # Input is hdfs URI such as hdfs://host:port/myfile.parquet netloc_split = parsed_uri.netloc.split(':') host = netloc_split[0] if host == '': host = 'default' port = 0 if len(netloc_split) == 2 and netloc_split[1].isnumeric(): port = int(netloc_split[1]) fs = pa.hdfs.connect(host=host, port=port) fs_path = parsed_uri.path elif parsed_uri.scheme == 'file': # Input is local URI such as file:///home/user/myfile.parquet fs = LocalFileSystem.get_instance() fs_path = parsed_uri.path else: # Input is local path such as /home/user/myfile.parquet fs = LocalFileSystem.get_instance() fs_path = where return fs, fs_path
def open(self, path, mode='rb'): """ Open file for reading or writing """ path = _stringify_path(path) return self.fs.open(path, mode=mode)
def ls(self, path, detail=False): path = _stringify_path(path) return self.fs.ls(path, detail=detail)
def walk(self, path): """ Directory tree generator, like os.walk """ path = _stringify_path(path) return self.fs.walk(path)
def isfile(self, path): path = _stringify_path(path) return os.path.isfile(path)
def mkdir(self, path, create_parents=True): path = _stringify_path(path) if create_parents: return self.fs.mkdirs(path) else: return self.fs.mkdir(path)
def exists(self, path): path = _stringify_path(path) return self.fs.exists(path)
def delete(self, path, recursive=False): path = _stringify_path(path) return self.fs.rm(path, recursive=recursive)
def exists(self, path): path = _stringify_path(path) return os.path.exists(path)
def _ensure_single_source(path, filesystem=None): """ Treat path as either a recursively traversable directory or a single file. Parameters ---------- path : path-like filesystem : FileSystem or str, optional If an URI is passed, then its path component will act as a prefix for the file paths. Returns ------- (FileSystem, list of str or fs.Selector) File system object and either a single item list pointing to a file or an fs.Selector object pointing to a directory. Raises ------ TypeError If the passed filesystem has wrong type. FileNotFoundError If the referenced file or directory doesn't exist. """ from pyarrow.fs import FileSystem, LocalFileSystem, FileType, FileSelector path = _stringify_path(path) # if filesystem is not given try to automatically determine one # first check if the file exists as a local (relative) file path # if not then try to parse the path as an URI file_info = None if filesystem is None: filesystem = LocalFileSystem() try: file_info = filesystem.get_file_info([path])[0] except OSError: file_info = None exists_locally = False else: exists_locally = (file_info.type != FileType.NotFound) # if the file or directory doesn't exists locally, then assume that # the path is an URI describing the file system as well if not exists_locally: try: filesystem, path = FileSystem.from_uri(path) except ValueError as e: # ARROW-8213: neither an URI nor a locally existing path, # so assume that local path was given and propagate a nicer # file not found error instead of a more confusing scheme # parsing error if "empty scheme" not in str(e): raise else: # unset file_info to query it again from the new filesystem file_info = None # construct a filesystem if it is a valid URI filesystem, _ = _ensure_fs(filesystem) # ensure that the path is normalized before passing to dataset discovery path = filesystem.normalize_path(path) # retrieve the file descriptor if file_info is None: file_info = filesystem.get_file_info([path])[0] # depending on the path type either return with a recursive # directory selector or as a list containing a single file if file_info.type == FileType.Directory: paths_or_selector = FileSelector(path, recursive=True) elif file_info.type == FileType.File: paths_or_selector = [path] else: raise FileNotFoundError(path) return filesystem, paths_or_selector
def mkdir(self, path, create_parents=True): path = _stringify_path(path) if create_parents: os.makedirs(path) else: os.mkdir(path)
def _ensure_multiple_sources(paths, filesystem=None): """ Treat a list of paths as files belonging to a single file system If the file system is local then also validates that all paths are referencing existing *files* otherwise any non-file paths will be silently skipped (for example on a remote filesystem). Parameters ---------- paths : list of path-like Note that URIs are not allowed. filesystem : FileSystem or str, optional If an URI is passed, then its path component will act as a prefix for the file paths. Returns ------- (FileSystem, list of str) File system object and a list of normalized paths. Raises ------ TypeError If the passed filesystem has wrong type. IOError If the file system is local and a referenced path is not available or not a file. """ from pyarrow.fs import (LocalFileSystem, SubTreeFileSystem, _MockFileSystem, FileType, _ensure_filesystem) if filesystem is None: # fall back to local file system as the default filesystem = LocalFileSystem() else: # construct a filesystem if it is a valid URI filesystem = _ensure_filesystem(filesystem) is_local = (isinstance(filesystem, (LocalFileSystem, _MockFileSystem)) or (isinstance(filesystem, SubTreeFileSystem) and isinstance(filesystem.base_fs, LocalFileSystem))) # allow normalizing irregular paths such as Windows local paths paths = [filesystem.normalize_path(_stringify_path(p)) for p in paths] # validate that all of the paths are pointing to existing *files* # possible improvement is to group the file_infos by type and raise for # multiple paths per error category if is_local: for info in filesystem.get_file_info(paths): file_type = info.type if file_type == FileType.File: continue elif file_type == FileType.NotFound: raise FileNotFoundError(info.path) elif file_type == FileType.Directory: raise IsADirectoryError( 'Path {} points to a directory, but only file paths are ' 'supported. To construct a nested or union dataset pass ' 'a list of dataset objects instead.'.format(info.path)) else: raise IOError( 'Path {} exists but its type is unknown (could be a ' 'special file such as a Unix socket or character device, ' 'or Windows NUL / CON / ...)'.format(info.path)) return filesystem, paths
def isdir(self, path): path = _stringify_path(path) return os.path.isdir(path)
def ls(self, path): path = _stringify_path(path) return sorted(pjoin(path, x) for x in os.listdir(path))