def _ensure_single_source(path, filesystem=None): """ Treat path as either a recursively traversable directory or a single file. Parameters ---------- path : path-like filesystem : FileSystem or str, optional If an URI is passed, then its path component will act as a prefix for the file paths. Returns ------- (FileSystem, list of str or fs.Selector) File system object and either a single item list pointing to a file or an fs.Selector object pointing to a directory. Raises ------ TypeError If the passed filesystem has wrong type. FileNotFoundError If the referenced file or directory doesn't exist. """ from pyarrow.fs import FileType, FileSelector, _resolve_filesystem_and_path # at this point we already checked that `path` is a path-like filesystem, path = _resolve_filesystem_and_path(path, filesystem) # ensure that the path is normalized before passing to dataset discovery path = filesystem.normalize_path(path) # retrieve the file descriptor file_info = filesystem.get_file_info(path) # depending on the path type either return with a recursive # directory selector or as a list containing a single file if file_info.type == FileType.Directory: paths_or_selector = FileSelector(path, recursive=True) elif file_info.type == FileType.File: paths_or_selector = [path] else: raise FileNotFoundError(path) return filesystem, paths_or_selector
def get_schema(self, uri: str): fs, base_dir = FileSystem.from_uri(normalize_uri(uri)) selector = FileSelector(base_dir, allow_not_found=True, recursive=True) first_parquet = None for finfo in fs.get_file_info(selector): if finfo.path.endswith(".parquet"): first_parquet = finfo.path break metadata_file = fs.open_input_file(first_parquet) metadata = pq.read_metadata(metadata_file) kv_metadata = metadata.metadata try: return json.loads(kv_metadata[self.SPARK_PARQUET_ROW_METADATA]) except KeyError as exp: raise ValueError( f"Parquet dataset {uri} is not created via Spark") from exp
def test_s3_real_aws(): # Exercise connection code with an AWS-backed S3 bucket. # This is a minimal integration check for ARROW-9261 and similar issues. from pyarrow.fs import S3FileSystem default_region = (os.environ.get('PYARROW_TEST_S3_REGION') or 'us-east-1') fs = S3FileSystem(anonymous=True) assert fs.region == default_region fs = S3FileSystem(anonymous=True, region='us-east-2') entries = fs.get_file_info(FileSelector('ursa-labs-taxi-data')) assert len(entries) > 0 with fs.open_input_stream('ursa-labs-taxi-data/2019/06/data.parquet') as f: md = f.metadata() assert 'Content-Type' in md assert md['Last-Modified'] == b'2020-01-17T16:26:28Z' # For some reason, the header value is quoted # (both with AWS and Minio) assert md['ETag'] == b'"f1efd5d76cb82861e1542117bfa52b90-8"'
def test_hdfs_options(hdfs_server): from pyarrow.fs import HdfsOptions, HadoopFileSystem if not pa.have_libhdfs(): pytest.skip('Cannot locate libhdfs') options = HdfsOptions() assert options.endpoint == ('', 0) options.endpoint = ('localhost', 8080) assert options.endpoint == ('localhost', 8080) with pytest.raises(TypeError): options.endpoint = 'localhost:8000' assert options.driver == 'libhdfs' options.driver = 'libhdfs3' assert options.driver == 'libhdfs3' with pytest.raises(ValueError): options.driver = 'unknown' assert options.replication == 3 options.replication = 2 assert options.replication == 2 assert options.user == '' options.user = '******' assert options.user == 'libhdfs' assert options.default_block_size == 0 options.default_block_size = 128*1024**2 assert options.default_block_size == 128*1024**2 assert options.buffer_size == 0 options.buffer_size = 64*1024 assert options.buffer_size == 64*1024 options = HdfsOptions.from_uri('hdfs://localhost:8080/?user=test') assert options.endpoint == ('hdfs://localhost', 8080) assert options.user == 'test' host, port, user = hdfs_server uri = "hdfs://{}:{}/?user={}".format(host, port, user) fs = HadoopFileSystem(uri) assert fs.get_target_stats(FileSelector('/'))
def _get_dataset(self, asset): # FIXME(roee88): bypass https://issues.apache.org/jira/browse/ARROW-7867 selector = FileSelector(asset.path, allow_not_found=True, recursive=True) try: data_files = [ f.path for f in asset.filesystem.get_file_info(selector) if f.size ] except NotADirectoryError: data_files = None if not data_files: data_files = [asset.path] # asset.path is probably a single file if asset.format == "csv" or asset.format == "parquet": return ds.dataset(data_files, format=asset.format, filesystem=asset.filesystem) raise ValueError("unsupported format {}".format(asset.format))
def _expand_directory( path: str, filesystem: "pyarrow.fs.FileSystem", exclude_prefixes: Optional[List[str]] = None, ) -> List[str]: """ Expand the provided directory path to a list of file paths. Args: path: The directory path to expand. filesystem: The filesystem implementation that should be used for reading these files. exclude_prefixes: The file relative path prefixes that should be excluded from the returned file set. Default excluded prefixes are "." and "_". Returns: A list of file paths contained in the provided directory. """ if exclude_prefixes is None: exclude_prefixes = [".", "_"] from pyarrow.fs import FileSelector selector = FileSelector(path, recursive=True) files = filesystem.get_file_info(selector) base_path = selector.base_dir filtered_paths = [] for file_ in files: if not file_.is_file: continue file_path = file_.path if not file_path.startswith(base_path): continue relative = file_path[len(base_path) :] if any(relative.startswith(prefix) for prefix in exclude_prefixes): continue filtered_paths.append((file_path, file_)) # We sort the paths to guarantee a stable order. return zip(*sorted(filtered_paths, key=lambda x: x[0]))
def test_get_file_info_with_selector(fs, pathfn): skip_fsspec_s3fs(fs) base_dir = pathfn('selector-dir/') file_a = pathfn('selector-dir/test_file_a') file_b = pathfn('selector-dir/test_file_b') dir_a = pathfn('selector-dir/test_dir_a') try: fs.create_dir(base_dir) with fs.open_output_stream(file_a): pass with fs.open_output_stream(file_b): pass fs.create_dir(dir_a) selector = FileSelector(base_dir, allow_not_found=False, recursive=True) assert selector.base_dir == base_dir infos = fs.get_file_info(selector) assert len(infos) == 3 for info in infos: if info.path.endswith(file_a): assert info.type == FileType.File elif info.path.endswith(file_b): assert info.type == FileType.File elif info.path.rstrip("/").endswith(dir_a): assert info.type == FileType.Directory else: raise ValueError('unexpected path {}'.format(info.path)) check_mtime_or_absent(info) finally: fs.delete_file(file_a) fs.delete_file(file_b) fs.delete_dir(dir_a) fs.delete_dir(base_dir)
def _ensure_fs_and_paths(path_or_paths, filesystem=None): # Validate and convert the path-likes and filesystem. # Returns filesystem and list of string paths or FileSelector from pyarrow.fs import FileType, FileSelector if isinstance(path_or_paths, list): paths_or_selector = [_stringify_path(path) for path in path_or_paths] # infer from first path filesystem = _ensure_fs(filesystem, paths_or_selector[0]) else: path = _stringify_path(path_or_paths) filesystem = _ensure_fs(filesystem, path) stats = filesystem.get_target_stats([path])[0] if stats.type == FileType.Directory: # for directory, pass a selector paths_or_selector = FileSelector(path, recursive=True) elif stats.type == FileType.File: # for a single file path, pass it as a list paths_or_selector = [path] else: raise FileNotFoundError(path) return filesystem, paths_or_selector
def list( self, path: str, ) -> List["pyarrow.fs.FileInfo"]: """List blobs and sub-dirs in the given path, if possible. Examples: # List created blobs and dirs at <storage_prefix>/my_app/path >>> client = storage.get_client("my_app") >>> client.list("path") [<FileInfo for '/tmp/storage/my_app/path/foo.txt' type=FileType.File>, <FileInfo for '/tmp/storage/my_app/path/subdir' type=FileType.Directory>] # Non-existent path. >>> client.get_info("does_not_exist") FileNotFoundError: ... # Not a directory. >>> storage.get_info("path/foo.txt") NotADirectoryError: ... Args: path: Relative directory to list from. Returns: List of file-info objects for the directory contents. Raises: FileNotFoundError if the given path is not found. NotADirectoryError if the given path isn't a valid directory. """ from pyarrow.fs import FileSelector full_path = self._resolve_path(path) selector = FileSelector(full_path, recursive=False) files = self.fs.get_file_info(selector) return files
def test_get_target_stats_with_selector(fs, pathfn): base_dir = pathfn('selector-dir/') file_a = pathfn('selector-dir/test_file_a') file_b = pathfn('selector-dir/test_file_b') dir_a = pathfn('selector-dir/test_dir_a') try: fs.create_dir(base_dir) with fs.open_output_stream(file_a): pass with fs.open_output_stream(file_b): pass fs.create_dir(dir_a) selector = FileSelector(base_dir, allow_non_existent=False, recursive=True) assert selector.base_dir == base_dir stats = fs.get_target_stats(selector) assert len(stats) == 3 for st in stats: if st.path.endswith(file_a): assert st.type == FileType.File elif st.path.endswith(file_b): assert st.type == FileType.File elif st.path.endswith(dir_a): assert st.type == FileType.Directory else: raise ValueError('unexpected path {}'.format(st.path)) finally: fs.delete_file(file_a) fs.delete_file(file_b) fs.delete_dir(dir_a) fs.delete_dir(base_dir)
def _ensure_single_source(path, filesystem=None): """ Treat path as either a recursively traversable directory or a single file. Parameters ---------- path : path-like filesystem : FileSystem or str, optional If an URI is passed, then its path component will act as a prefix for the file paths. Returns ------- (FileSystem, list of str or fs.Selector) File system object and either a single item list pointing to a file or an fs.Selector object pointing to a directory. Raises ------ TypeError If the passed filesystem has wrong type. FileNotFoundError If the referenced file or directory doesn't exist. """ from pyarrow.fs import FileSystem, LocalFileSystem, FileType, FileSelector path = _stringify_path(path) # if filesystem is not given try to automatically determine one # first check if the file exists as a local (relative) file path # if not then try to parse the path as an URI file_info = None if filesystem is None: filesystem = LocalFileSystem() try: file_info = filesystem.get_file_info([path])[0] except OSError: file_info = None exists_locally = False else: exists_locally = (file_info.type != FileType.NotFound) # if the file or directory doesn't exists locally, then assume that # the path is an URI describing the file system as well if not exists_locally: try: filesystem, path = FileSystem.from_uri(path) except ValueError as e: # ARROW-8213: neither an URI nor a locally existing path, # so assume that local path was given and propagate a nicer # file not found error instead of a more confusing scheme # parsing error if "empty scheme" not in str(e): raise else: # unset file_info to query it again from the new filesystem file_info = None # construct a filesystem if it is a valid URI filesystem, _ = _ensure_fs(filesystem) # ensure that the path is normalized before passing to dataset discovery path = filesystem.normalize_path(path) # retrieve the file descriptor if file_info is None: file_info = filesystem.get_file_info([path])[0] # depending on the path type either return with a recursive # directory selector or as a list containing a single file if file_info.type == FileType.Directory: paths_or_selector = FileSelector(path, recursive=True) elif file_info.type == FileType.File: paths_or_selector = [path] else: raise FileNotFoundError(path) return filesystem, paths_or_selector
def test_hdfs_options(hdfs_connection): from pyarrow.fs import HadoopFileSystem if not pa.have_libhdfs(): pytest.skip('Cannot locate libhdfs') host, port, user = hdfs_connection replication = 2 buffer_size = 64 * 1024 default_block_size = 128 * 1024**2 uri = ('hdfs://{}:{}/?user={}&replication={}&buffer_size={}' '&default_block_size={}') hdfs1 = HadoopFileSystem(host, port, user='******', replication=replication, buffer_size=buffer_size, default_block_size=default_block_size) hdfs2 = HadoopFileSystem.from_uri( uri.format(host, port, 'libhdfs', replication, buffer_size, default_block_size)) hdfs3 = HadoopFileSystem.from_uri( uri.format(host, port, 'me', replication, buffer_size, default_block_size)) hdfs4 = HadoopFileSystem.from_uri( uri.format(host, port, 'me', replication + 1, buffer_size, default_block_size)) hdfs5 = HadoopFileSystem(host, port) hdfs6 = HadoopFileSystem.from_uri('hdfs://{}:{}'.format(host, port)) hdfs7 = HadoopFileSystem(host, port, user='******') hdfs8 = HadoopFileSystem(host, port, user='******', kerb_ticket="cache_path") hdfs9 = HadoopFileSystem(host, port, user='******', kerb_ticket=pathlib.Path("cache_path")) hdfs10 = HadoopFileSystem(host, port, user='******', kerb_ticket="cache_path2") hdfs11 = HadoopFileSystem(host, port, user='******', kerb_ticket="cache_path", extra_conf={'hdfs_token': 'abcd'}) assert hdfs1 == hdfs2 assert hdfs5 == hdfs6 assert hdfs6 != hdfs7 assert hdfs2 != hdfs3 assert hdfs3 != hdfs4 assert hdfs7 != hdfs5 assert hdfs2 != hdfs3 assert hdfs3 != hdfs4 assert hdfs7 != hdfs8 assert hdfs8 == hdfs9 assert hdfs10 != hdfs9 assert hdfs11 != hdfs8 with pytest.raises(TypeError): HadoopFileSystem() with pytest.raises(TypeError): HadoopFileSystem.from_uri(3) for fs in [ hdfs1, hdfs2, hdfs3, hdfs4, hdfs5, hdfs6, hdfs7, hdfs8, hdfs9, hdfs10, hdfs11 ]: assert pickle.loads(pickle.dumps(fs)) == fs host, port, user = hdfs_connection hdfs = HadoopFileSystem(host, port, user=user) assert hdfs.get_file_info(FileSelector('/')) hdfs = HadoopFileSystem.from_uri("hdfs://{}:{}/?user={}".format( host, port, user)) assert hdfs.get_file_info(FileSelector('/'))
def test_get_file_info_with_selector(fs, pathfn): base_dir = pathfn('selector-dir/') file_a = pathfn('selector-dir/test_file_a') file_b = pathfn('selector-dir/test_file_b') dir_a = pathfn('selector-dir/test_dir_a') file_c = pathfn('selector-dir/test_dir_a/test_file_c') dir_b = pathfn('selector-dir/test_dir_b') try: fs.create_dir(base_dir) with fs.open_output_stream(file_a): pass with fs.open_output_stream(file_b): pass fs.create_dir(dir_a) with fs.open_output_stream(file_c): pass fs.create_dir(dir_b) # recursive selector selector = FileSelector(base_dir, allow_not_found=False, recursive=True) assert selector.base_dir == base_dir infos = fs.get_file_info(selector) if fs.type_name == "py::fsspec+s3": # s3fs only lists directories if they are not empty, but depending # on the s3fs/fsspec version combo, it includes the base_dir # (https://github.com/dask/s3fs/issues/393) assert (len(infos) == 4) or (len(infos) == 5) else: assert len(infos) == 5 for info in infos: if (info.path.endswith(file_a) or info.path.endswith(file_b) or info.path.endswith(file_c)): assert info.type == FileType.File elif (info.path.rstrip("/").endswith(dir_a) or info.path.rstrip("/").endswith(dir_b)): assert info.type == FileType.Directory elif (fs.type_name == "py::fsspec+s3" and info.path.rstrip("/").endswith("selector-dir")): # s3fs can include base dir, see above assert info.type == FileType.Directory else: raise ValueError('unexpected path {}'.format(info.path)) check_mtime_or_absent(info) # non-recursive selector -> not selecting the nested file_c selector = FileSelector(base_dir, recursive=False) infos = fs.get_file_info(selector) if fs.type_name == "py::fsspec+s3": # s3fs only lists directories if they are not empty # + for s3fs 0.5.2 all directories are dropped because of buggy # side-effect of previous find() call # (https://github.com/dask/s3fs/issues/410) assert (len(infos) == 3) or (len(infos) == 2) else: assert len(infos) == 4 finally: fs.delete_dir(base_dir)
def listdir(self, path): return [f.path for f in self._get_client().get_file_info(FileSelector(path))]