def _ensure_fs_and_paths(path_or_paths, filesystem=None): # Validate and convert the path-likes and filesystem. # Returns filesystem and list of string paths or FileSelector from pyarrow.fs import FileSystem, FileType, FileSelector if isinstance(path_or_paths, list): paths_or_selector = [_stringify_path(path) for path in path_or_paths] if filesystem is None: # infer from first path filesystem, _ = FileSystem.from_uri(paths_or_selector[0]) else: path = _stringify_path(path_or_paths) if filesystem is None: filesystem, path = FileSystem.from_uri(path) stats = filesystem.get_target_stats([path])[0] if stats.type == FileType.Directory: # for directory, pass a selector paths_or_selector = FileSelector(path, recursive=True) elif stats.type == FileType.File: # for a single file path, pass it as a list paths_or_selector = [path] else: raise FileNotFoundError(path) return filesystem, paths_or_selector
def _get_filesystem_path(path, filesystem=None, storage_options=None): """ Get the filesystem and path for a given filesystem and path. If the filesystem is not None then it's just returned as is. """ import pyarrow if (isinstance(path, str) and storage_options is None and filesystem is None and Version(pyarrow.__version__) >= Version("5.0.0")): # Use the native pyarrow filesystem if possible. try: from pyarrow.fs import FileSystem filesystem, path = FileSystem.from_uri(path) except Exception: # fallback to use get_handle / fsspec for filesystems # that pyarrow doesn't support pass if _is_fsspec_url(path) and filesystem is None: fsspec = import_optional_dependency( "fsspec", extra="fsspec is requred for 'storage_options'.") filesystem, path = fsspec.core.url_to_fs(path, **(storage_options or {})) if filesystem is None and storage_options: raise ValueError( "Cannot provide 'storage_options' with non-fsspec path '{}'". format(path)) return filesystem, path
def __iter__(self): shuffler = RandomShuffler( self.shuffler_capacity if self.shuffle else 1, self.seed) group_count = 0 for filepath in self.files: fs, path = FileSystem.from_uri(filepath) with fs.open_input_file(path) as fobj: parquet = pg.ParquetFile(fobj) for group_idx in range(parquet.num_row_groups): # A simple form of row-group level bucketing without memory overhead. # Pros: # - It requires zero communication to initialize the distributed policy # - It uses little memory and no startup overhead, i.e. collecting row groups. # Cons: # The drawback would be if the world size is much larger than # the average number of row groups. As a result, many of the # file open operations would be wasted. group_count += 1 if group_count % self.world_size != self.rank: continue row_group = parquet.read_row_group(group_idx, columns=self.columns) for batch in row_group.to_batches(): # type: RecordBatch # TODO: read batches not using pandas for _, row in batch.to_pandas().iterrows(): shuffler.append(row) # Maintain the shuffler buffer around its capacity. while shuffler.full(): yield self._convert(shuffler.pop().to_dict(), self.spark_row_metadata) while shuffler: yield self._convert(shuffler.pop().to_dict(), self.spark_row_metadata)
def _ensure_fs(fs_or_uri): from pyarrow.fs import (FileSystem, LocalFileSystem, SubTreeFileSystem, FileType, _ensure_filesystem) if isinstance(fs_or_uri, str): # instantiate the file system from an uri, if the uri has a path # component then it will be treated as a path prefix filesystem, prefix = FileSystem.from_uri(fs_or_uri) is_local = isinstance(filesystem, LocalFileSystem) prefix = filesystem.normalize_path(prefix) if prefix: # validate that the prefix is pointing to a directory prefix_info = filesystem.get_file_info([prefix])[0] if prefix_info.type != FileType.Directory: raise ValueError( "The path component of the filesystem URI must point to a " "directory but it has a type: `{}`. The path component " "is `{}` and the given filesystem URI is `{}`".format( prefix_info.type.name, prefix_info.path, fs_or_uri)) filesystem = SubTreeFileSystem(prefix, filesystem) return filesystem, is_local try: filesystem = _ensure_filesystem(fs_or_uri) except TypeError: raise TypeError( '`filesystem` argument must be a FileSystem instance or a valid ' 'file system URI') if isinstance(filesystem, (LocalFileSystem, _MockFileSystem)): return filesystem, True else: return filesystem, False
def _ensure_fs(filesystem, path): # Validate or infer the filesystem from the path from pyarrow.fs import (FileSystem, LocalFileSystem, FileType, _normalize_path) if filesystem is None: # First check if the file exists as a local (relative) file path filesystem = LocalFileSystem() try: infos = filesystem.get_file_info([path])[0] except OSError: local_path_exists = False else: local_path_exists = (infos.type != FileType.NotFound) if not local_path_exists: # Perhaps it's a URI? try: return FileSystem.from_uri(path) except ValueError as e: if "empty scheme" not in str(e): raise # ARROW-8213: not a URI, assume local path # to get a nice error message. # ensure we have a proper path (eg no backslashes on Windows) path = _normalize_path(filesystem, path) return filesystem, path
def test_open_dataset_from_uri_s3(s3_connection, s3_server): # open dataset from non-localfs string path from pyarrow.fs import FileSystem import pyarrow.parquet as pq host, port, access_key, secret_key = s3_connection uri = ( "s3://{}:{}@mybucket/data.parquet?scheme=http&endpoint_override={}:{}" .format(access_key, secret_key, host, port) ) fs, path = FileSystem.from_uri(uri) fs.create_dir("mybucket") table = pa.table({'a': [1, 2, 3]}) with fs.open_output_stream("mybucket/data.parquet") as out: pq.write_table(table, out) # full string URI dataset = ds.dataset(uri, format="parquet") assert dataset.to_table().equals(table) # passing filesystem object dataset = ds.dataset(path, format="parquet", filesystem=fs) assert dataset.to_table().equals(table)
def _ensure_fs(filesystem, path): # Validate or infer the filesystem from the path from pyarrow.fs import FileSystem if filesystem is not None: return filesystem, path return FileSystem.from_uri(path)
def _ensure_fs(filesystem, path): # Validate or infer the filesystem from the path from pyarrow.fs import ( FileSystem, LocalFileSystem, FileType, _normalize_path) if filesystem is None: # first check if the file exists as a local (relative) file path filesystem = LocalFileSystem() try: infos = filesystem.get_file_info([path])[0] except OSError: return FileSystem.from_uri(path) if infos.type == FileType.NotFound: return FileSystem.from_uri(path) # ensure we have a proper path (eg no backslashes on Windows) path = _normalize_path(filesystem, path) return filesystem, path
def test_s3_real_aws_region_selection(): # Taken from a registry of open S3-hosted datasets # at https://github.com/awslabs/open-data-registry fs, path = FileSystem.from_uri('s3://mf-nwp-models/README.txt') assert fs.region == 'eu-west-1' with fs.open_input_stream(path) as f: assert b"Meteo-France Atmospheric models on AWS" in f.read(50) # Passing an explicit region disables auto-selection fs, path = FileSystem.from_uri( 's3://mf-nwp-models/README.txt?region=us-east-2') assert fs.region == 'us-east-2' # Reading from the wrong region may still work for public buckets... # Non-existent bucket (hopefully, otherwise need to fix this test) with pytest.raises(IOError, match="Bucket '.*' not found"): FileSystem.from_uri('s3://x-arrow-non-existent-bucket') fs, path = FileSystem.from_uri( 's3://x-arrow-non-existent-bucket?region=us-east-3') assert fs.region == 'us-east-3'
def _ensure_fs(filesystem, path): # Validate or infer the filesystem from the path from pyarrow.fs import FileSystem, LocalFileSystem if filesystem is None: try: filesystem, _ = FileSystem.from_uri(path) except Exception: # when path is not found, we fall back to local file system filesystem = LocalFileSystem() return filesystem
def s3_example_fs(s3_connection, s3_server): from pyarrow.fs import FileSystem host, port, access_key, secret_key = s3_connection uri = ( "s3://{}:{}@mybucket/data.parquet?scheme=http&endpoint_override={}:{}". format(access_key, secret_key, host, port)) fs, path = FileSystem.from_uri(uri) fs.create_dir("mybucket") yield fs, uri, path
def resolve(self, uri: str) -> Iterable[str]: """Resolve dataset via a filesystem URI.""" uri = normalize_uri(uri) parsed = urlparse(uri) fs, base_dir = FileSystem.from_uri(uri) # base_dir = parsed.netloc + parsed.path selector = FileSelector(base_dir, allow_not_found=True, recursive=True) scheme = parsed.scheme if parsed.scheme else "file" return [ scheme + "://" + finfo.path for finfo in fs.get_file_info(selector) if finfo.path.endswith(".parquet") ]
def test_filesystem_from_uri_s3(minio_server): from pyarrow.fs import S3FileSystem address, access_key, secret_key = minio_server uri = "s3://{}:{}@mybucket/foo/bar?scheme=http&endpoint_override={}" \ .format(access_key, secret_key, urllib.parse.quote(address)) fs, path = FileSystem.from_uri(uri) assert isinstance(fs, S3FileSystem) assert path == "mybucket/foo/bar" fs.create_dir(path) [st] = fs.get_target_stats([path]) assert st.path == path assert st.type == FileType.Directory
def test_filesystem_from_uri_s3(s3_connection, s3_server): from pyarrow.fs import S3FileSystem host, port, access_key, secret_key = s3_connection uri = "s3://{}:{}@mybucket/foo/bar?scheme=http&endpoint_override={}:{}" \ .format(access_key, secret_key, host, port) fs, path = FileSystem.from_uri(uri) assert isinstance(fs, S3FileSystem) assert path == "mybucket/foo/bar" fs.create_dir(path) [info] = fs.get_file_info([path]) assert info.path == path assert info.type == FileType.Directory
def get_schema(self, uri: str): fs, base_dir = FileSystem.from_uri(normalize_uri(uri)) selector = FileSelector(base_dir, allow_not_found=True, recursive=True) first_parquet = None for finfo in fs.get_file_info(selector): if finfo.path.endswith(".parquet"): first_parquet = finfo.path break metadata_file = fs.open_input_file(first_parquet) metadata = pq.read_metadata(metadata_file) kv_metadata = metadata.metadata try: return json.loads(kv_metadata[self.SPARK_PARQUET_ROW_METADATA]) except KeyError as exp: raise ValueError( f"Parquet dataset {uri} is not created via Spark") from exp
def __init__(self, filesystem: FileSystem, path: str, mode: str = "rb", encoding: Optional[str] = "utf-8"): """ HDFSFile construct Args: filesystem : FileSystem instance path : Path to file mode : read or write mode. Supported: "r", "rb" (default), "w", "wb". """ self.filesystem = filesystem self.path = path self.mode = mode self.encoding = None if "b" in mode else encoding self._file = filesystem.open(self.path, mode={ "r": "rb", "w": "wb" }.get(mode, mode))
def test_filesystem_from_uri(uri, expected_klass, expected_path): fs, path = FileSystem.from_uri(uri) assert isinstance(fs, expected_klass) assert path == expected_path
def test_cannot_instantiate_base_filesystem(): with pytest.raises(TypeError): FileSystem()
def test_open_dataset_from_s3_with_filesystem_uri(s3_connection, s3_server): from pyarrow.fs import FileSystem import pyarrow.parquet as pq host, port, access_key, secret_key = s3_connection bucket = 'theirbucket' path = 'nested/folder/data.parquet' uri = "s3://{}:{}@{}/{}?scheme=http&endpoint_override={}:{}".format( access_key, secret_key, bucket, path, host, port ) fs, path = FileSystem.from_uri(uri) assert path == 'theirbucket/nested/folder/data.parquet' fs.create_dir(bucket) table = pa.table({'a': [1, 2, 3]}) with fs.open_output_stream(path) as out: pq.write_table(table, out) # full string URI dataset = ds.dataset(uri, format="parquet") assert dataset.to_table().equals(table) # passing filesystem as an uri template = ( "s3://{}:{}@{{}}?scheme=http&endpoint_override={}:{}".format( access_key, secret_key, host, port ) ) cases = [ ('theirbucket/nested/folder/', '/data.parquet'), ('theirbucket/nested/folder', 'data.parquet'), ('theirbucket/nested/', 'folder/data.parquet'), ('theirbucket/nested', 'folder/data.parquet'), ('theirbucket', '/nested/folder/data.parquet'), ('theirbucket', 'nested/folder/data.parquet'), ] for prefix, path in cases: uri = template.format(prefix) dataset = ds.dataset(path, filesystem=uri, format="parquet") assert dataset.to_table().equals(table) with pytest.raises(pa.ArrowInvalid, match='Missing bucket name'): uri = template.format('/') ds.dataset('/theirbucket/nested/folder/data.parquet', filesystem=uri) error = ( "The path component of the filesystem URI must point to a directory " "but it has a type: `{}`. The path component is `{}` and the given " "filesystem URI is `{}`" ) path = 'theirbucket/doesnt/exist' uri = template.format(path) with pytest.raises(ValueError) as exc: ds.dataset('data.parquet', filesystem=uri) assert str(exc.value) == error.format('NotFound', path, uri) path = 'theirbucket/nested/folder/data.parquet' uri = template.format(path) with pytest.raises(ValueError) as exc: ds.dataset('data.parquet', filesystem=uri) assert str(exc.value) == error.format('File', path, uri)
def _ensure_single_source(path, filesystem=None): """ Treat path as either a recursively traversable directory or a single file. Parameters ---------- path : path-like filesystem : FileSystem or str, optional If an URI is passed, then its path component will act as a prefix for the file paths. Returns ------- (FileSystem, list of str or fs.Selector) File system object and either a single item list pointing to a file or an fs.Selector object pointing to a directory. Raises ------ TypeError If the passed filesystem has wrong type. FileNotFoundError If the referenced file or directory doesn't exist. """ from pyarrow.fs import FileSystem, LocalFileSystem, FileType, FileSelector path = _stringify_path(path) # if filesystem is not given try to automatically determine one # first check if the file exists as a local (relative) file path # if not then try to parse the path as an URI file_info = None if filesystem is None: filesystem = LocalFileSystem() try: file_info = filesystem.get_file_info([path])[0] except OSError: file_info = None exists_locally = False else: exists_locally = (file_info.type != FileType.NotFound) # if the file or directory doesn't exists locally, then assume that # the path is an URI describing the file system as well if not exists_locally: try: filesystem, path = FileSystem.from_uri(path) except ValueError as e: # ARROW-8213: neither an URI nor a locally existing path, # so assume that local path was given and propagate a nicer # file not found error instead of a more confusing scheme # parsing error if "empty scheme" not in str(e): raise else: # unset file_info to query it again from the new filesystem file_info = None # construct a filesystem if it is a valid URI filesystem, _ = _ensure_fs(filesystem) # ensure that the path is normalized before passing to dataset discovery path = filesystem.normalize_path(path) # retrieve the file descriptor if file_info is None: file_info = filesystem.get_file_info([path])[0] # depending on the path type either return with a recursive # directory selector or as a list containing a single file if file_info.type == FileType.Directory: paths_or_selector = FileSelector(path, recursive=True) elif file_info.type == FileType.File: paths_or_selector = [path] else: raise FileNotFoundError(path) return filesystem, paths_or_selector
def __init__(self): self._client, _ = FileSystem.from_uri(Envs.HDFS_SERVER)
def test_filesystem_from_path_object(path): p = pathlib.Path(path) fs, path = FileSystem.from_uri(p) assert isinstance(fs, LocalFileSystem) assert path == p.resolve().absolute().as_posix()