def test_open_dataset_from_uri_s3(s3_connection, s3_server): # open dataset from non-localfs string path from pyarrow.fs import FileSystem import pyarrow.parquet as pq host, port, access_key, secret_key = s3_connection uri = ( "s3://{}:{}@mybucket/data.parquet?scheme=http&endpoint_override={}:{}" .format(access_key, secret_key, host, port) ) fs, path = FileSystem.from_uri(uri) fs.create_dir("mybucket") table = pa.table({'a': [1, 2, 3]}) with fs.open_output_stream("mybucket/data.parquet") as out: pq.write_table(table, out) # full string URI dataset = ds.dataset(uri, format="parquet") assert dataset.to_table().equals(table) # passing filesystem object dataset = ds.dataset(path, format="parquet", filesystem=fs) assert dataset.to_table().equals(table)
def _ensure_directory(uri: str): """Create directory at remote URI. Some external filesystems require directories to already exist, or at least the `netloc` to be created (e.g. PyArrows ``mock://`` filesystem). Generally this should be done before and outside of Ray applications. This utility is thus primarily used in testing, e.g. of ``mock://` URIs. """ fs, path = get_fs_and_path(uri) try: fs.create_dir(path) except Exception: pass
def create_dir(path, fs_options, fs=None): fs, path = parse(path, fs_options=fs_options, fs=fs) if fs is None: fs = pa.fs.LocalFileSystem() fs.create_dir(path, recursive=True)
def test_open_dataset_from_s3_with_filesystem_uri(s3_connection, s3_server): from pyarrow.fs import FileSystem import pyarrow.parquet as pq host, port, access_key, secret_key = s3_connection bucket = 'theirbucket' path = 'nested/folder/data.parquet' uri = "s3://{}:{}@{}/{}?scheme=http&endpoint_override={}:{}".format( access_key, secret_key, bucket, path, host, port ) fs, path = FileSystem.from_uri(uri) assert path == 'theirbucket/nested/folder/data.parquet' fs.create_dir(bucket) table = pa.table({'a': [1, 2, 3]}) with fs.open_output_stream(path) as out: pq.write_table(table, out) # full string URI dataset = ds.dataset(uri, format="parquet") assert dataset.to_table().equals(table) # passing filesystem as an uri template = ( "s3://{}:{}@{{}}?scheme=http&endpoint_override={}:{}".format( access_key, secret_key, host, port ) ) cases = [ ('theirbucket/nested/folder/', '/data.parquet'), ('theirbucket/nested/folder', 'data.parquet'), ('theirbucket/nested/', 'folder/data.parquet'), ('theirbucket/nested', 'folder/data.parquet'), ('theirbucket', '/nested/folder/data.parquet'), ('theirbucket', 'nested/folder/data.parquet'), ] for prefix, path in cases: uri = template.format(prefix) dataset = ds.dataset(path, filesystem=uri, format="parquet") assert dataset.to_table().equals(table) with pytest.raises(pa.ArrowInvalid, match='Missing bucket name'): uri = template.format('/') ds.dataset('/theirbucket/nested/folder/data.parquet', filesystem=uri) error = ( "The path component of the filesystem URI must point to a directory " "but it has a type: `{}`. The path component is `{}` and the given " "filesystem URI is `{}`" ) path = 'theirbucket/doesnt/exist' uri = template.format(path) with pytest.raises(ValueError) as exc: ds.dataset('data.parquet', filesystem=uri) assert str(exc.value) == error.format('NotFound', path, uri) path = 'theirbucket/nested/folder/data.parquet' uri = template.format(path) with pytest.raises(ValueError) as exc: ds.dataset('data.parquet', filesystem=uri) assert str(exc.value) == error.format('File', path, uri)