def main(): parser = argparse.ArgumentParser( description="Generate sample parquet data") parser.add_argument('path', type=str, nargs='?', help='path to save data to', default="./data/data.parquet") parser.add_argument( '--source', type=str, help= 'local path to import data from (optional; can be csv, json or parquet)' ) parser.add_argument( '--endpoint', type=str, help= 'S3 endpoint (e.g.: https://s3.eu-de.cloud-object-storage.appdomain.cloud' ) parser.add_argument('--access_key', type=str, help='S3 access key') parser.add_argument('--secret_key', type=str, help='S3 secret key') args = parser.parse_args() if args.endpoint: print("Using S3 file system") parsed_endpoint = urlparse(args.endpoint) fs = S3FileSystem(endpoint_override=parsed_endpoint.netloc, scheme=parsed_endpoint.scheme, access_key=args.access_key, secret_key=args.secret_key, background_writes=False) else: print("Using local file system") os.makedirs(os.path.dirname(args.path), exist_ok=True) fs = LocalFileSystem() table = import_table(args.source) with fs.open_output_stream(args.path) as f: pq.write_table(table, f) print("Table written to", args.path) print(table.to_pandas())
class FakeHadoopFileSystem: def __init__(self, *args, **kwargs): from pyarrow.fs import LocalFileSystem self._root = Path(_hdfs_root.name) self._fs = LocalFileSystem() def _path(self, path): from pyarrow.fs import FileSelector if isinstance(path, FileSelector): return FileSelector( os.fspath(self._root / path.base_dir.lstrip("/")), path.allow_not_found, path.recursive, ) return os.fspath(self._root / path.lstrip("/")) def create_dir(self, path): return self._fs.create_dir(self._path(path)) def open_input_stream(self, path): return self._fs.open_input_stream(self._path(path)) def open_output_stream(self, path): import posixpath # NOTE: HadoopFileSystem.open_output_stream creates directories # automatically. self.create_dir(posixpath.dirname(path)) return self._fs.open_output_stream(self._path(path)) def get_file_info(self, path): return self._fs.get_file_info(self._path(path)) def move(self, from_path, to_path): self._fs.move(self._path(from_path), self._path(to_path)) def delete_file(self, path): self._fs.delete_file(self._path(path))
class FakeHadoopFileSystem: def __init__(self, *args, **kwargs): from pyarrow.fs import LocalFileSystem self._root = Path(_hdfs_root.name) self._fs = LocalFileSystem() def _path(self, path): from pyarrow.fs import FileSelector if isinstance(path, FileSelector): return FileSelector( os.fspath(self._root / path.base_dir.lstrip("/")), path.allow_not_found, path.recursive, ) if isinstance(path, list): return [self._path(sub_path) for sub_path in path] return os.fspath(self._root / path.lstrip("/")) def create_dir(self, path, **kwargs): return self._fs.create_dir(self._path(path), **kwargs) def open_input_stream(self, path, **kwargs): return self._fs.open_input_stream(self._path(path), **kwargs) def open_output_stream(self, path, **kwargs): import posixpath # NOTE: HadoopFileSystem.open_output_stream creates directories # automatically. self.create_dir(posixpath.dirname(path)) return self._fs.open_output_stream(self._path(path), **kwargs) def get_file_info(self, path, **kwargs): from pyarrow.fs import FileInfo entries = self._fs.get_file_info(self._path(path), **kwargs) if isinstance(entries, FileInfo): ret = self._adjust_entry(entries) else: assert isinstance(entries, list) ret = list(map(self._adjust_entry, entries)) # import pdb; pdb.set_trace() return ret def _adjust_entry(self, entry): import posixpath from pyarrow.fs import FileInfo mocked_path = os.path.relpath(entry.path, self._root) mocked_parts = mocked_path.split(os.path.sep) return FileInfo( path=posixpath.join(*mocked_parts), type=entry.type, mtime=entry.mtime, size=entry.size, ) def move(self, from_path, to_path): self._fs.move(self._path(from_path), self._path(to_path)) def delete_file(self, path): self._fs.delete_file(self._path(path))