class GcsUnstructuredProvider(UnstructuredStorageProvider): """This class allows you to upload arbitrary bytes to GCS. They will be stored under bucket_name/base_path/filename """ file_system: GCSFileSystem def __init__( self, project: str, bucket_name: str, base_path: str, token: str = None, ) -> None: super().__init__() self.project = project self.bucket_name = bucket_name self.base_path = base_path self.token = token self.base_path = f"{bucket_name}/{base_path}/{{filename}}" self.file_name_cache: Set[str] = set() """The set of all filenames ever uploaded, checked before uploading""" self.logger = logging.getLogger("openwpm") async def init(self) -> None: await super(GcsUnstructuredProvider, self).init() self.file_system = GCSFileSystem(project=self.project, token=self.token, access="read_write") async def store_blob(self, filename: str, blob: bytes, overwrite: bool = False) -> None: target_path = self.base_path.format(filename=filename) if not overwrite and (filename in self.file_name_cache or self.file_system.exists(target_path)): self.logger.info("Not saving out file %s as it already exists", filename) return self.file_system.start_transaction() with self.file_system.open(target_path, mode="wb") as f: f.write(blob) self.file_system.end_transaction() self.file_name_cache.add(filename) async def flush_cache(self) -> None: pass async def shutdown(self) -> None: pass
class GcsStructuredProvider(ArrowProvider): """This class allows you to upload Parquet files to GCS. This might not actually be the thing that we want to do long term but seeing as GCS is the S3 equivalent of GCP it is the easiest way forward. Inspired by the old S3Aggregator structure the GcsStructuredProvider will by default store into base_path/visits/table_name in the given bucket. Pass a different sub_dir to change this. """ file_system: GCSFileSystem def __init__( self, project: str, bucket_name: str, base_path: str, token: str = None, sub_dir: str = "visits", ) -> None: super().__init__() self.project = project self.token = token self.base_path = f"{bucket_name}/{base_path}/{sub_dir}/{{table_name}}" def __str__(self) -> str: return f"GCS:{self.base_path.removesuffix('/{table_name}')}" async def init(self) -> None: await super(GcsStructuredProvider, self).init() self.file_system = GCSFileSystem(project=self.project, token=self.token, access="read_write") async def write_table(self, table_name: TableName, table: Table) -> None: self.file_system.start_transaction() pq.write_to_dataset( table, self.base_path.format(table_name=table_name), filesystem=self.file_system, ) self.file_system.end_transaction() async def shutdown(self) -> None: pass