class GoogleCloudStorage(BlobStorage): """Google Cloud Storage storage provider. Args: project_id: the ID of the Google Cloud project. bucket_name: the name of the Cloud Storage bucket to use for all blobs. service_account_file: the filename of the GCP service account JSON key file. """ def __init__(self, project_id: str, bucket_name: str, service_account_file: str): self._bucket_name = bucket_name self._client = storage.Client(project=project_id, credentials=service_account.Credentials.\ from_service_account_file(service_account_file)) self._fs = GCSFileSystem(token=service_account_file, check_connection=True) def get_object(self, blob_name): blob = self._client.bucket(self._bucket_name).get_blob(blob_name) if blob is None: raise ValueError("Cannot find blob: "+blob_name) return json.loads(blob.download_as_string().decode("utf-8")) @contextlib.contextmanager def get_file(self, blob_name): path = os.path.join(self._bucket_name, blob_name) try: fileobj = self._fs.open(path, 'rb') yield fileobj finally: fileobj.close() def put_object(self, obj, blob_name): blob = self._client.bucket(self._bucket_name).blob(blob_name) data = json.dumps(obj).encode("utf-8") blob.upload_from_string(data, content_type="application/json") blob.reload() return Blob(blob_name, blob.size) def put_file(self, fileobj, blob_name): blob = self._client.bucket(self._bucket_name).blob(blob_name) blob.upload_from_file(fileobj) blob.reload() return Blob(blob_name, blob.size) def put_avro(self, schema, records, blob_name, codec='snappy'): path = os.path.join(self._bucket_name, blob_name) tmp_path = os.path.join(os.path.dirname(path), "~{}".format(os.path.basename(path))) with self._fs.open(tmp_path, "wb") as of: fastavro.writer(of, schema, records, codec) self._fs.mv(tmp_path, path) self._fs.setxattrs(path, content_type="avro/binary") blob = self._client.bucket(self._bucket_name).get_blob(blob_name) if blob is None: raise RuntimeError("Cannot find new avro blob: "+blob_name) return Blob(blob_name, blob.size)
def gcs_maker(populate=False): gcs = GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN) gcs.invalidate_cache() try: try: gcs.mkdir(TEST_BUCKET, default_acl="authenticatedread", acl="publicReadWrite") except gcsfs.utils.HttpError: pass # ensure we're empty. gcs.rm(TEST_BUCKET, recursive=True) for k in [a, b, c, d]: try: gcs.rm(k) except FileNotFoundError: pass if populate: for flist in [files, csv_files, text_files]: for fname, data in flist.items(): with gcs.open(TEST_BUCKET + "/" + fname, "wb") as f: f.write(data) gcs.invalidate_cache() yield gcs finally: for f in gcs.find(TEST_BUCKET): try: gcs.rm(f) except: # noqa: E722 pass
def gcs_maker(populate=False): gcs = GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN) gcs.invalidate_cache() try: try: gcs.mkdir(TEST_BUCKET, default_acl="authenticatedread", acl="publicReadWrite") except: pass for k in [a, b, c, d]: try: gcs.rm(k) except: pass if populate: for flist in [files, csv_files, text_files]: for fname, data in flist.items(): with gcs.open(TEST_BUCKET+'/'+fname, 'wb') as f: f.write(data) yield gcs finally: for f in gcs.find(TEST_BUCKET): try: gcs.rm(f) except: pass
def gcs_maker(populate=False): gcs = GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN) try: if not gcs.exists(TEST_BUCKET): gcs.mkdir(TEST_BUCKET) for k in [a, b, c, d]: try: gcs.rm(k) except: pass if populate: for flist in [files, csv_files, text_files]: for fname, data in flist.items(): with gcs.open(TEST_BUCKET + '/' + fname, 'wb') as f: f.write(data) yield gcs finally: [gcs.rm(f) for f in gcs.walk(TEST_BUCKET)]
# Process the known (hardcoded) tables for table_name in table_names: log.info(f"PROCESSING {table_name}") primary_key_gen = PrimaryKeyGenerator(table_name, args.firecloud) # set to hold all columns for this table, list to hold all the rows column_set = set() row_list = [] # generated PK column headers for Firecloud compatibility entity_name = primary_key_gen.generate_entity_name() # read json data for path in gcs.ls(os.path.join(args.input_dir, table_name)): log.info(f"...Opening {path}") with gcs.open(path, 'r') as json_file: print(path, json_file) for line in json_file: row = json.loads(line) if not row: raise RuntimeError(f'Encountered invalid JSON "{line}", aborting.') row[entity_name] = primary_key_gen.generate_primary_key(row) column_set.update(row.keys()) row_list.append(row) # make sure pk is the first column (Firecloud req.) # pop out the PK, will be splitting this set out later column_set.discard(entity_name)