Exemple #1
0
class GoogleCloudStorage(BlobStorage):
    """Google Cloud Storage storage provider.

    Args:
        project_id: the ID of the Google Cloud project.
        bucket_name: the name of the Cloud Storage bucket to use for all blobs.
        service_account_file: the filename of the GCP service account JSON key 
            file.
    """
    def __init__(self, project_id: str, bucket_name: str, 
            service_account_file: str):
        self._bucket_name = bucket_name
        self._client = storage.Client(project=project_id, 
                credentials=service_account.Credentials.\
                from_service_account_file(service_account_file))
        self._fs = GCSFileSystem(token=service_account_file,
                check_connection=True)
    
    def get_object(self, blob_name):
        blob = self._client.bucket(self._bucket_name).get_blob(blob_name)
        if blob is None:
            raise ValueError("Cannot find blob: "+blob_name)
        return json.loads(blob.download_as_string().decode("utf-8"))

    @contextlib.contextmanager
    def get_file(self, blob_name):
        path = os.path.join(self._bucket_name, blob_name)
        try:
            fileobj = self._fs.open(path, 'rb')
            yield fileobj
        finally:
            fileobj.close()

    def put_object(self, obj, blob_name):
        blob = self._client.bucket(self._bucket_name).blob(blob_name)
        data = json.dumps(obj).encode("utf-8")
        blob.upload_from_string(data, content_type="application/json")
        blob.reload()
        return Blob(blob_name, blob.size)

    def put_file(self, fileobj, blob_name):
        blob = self._client.bucket(self._bucket_name).blob(blob_name)
        blob.upload_from_file(fileobj)
        blob.reload()
        return Blob(blob_name, blob.size)
    
    def put_avro(self, schema, records, blob_name, codec='snappy'):
        path = os.path.join(self._bucket_name, blob_name)
        tmp_path = os.path.join(os.path.dirname(path),
                "~{}".format(os.path.basename(path)))
        with self._fs.open(tmp_path, "wb") as of:
            fastavro.writer(of, schema, records, codec)
        self._fs.mv(tmp_path, path)
        self._fs.setxattrs(path, content_type="avro/binary")
        blob = self._client.bucket(self._bucket_name).get_blob(blob_name)
        if blob is None:
            raise RuntimeError("Cannot find new avro blob: "+blob_name)
        return Blob(blob_name, blob.size)
Exemple #2
0
def gcs_maker(populate=False):
    gcs = GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN)
    gcs.invalidate_cache()
    try:
        try:
            gcs.mkdir(TEST_BUCKET,
                      default_acl="authenticatedread",
                      acl="publicReadWrite")
        except gcsfs.utils.HttpError:
            pass

        # ensure we're empty.
        gcs.rm(TEST_BUCKET, recursive=True)

        for k in [a, b, c, d]:
            try:
                gcs.rm(k)
            except FileNotFoundError:
                pass
        if populate:
            for flist in [files, csv_files, text_files]:
                for fname, data in flist.items():
                    with gcs.open(TEST_BUCKET + "/" + fname, "wb") as f:
                        f.write(data)
        gcs.invalidate_cache()
        yield gcs
    finally:
        for f in gcs.find(TEST_BUCKET):
            try:
                gcs.rm(f)
            except:  # noqa: E722
                pass
Exemple #3
0
def gcs_maker(populate=False):
    gcs = GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN)
    gcs.invalidate_cache()
    try:
        try:
            gcs.mkdir(TEST_BUCKET, default_acl="authenticatedread",
                      acl="publicReadWrite")
        except:
            pass
        for k in [a, b, c, d]:
            try:
                gcs.rm(k)
            except:
                pass
        if populate:
            for flist in [files, csv_files, text_files]:
                for fname, data in flist.items():
                    with gcs.open(TEST_BUCKET+'/'+fname, 'wb') as f:
                        f.write(data)
        yield gcs
    finally:
        for f in gcs.find(TEST_BUCKET):
            try:
                gcs.rm(f)
            except:
                pass
Exemple #4
0
def gcs_maker(populate=False):
    gcs = GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN)
    try:
        if not gcs.exists(TEST_BUCKET):
            gcs.mkdir(TEST_BUCKET)
        for k in [a, b, c, d]:
            try:
                gcs.rm(k)
            except:
                pass
        if populate:
            for flist in [files, csv_files, text_files]:
                for fname, data in flist.items():
                    with gcs.open(TEST_BUCKET + '/' + fname, 'wb') as f:
                        f.write(data)
        yield gcs
    finally:
        [gcs.rm(f) for f in gcs.walk(TEST_BUCKET)]
Exemple #5
0
# Process the known (hardcoded) tables
for table_name in table_names:
    log.info(f"PROCESSING {table_name}")
    primary_key_gen = PrimaryKeyGenerator(table_name, args.firecloud)
    # set to hold all columns for this table, list to hold all the rows
    column_set = set()
    row_list = []
    # generated PK column headers for Firecloud compatibility
    entity_name = primary_key_gen.generate_entity_name()

    # read json data
    for path in gcs.ls(os.path.join(args.input_dir, table_name)):
        log.info(f"...Opening {path}")

        with gcs.open(path, 'r') as json_file:
            print(path, json_file)
            for line in json_file:
                row = json.loads(line)

                if not row:
                    raise RuntimeError(f'Encountered invalid JSON "{line}", aborting.')

                row[entity_name] = primary_key_gen.generate_primary_key(row)

                column_set.update(row.keys())
                row_list.append(row)

    # make sure pk is the first column (Firecloud req.)
    # pop out the PK, will be splitting this set out later
    column_set.discard(entity_name)