def backup_table(self, table_name): client = Client("hscic") sql = "SELECT max(month) FROM {hscic}.%s" % table_name latest_date = client.query(sql).rows[0][0] latest_year_and_month = latest_date.strftime("%Y_%m") table = client.get_table(table_name) storage_client = StorageClient() bucket = storage_client.bucket() year_and_months = set() prefix_base = "backups/{}/".format(table_name) for blob in bucket.list_blobs(prefix=prefix_base): match = re.search("/(\d{4}_\d{2})/", blob.name) year_and_months.add(match.groups()[0]) if latest_year_and_month in year_and_months: print("{} table already backed up for {}".format( table_name, latest_year_and_month)) return storage_prefix = "{}/{}/{}-".format(prefix_base, latest_year_and_month, table_name) exporter = TableExporter(table, storage_prefix) exporter.export_to_storage()
def create_storage_backed_table(self, table_id, schema, gcs_path): gcs_client = StorageClient() bucket = gcs_client.bucket() if bucket.get_blob(gcs_path) is None: raise RuntimeError("Could not find blob at {}".format(gcs_path)) gcs_uri = "gs://{}/{}".format(self.project, gcs_path) schema_as_dict = [{ "name": s.name, "type": s.field_type.lower() } for s in schema] resource = { "tableReference": { "tableId": table_id }, "externalDataConfiguration": { "csvOptions": { "skipLeadingRows": "1" }, "sourceFormat": "CSV", "sourceUris": [gcs_uri], "schema": { "fields": schema_as_dict }, }, } path = "/projects/{}/datasets/{}/tables".format( self.project, self.dataset_id) try: self.gcbq_client._connection.api_request(method="POST", path=path, data=resource) except NotFound as e: if not dataset_is_missing(e): raise self.create_dataset() self.gcbq_client._connection.api_request(method="POST", path=path, data=resource) return self.get_table(table_id)
def create_storage_backed_table(self, table_id, schema, gcs_path): gcs_client = StorageClient() bucket = gcs_client.bucket() if bucket.get_blob(gcs_path) is None: raise RuntimeError('Could not find blob at {}'.format(gcs_path)) gcs_uri = 'gs://{}/{}'.format(self.project, gcs_path) schema_as_dict = [{'name': s.name, 'type': s.field_type.lower()} for s in schema] resource = { 'tableReference': {'tableId': table_id}, 'externalDataConfiguration': { 'csvOptions': {'skipLeadingRows': '1'}, 'sourceFormat': 'CSV', 'sourceUris': [gcs_uri], 'schema': {'fields': schema_as_dict} } } path = '/projects/{}/datasets/{}/tables'.format( self.project, self.dataset_id ) try: self.gcbq_client._connection.api_request( method='POST', path=path, data=resource ) except NotFound as e: if not dataset_is_missing(e): raise self.create_dataset() self.gcbq_client._connection.api_request( method='POST', path=path, data=resource ) return self.get_table(table_id)
def __init__(self, table, storage_prefix): self.table = table self.storage_prefix = storage_prefix storage_client = StorageClient() self.bucket = storage_client.bucket()
def upload_to_storage(self, local_path, storage_path): client = StorageClient() bucket = client.bucket() blob = bucket.blob(storage_path) with open(local_path) as f: blob.upload_from_file(f)