def test_find_bucket_found_with_prefix(self): """Tests that bucket is found when given prefix.""" bucket_foo = storage.Bucket(self.gcs_client, self.foo_name) bucket_bar = storage.Bucket(self.gcs_client, self.bar_name) bucket_iterator = iter([bucket_foo, bucket_bar]) actual_output = gcs_transcript_utils.find_bucket_with_prefix( bucket_iterator, self.foo_name) expected_output = self.foo_name self.assertEqual(actual_output, expected_output)
def publish_daily_cost( billing_bucket_name, target_bucket_name, target_object_name, kind="json", debug=False, dry_run=False, ): totals = {} client = storage.Client() bucket = storage.Bucket(client, billing_bucket_name) if kind == "csv": prefix = "report-" else: prefix = "billing-" blobs = bucket.list_blobs(prefix=prefix) for blob in blobs: buffer = io.StringIO(blob.download_as_string().decode()) if kind == "csv": current_totals = totals_from_csv(buffer) else: current_totals = totals_from_json(buffer) for time_range, cost in current_totals.items(): totals[time_range] = totals.get(time_range, 0) + cost # We want to push out sorted jsonl sorted_items = [{ "version": 1, "start_time": start_time, "end_time": end_time, "cost": cost } for (start_time, end_time), cost in totals.items()] sorted_items.sort(key=lambda d: d["start_time"]) if debug: for item in sorted_items: print(json.dumps(item)) if not dry_run: target_bucket = storage.Bucket(client, target_bucket_name) blob = target_bucket.blob(target_object_name) target_buffer = io.StringIO() for item in sorted_items: target_buffer.write(json.dumps(item) + "\n") target_buffer.seek(0) blob.upload_from_file(target_buffer) return sorted_items
def archive_events(project, log_name, source_bucket, destination_bucket, date, object_name_template='events-{date}.jsonl', debug=False, dry_run=False): storage_client = storage.Client() src_bucket = storage.Bucket(storage_client, source_bucket) dest_bucket = storage.Bucket(storage_client, destination_bucket) prefix = log_name + '/' + date.strftime('%Y/%m/%d') print(f'Finding blobs with prefix {prefix}') src_blobs = src_bucket.list_blobs(prefix=prefix) count = 0 all_events = [] for src_blob in src_blobs: with tempfile.TemporaryFile(mode='wb+') as temp: src_blob.download_to_file(temp) temp.seek(0) for line in temp: event = json.loads(json.loads(line)['jsonPayload']['message']) # Account for time when 'event' was nested if 'event' in event: event.update(event['event']) del event['event'] event = process_event(event) if debug: print(event) if not dry_run: all_events.append(event) count += 1 if not dry_run: # Timestamp is ISO8601 in UTC, so can be sorted lexicographically all_events.sort(key=lambda event: event['timestamp']) with tempfile.TemporaryFile(mode='w+') as out: for event in all_events: out.write(json.dumps(event) + '\n') out.seek(0) blob_name = object_name_template.format( date=date.strftime('%Y-%m-%d')) blob = dest_bucket.blob(blob_name) # Set metadata on the object so we know when this archive is for & how many events there are blob.metadata = { 'Events-Date': date.strftime('%Y-%m-%d'), 'Events-Count': len(all_events) } blob.upload_from_file(out) print( f'Uploaded {destination_bucket}/{blob_name} with {count} events' )
def __init__(self, config): self.config = config self.client = storage.Client() self.input_bucket = storage.Bucket(self.client, self.config['input_bucket']) self.output_bucket = storage.Bucket(self.client, self.config['output_bucket']) self.reg = ProcessedRegister(self.output_bucket, self.config['symbol'], self.config['name']) self.old_processed_bnames = [p[0] for p in self.reg.processed] self.new_processed_bnames = [] self.input_blobs = []
def download_blob(self, bucket_name, blob_name, local_path=None): """ Gets a blob from a bucket `Args:` bucket_name: str The name of the bucket blob_name: str The name of the blob local_path: str The local path where the file will be downloaded. If not specified, a temporary file will be created and returned, and that file will be removed automatically when the script is done running. `Returns:` str The path of the downloaded file """ if not local_path: local_path = files.create_temp_file_for_path('TEMPTHING') bucket = storage.Bucket(self.client, name=bucket_name) blob = storage.Blob(blob_name, bucket) logger.info(f'Downloading {blob_name} from {bucket_name} bucket.') with open(local_path, 'wb') as f: blob.download_to_file(f, client=self.client) logger.info(f'{blob_name} saved to {local_path}.') return local_path
def update(self): source_bucket = storage.Bucket(self.staging.client, self.config['source_bucket']) blobs = source_bucket.list_blobs(prefix=self.config['source_prefix']) blobs = {b.name: b for b in blobs} # check against blob names input_bnames = set(blobs) logging.debug('input blobs = %s', len(input_bnames)) old_processed = set(p[0] for p in self.processed) logging.debug('blobs already processed = %s', len(old_processed)) bnames_to_process = list(input_bnames - old_processed) logging.debug('non processed blobs = %s', len(bnames_to_process)) # check against timestamps bnames_to_process2 = [b for b, t in self.processed if blobs[b].time_created > t] logging.debug('outdated blobs to process = %s', len(bnames_to_process2)) # select blobs to process bnames_to_process = set(bnames_to_process + bnames_to_process2) logging.debug('blobs to process = %s', len(bnames_to_process)) procs = [] for bname in bnames_to_process: blob = blobs[bname] fname = os.path.split(bname)[1] fname = os.path.join(self.data_dir, fname) with open(fname, 'wb+') as fp: blob.download_to_file(fp) procs.append((blob.name, dtfmt(blob.time_created))) if procs: logging.debug('updating processed %s', self.processed_fname) self.processed = self.processed + procs with open(self.processed_fname, 'w+') as fp: fp.write(json.dumps(self.processed))
def maybe_upload_file(local_path): '''Upload a file to remote cloud storage if the path starts with gs:// or s3:// ''' if local_path.startswith(('s3://', 'gs://')): prefix = local_path.split(':')[0] remote_bucket_path = local_path[len("s3://"):] # same length bp = remote_bucket_path.split("/") bucket = bp[0] path = remote_bucket_path[1 + len(bucket):] # s3://example/file becomes s3:/example/file in Linux local_path = prefix + ':/' + remote_bucket_path if prefix == 's3': import boto3 s3 = boto3.client('s3', endpoint_url=os.environ.get('S3_ENDPOINT')) s3.upload_file(local_path, bucket, path) elif prefix == 'gs': from google.cloud import storage client = storage.Client() Hbucket = storage.Bucket(client, bucket) blob = storage.Blob(path, Hbucket) blob.upload_from_filename(local_path)
def get_configs(service_account_json_file, zip_filename='board_and_stream_cfg.zip', extract_dir='cfg'): ''' Downloads and extracts board configurations from GCP ''' with open(service_account_json_file, 'r') as sa_file: bucket_name = json.load(sa_file)['bucket'] credentials = service_account.Credentials.from_service_account_file( args.sa_account_json_key) client = storage.Client(project=PROJECT_ID, credentials=credentials) bucket = storage.Bucket(client, bucket_name) configs = bucket.get_blob(zip_filename) configs.download_to_filename(zip_filename) with ZipFile(zip_filename) as zip_file: try: for member in zip_file.namelist(): path, file_name = os.path.split(member) if os.path.splitext( file_name)[1] in EXTENSIONS and path == 'board_config': file_loc = zip_file.extract(member, extract_dir) shutil.move( file_loc, os.path.join(extract_dir, os.path.basename(file_loc))) else: zip_file.extract(member, extract_dir) finally: shutil.rmtree(os.path.join(extract_dir, 'board_config'))
def handler(event, context): # Get secret key secret_id = os.environ['secret_id'] print("Access AWS Secrets Manager") secretsmanager = boto3.client('secretsmanager') response = secretsmanager.get_secret_value(SecretId=secret_id) key_file_name = "/tmp/api.json" print("Write API key to temporary file") key_file = open(key_file_name, "w") key_file.write(response['SecretString']) key_file.close() print("Connect to Google Play GCP Storage service via API key") storage_client = storage.Client.from_service_account_json(key_file_name) cloud_storage_bucket = 'some-bucket' print("Connect to GCP bucket") source_bucket = storage.Bucket(storage_client, cloud_storage_bucket) print("Connect to AWS S3 service") s3 = boto3.resource('s3') target_bucket_name = os.environ['target_bucket_name'] print("Connect to AWS bucket " + target_bucket_name) target_bucket = s3.Bucket(target_bucket_name) prefix = 'stats/installs/installs_uk.nhs.covid19.production_' suffix = '_overview.csv' # Determine dates to get Google reports months = get_months_covering_data_as_of_today() length = len(months) i = 0 while i < length: object_name = prefix + months[i] + suffix print(str(i + 1) + "/" + str(length) + ": " + object_name) try: # Download Google reports print("Download from GCP bucket") blob = source_bucket.get_blob(object_name) blob_text = blob.download_as_text(encoding='utf16') blob_bytes_utf8 = blob_text.encode('utf-8') # Upload reports to our AWS S3 bucket print("Upload to AWS bucket") target_bucket.put_object(Key=object_name, Body=blob_bytes_utf8) except Exception as e: print(e) i += 1
def create_bucket(bucket, bucket_name): """Create and return a new bucket in the object store called 'bucket_name'. This will raise an ObjectStoreError if this bucket already exists """ new_bucket = _copy.copy(bucket) try: from google.cloud import storage as _storage client = new_bucket["client"] bucket_name = _sanitise_bucket_name(bucket_name, bucket["unique_suffix"]) bucket_obj = _storage.Bucket(client, name=bucket_name) bucket_obj.location = bucket["bucket"].location bucket_obj.storage_class = "REGIONAL" new_bucket["bucket"] = client.create_bucket(bucket_obj) new_bucket["bucket_name"] = str(bucket_name) except Exception as e: # couldn't create the bucket - likely because it already # exists - try to connect to the existing bucket from Acquire.ObjectStore import ObjectStoreError raise ObjectStoreError( "Unable to create the bucket '%s', likely because it " "already exists: %s" % (bucket_name, str(e))) return new_bucket
def dump_features_to_gcs(ft_tables, dest, project, client): """ Dump generated tables as files on Google Cloud Storage""" logger.info('Dumping {} tables to {}...'.format(len(ft_tables), dest)) gs_client = gcs.Client(project) split_uri = dest.split('/') filepath = '/'.join(split_uri[3:]) bucket_name = split_uri[2] bucket = gcs.Bucket(gs_client, bucket_name) jobs = [] for ft_table in ft_tables: filename_shard = ft_table.name + '{0:012d}'.format(0) blob = gcs.Blob(name=os.path.join(filepath, filename_shard), bucket=bucket) if blob.exists(): count = 0 while blob.exists(): logger.info(' -- Removing blob {}'.format(blob.path)) blob.delete() count += 1 filename_shard = ft_table.name + '{0:012d}'.format(count) blob = gcs.Blob(name=os.path.join(filepath, filename_shard), bucket=bucket) path = dest + '/' + ft_table.name + '*' jobname = 'features_dump_job_' + str(uuid.uuid4()) job = client.extract_table_to_storage(jobname, ft_table, path) job.destination_format = 'NEWLINE_DELIMITED_JSON' job.begin() jobs.append(job) return jobs
def create_regional_bucket(bucketname, region): ''' Creates a storage bucket in the current region. ''' client = storage.Client() b = storage.Bucket(bucketname) b.name = bucketname b.location = region try: final_bucket = client.create_bucket(b) return except google.api_core.exceptions.Conflict as ex: message = ''' An attempt was made to create a bucket at %s. However, the storage API indicated that this was an existing bucket. Exception reported: %s ''' % (bucketname, ex) except google.api_core.exceptions.BadRequest as ex: message = ''' An attempt was made to create a bucket at %s. However, the storage API indicated that there was an error during creation. Exception reported: %s ''' % (bucketname, ex) except Exception as ex: message = ''' An attempt was made to create a bucket at %s. However, there was an unexpected exception raised. Exception reported: %s ''' % (bucketname, ex) subject = 'Error with bucket creation' notify_admins(message, subject)
def create_gcs_bucket(self): bucket = storage.Bucket(self.storage_client, name=self.bucket_name) if not bucket.exists(): bucket.create(location=self.bucket_region) print('GCS bucket created.') else: print('GCS bucket found.')
def _check_bucket_exists_and_delete(spinner, storage_client, bucket_name, project_name): """Checks if the bucket exists and delete it. If it already exists, prompt the user to make sure they want to delete it and everything in it. Args: spinner: The spinner displayed in the console storage_client: The storage client object used to access GCS bucket_name: The name of the bucket to check if it exists project_name: The name of the project to check the bucket exists in Raises: SystemExit: If the bucket already exists and the user does not choose to delete it """ bucket = storage.Bucket( client=storage_client, name=bucket_name, user_project=project_name) if bucket.exists(): spinner.hide() answer = raw_input( '\nWARNING!!! Bucket {} already exists in project {}\nType YES to confirm you want to' ' delete it: '.format(bucket_name, project_name)) spinner.show() if answer != 'YES': spinner.fail('X') raise SystemExit() spinner.write('') bucket.delete(force=True) spinner.write('{} TESTING: Bucket {} deleted from project {}'.format( _CHECKMARK, bucket_name, project_name))
def test_find_bucket_not_found_with_prefix(self): """Tests that exception is raised if bucket is not found.S""" bucket_foo = storage.Bucket(self.gcs_client, self.foo_name) bucket_iterator = iter([bucket_foo]) self.assertRaises(NotFound, gcs_transcript_utils.find_bucket_with_prefix, bucket_iterator, self.bar_name)
def listAllFilesInFolder(self, folder, useCacheBucket=False): if self['data_file_storage_method'] == 'local': dir = os.path.join(self.configurationDirectory, folder) if os.path.exists(dir): return os.listdir(dir) else: return [] elif self['data_file_storage_method'] == 'gcs': if 'applicationId' not in self or self.applicationId is None: raise RuntimeError( "Can't load object from google cloud storage without an applicationId, which is used to indicate the bucket." ) storageClient = getSharedGCSStorageClient() bucketId = "kwola-testing-run-data-" + self.applicationId if useCacheBucket: bucketId += "-cache" applicationStorageBucket = storage.Bucket(storageClient, bucketId) blobs = applicationStorageBucket.list_blobs(prefix=folder, delimiter="") return [blob.name[len(folder) + 1:] for blob in blobs] else: raise RuntimeError( f"Unexpected value {self['data_file_storage_method']} for configuration data_file_storage_method" )
def deleteKwolaFileData(self, folder, fileName, useCacheBucket=False): filePath = os.path.join(folder, fileName) if self['data_serialization_encryption_key']: filePath += ".enc" try: if self['data_file_storage_method'] == 'local': os.unlink(os.path.join(self.configurationDirectory, filePath)) elif self['data_file_storage_method'] == 'gcs': if 'applicationId' not in self or self.applicationId is None: raise RuntimeError( "Can't load object from google cloud storage without an applicationId, which is used to indicate the bucket." ) storageClient = getSharedGCSStorageClient() bucketId = "kwola-testing-run-data-" + self.applicationId if useCacheBucket: bucketId += "-cache" applicationStorageBucket = storage.Bucket( storageClient, bucketId) objectBlob = storage.Blob(filePath, applicationStorageBucket) objectBlob.delete() return else: raise RuntimeError( f"Unexpected value {self['data_file_storage_method']} for configuration data_file_storage_method" ) except FileNotFoundError: return except google.cloud.exceptions.NotFound: return
def is_updated_after(self, bucket, object, ts): """ Checks if an object is updated in Google Cloud Storage. :param bucket: The Google cloud storage bucket where the object is. :type bucket: str :param object: The name of the object to check in the Google cloud storage bucket. :type object: str :param ts: The timestamp to check against. :type ts: datetime.datetime """ client = self.get_conn() bucket = storage.Bucket(client=client, name=bucket) blob = bucket.get_blob(blob_name=object) blob.reload() blob_update_time = blob.updated if blob_update_time is not None: import dateutil.tz if not ts.tzinfo: ts = ts.replace(tzinfo=dateutil.tz.tzutc()) self.log.info("Verify object date: %s > %s", blob_update_time, ts) if blob_update_time > ts: return True return False
def get_bucket(self): """Get the bucket defined by 'bucket_name' from the storage_client. Throws a ValueError when bucket_name is not set. If the bucket does not exist in GCS, a new bucket will be created. """ if self._bucket: return self._bucket if not self.bucket_name: raise ValueError("The 'bucket_name' needs to be set.") try: self._bucket = self.storage_client.get_bucket(self.bucket_name) except (exceptions.NotFound, exceptions.Forbidden): bucket = storage.Bucket(self.storage_client, name=self.bucket_name) bucket.versioning_enabled = True bucket.lifecycle_rules = [{ 'action': { 'type': 'SetStorageClass', 'storageClass': 'NEARLINE' }, 'condition': { 'numNewerVersions': 1, 'matchesStorageClass': ['REGIONAL', 'STANDARD'], 'age': 30 } }] try: bucket.create(location='europe-west4') except exceptions.Conflict: raise self._bucket = self.storage_client.get_bucket(self.bucket_name) return self._bucket
def delete_blob(self, audioName: str): """Deletes a blob from the bucket.""" bucket = storage.Bucket(self.storage_client, name=bucket_name) blob = bucket.blob(audioName) blob.delete() print("Blob {} deleted.".format(audioName))
def _get_blob(self, bucket_name: str, blob_name: str) -> gcs.Blob: """ Get a Blob object by name. """ with aws.service_account_credentials(): client = gcs.Client() bucket = gcs.Bucket(client, bucket_name) return bucket.get_blob(blob_name)
def create_bucket(client, bucket_name, location, storage_class): # Set properties on a plain resource object. bucket = storage.Bucket(client, name=bucket_name) bucket.location = location bucket.storage_class = storage_class bucket = client.create_bucket(bucket) return bucket
def get_gcs_bucket(d_pl_options): path_segments = d_pl_options[fidscs_globals.OPT_NAME_WORK_DIR][5:].split( '/') gcs_bucket = path_segments[0] beam_gcp_project = d_pl_options[fidscs_globals.OPT_NAME_PROJECT] return gcs.Bucket(get_gcs_client(), name=gcs_bucket, user_project=beam_gcp_project)
def create_bucket(self): """Creates a new bucket.""" bucket = storage.Bucket(self.storage_client, name=bucket_name) bucket.location = "europe-west1" self.storage_client.create_bucket(bucket)
def list_bucket_contents(): files = [] client = storage.Client() bucket = storage.Bucket(client, 'broad-dsp-monster-hca-prod-lattice') all_blobs = list(client.list_blobs(bucket)) for blob in all_blobs: files.append(blob.name) return files
def _uri_to_blob(creds, uri, conn=None): assert uri.startswith('gs://') url_tup = urlparse(uri) bucket_name = url_tup.netloc if conn is None: conn = calling_format.connect(creds) b = storage.Bucket(conn, name=bucket_name) return storage.Blob(url_tup.path.lstrip('/'), b)
def is_dir(self, cred_id, bucket, path): client = self.__get_client(cred_id) bucket = storage.Bucket(client, bucket) prefix = f"{path}/" for blob in bucket.list_blobs(prefix=prefix): if blob.name.startswith(prefix): return True return False
def _uri_to_blob(creds, uri, conn=None): assert uri.startswith('gs://') url_tup = urlparse(uri) bucket_name = url_tup.netloc if conn is None: conn = calling_format.connect(creds) b = storage.Bucket(conn, name=bucket_name) # Use 10MB chunk size return storage.Blob(url_tup.path, b, chunk_size=1048576 * 10)
def saveKwolaFileData(self, folder, fileName, fileData, useCacheBucket=False): filePath = os.path.join(folder, fileName) if self['data_serialization_encryption_key']: nonceData = os.urandom(16) keyHash = hashlib.sha256() keyHash.update( bytes( self['data_serialization_encryption_key'] + folder + fileName, "utf8")) cipher = Cipher(algorithms.AES(keyHash.digest()), modes.CTR(nonceData)) encryptor = cipher.encryptor() fileData = nonceData + encryptor.update( fileData) + encryptor.finalize() filePath += ".enc" else: cipher = None if self['data_file_storage_method'] == 'local': # Todo - we shouldn't be making these os.path.exists calls every single time we save file data # Its inefficient. if not os.path.exists( os.path.join(self.configurationDirectory, folder)): try: os.mkdir(os.path.join(self.configurationDirectory, folder)) except FileExistsError: # This just means there is a race condition and multiple threads attempted # to create this folder at the same time. pass with open(os.path.join(self.configurationDirectory, filePath), 'wb') as f: f.write(fileData) elif self['data_file_storage_method'] == 'gcs': if 'applicationId' not in self or self.applicationId is None: raise RuntimeError( "Can't load object from google cloud storage without an applicationId, which is used to indicate the bucket." ) storageClient = getSharedGCSStorageClient() bucketId = "kwola-testing-run-data-" + self.applicationId if useCacheBucket: bucketId += "-cache" applicationStorageBucket = storage.Bucket(storageClient, bucketId) objectBlob = storage.Blob(filePath, applicationStorageBucket) objectBlob.upload_from_string(fileData) else: raise RuntimeError( f"Unexpected value {self['data_file_storage_method']} for configuration data_file_storage_method" )
def up(self): self.project = self.config.get('project', os.environ['GCLOUD_PROJECT']) self.gs = storage.Client(project=self.project) self.bucket_name = self.config['bucket'] self.bucket = storage.Bucket(self.gs, name=self.bucket_name) self.ensure_bucket() super().up()