def read_chunk(report: dict, chunk: int = 4096, credentials=None, start: int = 0) -> str: client = storage.Client(credentials=( credentials.get_credentials() if credentials else None)) path_segments = report['current_path'].split('/') report_bucket = path_segments[-2] report_blob_name = path_segments[-1].split('?')[0] source_bucket = Bucket(client, report_bucket) blob = source_bucket.blob(report_blob_name) data = blob.download_as_string(start=start, end=chunk, raw_download=True).decode('utf-8') return data
def __getBlobUpdated(bucket: Bucket, blobName): """ Takes a bucket object and blob name and returns the blobs generation key :param self: :param bucket: Bucket :param blobName: String :return: """ return bucket.get_blob(blobName).updated
def upload_from_file(bkt: storage.Bucket, blob: str, file: str, delete_file: bool = True) -> str: b = bkt.blob(blob_name=blob) b.upload_from_filename(file) if delete_file: os.remove(file) return b.public_url
def download_metadata_from_gcs(bucket: storage.Bucket, local_sample_path: ComparisonPath) -> None: (local_sample_path / "operations").mkdir_p() prefix = str(local_sample_path) blobs = bucket.list_blobs(prefix=prefix) for blob in blobs: if not blob.name.endswith('/digest.json'): logging.info(f'Downloading blob: {blob.name}') blob.download_to_filename(blob.name)
def _UploadDirectory(local_dir: str, gcs_bucket: storage.Bucket, gcs_dir: str): """Upload the contents of a local directory to a GCS Bucket.""" for file_name in os.listdir(local_dir): path = os.path.join(local_dir, file_name) if not os.path.isfile(path): logging.info("Skipping %s as it's not a file.", path) continue logging.info("Uploading: %s", path) gcs_blob = gcs_bucket.blob(f"{gcs_dir}/{file_name}") gcs_blob.upload_from_filename(path)
def revoke_expiring_gcs_access( bucket: storage.Bucket, role: str, user_email: str, prefix: Optional[str] = None ): """Revoke a bucket IAM policy change made by calling `grant_expiring_gcs_access`.""" # see https://cloud.google.com/storage/docs/access-control/using-iam-permissions#code-samples_3 policy = bucket.get_iam_policy(requested_policy_version=3) policy.version = 3 # find and remove all matching policy bindings for this user if any exist for i in range(GOOGLE_MAX_DOWNLOAD_PERMISSIONS): removed_binding = _find_and_pop_binding(policy, prefix, role, user_email) if removed_binding is None: if i == 0: warnings.warn( f"Tried to revoke a non-existent download IAM permission for {user_email}/{prefix}" ) break bucket.set_iam_policy(policy)
def read_weather_for_state_for_date(bucket: Bucket, bucket_raw_base_path: str, selected_state: str, date: datetime.date): yyyymmdd: str = date.strftime("%Y%m%d") blob = bucket.blob( f"{bucket_raw_base_path.format(date=yyyymmdd)}/{selected_state}.json.gz" ) try: return json.loads(gunzip_bytes(blob.download_as_string())) except NotFound: return None
def storage(request): # create a random test bucket name bucket_name = "test_bucket_" + get_random_string(6, string.ascii_lowercase) storage = DjangoGCloudStorage( project=request.config.getoption("--gcs-project-name"), bucket=bucket_name, credentials_file_path=request.config.getoption( "--gcs-credentials-file")) # Make sure the bucket exists bucket = Bucket(storage.client, bucket_name) bucket.create(location=request.config.getoption("--gcs-bucket-location")) yield storage storage.bucket.delete_blobs(storage.bucket.list_blobs()) storage.bucket.delete(force=True)
def process_sqs_message(sqs_client, sqs_queue_url: str, gcs_output_bucket: storage.Bucket, gcs_output_prefix: str) -> None: """ Process SQS Message :param sqs_client: AWS SQS client :param sqs_queue_url: AWS SQS Queue URL :param gcs_output_bucket: GCP GCS bucket :param gcs_output_prefix: GCP GCS object prefix """ # Receive one message response = sqs_client.receive_message(QueueUrl=sqs_queue_url, MaxNumberOfMessages=1, VisibilityTimeout=0, WaitTimeSeconds=0) # Process message if 'Messages' in response and len(response['Messages']) > 0: message = response['Messages'][0] receipt_handle = message['ReceiptHandle'] message_body = json.loads(message['Body']) message_id = message['MessageId'] text = message_body['text'] timestamp = message_body['timestamp'] parsed_text = process_text(text) logger.info('Message received successfully!') # Send result to GCS result = { 'id': message_id, 'timestamp': timestamp, 'text': text, 'parsed_text': parsed_text } result_string = json.dumps(result, ensure_ascii=False, encoding='utf8', indent=2) object_key = ( f'{gcs_output_prefix.rstrip("/")}' f'/result_{message_id}_{datetime.fromisoformat(timestamp).strftime("%Y%m%dT%H%M%S")}.json' ) blob = gcs_output_bucket.blob(object_key) blob.upload_from_string(result_string) logger.info( f'Sent result to `{object_key}` in `{gcs_output_bucket.name}`') # Delete message sqs_client.delete_message(QueueUrl=sqs_queue_url, ReceiptHandle=receipt_handle) logger.info('Message deleted successfully!') else: logger.info('No messages in queue')
def start_backfill_subscriber_if_not_running( gcs_client: Optional[storage.Client], bkt: storage.Bucket, table_prefix: str) -> Optional[storage.Blob]: """start the backfill subscriber if it is not already runnning for this table prefix. created a backfill file for the table prefix if not exists. """ if not gcs_client: gcs_client = storage.Client(client_info=constants.CLIENT_INFO) start_backfill = True # Do not start subscriber until START_BACKFILL_FILENAME has been dropped # at the table prefix. if constants.START_BACKFILL_FILENAME: start_backfill_blob = bkt.blob( f"{table_prefix}/{constants.START_BACKFILL_FILENAME}") start_backfill = start_backfill_blob.exists(client=gcs_client) if not start_backfill: print("note triggering backfill because" f"gs://{start_backfill_blob.bucket.name}/" f"{start_backfill_blob.name} was not found.") if start_backfill: # Create a _BACKFILL file for this table if not exists backfill_blob = bkt.blob( f"{table_prefix}/{constants.BACKFILL_FILENAME}") try: backfill_blob.upload_from_string("", if_generation_match=0, client=gcs_client) print("triggered backfill with " f"gs://{backfill_blob.bucket.name}/{backfill_blob.name} " f"created at {backfill_blob.time_created}.") return backfill_blob except google.api_core.exceptions.PreconditionFailed: backfill_blob.reload(client=gcs_client) print("backfill already in progress due to: " f"gs://{backfill_blob.bucket.name}/{backfill_blob.name} " f"created at {backfill_blob.time_created}. exiting.") return backfill_blob else: return None
def source_bucket(gcs_bucket: storage.Bucket, sts_service_account: str): """ Yields and auto-cleans up a CGS bucket preconfigured with necessary STS service account read perms """ # Setup policy for STS member: str = f"serviceAccount:{sts_service_account}" objectViewer = "roles/storage.objectViewer" bucketReader = "roles/storage.legacyBucketReader" # Prepare policy policy = gcs_bucket.get_iam_policy(requested_policy_version=3) policy.bindings.append({"role": objectViewer, "members": {member}}) policy.bindings.append({"role": bucketReader, "members": {member}}) # Set policy gcs_bucket.set_iam_policy(policy) yield gcs_bucket
def __delete_blob(bucket: Bucket, blob_name: str): """Deletes a blob from the bucket.""" # bucket_name = "your-bucket-name" # blob_name = "your-object-name" blob = bucket.blob(blob_name) try: blob.delete() print("Blob {} deleted from bucket: {}.".format(blob_name, bucket)) except NotFound: print("File:", blob_name, "doesn't exists in bucket:", bucket)
def storage(request): # create a random test bucket name bucket_name = "test_bucket_" + get_random_string(6, string.ascii_lowercase) storage = DjangoGCloudStorage( project=request.config.getoption("--gcs-project-name"), bucket=bucket_name, credentials_file_path=request.config.getoption("--gcs-credentials-file") ) # Make sure the bucket exists bucket = Bucket(storage.client, bucket_name) bucket.location = request.config.getoption("--gcs-bucket-location") bucket.create() yield storage storage.bucket.delete_blobs(storage.bucket.list_blobs()) storage.bucket.delete(force=True)
def upload_blob(source: str, destination: str): """Uploads a file to the bucket.""" # source = "local/path/to/file" # destination = "gs://your-bucket-name/storage-object-name" storage_client = storage.Client() bucket = Bucket.from_string(destination, storage_client) blob = Blob.from_string(destination, storage_client) blob.upload_from_filename(source) print(f"File {source} uploaded to destination.")
def grant_expiring_gcs_access( bucket: storage.Bucket, role: str, user_email: str, prefix: Optional[str] = None ): """ Grant `user_email` the provided `role` on a `bucket`, expiring after `INACTIVE_USER_DAYS` days have elapsed. By default, permissions apply to the whole bucket. Optionally, provide an object URL `prefix` to restrict this permission grant to only a portion of the objects in the given bucket. """ # see https://cloud.google.com/storage/docs/access-control/using-iam-permissions#code-samples_3 policy = bucket.get_iam_policy(requested_policy_version=3) policy.version = 3 # remove the existing binding if one exists so that we can recreate it with # an updated TTL. _find_and_pop_binding(policy, prefix, role, user_email) binding = _build_binding_with_expiry(bucket.name, prefix, role, user_email) # (re)insert the binding into the policy policy.bindings.append(binding) bucket.set_iam_policy(policy)
def timeSort(bucket: Bucket, prefix: str, num: Optional[int] = None) -> List[Image]: blobs = bucket.list_blobs(prefix=prefix) imgs = [ Image(el.public_url) for el in blobs if el.public_url.endswith(".png") ] simgs = sorted(imgs, key=lambda x: (x.date, x.seq), reverse=True) if num: return simgs[:num] return simgs
def create(ctx, *args, **kwargs): admin_check(ctx.obj["user_id"]) bucket = Bucket(ctx.obj["client"], name=ctx.obj["name"]) bucket.location = kwargs["location"].upper() bucket.storage_class = kwargs["class"].upper() bucket.create() return f"Bucket `{bucket.name}` created."
def blob_path(bucket_object: storage.Bucket) -> str: """ Path of a file placed in the GCS Bucket for tests """ filename = "sample_file.json" local_path = path.join(path.dirname(__file__), f"fixtures/{filename}") # remote_path = f"gs://{TEST_BUCKET}/{filename}" blob: storage.Blob = bucket_object.blob(filename) blob.upload_from_filename(local_path) assert blob.exists() # print("Created blob?", blob.exists()) yield filename # print("Removing blob...") blob.delete()
def _prepare_sync_plan( source_bucket: storage.Bucket, destination_bucket: storage.Bucket, source_object: Optional[str], destination_object: Optional[str], recursive: bool, ) -> Tuple[Set[storage.Blob], Set[storage.Blob], Set[storage.Blob]]: # Calculate the number of characters that remove from the name, because they contain information # about the parent's path source_object_prefix_len = len(source_object) if source_object else 0 destination_object_prefix_len = len(destination_object) if destination_object else 0 delimiter = "/" if not recursive else None # Fetch blobs list source_blobs = list(source_bucket.list_blobs(prefix=source_object, delimiter=delimiter)) destination_blobs = list( destination_bucket.list_blobs(prefix=destination_object, delimiter=delimiter) ) # Create indexes that allow you to identify blobs based on their name source_names_index = {a.name[source_object_prefix_len:]: a for a in source_blobs} destination_names_index = {a.name[destination_object_prefix_len:]: a for a in destination_blobs} # Create sets with names without parent object name source_names = set(source_names_index.keys()) destination_names = set(destination_names_index.keys()) # Determine objects to copy and delete to_copy = source_names - destination_names to_delete = destination_names - source_names to_copy_blobs = {source_names_index[a] for a in to_copy} # type: Set[storage.Blob] to_delete_blobs = {destination_names_index[a] for a in to_delete} # type: Set[storage.Blob] # Find names that are in both buckets names_to_check = source_names.intersection(destination_names) to_rewrite_blobs = set() # type: Set[storage.Blob] # Compare objects based on crc32 for current_name in names_to_check: source_blob = source_names_index[current_name] destination_blob = destination_names_index[current_name] # if the objects are different, save it if source_blob.crc32c != destination_blob.crc32c: to_rewrite_blobs.add(source_blob) return to_copy_blobs, to_delete_blobs, to_rewrite_blobs
def save_arrays(arrays: Dict[str, np.ndarray], filename: str, bucket: storage.Bucket): """ Saves .npz arrays (compressed in groups of 10) to cloud :param arrays: dict mapping IDs --> arrays to be saved :param filename: new filename :param bucket: bucket to be saved within :return: """ out_stream = io.BytesIO() np.savez_compressed(out_stream, **arrays) out_stream.seek(0) out_blob = bucket.blob(filename) out_blob.upload_from_file(out_stream)
def download_from_bucket( bucket: storage.Bucket, bucket_file_path: str, local_file_path: Optional[Union[Path, str]] = None, force: bool = False, ) -> None: """ Download the file from the bucket to the local machine. If the local_directory is specified the files are downloaded to this directory, otherwise the structure of the file path is preserved. Args: bucket: bucket from which to download the file bucket_file_path: file path in the bucket local_file_path: path to which we save locally force: whether to force the download or not Raises: FileNotFoundError: if the file does not exist. """ gs_blob = bucket.blob(bucket_file_path) if not gs_blob.exists(): raise FileNotFoundError( f"The file {bucket_file_path} does not exist in Google Bucket '{bucket.name}'" ) if local_file_path is None: local_file_path = CACHE_DIRECTORY / bucket_file_path else: local_file_path = _convert_file_path(local_file_path).resolve() should_download = force or not local_file_path.exists() if should_download: local_file_path.parent.mkdir(exist_ok=True, parents=True) url = gs_blob.public_url filename = url.split("/")[-1] # gs_blob.download_to_filename(local_file_path) no progress bar with TqdmUpTo(unit="B", unit_scale=True, unit_divisor=1024, miniters=1, desc=filename) as t: urlretrieve(url, filename=local_file_path, reporthook=t.update_to) log.info(f"File {bucket_file_path} downloaded from Google Bucket " f"'{bucket.name}' at {local_file_path}")
def sync_gcs_to_box(bucket: Bucket, box: BoxClient, cache: dict) -> List[Future]: # constuct an executor for copy tasks executor = ThreadPoolExecutor(max_workers=cpu_count()) futures = [] for blob in bucket.list_blobs(): if cache.get(blob.name, False): # Found the blob in Box LOG.debug("Blob {} already in Box.".format(blob.name)) else: # Did not find the Blob in box if blob.metadata and blob.metadata[BOX_MTIME_KEY]: LOG.info( "Found blob {} in bucket that was synced, but no longer exists in Box. Deleting." .format(blob.name)) blob.delete() else: if blob.name[-1] == '/': LOG.info( "Found new folder {} not in Box. Creating.".format( blob.name)) path = blob.name.split("/")[:-1] # do this serially, as there should be few. # Ideally, box_mkdir_p never misses cache when making files as the folder will sort first box_mkdir_p(box, path, cache) else: # Found a file that doesn't seem to be in Box. blob_name = blob.name LOG.info("Found new blob {} not in Box. Uploading.".format( blob_name)) # split name by slashes; last item is file, the previous are folders tokens = blob.name.split("/") path, filename = tokens[:-1], tokens[-1] target_folder = box_mkdir_p(box, path, cache) # prepare the copy temp_file = BytesIO() reader = blob.download_to_file writer = lambda temp: target_folder.upload_stream( temp, filename) transfer_callback = lambda bf: patch_blob_metadata( bucket, blob_name, bf) # submit the copy work future = executor.submit(concurrent_upload, reader, writer, temp_file, transfer_callback) futures.append(future) return futures
def copy_id_set(production_bucket: Bucket, build_bucket: Bucket, storage_base_path: str, build_bucket_base_path: str): """ Copies the id_set.json artifact from the build bucket to the production bucket. Args: production_bucket (google.cloud.storage.bucket.Bucket): gcs bucket where id_set is copied to. build_bucket (google.cloud.storage.bucket.Bucket): gcs bucket where id_set is copied from. storage_base_path (str): the path to upload the id_set.json to. build_bucket_base_path (str): the path in the build bucket of the id_set.json. """ build_id_set_path = os.path.join(os.path.dirname(build_bucket_base_path), 'id_set.json') build_id_set_blob = build_bucket.blob(build_id_set_path) if not build_id_set_blob.exists(): logging.error( f"id_set.json file does not exists in build bucket in path: {build_id_set_path}" ) sys.exit(1) prod_id_set_path = os.path.join(os.path.dirname(storage_base_path), 'id_set.json') try: copied_blob = build_bucket.copy_blob( blob=build_id_set_blob, destination_bucket=production_bucket, new_name=prod_id_set_path) if not copied_blob.exists(): logging.error( f"Failed to upload id_set.json to {prod_id_set_path}") sys.exit(1) else: logging.success("Finished uploading id_set.json to storage.") except Exception as e: logging.exception(f"Failed copying ID Set. Additional Info: {str(e)}") sys.exit(1)
def download_and_extract_index(build_bucket: Bucket, extract_destination_path: str, build_bucket_base_path: str): """Downloads and extracts production and build indexes zip from cloud storage. Args: build_bucket (google.cloud.storage.bucket.Bucket): google storage bucket where build index.zip is stored. extract_destination_path (str): the full path of extract folder. build_bucket_base_path (str): the path in the build bucket of the index. Returns: str: extracted build index folder full path. Blob: google cloud storage object that represents prod index.zip blob. Blob: google cloud storage object that represents build index.zip blob. str: downloaded prod index generation. str: downloaded build index generation. """ build_index_storage_path = os.path.join(build_bucket_base_path, f"{GCPConfig.INDEX_NAME}.zip") download_build_index_path = os.path.join(extract_destination_path, f"{GCPConfig.INDEX_NAME}.zip") build_index_blob = build_bucket.blob(build_index_storage_path) build_index_folder_path = os.path.join(extract_destination_path, GCPConfig.INDEX_NAME) if not os.path.exists(extract_destination_path): os.mkdir(extract_destination_path) if not build_index_blob.exists(): logging.error(f"No build index was found in path: {build_index_storage_path}") sys.exit(1) build_index_blob.reload() build_index_generation = build_index_blob.generation build_index_blob.download_to_filename(download_build_index_path, if_generation_match=build_index_generation) if os.path.exists(download_build_index_path): with ZipFile(download_build_index_path, 'r') as index_zip: index_zip.extractall(extract_destination_path) if not os.path.exists(build_index_folder_path): logging.error(f"Failed creating build {GCPConfig.INDEX_NAME} folder with extracted data.") sys.exit(1) os.remove(download_build_index_path) logging.success(f"Finished downloading and extracting build {GCPConfig.INDEX_NAME} file to " f"{extract_destination_path}") return build_index_folder_path, build_index_blob, build_index_generation else: logging.error(f"Failed to download build {GCPConfig.INDEX_NAME}.zip file from cloud storage.") sys.exit(1)
def upload(bucket: storage.Bucket, thumb: Thumbnail) -> bool: blob = bucket.blob(str(thumb.path)) blob.upload_from_string(thumb.content, thumb.mimetype) logger.info('Uploaded {}.', thumb.path) # TODO: Copy ACL from original image try: blob.make_public() except ServiceUnavailable as e: logger.error('Failed to make {} public.\nError: {}', blob.path, e) meta = {'Generator': f'Thunagen v{__version__}'} blob.metadata = meta try: blob.patch() logger.debug('Made {} public and set metadata {}', thumb.path, meta) except NotFound: logger.error('{} was deleted by someone.', blob.path) return True
def get_files(client: storage.Client, bucket: storage.Bucket) -> List[dict]: """Retrieves all files in a given GCS bucket Args: client: Object representing Python GCS client bucket: google.cloud.storage.Bucket holding bucket name Returns: List of dicts [{name: String holding file name, type: String representing type of file, 'audio/flac'. }] """ bucket = client.get_bucket(bucket) return [{ 'name': blob.name, 'type': blob.content_type } for blob in list(bucket.list_blobs())]
def _UploadBuildResults(gcs_bucket: storage.Bucket, gcs_build_results_dir: str): """Uploads all build results to Google Cloud Storage.""" logging.info("Will upload build results to gs://%s/%s.", os.environ[_GCS_BUCKET], gcs_build_results_dir) for build_result in os.listdir(flags.FLAGS.build_results_dir): path = os.path.join(flags.FLAGS.build_results_dir, build_result) if not os.path.isfile(path): logging.info("Skipping %s as it's not a file.", path) continue logging.info("Uploading: %s", path) gcs_blob = gcs_bucket.blob("{}/{}".format(gcs_build_results_dir, build_result)) gcs_blob.upload_from_filename(path) logging.info("GCS upload done.")
def download_job_manifest(bucket: Bucket, job_id: str) -> JobManifest: """ Download the JobManifest associated with job_id in given bucket. Parameters ---------- bucket : google.cloud.storage.Bucket The GCS bucket where job data is stored. job_id : str The ID of the job. Returns ------- JobManifest """ path = f"thor_jobs/v1/job-{job_id}/manifest.json" as_str = bucket.blob(path).download_as_string() return JobManifest.from_str(as_str)
def testGetPathContentsSubDir(self): requests = ['dummy_bucket1/subdir/', 'dummy_bucket1/subdir'] dummy_bucket1 = Bucket(client=Mock(), name='dummy_bucket1') gcs_buckets = [ dummy_bucket1, ] gcs_blobs = [ Blob(name='subdir/dummy_file', bucket=dummy_bucket1), Blob(name='subdir/dummy_dir/', bucket=dummy_bucket1), ] storage_client = Mock() storage_client.list_buckets = MagicMock(return_value=gcs_buckets) storage_client.list_blobs = MagicMock(return_value=gcs_blobs) wanted = { 'type': 'directory', 'content': [ { 'name': 'dummy_file', 'path': 'dummy_bucket1/subdir/dummy_file', 'type': 'file', 'last_modified': '', }, { 'name': 'dummy_dir/', 'path': 'dummy_bucket1/subdir/dummy_dir/', 'type': 'directory', 'last_modified': '', }, ] } for req in requests: got = handlers.getPathContents(req, storage_client) self.assertEqual(wanted['content'], got['content']) with self.assertRaises(handlers.FileNotFound): req = 'dummy_bucket1/sub' handlers.getPathContents(req, storage_client)
def testGetPathContentsDirEmpty(self): requests = ['dummy_bucket1/', 'dummy_bucket1'] dummy_bucket1 = Bucket(client=Mock(), name='dummy_bucket1') gcs_buckets = [ dummy_bucket1, ] gcs_blobs = [] storage_client = Mock() storage_client.list_buckets = MagicMock(return_value=gcs_buckets) storage_client.list_blobs = MagicMock(return_value=gcs_blobs) wanted = {'type': 'directory', 'content': []} for req in requests: got = handlers.getPathContents(req, storage_client) self.assertDictEqual(wanted, got)
def to_public_png(npy_blob: storage.Blob, public_bucket: storage.Bucket): """ Converts a .npy blob into a png file and uploads it to the public bucket. :param npy_blob: :param public_bucket: :return: """ npy_filepath = f'/tmp/{npy_blob.name.split("/")[-1]}' npy_blob.download_to_filename(npy_filepath) arr = np.load(npy_filepath) png_filepath = npy_filepath.replace('.npy', '.png') plt.imsave(png_filepath, arr) png_blob_name = npy_blob.name.replace('.npy', '.png') png_blob = public_bucket.blob(png_blob_name) png_blob.upload_from_filename(png_filepath) os.remove(npy_filepath) os.remove(png_filepath)