def _UpdateLatestServerDebDirectory(gcs_bucket: storage.Bucket, gcs_build_results_dir: str): """Updates the '_latest_server_deb' GCS directory with the latest results.""" logging.info("Updating latest server deb directory.") old_build_results = list( gcs_bucket.list_blobs(prefix=_LATEST_SERVER_DEB_GCS_DIR)) new_build_results = list( gcs_bucket.list_blobs(prefix=gcs_build_results_dir)) if not new_build_results: raise GCSUploadError( "Failed to find build results for the server-deb Travis job.") for gcs_blob in old_build_results: logging.info("Deleting previous blob: %s", gcs_blob) gcs_blob.delete() for gcs_blob in new_build_results: build_result_filename = gcs_blob.name.split("/")[-1] latest_build_result_path = ( f"{_LATEST_SERVER_DEB_GCS_DIR}/{build_result_filename}") logging.info("Copying blob %s (%s) -> %s", gcs_blob, gcs_bucket, latest_build_result_path) gcs_bucket.copy_blob(gcs_blob, gcs_bucket, new_name=latest_build_result_path)
def _prepare_sync_plan( source_bucket: storage.Bucket, destination_bucket: storage.Bucket, source_object: Optional[str], destination_object: Optional[str], recursive: bool, ) -> Tuple[Set[storage.Blob], Set[storage.Blob], Set[storage.Blob]]: # Calculate the number of characters that remove from the name, because they contain information # about the parent's path source_object_prefix_len = len(source_object) if source_object else 0 destination_object_prefix_len = len( destination_object) if destination_object else 0 delimiter = "/" if not recursive else None # Fetch blobs list source_blobs = list( source_bucket.list_blobs(prefix=source_object, delimiter=delimiter)) destination_blobs = list( destination_bucket.list_blobs(prefix=destination_object, delimiter=delimiter)) # Create indexes that allow you to identify blobs based on their name source_names_index = { a.name[source_object_prefix_len:]: a for a in source_blobs } destination_names_index = { a.name[destination_object_prefix_len:]: a for a in destination_blobs } # Create sets with names without parent object name source_names = set(source_names_index.keys()) destination_names = set(destination_names_index.keys()) # Determine objects to copy and delete to_copy = source_names - destination_names to_delete = destination_names - source_names to_copy_blobs = {source_names_index[a] for a in to_copy} # type: Set[storage.Blob] to_delete_blobs = {destination_names_index[a] for a in to_delete} # type: Set[storage.Blob] # Find names that are in both buckets names_to_check = source_names.intersection(destination_names) to_rewrite_blobs = set() # type: Set[storage.Blob] # Compare objects based on crc32 for current_name in names_to_check: source_blob = source_names_index[current_name] destination_blob = destination_names_index[current_name] # if the objects are different, save it if source_blob.crc32c != destination_blob.crc32c: to_rewrite_blobs.add(source_blob) return to_copy_blobs, to_delete_blobs, to_rewrite_blobs
def get_latest_version_from_bucket(pack_id: str, production_bucket: Bucket) -> str: """ Retrieves the latest version of pack in the bucket Args: pack_id (str): The pack id to retrieve the latest version production_bucket (Bucket): The GCS production bucket Returns: The latest version of the pack as it is in the production bucket """ pack_bucket_path = os.path.join(GCPConfig.STORAGE_BASE_PATH, pack_id) logging.debug( f'Trying to get latest version for pack {pack_id} from bucket path {pack_bucket_path}' ) # Adding the '/' in the end of the prefix to search for the exact pack id pack_versions_paths = [ f.name for f in production_bucket.list_blobs(prefix=f'{pack_bucket_path}/') if f.name.endswith('.zip') ] pack_versions = [ LooseVersion(PACK_PATH_VERSION_REGEX.findall(path)[0]) for path in pack_versions_paths ] logging.debug( f'Found the following zips for {pack_id} pack: {pack_versions}') if pack_versions: pack_latest_version = max(pack_versions).vstring return pack_latest_version else: logging.error( f'Could not find any versions for pack {pack_id} in bucket path {pack_bucket_path}' )
def clear_remote_dags_bucket(bucket: Bucket): i = 0 for blob in bucket.list_blobs(prefix='dags'): if not blob.name in ['dags/', 'dags/airflow_monitoring.py']: print(f"deleting file {_blob_uri(blob)}") blob.delete() i += 1 print(f"{i} files deleted")
def download_metadata_from_gcs(bucket: storage.Bucket, local_sample_path: ComparisonPath) -> None: (local_sample_path / "operations").mkdir_p() prefix = str(local_sample_path) blobs = bucket.list_blobs(prefix=prefix) for blob in blobs: if not blob.name.endswith('/digest.json'): logging.info(f'Downloading blob: {blob.name}') blob.download_to_filename(blob.name)
def timeSort(bucket: Bucket, prefix: str, num: Optional[int] = None) -> List[Image]: blobs = bucket.list_blobs(prefix=prefix) imgs = [ Image(el.public_url) for el in blobs if el.public_url.endswith(".png") ] simgs = sorted(imgs, key=lambda x: (x.date, x.seq), reverse=True) if num: return simgs[:num] return simgs
def sync_gcs_to_box(bucket: Bucket, box: BoxClient, cache: dict) -> List[Future]: # constuct an executor for copy tasks executor = ThreadPoolExecutor(max_workers=cpu_count()) futures = [] for blob in bucket.list_blobs(): if cache.get(blob.name, False): # Found the blob in Box LOG.debug("Blob {} already in Box.".format(blob.name)) else: # Did not find the Blob in box if blob.metadata and blob.metadata[BOX_MTIME_KEY]: LOG.info( "Found blob {} in bucket that was synced, but no longer exists in Box. Deleting." .format(blob.name)) blob.delete() else: if blob.name[-1] == '/': LOG.info( "Found new folder {} not in Box. Creating.".format( blob.name)) path = blob.name.split("/")[:-1] # do this serially, as there should be few. # Ideally, box_mkdir_p never misses cache when making files as the folder will sort first box_mkdir_p(box, path, cache) else: # Found a file that doesn't seem to be in Box. blob_name = blob.name LOG.info("Found new blob {} not in Box. Uploading.".format( blob_name)) # split name by slashes; last item is file, the previous are folders tokens = blob.name.split("/") path, filename = tokens[:-1], tokens[-1] target_folder = box_mkdir_p(box, path, cache) # prepare the copy temp_file = BytesIO() reader = blob.download_to_file writer = lambda temp: target_folder.upload_stream( temp, filename) transfer_callback = lambda bf: patch_blob_metadata( bucket, blob_name, bf) # submit the copy work future = executor.submit(concurrent_upload, reader, writer, temp_file, transfer_callback) futures.append(future) return futures
def get_files(client: storage.Client, bucket: storage.Bucket) -> List[dict]: """Retrieves all files in a given GCS bucket Args: client: Object representing Python GCS client bucket: google.cloud.storage.Bucket holding bucket name Returns: List of dicts [{name: String holding file name, type: String representing type of file, 'audio/flac'. }] """ bucket = client.get_bucket(bucket) return [{ 'name': blob.name, 'type': blob.content_type } for blob in list(bucket.list_blobs())]
def get_latest_version_from_bucket(pack_id: str, production_bucket: Bucket) -> str: """ Retrieves the latest version of pack in the bucket Args: pack_id (str): The pack id to retrieve the latest version production_bucket (Bucket): The GCS production bucket Returns: The latest version of the pack as it is in the production bucket """ pack_bucket_path = os.path.join(GCPConfig.STORAGE_BASE_PATH, pack_id) # Adding the '/' in the end of the prefix to search for the exact pack id pack_versions_paths = [ f.name for f in production_bucket.list_blobs(prefix=f'{pack_bucket_path}/') if f.name.endswith('.zip') ] pack_versions = [ LooseVersion(PACK_PATH_VERSION_REGEX.findall(path)[0]) for path in pack_versions_paths ] pack_latest_version = max(pack_versions).vstring return pack_latest_version