def download_and_extract_index(build_bucket: Bucket,
                               extract_destination_path: str,
                               build_bucket_base_path: str):
    """Downloads and extracts production and build indexes zip from cloud storage.

    Args:
        build_bucket (google.cloud.storage.bucket.Bucket): google storage bucket where build index.zip is stored.
        extract_destination_path (str): the full path of extract folder.
        build_bucket_base_path (str): the path in the build bucket of the index.
    Returns:
        str: extracted build index folder full path.
        Blob: google cloud storage object that represents prod index.zip blob.
        Blob: google cloud storage object that represents build index.zip blob.
        str: downloaded prod index generation.
        str: downloaded build index generation.

    """
    build_index_storage_path = os.path.join(build_bucket_base_path,
                                            f"{GCPConfig.INDEX_NAME}.zip")
    download_build_index_path = os.path.join(extract_destination_path,
                                             f"{GCPConfig.INDEX_NAME}.zip")

    build_index_blob = build_bucket.blob(build_index_storage_path)
    build_index_folder_path = os.path.join(extract_destination_path,
                                           GCPConfig.INDEX_NAME)

    if not os.path.exists(extract_destination_path):
        os.mkdir(extract_destination_path)

    if not build_index_blob.exists():
        logging.error(
            f"No build index was found in path: {build_index_storage_path}")
        sys.exit(1)

    build_index_blob.reload()
    build_index_generation = build_index_blob.generation
    build_index_blob.download_to_filename(
        download_build_index_path, if_generation_match=build_index_generation)

    if os.path.exists(download_build_index_path):
        with ZipFile(download_build_index_path, 'r') as index_zip:
            index_zip.extractall(extract_destination_path)

        if not os.path.exists(build_index_folder_path):
            logging.error(
                f"Failed creating build {GCPConfig.INDEX_NAME} folder with extracted data."
            )
            sys.exit(1)

        os.remove(download_build_index_path)
        logging.success(
            f"Finished downloading and extracting build {GCPConfig.INDEX_NAME} file to "
            f"{extract_destination_path}")

        return build_index_folder_path, build_index_blob, build_index_generation
    else:
        logging.error(
            f"Failed to download build {GCPConfig.INDEX_NAME}.zip file from cloud storage."
        )
        sys.exit(1)
Esempio n. 2
0
def handle_duplicate_notification(bkt: storage.Bucket,
                                  success_blob: storage.Blob, gsurl: str):
    """
    Need to handle potential duplicate Pub/Sub notifications.
    To achieve this we will drop an empty "claimed" file that indicates
    an invocation of this cloud function has picked up the success file
    with a certain creation timestamp. This will support republishing the
    success file as a mechanism of re-running the ingestion while avoiding
    duplicate ingestion due to multiple Pub/Sub messages for a success file
    with the same creation time.
    """
    success_blob.reload()
    success_created_unix_timestamp = success_blob.time_created.timestamp()

    claim_blob: storage.Blob = bkt.blob(
        success_blob.name.replace(
            SUCCESS_FILENAME, f"_claimed_{success_created_unix_timestamp}"))
    try:
        claim_blob.upload_from_string("", if_generation_match=0)
    except google.api_core.exceptions.PreconditionFailed as err:
        raise RuntimeError(
            f"The prefix {gsurl} appears to already have been claimed for "
            f"{gsurl}{SUCCESS_FILENAME} with created timestamp"
            f"{success_created_unix_timestamp}."
            "This means that another invocation of this cloud function has"
            "claimed the ingestion of this batch."
            "This may be due to a rare duplicate delivery of the Pub/Sub "
            "storage notification.") from err
Esempio n. 3
0
    def copy_to_gcs(bucket_name: str,
                    report: Dict[str, Any],
                    credentials: Credentials = None):
        """copy from one bucket to another

    This is a copy from the bucket defined in the report definition (as DV360
    stores its reports in GCS) into the monitored bucket for upload. It's
    BLAZING fast, to the extent that there is essentially no limit on the
    maximum size of a DV360 report we can handle.

    The destination file name is the report's id.

    Arguments:
        bucket_name (str):  destination bucket name
        report (Dict[str, Any]):  report definition
    """
        client = storage.Client(
            credentials=(credentials.credentials if credentials else None))

        path_segments = report['current_path'].split('/')
        report_bucket = path_segments[-2]
        report_blob_name = path_segments[-1].split('?')[0]
        output_blob_name = report['id']

        source_bucket = Bucket(client, report_bucket)
        source_blob = source_bucket.blob(report_blob_name)

        destination_bucket = client.get_bucket(bucket_name)
        source_bucket.copy_blob(source_blob, destination_bucket,
                                '{id}.csv'.format(id=output_blob_name))

        logging.info('File {report} copied from {source} to {bucket}.'.format(
            report=report_blob_name, bucket=bucket_name, source=report_bucket))
Esempio n. 4
0
    def rename(bucket: str,
               source: str,
               destination: str,
               credentials: Credentials = None):
        """Rename a file.

    This is a copy/delete action as GCS has no actual rename option, however as
    it is all within GCS it is BLAZING fast, to the extent that there is
    essentially no limit on the maximum size of file we can rename.

    Arguments:
        bucket (str):  destination bucket name
        source (str):  current name
        destination (str):  new name
        credentials (Credentials):  authentication, if needed
    """
        client = storage.Client(
            credentials=(credentials.credentials if credentials else None))

        source_bucket = Bucket(client, name=bucket)
        source_blob = source_bucket.blob(blob_name=source)

        destination_bucket = client.get_bucket(bucket)
        source_bucket.copy_blob(source_blob, destination_bucket, destination)
        source_blob.delete()

        logging.info(f'Renamed file %s as %s in %s.', bucket, source,
                     destination)
Esempio n. 5
0
def upload_labels(chunks: List[Chunk], data_name: str,
                  data_bucket: storage.Bucket) -> None:
    """
    Uploads the labels for a single chunk to
    gs://elvos/processed3d/{data_name}/labels/ as a
    csv file named {patient_id}.csv.

    Note that the position tuple is converted into a hyphen-delimited
    string as in upload_chunks_npz

    :param chunks:
    :param data_name:
    :param data_bucket:
    :return:
    """
    patient_id = chunks[0].patient_id
    labels = []
    for chunk in chunks:
        position_str = '-'.join([str(coord) for coord in chunk.position])
        labels.append((chunk.patient_id, position_str, chunk.label))

    df = pd.DataFrame(data=labels, columns=['patient_id', 'position', 'label'])
    tmp_filepath = f'/tmp/labels-{patient_id}.csv'
    df.to_csv(tmp_filepath, index_label=False)

    blob = data_bucket.blob(f'processed3d/{data_name}/labels/{patient_id}.csv')
    logging.info(f'uploading csv file: {blob.name}')
    blob.upload_from_filename(tmp_filepath)
    os.remove(tmp_filepath)
Esempio n. 6
0
    def get_report_file(report: dict, credentials: Credentials = None) -> str:
        """get_report_file

    Find and return just the blob. We'll use this in DV360 to be able to stream
    the file in pieces so we can drop out the footer.

    Arguments:
        report (dict):  [description]

    Keyword Arguments:
        credentials (credentiala):  [description] (default: {None})

    Returns:
        str: [description]
    """
        client = storage.Client(
            credentials=(credentials.credentials if credentials else None))

        path_segments = report['current_path'].split('/')
        report_bucket = path_segments[-2]
        report_blob_name = path_segments[-1].split('?')[0]

        source_bucket = Bucket(client, report_bucket)
        blob = source_bucket.blob(report_blob_name)
        return blob
Esempio n. 7
0
def upload_chunks_npz(chunks: List[Chunk], data_name: str,
                      data_bucket: storage.Bucket) -> None:
    """
    Uploads chunks to gs://elvos/processed3d/{data_name}/arrays/
    as a .npz file ({patient_id}.npz). The NPZ files contain the position
    tuple as a key and the array as the value.

    Example:

    The chunk with position (64, 96, 128) will be saved as:
        '64-96-128': array(...

    :param chunks: a list of chunks with the same patient id.
    :param data_name:
    :param data_bucket:
    :return:
    """
    patient_id = chunks[0].patient_id
    blob = data_bucket.blob(f'processed3d/{data_name}/arrays/{patient_id}.npz')
    chunk_dict = {}
    for c in chunks:
        position_str = '-'.join([str(coord) for coord in c.position])
        chunk_dict[position_str] = c.array

    stream = io.BytesIO()
    np.savez(stream, **chunk_dict)
    stream.seek(0)

    logging.info(f'uploading npz file: {blob.name}')
    blob.upload_from_file(stream)
    stream.close()
Esempio n. 8
0
def copy_id_set(production_bucket: Bucket, build_bucket: Bucket, storage_base_path: str, build_bucket_base_path: str):
    """ Copies the id_set.json artifact from the build bucket to the production bucket.

    Args:
        production_bucket (google.cloud.storage.bucket.Bucket): gcs bucket where id_set is copied to.
        build_bucket (google.cloud.storage.bucket.Bucket): gcs bucket where id_set is copied from.
        storage_base_path (str): the path to upload the id_set.json to.
        build_bucket_base_path (str): the path in the build bucket of the id_set.json.
    """

    build_id_set_path = os.path.join(os.path.dirname(build_bucket_base_path), 'id_set.json')
    build_id_set_blob = build_bucket.blob(build_id_set_path)

    if not build_id_set_blob.exists():
        logging.error(f"id_set.json file does not exists in build bucket in path: {build_id_set_path}")
        sys.exit(1)

    prod_id_set_path = os.path.join(os.path.dirname(storage_base_path), 'id_set.json')
    try:
        copied_blob = build_bucket.copy_blob(
            blob=build_id_set_blob, destination_bucket=production_bucket, new_name=prod_id_set_path
        )
        if not copied_blob.exists():
            logging.error(f"Failed to upload id_set.json to {prod_id_set_path}")
            sys.exit(1)
        else:
            logging.success("Finished uploading id_set.json to storage.")
    except Exception as e:
        logging.exception(f"Failed copying ID Set. Additional Info: {str(e)}")
        sys.exit(1)
Esempio n. 9
0
def copy_id_set(production_bucket: Bucket, build_bucket: Bucket):
    """ Copies the id_set.json artifact from the build bucket to the production bucket.

    Args:
        production_bucket (google.cloud.storage.bucket.Bucket): gcs bucket where id_set is copied to.
        build_bucket (google.cloud.storage.bucket.Bucket): gcs bucket where id_set is copied from.
    """

    build_id_set_path = os.path.join(
        os.path.dirname(GCPConfig.BUILD_BASE_PATH), 'id_set.json')
    build_id_set_blob = build_bucket.blob(build_id_set_path)

    if not build_id_set_blob.exists():
        logging.error(
            f"id_set.json file does not exists in build bucket in path: {build_id_set_path}"
        )

    prod_id_set_path = os.path.join(
        os.path.dirname(GCPConfig.STORAGE_BASE_PATH), 'id_set.json')
    copied_blob = build_bucket.copy_blob(blob=build_id_set_blob,
                                         destination_bucket=production_bucket,
                                         new_name=prod_id_set_path)

    if not copied_blob.exists():
        logging.error(f"Failed to upload id_set.json to {prod_id_set_path}")
    else:
        logging.success("Finished uploading id_set.json to storage.")
Esempio n. 10
0
def start_backfill_subscriber_if_not_running(
        gcs_client: Optional[storage.Client], bkt: storage.Bucket,
        table_prefix: str) -> Optional[storage.Blob]:
    """start the backfill subscriber if it is not already runnning for this
    table prefix.

    created a backfill file for the table prefix if not exists.
    """
    if gcs_client is None:
        gcs_client = storage.Client(client_info=constants.CLIENT_INFO)
    start_backfill = True

    # Do not start subscriber if a START_BACKFILL_FILENAME has been defined
    # in an environment variable and the file has not yet been dropped
    # at the table prefix.
    if constants.START_BACKFILL_FILENAME:
        start_backfill_blob = bkt.blob(
            f"{table_prefix}/{constants.START_BACKFILL_FILENAME}")
        start_backfill = start_backfill_blob.exists(client=gcs_client)
        if not start_backfill:
            print("Not triggering backfill because"
                  f"gs://{start_backfill_blob.bucket.name}/"
                  f"{start_backfill_blob.name} was not found.")

    if start_backfill:
        # Create a _BACKFILL file for this table if not exists
        backfill_blob = bkt.blob(
            f"{table_prefix}/{constants.BACKFILL_FILENAME}")
        try:
            backfill_blob.upload_from_string(
                "",
                # Setting if_generation_match below to 0 makes the operation
                # succeed only if there are no live versions of the blob.
                if_generation_match=0,
                client=gcs_client)
            print("triggered backfill with "
                  f"gs://{backfill_blob.bucket.name}/{backfill_blob.name} "
                  f"created at {backfill_blob.time_created}.")
            return backfill_blob
        except google.api_core.exceptions.PreconditionFailed:
            backfill_blob.reload(client=gcs_client)
            print("backfill already in progress due to: "
                  f"gs://{backfill_blob.bucket.name}/{backfill_blob.name} "
                  f"created at {backfill_blob.time_created}. exiting.")
            return backfill_blob
    else:
        return None
Esempio n. 11
0
def upload_core_packs_config(production_bucket: Bucket, build_number: str, extract_destination_path: str,
                             build_bucket: Bucket):
    """Uploads corepacks.json file configuration to bucket. Corepacks file includes core packs for server installation.

     Args:
        production_bucket (google.cloud.storage.bucket.Bucket): gcs bucket where core packs config is uploaded.
        build_number (str): CircleCI build number.
        extract_destination_path (str): Full path of folder to extract the corepacks file
        build_bucket (google.cloud.storage.bucket.Bucket): gcs bucket where core packs config is downloaded from.

    """
    # download the corepacks.json stored in the build bucket to temp dir
    build_corepacks_file_path = os.path.join(GCPConfig.BUILD_BASE_PATH, GCPConfig.CORE_PACK_FILE_NAME)
    build_corepacks_blob = build_bucket.blob(build_corepacks_file_path)

    if not build_corepacks_blob.exists():
        logging.critical(f"{GCPConfig.CORE_PACK_FILE_NAME} is missing in {build_bucket.name} bucket, exiting...")
        sys.exit(1)

    temp_corepacks_file_path = os.path.join(extract_destination_path, GCPConfig.CORE_PACK_FILE_NAME)
    build_corepacks_blob.download_to_filename(temp_corepacks_file_path)
    corepacks_file = load_json(temp_corepacks_file_path)

    # change the storage paths to the prod bucket
    corepacks_list = corepacks_file.get('corePacks', [])
    try:
        corepacks_list = [os.path.join(GCPConfig.GCS_PUBLIC_URL, production_bucket.name, GCPConfig.STORAGE_BASE_PATH,
                                       LATEST_ZIP_REGEX.findall(corepack_path)[0]) for corepack_path in corepacks_list]
    except IndexError:
        corepacks_list_str = '\n'.join(corepacks_list)
        logging.exception(f"GCS paths in build bucket corepacks.json file are not of format: "
                          f"{GCPConfig.GCS_PUBLIC_URL}/<BUCKET_NAME>/.../content/packs/...\n"
                          f"List of build bucket corepacks paths:\n{corepacks_list_str}")
        sys.exit(1)

    # construct core pack data with public gcs urls
    core_packs_data = {
        'corePacks': corepacks_list,
        'buildNumber': build_number
    }

    # upload core pack json file to gcs
    prod_corepacks_file_path = os.path.join(GCPConfig.STORAGE_BASE_PATH, GCPConfig.CORE_PACK_FILE_NAME)
    prod_corepacks_blob = production_bucket.blob(prod_corepacks_file_path)
    prod_corepacks_blob.upload_from_string(json.dumps(core_packs_data, indent=4))

    logging.success(f"Finished uploading {GCPConfig.CORE_PACK_FILE_NAME} to storage.")
Esempio n. 12
0
def download_blob_to_temp_file(media_bucket: Bucket, file_key) -> Optional[str]:
    """
    Returns the filename where the image was downloaded to (if any).

    If a file name was returned, it MUST be freed using `os.remove(_)`.
    """
    _, temp_local_filename = tempfile.mkstemp()

    try:
        media_bucket.blob(file_key).download_to_filename(temp_local_filename)
        # print(f"Image {file_key} was downloaded to {temp_local_filename}")
        return temp_local_filename

    except exceptions.NotFound:
        print(f"Image {file_key} was not found in media bucket!")
        os.remove(temp_local_filename)
        return None
Esempio n. 13
0
class GCPStorageOutput(BlobOutput):
    def __init__(self, ctx, config=None):
        super().__init__(ctx, config)
        self.bucket = Bucket(StorageClient(), self.bucket)

    def upload_file(self, path, key):
        blob = self.bucket.blob(key)
        blob.upload_from_filename(path)
Esempio n. 14
0
def upload_from_file(bkt: storage.Bucket,
                     blob: str,
                     file: str,
                     delete_file: bool = True) -> str:
    b = bkt.blob(blob_name=blob)
    b.upload_from_filename(file)
    if delete_file:
        os.remove(file)
    return b.public_url
Esempio n. 15
0
def _UploadDirectory(local_dir: str, gcs_bucket: storage.Bucket, gcs_dir: str):
    """Upload the contents of a local directory to a GCS Bucket."""
    for file_name in os.listdir(local_dir):
        path = os.path.join(local_dir, file_name)
        if not os.path.isfile(path):
            logging.info("Skipping %s as it's not a file.", path)
            continue
        logging.info("Uploading: %s", path)
        gcs_blob = gcs_bucket.blob(f"{gcs_dir}/{file_name}")
        gcs_blob.upload_from_filename(path)
Esempio n. 16
0
def read_weather_for_state_for_date(bucket: Bucket, bucket_raw_base_path: str,
                                    selected_state: str, date: datetime.date):
    yyyymmdd: str = date.strftime("%Y%m%d")
    blob = bucket.blob(
        f"{bucket_raw_base_path.format(date=yyyymmdd)}/{selected_state}.json.gz"
    )
    try:
        return json.loads(gunzip_bytes(blob.download_as_string()))
    except NotFound:
        return None
Esempio n. 17
0
def process_sqs_message(sqs_client, sqs_queue_url: str,
                        gcs_output_bucket: storage.Bucket,
                        gcs_output_prefix: str) -> None:
    """
    Process SQS Message

    :param sqs_client: AWS SQS client
    :param sqs_queue_url: AWS SQS Queue URL
    :param gcs_output_bucket: GCP GCS bucket
    :param gcs_output_prefix: GCP GCS object prefix
    """

    # Receive one message
    response = sqs_client.receive_message(QueueUrl=sqs_queue_url,
                                          MaxNumberOfMessages=1,
                                          VisibilityTimeout=0,
                                          WaitTimeSeconds=0)

    # Process message
    if 'Messages' in response and len(response['Messages']) > 0:
        message = response['Messages'][0]
        receipt_handle = message['ReceiptHandle']
        message_body = json.loads(message['Body'])
        message_id = message['MessageId']
        text = message_body['text']
        timestamp = message_body['timestamp']
        parsed_text = process_text(text)
        logger.info('Message received successfully!')

        # Send result to GCS
        result = {
            'id': message_id,
            'timestamp': timestamp,
            'text': text,
            'parsed_text': parsed_text
        }
        result_string = json.dumps(result,
                                   ensure_ascii=False,
                                   encoding='utf8',
                                   indent=2)
        object_key = (
            f'{gcs_output_prefix.rstrip("/")}'
            f'/result_{message_id}_{datetime.fromisoformat(timestamp).strftime("%Y%m%dT%H%M%S")}.json'
        )
        blob = gcs_output_bucket.blob(object_key)
        blob.upload_from_string(result_string)
        logger.info(
            f'Sent result to `{object_key}` in `{gcs_output_bucket.name}`')

        # Delete message
        sqs_client.delete_message(QueueUrl=sqs_queue_url,
                                  ReceiptHandle=receipt_handle)
        logger.info('Message deleted successfully!')
    else:
        logger.info('No messages in queue')
Esempio n. 18
0
def copy_index(index_folder_path: str, build_index_blob: Blob,
               build_index_generation: str, production_bucket: Bucket,
               build_bucket: Bucket):
    """ Copies the build bucket index to the production bucket index path.

    Args:
        index_folder_path (str): index folder full path.
        build_index_blob (Blob): google cloud storage object that represents build index.zip blob.
        build_index_generation (str): downloaded build index generation.
        production_bucket (google.cloud.storage.bucket.Bucket): gcs bucket where index is copied to.
        build_bucket (google.cloud.storage.bucket.Bucket): gcs bucket where index is copied from.

    """
    try:
        build_index_blob.reload()
        build_current_index_generation = build_index_blob.generation

        # disabling caching for prod index blob
        prod_index_storage_path = os.path.join(GCPConfig.STORAGE_BASE_PATH,
                                               f"{GCPConfig.INDEX_NAME}.zip")
        prod_index_blob = production_bucket.blob(prod_index_storage_path)
        prod_index_blob.cache_control = "no-cache,max-age=0"

        if build_current_index_generation == build_index_generation:
            copied_index = build_bucket.copy_blob(
                blob=build_index_blob,
                destination_bucket=production_bucket,
                new_name=prod_index_storage_path)
            if copied_index.exists():
                logging.success(
                    f"Finished uploading {GCPConfig.INDEX_NAME}.zip to storage."
                )
            else:
                logging.error(
                    "Failed copying index from, build index blob does not exists."
                )
                sys.exit(1)
        else:
            logging.error(
                f"Failed in uploading {GCPConfig.INDEX_NAME}, mismatch in index file generation"
            )
            logging.error(
                f"Downloaded build index generation: {build_index_generation}")
            logging.error(
                f"Current build index generation: {build_current_index_generation}"
            )
            sys.exit(1)
    except Exception as e:
        logging.exception(
            f"Failed copying {GCPConfig.INDEX_NAME}. Additional Info: {str(e)}"
        )
        sys.exit(1)
    finally:
        shutil.rmtree(index_folder_path)
    def __delete_blob(bucket: Bucket, blob_name: str):
        """Deletes a blob from the bucket."""
        # bucket_name = "your-bucket-name"
        # blob_name = "your-object-name"

        blob = bucket.blob(blob_name)
        try:
            blob.delete()
            print("Blob {} deleted from bucket: {}.".format(blob_name, bucket))
        except NotFound:
            print("File:", blob_name, "doesn't exists in bucket:", bucket)
Esempio n. 20
0
def blob_path(bucket_object: storage.Bucket) -> str:
    """ Path of a file placed in the GCS Bucket for tests """
    filename = "sample_file.json"
    local_path = path.join(path.dirname(__file__), f"fixtures/{filename}")
    # remote_path = f"gs://{TEST_BUCKET}/{filename}"

    blob: storage.Blob = bucket_object.blob(filename)
    blob.upload_from_filename(local_path)
    assert blob.exists()
    # print("Created blob?", blob.exists())
    yield filename
    # print("Removing blob...")
    blob.delete()
Esempio n. 21
0
def save_arrays(arrays: Dict[str, np.ndarray], filename: str,
                bucket: storage.Bucket):
    """
    Saves .npz arrays (compressed in groups of 10) to cloud
    :param arrays: dict mapping IDs --> arrays to be saved
    :param filename: new filename
    :param bucket: bucket to be saved within
    :return:
    """
    out_stream = io.BytesIO()
    np.savez_compressed(out_stream, **arrays)
    out_stream.seek(0)
    out_blob = bucket.blob(filename)
    out_blob.upload_from_file(out_stream)
Esempio n. 22
0
def download_from_bucket(
    bucket: storage.Bucket,
    bucket_file_path: str,
    local_file_path: Optional[Union[Path, str]] = None,
    force: bool = False,
) -> None:
    """
    Download the file from the bucket to the local machine.

    If the local_directory is specified the files are downloaded to this directory,
    otherwise the structure of the file path is preserved.

    Args:
        bucket: bucket from which to download the file
        bucket_file_path: file path in the bucket
        local_file_path: path to which we save locally
        force: whether to force the download or not

    Raises:
        FileNotFoundError: if the file does not exist.
    """
    gs_blob = bucket.blob(bucket_file_path)

    if not gs_blob.exists():
        raise FileNotFoundError(
            f"The file {bucket_file_path} does not exist in Google Bucket '{bucket.name}'"
        )

    if local_file_path is None:
        local_file_path = CACHE_DIRECTORY / bucket_file_path
    else:
        local_file_path = _convert_file_path(local_file_path).resolve()

    should_download = force or not local_file_path.exists()

    if should_download:
        local_file_path.parent.mkdir(exist_ok=True, parents=True)
        url = gs_blob.public_url
        filename = url.split("/")[-1]
        # gs_blob.download_to_filename(local_file_path) no progress bar

        with TqdmUpTo(unit="B",
                      unit_scale=True,
                      unit_divisor=1024,
                      miniters=1,
                      desc=filename) as t:
            urlretrieve(url, filename=local_file_path, reporthook=t.update_to)

        log.info(f"File {bucket_file_path} downloaded from Google Bucket "
                 f"'{bucket.name}' at {local_file_path}")
Esempio n. 23
0
def upload(bucket: storage.Bucket, thumb: Thumbnail) -> bool:
    blob = bucket.blob(str(thumb.path))
    blob.upload_from_string(thumb.content, thumb.mimetype)
    logger.info('Uploaded {}.', thumb.path)
    # TODO: Copy ACL from original image
    try:
        blob.make_public()
    except ServiceUnavailable as e:
        logger.error('Failed to make {} public.\nError: {}', blob.path, e)
    meta = {'Generator': f'Thunagen v{__version__}'}
    blob.metadata = meta
    try:
        blob.patch()
        logger.debug('Made {} public and set metadata {}', thumb.path, meta)
    except NotFound:
        logger.error('{} was deleted by someone.', blob.path)
    return True
def _UploadBuildResults(gcs_bucket: storage.Bucket,
                        gcs_build_results_dir: str):
    """Uploads all build results to Google Cloud Storage."""
    logging.info("Will upload build results to gs://%s/%s.",
                 os.environ[_GCS_BUCKET], gcs_build_results_dir)

    for build_result in os.listdir(flags.FLAGS.build_results_dir):
        path = os.path.join(flags.FLAGS.build_results_dir, build_result)
        if not os.path.isfile(path):
            logging.info("Skipping %s as it's not a file.", path)
            continue
        logging.info("Uploading: %s", path)
        gcs_blob = gcs_bucket.blob("{}/{}".format(gcs_build_results_dir,
                                                  build_result))
        gcs_blob.upload_from_filename(path)

    logging.info("GCS upload done.")
Esempio n. 25
0
    def read_chunk(report: dict,
                   chunk: int = 4096,
                   credentials: Credentials = None,
                   start: int = 0) -> str:
        client = storage.Client(
            credentials=(credentials.credentials if credentials else None))

        path_segments = report['current_path'].split('/')
        report_bucket = path_segments[-2]
        report_blob_name = path_segments[-1].split('?')[0]

        source_bucket = Bucket(client, report_bucket)
        blob = source_bucket.blob(report_blob_name)

        data = blob.download_as_string(start=start,
                                       end=chunk,
                                       raw_download=True).decode('utf-8')
        return data
Esempio n. 26
0
def download_job_manifest(bucket: Bucket, job_id: str) -> JobManifest:
    """
    Download the JobManifest associated with job_id in given bucket.

    Parameters
    ----------
    bucket : google.cloud.storage.Bucket
        The GCS bucket where job data is stored.
    job_id : str
        The ID of the job.

    Returns
    -------
    JobManifest
    """
    path = f"thor_jobs/v1/job-{job_id}/manifest.json"
    as_str = bucket.blob(path).download_as_string()
    return JobManifest.from_str(as_str)
Esempio n. 27
0
def to_public_png(npy_blob: storage.Blob, public_bucket: storage.Bucket):
    """
    Converts a .npy blob into a png file and uploads it to the public
    bucket.

    :param npy_blob:
    :param public_bucket:
    :return:
    """
    npy_filepath = f'/tmp/{npy_blob.name.split("/")[-1]}'
    npy_blob.download_to_filename(npy_filepath)
    arr = np.load(npy_filepath)

    png_filepath = npy_filepath.replace('.npy', '.png')
    plt.imsave(png_filepath, arr)

    png_blob_name = npy_blob.name.replace('.npy', '.png')
    png_blob = public_bucket.blob(png_blob_name)
    png_blob.upload_from_filename(png_filepath)
    os.remove(npy_filepath)
    os.remove(png_filepath)
Esempio n. 28
0
def handle_backlog(
    gcs_client: storage.Client,
    bq_client: bigquery.Client,
    bkt: storage.Bucket,
    lock_blob: storage.Blob,
    backfill_blob: storage.Blob,
):
    """submit the next item in the _backlog if it is non-empty or clean up the
    _BACKFILL and _bqlock files.
    Args:
        gcs_client: storage.Client
        bq_client: bigquery.Client
        bkt: storage.Bucket
        lock_blob: storage.Blob _bqlock blob
        backfill_blob: storage.blob _BACKFILL blob
    Returns:
        bool: should this backlog subscriber exit
    """
    table_prefix = utils.get_table_prefix(gcs_client, backfill_blob)
    check_backlog_time = time.monotonic()
    next_backlog_file = utils.get_next_backlog_item(gcs_client, bkt,
                                                    table_prefix)
    if next_backlog_file:
        next_success_file: storage.Blob = bkt.blob(
            next_backlog_file.name.replace("/_backlog/", "/"))
        if not next_success_file.exists(client=gcs_client):
            raise exceptions.BacklogException(
                "backlog contains "
                f"gs://{next_backlog_file.bucket}/{next_backlog_file.name} "
                "but the corresponding success file does not exist at: "
                f"gs://{next_success_file.bucket}/{next_success_file.name}")
        print("applying next batch for:"
              f"gs://{next_success_file.bucket}/{next_success_file.name}")
        next_job_id = utils.create_job_id(next_success_file.name)
        utils.apply(gcs_client, bq_client, next_success_file, lock_blob,
                    next_job_id)
        return False  # BQ job running
    print("no more files found in the backlog deleteing backfill blob")
    backfill_blob.delete(if_generation_match=backfill_blob.generation,
                         client=gcs_client)
    if (check_backlog_time + constants.ENSURE_SUBSCRIBER_SECONDS <
            time.monotonic()):
        print("checking if the backlog is still empty for "
              f"gs://${bkt.name}/{table_prefix}/_backlog/"
              f"There was more than {constants.ENSURE_SUBSCRIBER_SECONDS}"
              " seconds between listing items on the backlog and "
              f"deleting the {constants.BACKFILL_FILENAME}. "
              "This should not happen often but is meant to alleviate a "
              "race condition in the event that something caused the "
              "delete operation was delayed or had to be retried for a "
              "long time.")
        next_backlog_file = utils.get_next_backlog_item(
            gcs_client, bkt, table_prefix)
        if next_backlog_file:
            # The backfill file was deleted but the backlog is
            # not empty. Re-trigger the backfill subscriber loop by
            # dropping a new backfill file.
            start_backfill_subscriber_if_not_running(gcs_client, bkt,
                                                     table_prefix)
            return True  # we are re-triggering a new backlog subscriber
    table = None
    # Get table from lock blob
    lock_contents_str = utils.read_gcs_file_if_exists(
        gcs_client, f"gs://{bkt.name}/{lock_blob.name}")
    if lock_contents_str:
        lock_contents: Dict = json.loads(lock_contents_str)
        if lock_contents:
            print(
                json.dumps(
                    dict(message=f"View lock contents in jsonPayload for"
                         f" gs://{bkt.name}/{lock_blob.name}",
                         lock_contents=lock_contents)))
            table = bigquery.TableReference.from_api_repr(
                lock_contents.get('table'))
    utils.handle_bq_lock(gcs_client, lock_blob, None, table)
    print(f"backlog is empty for gs://{bkt.name}/{table_prefix}. "
          "backlog subscriber exiting.")
    return True  # the backlog is empty
Esempio n. 29
0
def test_write_gcs(gcs_bucket: Bucket, gcs_dest: Dict[str, Any]):
    """Test writing a GCS blob in destination config."""
    blob = gcs_bucket.blob(gcs_dest["prefix"] + "test.txt")
    blob.upload_from_string("This is a test.")
    blob.delete()
Esempio n. 30
0
def upload_file_to_blob(media_bucket: Bucket, bucket_file_key, local_filename):
    media_bucket.blob(bucket_file_key).upload_from_filename(filename=local_filename, content_type='image/png')