コード例 #1
0
ファイル: main.py プロジェクト: saliyamwd/bigquery-utils
def handle_duplicate_notification(bkt: storage.Bucket,
                                  success_blob: storage.Blob, gsurl: str):
    """
    Need to handle potential duplicate Pub/Sub notifications.
    To achieve this we will drop an empty "claimed" file that indicates
    an invocation of this cloud function has picked up the success file
    with a certain creation timestamp. This will support republishing the
    success file as a mechanism of re-running the ingestion while avoiding
    duplicate ingestion due to multiple Pub/Sub messages for a success file
    with the same creation time.
    """
    success_blob.reload()
    success_created_unix_timestamp = success_blob.time_created.timestamp()

    claim_blob: storage.Blob = bkt.blob(
        success_blob.name.replace(
            SUCCESS_FILENAME, f"_claimed_{success_created_unix_timestamp}"))
    try:
        claim_blob.upload_from_string("", if_generation_match=0)
    except google.api_core.exceptions.PreconditionFailed as err:
        raise RuntimeError(
            f"The prefix {gsurl} appears to already have been claimed for "
            f"{gsurl}{SUCCESS_FILENAME} with created timestamp"
            f"{success_created_unix_timestamp}."
            "This means that another invocation of this cloud function has"
            "claimed the ingestion of this batch."
            "This may be due to a rare duplicate delivery of the Pub/Sub "
            "storage notification.") from err
コード例 #2
0
def handle_duplicate_notification(
    gcs_client: storage.Client,
    blob_to_claim: storage.Blob,
):
    """
    Need to handle potential duplicate Pub/Sub notifications.
    To achieve this we will drop an empty "claimed" file that indicates
    an invocation of this cloud function has picked up the success file
    with a certain creation timestamp. This will support republishing the
    success file as a mechanism of re-running the ingestion while avoiding
    duplicate ingestion due to multiple Pub/Sub messages for a success file
    with the same creation time.
    """
    blob_to_claim.reload(client=gcs_client)
    created_unix_timestamp = blob_to_claim.time_created.timestamp()

    basename = os.path.basename(blob_to_claim.name)
    claim_blob: storage.Blob = blob_to_claim.bucket.blob(
        blob_to_claim.name.replace(
            basename, f"_claimed_{basename}_created_at_"
            f"{created_unix_timestamp}"))
    try:
        claim_blob.upload_from_string("",
                                      if_generation_match=0,
                                      client=gcs_client)
    except google.api_core.exceptions.PreconditionFailed as err:
        blob_to_claim.reload(client=gcs_client)
        raise exceptions.DuplicateNotificationException(
            f"gs://{blob_to_claim.bucket.name}/{blob_to_claim.name} appears "
            "to already have been claimed for created timestamp: "
            f"{created_unix_timestamp}."
            "This means that another invocation of this cloud function has "
            "claimed the work to be one for this file. "
            "This may be due to a rare duplicate delivery of the Pub/Sub "
            "storage notification.") from err
コード例 #3
0
def copy_index(index_folder_path: str, build_index_blob: Blob, build_index_generation: str, production_bucket: Bucket,
               build_bucket: Bucket, storage_base_path: str, build_bucket_base_path: str):
    """ Copies the build bucket index to the production bucket index path.

    Args:
        index_folder_path (str): index folder full path.
        build_index_blob (Blob): google cloud storage object that represents build index.zip blob.
        build_index_generation (str): downloaded build index generation.
        production_bucket (google.cloud.storage.bucket.Bucket): gcs bucket where index is copied to.
        build_bucket (google.cloud.storage.bucket.Bucket): gcs bucket where index is copied from.
        storage_base_path (str): the path to upload the index to.
        build_bucket_base_path (str): the path in the build bucket of the index.
    """
    try:
        build_index_blob.reload()
        build_current_index_generation = build_index_blob.generation

        # disabling caching for prod index blob
        prod_index_storage_path = os.path.join(storage_base_path, f"{GCPConfig.INDEX_NAME}.zip")
        prod_index_blob = production_bucket.blob(prod_index_storage_path)
        prod_index_blob.cache_control = "no-cache,max-age=0"
        prod_index_json_storage_path = os.path.join(storage_base_path, f"{GCPConfig.INDEX_NAME}.json")
        prod_index_json_blob = production_bucket.blob(prod_index_json_storage_path)
        prod_index_json_blob.cache_control = "no-cache,max-age=0"

        if build_current_index_generation == build_index_generation:
            copied_index = build_bucket.copy_blob(
                blob=build_index_blob, destination_bucket=production_bucket, new_name=prod_index_storage_path
            )
            if copied_index.exists():
                logging.success(f"Finished uploading {GCPConfig.INDEX_NAME}.zip to storage.")
            else:
                logging.error("Failed copying index.zip from build index - blob does not exist.")
                sys.exit(1)
            copied_index_json_blob = build_bucket.blob(
                os.path.join(build_bucket_base_path, f"{GCPConfig.INDEX_NAME}.json")
            )
            copied_index_json = build_bucket.copy_blob(
                blob=copied_index_json_blob, destination_bucket=production_bucket, new_name=prod_index_json_storage_path
            )
            if copied_index_json.exists():
                logging.success(f"Finished uploading {GCPConfig.INDEX_NAME}.json to storage.")
            else:
                logging.error("Failed copying index.json from build index - blob does not exist.")
                sys.exit(1)
        else:
            logging.error(f"Failed in uploading {GCPConfig.INDEX_NAME}, mismatch in index file generation")
            logging.error(f"Downloaded build index generation: {build_index_generation}")
            logging.error(f"Current build index generation: {build_current_index_generation}")
            sys.exit(1)
    except Exception as e:
        logging.exception(f"Failed copying {GCPConfig.INDEX_NAME}. Additional Info: {str(e)}")
        sys.exit(1)
    finally:
        shutil.rmtree(index_folder_path)
コード例 #4
0
def up_to_date(input_blob: storage.Blob, output_blob: storage.Blob):
    """
    Checks if the blob is up-to-date.
    :param input_blob:
    :param output_blob:
    :return: true if the output blob is up-to-date. If the blob doesn't
    exist or is outdated, returns false.
    """
    if not output_blob.exists():
        return False

    input_blob.reload()
    output_blob.reload()
    assert input_blob.updated is not None, 'input blob should exist'
    if input_blob.updated > output_blob.updated:
        return False

    return True
コード例 #5
0
    def _assert_file_uploaded(self, blob: storage.Blob, sleep_time: float,
                              max_sleep_time: float):
        if sleep_time > max_sleep_time:
            raise UploadPollingException(
                f'Could not verify completed upload for blob {blob.name} within maximum '
                f'wait time of {str(max_sleep_time)} seconds')
        else:
            sleep(sleep_time)
            blob.reload()

            export_completed = blob.metadata is not None and blob.metadata.get(
                "export_completed")
            if export_completed:
                return
            else:
                new_sleep_time = sleep_time * 2
                self.logger.info(
                    f'Verifying upload of blob {blob.name}. Waiting for {str(new_sleep_time)} seconds...'
                )
                return self._assert_file_uploaded(blob, new_sleep_time,
                                                  max_sleep_time)