Esempio n. 1
0
    def prepare(self):
        """
        Prepare a processing request for each account.

        Scans the database for providers that have reports that need to be processed.
        Any report it finds is queued to the appropriate celery task to download
        and process those reports.

        Args:
            None

        Returns:
            (celery.result.AsyncResult) Async result for download request.

        """
        async_result = None
        for account in self._polling_accounts:
            provider_uuid = account.get("provider_uuid")
            report_months = self.get_reports(provider_uuid)
            for month in report_months:
                provider_status = ProviderStatus(provider_uuid)
                if provider_status.is_valid(
                ) and not provider_status.is_backing_off():
                    LOG.info(
                        "Getting %s report files for account (provider uuid): %s",
                        month.strftime("%B %Y"),
                        provider_uuid,
                    )
                    account["report_month"] = month
                    async_result = (get_report_files.s(**account)
                                    | summarize_reports.s()).apply_async()

                    LOG.info("Download queued - schema_name: %s, Task ID: %s",
                             account.get("schema_name"), str(async_result))

                    # update labels
                    labeler = AccountLabel(
                        auth=account.get("authentication"),
                        schema=account.get("schema_name"),
                        provider_type=account.get("provider_type"),
                    )
                    account_number, label = labeler.get_label_details()
                    if account_number:
                        LOG.info("Account: %s Label: %s updated.",
                                 account_number, label)
                else:
                    LOG.info(
                        "Provider skipped: %s Valid: %s Backing off: %s",
                        account.get("provider_uuid"),
                        provider_status.is_valid(),
                        provider_status.is_backing_off(),
                    )
        return async_result
Esempio n. 2
0
    def prepare(self):
        """
        Prepare a processing request for each account.

        Args:
            None

        Returns:
            (celery.result.AsyncResult) Async result for download request.

        """
        async_result = None
        for account in self._polling_accounts:
            provider_uuid = account.get('provider_uuid')
            report_months = self.get_reports(provider_uuid)
            for month in report_months:
                provider_status = ProviderStatus(provider_uuid)
                if provider_status.is_valid(
                ) and not provider_status.is_backing_off():
                    LOG.info(
                        'Getting %s report files for account (provider uuid): %s',
                        month.strftime('%B %Y'), provider_uuid)
                    account['report_month'] = month
                    async_result = (get_report_files.s(**account) | summarize_reports.s()).\
                        apply_async()

                    LOG.info('Download queued - schema_name: %s, Task ID: %s',
                             account.get('schema_name'), str(async_result))

                    # update labels
                    labeler = AccountLabel(
                        auth=account.get('authentication'),
                        schema=account.get('schema_name'),
                        provider_type=account.get('provider_type'))
                    account_number, label = labeler.get_label_details()
                    if account_number:
                        LOG.info('Account: %s Label: %s updated.',
                                 account_number, label)
                else:
                    LOG.info('Provider skipped: %s Valid: %s Backing off: %s',
                             account.get('provider_uuid'),
                             provider_status.is_valid(),
                             provider_status.is_backing_off())
        return async_result
Esempio n. 3
0
def summarize_manifest(report_meta):
    """
    Kick off manifest summary when all report files have completed line item processing.

    Args:
        report (Dict) - keys: value
                        schema_name: String,
                        manifest_id: Integer,
                        provider_uuid: String,
                        provider_type: String,

    Returns:
        Celery Async UUID.

    """
    async_id = None
    schema_name = report_meta.get("schema_name")
    manifest_id = report_meta.get("manifest_id")
    provider_uuid = report_meta.get("provider_uuid")
    schema_name = report_meta.get("schema_name")
    provider_type = report_meta.get("provider_type")
    start_date = report_meta.get("start")
    end_date = report_meta.get("end")

    with ReportManifestDBAccessor() as manifest_accesor:
        if manifest_accesor.manifest_ready_for_summary(manifest_id):
            report_meta = {
                "schema_name": schema_name,
                "provider_type": provider_type,
                "provider_uuid": provider_uuid,
                "manifest_id": manifest_id,
            }
            if start_date and end_date:
                LOG.info(
                    f"Summarizing OCP reports from {str(start_date)}-{str(end_date)} for provider: {provider_uuid}"
                )
                report_meta["start"] = start_date
                report_meta["end"] = end_date
            async_id = summarize_reports.s(
                [report_meta], OCP_QUEUE).apply_async(queue=OCP_QUEUE)
    return async_id
Esempio n. 4
0
    def start_manifest_processing(self, customer_name, credentials,
                                  data_source, provider_type, schema_name,
                                  provider_uuid, report_month):
        """
        Start processing an account's manifest for the specified report_month.

        Args:
            (String) customer_name - customer name
            (String) credentials - credentials object
            (String) data_source - report storage location
            (String) schema_name - db tenant
            (String) provider_uuid - provider unique identifier
            (Date)   report_month - month to get latest manifest

        Returns:
            ({}) Dictionary containing the following keys:
                manifest_id - (String): Manifest ID for ReportManifestDBAccessor
                assembly_id - (String): UUID identifying report file
                compression - (String): Report compression format
                files       - ([{"key": full_file_path "local_file": "local file name"}]): List of report files.
        """
        downloader = ReportDownloader(
            customer_name=customer_name,
            credentials=credentials,
            data_source=data_source,
            provider_type=provider_type,
            provider_uuid=provider_uuid,
            report_name=None,
        )
        manifest = downloader.download_manifest(report_month)

        if manifest:
            LOG.info("Saving all manifest file names.")
            record_all_manifest_files(manifest["manifest_id"], [
                report.get("local_file")
                for report in manifest.get("files", [])
            ])

        LOG.info(f"Found Manifests: {str(manifest)}")
        report_files = manifest.get("files", [])
        report_tasks = []
        for report_file_dict in report_files:
            local_file = report_file_dict.get("local_file")
            report_file = report_file_dict.get("key")

            # Check if report file is complete or in progress.
            if record_report_status(manifest["manifest_id"], local_file,
                                    "no_request"):
                LOG.info(f"{local_file} was already processed")
                continue

            cache_key = f"{provider_uuid}:{report_file}"
            if self.worker_cache.task_is_running(cache_key):
                LOG.info(f"{local_file} process is in progress")
                continue

            report_context = manifest.copy()
            report_context["current_file"] = report_file
            report_context["local_file"] = local_file
            report_context["key"] = report_file

            report_tasks.append(
                get_report_files.s(
                    customer_name,
                    credentials,
                    data_source,
                    provider_type,
                    schema_name,
                    provider_uuid,
                    report_month,
                    report_context,
                ))
            LOG.info("Download queued - schema_name: %s.", schema_name)

        if report_tasks:
            async_id = chord(report_tasks, summarize_reports.s())()
            LOG.info(f"Manifest Processing Async ID: {async_id}")
        return manifest
Esempio n. 5
0
    def start_manifest_processing(self, customer_name, credentials,
                                  data_source, provider_type, schema_name,
                                  provider_uuid, report_month):
        """
        Start processing an account's manifest for the specified report_month.

        Args:
            (String) customer_name - customer name
            (String) credentials - credentials object
            (String) data_source - report storage location
            (String) schema_name - db tenant
            (String) provider_uuid - provider unique identifier
            (Date)   report_month - month to get latest manifest

        Returns:
            ({}) Dictionary containing the following keys:
                manifest_id - (String): Manifest ID for ReportManifestDBAccessor
                assembly_id - (String): UUID identifying report file
                compression - (String): Report compression format
                files       - ([{"key": full_file_path "local_file": "local file name"}]): List of report files.
            (Boolean) - Whether we are processing this manifest
        """
        # Switching initial ingest to use priority queue for QE tests based on QE_SCHEMA flag
        if self.queue_name is not None and self.provider_uuid is not None:
            SUMMARY_QUEUE = self.queue_name
            REPORT_QUEUE = self.queue_name
        else:
            SUMMARY_QUEUE = SUMMARIZE_REPORTS_QUEUE
            REPORT_QUEUE = GET_REPORT_FILES_QUEUE
        reports_tasks_queued = False
        downloader = ReportDownloader(
            customer_name=customer_name,
            credentials=credentials,
            data_source=data_source,
            provider_type=provider_type,
            provider_uuid=provider_uuid,
            report_name=None,
        )
        manifest = downloader.download_manifest(report_month)
        tracing_id = manifest.get("assembly_id",
                                  manifest.get("request_id", "no-request-id"))
        files = manifest.get("files", [])
        filenames = []
        for file in files:
            filenames.append(file.get("local_file"))
        LOG.info(
            log_json(
                tracing_id,
                f"Report with manifest {tracing_id} contains the files: {filenames}"
            ))

        if manifest:
            LOG.debug("Saving all manifest file names.")
            record_all_manifest_files(manifest["manifest_id"], [
                report.get("local_file")
                for report in manifest.get("files", [])
            ], tracing_id)

        LOG.info(log_json(tracing_id, f"Found Manifests: {str(manifest)}"))
        report_files = manifest.get("files", [])
        report_tasks = []
        last_report_index = len(report_files) - 1
        for i, report_file_dict in enumerate(report_files):
            local_file = report_file_dict.get("local_file")
            report_file = report_file_dict.get("key")

            # Check if report file is complete or in progress.
            if record_report_status(manifest["manifest_id"], local_file,
                                    "no_request"):
                LOG.info(
                    log_json(tracing_id,
                             f"{local_file} was already processed"))
                continue

            cache_key = f"{provider_uuid}:{report_file}"
            if self.worker_cache.task_is_running(cache_key):
                LOG.info(
                    log_json(tracing_id,
                             f"{local_file} process is in progress"))
                continue

            report_context = manifest.copy()
            report_context["current_file"] = report_file
            report_context["local_file"] = local_file
            report_context["key"] = report_file
            report_context["request_id"] = tracing_id

            if provider_type in [Provider.PROVIDER_OCP, Provider.PROVIDER_GCP
                                 ] or i == last_report_index:
                # This create_table flag is used by the ParquetReportProcessor
                # to create a Hive/Trino table.
                # To reduce the number of times we check Trino/Hive tables, we just do this
                # on the final file of the set.
                report_context["create_table"] = True
            # add the tracing id to the report context
            # This defaults to the celery queue
            report_tasks.append(
                get_report_files.s(
                    customer_name,
                    credentials,
                    data_source,
                    provider_type,
                    schema_name,
                    provider_uuid,
                    report_month,
                    report_context,
                ).set(queue=REPORT_QUEUE))
            LOG.info(
                log_json(tracing_id,
                         f"Download queued - schema_name: {schema_name}."))

        if report_tasks:
            reports_tasks_queued = True
            async_id = chord(report_tasks,
                             summarize_reports.s().set(queue=SUMMARY_QUEUE))()
            LOG.debug(
                log_json(tracing_id,
                         f"Manifest Processing Async ID: {async_id}"))
        return manifest, reports_tasks_queued
Esempio n. 6
0
    def start_manifest_processing(self, customer_name, credentials,
                                  data_source, provider_type, schema_name,
                                  provider_uuid, report_month):
        """
        Start processing an account's manifest for the specified report_month.

        Args:
            (String) customer_name - customer name
            (String) credentials - credentials object
            (String) data_source - report storage location
            (String) schema_name - db tenant
            (String) provider_uuid - provider unique identifier
            (Date)   report_month - month to get latest manifest

        Returns:
            ({}) Dictionary containing the following keys:
                manifest_id - (String): Manifest ID for ReportManifestDBAccessor
                assembly_id - (String): UUID identifying report file
                compression - (String): Report compression format
                files       - ([{"key": full_file_path "local_file": "local file name"}]): List of report files.
            (Boolean) - Whether we are processing this manifest
        """
        reports_tasks_queued = False
        downloader = ReportDownloader(
            customer_name=customer_name,
            credentials=credentials,
            data_source=data_source,
            provider_type=provider_type,
            provider_uuid=provider_uuid,
            report_name=None,
        )
        manifest = downloader.download_manifest(report_month)

        if manifest:
            LOG.info("Saving all manifest file names.")
            record_all_manifest_files(manifest["manifest_id"], [
                report.get("local_file")
                for report in manifest.get("files", [])
            ])

        LOG.info(f"Found Manifests: {str(manifest)}")
        report_files = manifest.get("files", [])
        report_tasks = []
        last_report_index = len(report_files) - 1
        for i, report_file_dict in enumerate(report_files):
            local_file = report_file_dict.get("local_file")
            report_file = report_file_dict.get("key")

            # Check if report file is complete or in progress.
            if record_report_status(manifest["manifest_id"], local_file,
                                    "no_request"):
                LOG.info(f"{local_file} was already processed")
                continue

            cache_key = f"{provider_uuid}:{report_file}"
            if self.worker_cache.task_is_running(cache_key):
                LOG.info(f"{local_file} process is in progress")
                continue

            report_context = manifest.copy()
            report_context["current_file"] = report_file
            report_context["local_file"] = local_file
            report_context["key"] = report_file

            if provider_type == Provider.PROVIDER_OCP or i == last_report_index:
                # To reduce the number of times we check Trino/Hive tables, we just do this
                # on the final file of the set.
                report_context["create_table"] = True

            # This defaults to the celery queue
            report_tasks.append(
                get_report_files.s(
                    customer_name,
                    credentials,
                    data_source,
                    provider_type,
                    schema_name,
                    provider_uuid,
                    report_month,
                    report_context,
                ).set(queue=GET_REPORT_FILES_QUEUE))
            LOG.info("Download queued - schema_name: %s.", schema_name)

        if report_tasks:
            reports_tasks_queued = True
            async_id = chord(
                report_tasks,
                summarize_reports.s().set(
                    queue=REFRESH_MATERIALIZED_VIEWS_QUEUE))()
            LOG.info(f"Manifest Processing Async ID: {async_id}")
        return manifest, reports_tasks_queued
Esempio n. 7
0
def summarize_manifest(report_meta, manifest_uuid):
    """
    Kick off manifest summary when all report files have completed line item processing.

    Args:
        manifest_uuid (string) - The id associated with the payload manifest
        report (Dict) - keys: value
                        schema_name: String,
                        manifest_id: Integer,
                        provider_uuid: String,
                        provider_type: String,

    Returns:
        Celery Async UUID.

    """
    async_id = None
    schema_name = report_meta.get("schema_name")
    manifest_id = report_meta.get("manifest_id")
    provider_uuid = report_meta.get("provider_uuid")
    provider_type = report_meta.get("provider_type")
    start_date = report_meta.get("start")
    end_date = report_meta.get("end")

    context = {"account": report_meta.get("schema_name"), "provider_uuid": str(provider_uuid)}

    with ReportManifestDBAccessor() as manifest_accesor:
        if manifest_accesor.manifest_ready_for_summary(manifest_id):
            new_report_meta = {
                "schema_name": schema_name,
                "provider_type": provider_type,
                "provider_uuid": provider_uuid,
                "manifest_id": manifest_id,
            }
            if start_date and end_date:
                if "0001-01-01 00:00:00+00:00" in [str(start_date), str(end_date)]:
                    cr_status = report_meta.get("cr_status", {})
                    context["cluster_id"] = cr_status.get("clusterID", "no-cluster-id")
                    data_collection_message = cr_status.get("reports", {}).get("data_collection_message", "")
                    if data_collection_message:
                        # remove potentially sensitive info from the error message
                        msg = (
                            f'data collection error [operator]: {re.sub("{[^}]+}", "{***}", data_collection_message)}'
                        )
                        cr_status["reports"]["data_collection_message"] = msg
                        # The full CR status is logged below, but we should limit our alert to just the query.
                        # We can check the full manifest to get the full error.
                        LOG.error(msg)
                        LOG.info(log_json(manifest_uuid, msg, context))
                    LOG.info(
                        log_json(manifest_uuid, f"CR Status for invalid manifest: {json.dumps(cr_status)}", context)
                    )
                    return  # an invalid payload will fail to summarize, so return before we try
                LOG.info(
                    log_json(
                        manifest_uuid,
                        f"Summarizing OCP reports from {str(start_date)}-{str(end_date)} for provider: {provider_uuid}",
                        context,
                    )
                )
                new_report_meta["start"] = start_date
                new_report_meta["end"] = end_date
                new_report_meta["manifest_uuid"] = manifest_uuid
            async_id = summarize_reports.s([new_report_meta], OCP_QUEUE).apply_async(queue=OCP_QUEUE)
    return async_id