Beispiel #1
0
    def start_manifest_processing(self, customer_name, credentials,
                                  data_source, provider_type, schema_name,
                                  provider_uuid, report_month):
        """
        Start processing an account's manifest for the specified report_month.

        Args:
            (String) customer_name - customer name
            (String) credentials - credentials object
            (String) data_source - report storage location
            (String) schema_name - db tenant
            (String) provider_uuid - provider unique identifier
            (Date)   report_month - month to get latest manifest

        Returns:
            ({}) Dictionary containing the following keys:
                manifest_id - (String): Manifest ID for ReportManifestDBAccessor
                assembly_id - (String): UUID identifying report file
                compression - (String): Report compression format
                files       - ([{"key": full_file_path "local_file": "local file name"}]): List of report files.
            (Boolean) - Whether we are processing this manifest
        """
        # Switching initial ingest to use priority queue for QE tests based on QE_SCHEMA flag
        if self.queue_name is not None and self.provider_uuid is not None:
            SUMMARY_QUEUE = self.queue_name
            REPORT_QUEUE = self.queue_name
        else:
            SUMMARY_QUEUE = SUMMARIZE_REPORTS_QUEUE
            REPORT_QUEUE = GET_REPORT_FILES_QUEUE
        reports_tasks_queued = False
        downloader = ReportDownloader(
            customer_name=customer_name,
            credentials=credentials,
            data_source=data_source,
            provider_type=provider_type,
            provider_uuid=provider_uuid,
            report_name=None,
        )
        manifest = downloader.download_manifest(report_month)
        tracing_id = manifest.get("assembly_id",
                                  manifest.get("request_id", "no-request-id"))
        files = manifest.get("files", [])
        filenames = []
        for file in files:
            filenames.append(file.get("local_file"))
        LOG.info(
            log_json(
                tracing_id,
                f"Report with manifest {tracing_id} contains the files: {filenames}"
            ))

        if manifest:
            LOG.debug("Saving all manifest file names.")
            record_all_manifest_files(manifest["manifest_id"], [
                report.get("local_file")
                for report in manifest.get("files", [])
            ], tracing_id)

        LOG.info(log_json(tracing_id, f"Found Manifests: {str(manifest)}"))
        report_files = manifest.get("files", [])
        report_tasks = []
        last_report_index = len(report_files) - 1
        for i, report_file_dict in enumerate(report_files):
            local_file = report_file_dict.get("local_file")
            report_file = report_file_dict.get("key")

            # Check if report file is complete or in progress.
            if record_report_status(manifest["manifest_id"], local_file,
                                    "no_request"):
                LOG.info(
                    log_json(tracing_id,
                             f"{local_file} was already processed"))
                continue

            cache_key = f"{provider_uuid}:{report_file}"
            if self.worker_cache.task_is_running(cache_key):
                LOG.info(
                    log_json(tracing_id,
                             f"{local_file} process is in progress"))
                continue

            report_context = manifest.copy()
            report_context["current_file"] = report_file
            report_context["local_file"] = local_file
            report_context["key"] = report_file
            report_context["request_id"] = tracing_id

            if provider_type in [Provider.PROVIDER_OCP, Provider.PROVIDER_GCP
                                 ] or i == last_report_index:
                # This create_table flag is used by the ParquetReportProcessor
                # to create a Hive/Trino table.
                # To reduce the number of times we check Trino/Hive tables, we just do this
                # on the final file of the set.
                report_context["create_table"] = True
            # add the tracing id to the report context
            # This defaults to the celery queue
            report_tasks.append(
                get_report_files.s(
                    customer_name,
                    credentials,
                    data_source,
                    provider_type,
                    schema_name,
                    provider_uuid,
                    report_month,
                    report_context,
                ).set(queue=REPORT_QUEUE))
            LOG.info(
                log_json(tracing_id,
                         f"Download queued - schema_name: {schema_name}."))

        if report_tasks:
            reports_tasks_queued = True
            async_id = chord(report_tasks,
                             summarize_reports.s().set(queue=SUMMARY_QUEUE))()
            LOG.debug(
                log_json(tracing_id,
                         f"Manifest Processing Async ID: {async_id}"))
        return manifest, reports_tasks_queued
Beispiel #2
0
    def start_manifest_processing(self, customer_name, credentials,
                                  data_source, provider_type, schema_name,
                                  provider_uuid, report_month):
        """
        Start processing an account's manifest for the specified report_month.

        Args:
            (String) customer_name - customer name
            (String) credentials - credentials object
            (String) data_source - report storage location
            (String) schema_name - db tenant
            (String) provider_uuid - provider unique identifier
            (Date)   report_month - month to get latest manifest

        Returns:
            ({}) Dictionary containing the following keys:
                manifest_id - (String): Manifest ID for ReportManifestDBAccessor
                assembly_id - (String): UUID identifying report file
                compression - (String): Report compression format
                files       - ([{"key": full_file_path "local_file": "local file name"}]): List of report files.
        """
        downloader = ReportDownloader(
            customer_name=customer_name,
            credentials=credentials,
            data_source=data_source,
            provider_type=provider_type,
            provider_uuid=provider_uuid,
            report_name=None,
        )
        manifest = downloader.download_manifest(report_month)

        if manifest:
            LOG.info("Saving all manifest file names.")
            record_all_manifest_files(manifest["manifest_id"], [
                report.get("local_file")
                for report in manifest.get("files", [])
            ])

        LOG.info(f"Found Manifests: {str(manifest)}")
        report_files = manifest.get("files", [])
        report_tasks = []
        for report_file_dict in report_files:
            local_file = report_file_dict.get("local_file")
            report_file = report_file_dict.get("key")

            # Check if report file is complete or in progress.
            if record_report_status(manifest["manifest_id"], local_file,
                                    "no_request"):
                LOG.info(f"{local_file} was already processed")
                continue

            cache_key = f"{provider_uuid}:{report_file}"
            if self.worker_cache.task_is_running(cache_key):
                LOG.info(f"{local_file} process is in progress")
                continue

            report_context = manifest.copy()
            report_context["current_file"] = report_file
            report_context["local_file"] = local_file
            report_context["key"] = report_file

            report_tasks.append(
                get_report_files.s(
                    customer_name,
                    credentials,
                    data_source,
                    provider_type,
                    schema_name,
                    provider_uuid,
                    report_month,
                    report_context,
                ))
            LOG.info("Download queued - schema_name: %s.", schema_name)

        if report_tasks:
            async_id = chord(report_tasks, summarize_reports.s())()
            LOG.info(f"Manifest Processing Async ID: {async_id}")
        return manifest
Beispiel #3
0
    def start_manifest_processing(self, customer_name, credentials,
                                  data_source, provider_type, schema_name,
                                  provider_uuid, report_month):
        """
        Start processing an account's manifest for the specified report_month.

        Args:
            (String) customer_name - customer name
            (String) credentials - credentials object
            (String) data_source - report storage location
            (String) schema_name - db tenant
            (String) provider_uuid - provider unique identifier
            (Date)   report_month - month to get latest manifest

        Returns:
            ({}) Dictionary containing the following keys:
                manifest_id - (String): Manifest ID for ReportManifestDBAccessor
                assembly_id - (String): UUID identifying report file
                compression - (String): Report compression format
                files       - ([{"key": full_file_path "local_file": "local file name"}]): List of report files.
            (Boolean) - Whether we are processing this manifest
        """
        reports_tasks_queued = False
        downloader = ReportDownloader(
            customer_name=customer_name,
            credentials=credentials,
            data_source=data_source,
            provider_type=provider_type,
            provider_uuid=provider_uuid,
            report_name=None,
        )
        manifest = downloader.download_manifest(report_month)

        if manifest:
            LOG.info("Saving all manifest file names.")
            record_all_manifest_files(manifest["manifest_id"], [
                report.get("local_file")
                for report in manifest.get("files", [])
            ])

        LOG.info(f"Found Manifests: {str(manifest)}")
        report_files = manifest.get("files", [])
        report_tasks = []
        last_report_index = len(report_files) - 1
        for i, report_file_dict in enumerate(report_files):
            local_file = report_file_dict.get("local_file")
            report_file = report_file_dict.get("key")

            # Check if report file is complete or in progress.
            if record_report_status(manifest["manifest_id"], local_file,
                                    "no_request"):
                LOG.info(f"{local_file} was already processed")
                continue

            cache_key = f"{provider_uuid}:{report_file}"
            if self.worker_cache.task_is_running(cache_key):
                LOG.info(f"{local_file} process is in progress")
                continue

            report_context = manifest.copy()
            report_context["current_file"] = report_file
            report_context["local_file"] = local_file
            report_context["key"] = report_file

            if provider_type == Provider.PROVIDER_OCP or i == last_report_index:
                # To reduce the number of times we check Trino/Hive tables, we just do this
                # on the final file of the set.
                report_context["create_table"] = True

            # This defaults to the celery queue
            report_tasks.append(
                get_report_files.s(
                    customer_name,
                    credentials,
                    data_source,
                    provider_type,
                    schema_name,
                    provider_uuid,
                    report_month,
                    report_context,
                ).set(queue=GET_REPORT_FILES_QUEUE))
            LOG.info("Download queued - schema_name: %s.", schema_name)

        if report_tasks:
            reports_tasks_queued = True
            async_id = chord(
                report_tasks,
                summarize_reports.s().set(
                    queue=REFRESH_MATERIALIZED_VIEWS_QUEUE))()
            LOG.info(f"Manifest Processing Async ID: {async_id}")
        return manifest, reports_tasks_queued