def start_manifest_processing(self, customer_name, credentials, data_source, provider_type, schema_name, provider_uuid, report_month): """ Start processing an account's manifest for the specified report_month. Args: (String) customer_name - customer name (String) credentials - credentials object (String) data_source - report storage location (String) schema_name - db tenant (String) provider_uuid - provider unique identifier (Date) report_month - month to get latest manifest Returns: ({}) Dictionary containing the following keys: manifest_id - (String): Manifest ID for ReportManifestDBAccessor assembly_id - (String): UUID identifying report file compression - (String): Report compression format files - ([{"key": full_file_path "local_file": "local file name"}]): List of report files. (Boolean) - Whether we are processing this manifest """ # Switching initial ingest to use priority queue for QE tests based on QE_SCHEMA flag if self.queue_name is not None and self.provider_uuid is not None: SUMMARY_QUEUE = self.queue_name REPORT_QUEUE = self.queue_name else: SUMMARY_QUEUE = SUMMARIZE_REPORTS_QUEUE REPORT_QUEUE = GET_REPORT_FILES_QUEUE reports_tasks_queued = False downloader = ReportDownloader( customer_name=customer_name, credentials=credentials, data_source=data_source, provider_type=provider_type, provider_uuid=provider_uuid, report_name=None, ) manifest = downloader.download_manifest(report_month) tracing_id = manifest.get("assembly_id", manifest.get("request_id", "no-request-id")) files = manifest.get("files", []) filenames = [] for file in files: filenames.append(file.get("local_file")) LOG.info( log_json( tracing_id, f"Report with manifest {tracing_id} contains the files: {filenames}" )) if manifest: LOG.debug("Saving all manifest file names.") record_all_manifest_files(manifest["manifest_id"], [ report.get("local_file") for report in manifest.get("files", []) ], tracing_id) LOG.info(log_json(tracing_id, f"Found Manifests: {str(manifest)}")) report_files = manifest.get("files", []) report_tasks = [] last_report_index = len(report_files) - 1 for i, report_file_dict in enumerate(report_files): local_file = report_file_dict.get("local_file") report_file = report_file_dict.get("key") # Check if report file is complete or in progress. if record_report_status(manifest["manifest_id"], local_file, "no_request"): LOG.info( log_json(tracing_id, f"{local_file} was already processed")) continue cache_key = f"{provider_uuid}:{report_file}" if self.worker_cache.task_is_running(cache_key): LOG.info( log_json(tracing_id, f"{local_file} process is in progress")) continue report_context = manifest.copy() report_context["current_file"] = report_file report_context["local_file"] = local_file report_context["key"] = report_file report_context["request_id"] = tracing_id if provider_type in [Provider.PROVIDER_OCP, Provider.PROVIDER_GCP ] or i == last_report_index: # This create_table flag is used by the ParquetReportProcessor # to create a Hive/Trino table. # To reduce the number of times we check Trino/Hive tables, we just do this # on the final file of the set. report_context["create_table"] = True # add the tracing id to the report context # This defaults to the celery queue report_tasks.append( get_report_files.s( customer_name, credentials, data_source, provider_type, schema_name, provider_uuid, report_month, report_context, ).set(queue=REPORT_QUEUE)) LOG.info( log_json(tracing_id, f"Download queued - schema_name: {schema_name}.")) if report_tasks: reports_tasks_queued = True async_id = chord(report_tasks, summarize_reports.s().set(queue=SUMMARY_QUEUE))() LOG.debug( log_json(tracing_id, f"Manifest Processing Async ID: {async_id}")) return manifest, reports_tasks_queued
def start_manifest_processing(self, customer_name, credentials, data_source, provider_type, schema_name, provider_uuid, report_month): """ Start processing an account's manifest for the specified report_month. Args: (String) customer_name - customer name (String) credentials - credentials object (String) data_source - report storage location (String) schema_name - db tenant (String) provider_uuid - provider unique identifier (Date) report_month - month to get latest manifest Returns: ({}) Dictionary containing the following keys: manifest_id - (String): Manifest ID for ReportManifestDBAccessor assembly_id - (String): UUID identifying report file compression - (String): Report compression format files - ([{"key": full_file_path "local_file": "local file name"}]): List of report files. """ downloader = ReportDownloader( customer_name=customer_name, credentials=credentials, data_source=data_source, provider_type=provider_type, provider_uuid=provider_uuid, report_name=None, ) manifest = downloader.download_manifest(report_month) if manifest: LOG.info("Saving all manifest file names.") record_all_manifest_files(manifest["manifest_id"], [ report.get("local_file") for report in manifest.get("files", []) ]) LOG.info(f"Found Manifests: {str(manifest)}") report_files = manifest.get("files", []) report_tasks = [] for report_file_dict in report_files: local_file = report_file_dict.get("local_file") report_file = report_file_dict.get("key") # Check if report file is complete or in progress. if record_report_status(manifest["manifest_id"], local_file, "no_request"): LOG.info(f"{local_file} was already processed") continue cache_key = f"{provider_uuid}:{report_file}" if self.worker_cache.task_is_running(cache_key): LOG.info(f"{local_file} process is in progress") continue report_context = manifest.copy() report_context["current_file"] = report_file report_context["local_file"] = local_file report_context["key"] = report_file report_tasks.append( get_report_files.s( customer_name, credentials, data_source, provider_type, schema_name, provider_uuid, report_month, report_context, )) LOG.info("Download queued - schema_name: %s.", schema_name) if report_tasks: async_id = chord(report_tasks, summarize_reports.s())() LOG.info(f"Manifest Processing Async ID: {async_id}") return manifest
def start_manifest_processing(self, customer_name, credentials, data_source, provider_type, schema_name, provider_uuid, report_month): """ Start processing an account's manifest for the specified report_month. Args: (String) customer_name - customer name (String) credentials - credentials object (String) data_source - report storage location (String) schema_name - db tenant (String) provider_uuid - provider unique identifier (Date) report_month - month to get latest manifest Returns: ({}) Dictionary containing the following keys: manifest_id - (String): Manifest ID for ReportManifestDBAccessor assembly_id - (String): UUID identifying report file compression - (String): Report compression format files - ([{"key": full_file_path "local_file": "local file name"}]): List of report files. (Boolean) - Whether we are processing this manifest """ reports_tasks_queued = False downloader = ReportDownloader( customer_name=customer_name, credentials=credentials, data_source=data_source, provider_type=provider_type, provider_uuid=provider_uuid, report_name=None, ) manifest = downloader.download_manifest(report_month) if manifest: LOG.info("Saving all manifest file names.") record_all_manifest_files(manifest["manifest_id"], [ report.get("local_file") for report in manifest.get("files", []) ]) LOG.info(f"Found Manifests: {str(manifest)}") report_files = manifest.get("files", []) report_tasks = [] last_report_index = len(report_files) - 1 for i, report_file_dict in enumerate(report_files): local_file = report_file_dict.get("local_file") report_file = report_file_dict.get("key") # Check if report file is complete or in progress. if record_report_status(manifest["manifest_id"], local_file, "no_request"): LOG.info(f"{local_file} was already processed") continue cache_key = f"{provider_uuid}:{report_file}" if self.worker_cache.task_is_running(cache_key): LOG.info(f"{local_file} process is in progress") continue report_context = manifest.copy() report_context["current_file"] = report_file report_context["local_file"] = local_file report_context["key"] = report_file if provider_type == Provider.PROVIDER_OCP or i == last_report_index: # To reduce the number of times we check Trino/Hive tables, we just do this # on the final file of the set. report_context["create_table"] = True # This defaults to the celery queue report_tasks.append( get_report_files.s( customer_name, credentials, data_source, provider_type, schema_name, provider_uuid, report_month, report_context, ).set(queue=GET_REPORT_FILES_QUEUE)) LOG.info("Download queued - schema_name: %s.", schema_name) if report_tasks: reports_tasks_queued = True async_id = chord( report_tasks, summarize_reports.s().set( queue=REFRESH_MATERIALIZED_VIEWS_QUEUE))() LOG.info(f"Manifest Processing Async ID: {async_id}") return manifest, reports_tasks_queued