def process(self): """Convert to parquet.""" msg = ( f"Converting CSV files to Parquet.\n\tStart date: {str(self.start_date)}\n\tFile: {str(self.report_file)}" ) LOG.info(msg) parquet_base_filename, daily_data_frames = self.convert_to_parquet() # Clean up the original downloaded file if ( self.provider_type != Provider.PROVIDER_OCP and not enable_trino_processing(self.provider_uuid, self.provider_type, self.schema_name) ) or enable_trino_processing(self.provider_uuid, self.provider_type, self.schema_name): for f in self.file_list: if os.path.exists(f): os.remove(f) for f in self.files_to_remove: if os.path.exists(f): os.remove(f) if os.path.exists(self.report_file): os.remove(self.report_file) return parquet_base_filename, daily_data_frames
def _set_updater(self): """ Create the report summary updater object. Object is specific to the report provider. Args: None Returns: (Object) : Provider-specific report summary updater """ if self._provider.type in (Provider.PROVIDER_AWS, Provider.PROVIDER_AWS_LOCAL): report_summary_updater = ( AWSReportParquetSummaryUpdater if enable_trino_processing( self._provider_uuid, self._provider.type, self._provider.customer.schema_name) else AWSReportSummaryUpdater) elif self._provider.type in (Provider.PROVIDER_AZURE, Provider.PROVIDER_AZURE_LOCAL): report_summary_updater = ( AzureReportParquetSummaryUpdater if enable_trino_processing( self._provider_uuid, self._provider.type, self._provider.customer.schema_name) else AzureReportSummaryUpdater) elif self._provider.type in (Provider.PROVIDER_OCP, ): report_summary_updater = ( OCPReportParquetSummaryUpdater if enable_trino_processing( self._provider_uuid, self._provider.type, self._provider.customer.schema_name) else OCPReportSummaryUpdater) elif self._provider.type in (Provider.PROVIDER_GCP, Provider.PROVIDER_GCP_LOCAL): report_summary_updater = ( GCPReportParquetSummaryUpdater if enable_trino_processing( self._provider_uuid, self._provider.type, self._provider.customer.schema_name) else GCPReportSummaryUpdater) else: return (None, None) ocp_cloud_updater = (OCPCloudParquetReportSummaryUpdater if enable_trino_processing( self._provider_uuid, self._provider.type, self._provider.customer.schema_name) else OCPCloudReportSummaryUpdater) LOG.info( f"Set report_summary_updater = {report_summary_updater.__name__}") return ( report_summary_updater(self._schema, self._provider, self._manifest), ocp_cloud_updater(self._schema, self._provider, self._manifest), )
def remove_files_not_in_set_from_s3_bucket(request_id, s3_path, manifest_id, context={}): """ Removes all files in a given prefix if they are not within the given set. """ if not ( settings.ENABLE_S3_ARCHIVING or enable_trino_processing(context.get("provider_uuid"), context.get("provider_type"), context.get("account")) ): return [] removed = [] if s3_path: try: s3_resource = get_s3_resource() existing_objects = s3_resource.Bucket(settings.S3_BUCKET_NAME).objects.filter(Prefix=s3_path) for obj_summary in existing_objects: existing_object = obj_summary.Object() metadata = existing_object.metadata manifest = metadata.get("manifestid") manifest_id_str = str(manifest_id) key = existing_object.key if manifest != manifest_id_str: s3_resource.Object(settings.S3_BUCKET_NAME, key).delete() removed.append(key) if removed: msg = f"Removed files from s3 bucket {settings.S3_BUCKET_NAME}: {','.join(removed)}." LOG.info(log_json(request_id, msg, context)) except (EndpointConnectionError, ClientError) as err: msg = f"Unable to remove data in bucket {settings.S3_BUCKET_NAME}. Reason: {str(err)}" LOG.info(log_json(request_id, msg, context)) return removed
def get_file_keys_from_s3_with_manifest_id(self, request_id, s3_path, manifest_id, context={}): """ Get all files in a given prefix that match the given manifest_id. """ if not enable_trino_processing(context.get("provider_uuid")): return [] keys = [] if s3_path: try: s3_resource = get_s3_resource() existing_objects = s3_resource.Bucket( settings.S3_BUCKET_NAME).objects.filter(Prefix=s3_path) for obj_summary in existing_objects: existing_object = obj_summary.Object() metadata = existing_object.metadata manifest = metadata.get("manifestid") manifest_id_str = str(manifest_id) key = existing_object.key if manifest == manifest_id_str: keys.append(key) except (EndpointConnectionError, ClientError) as err: msg = f"Unable to find data in bucket {settings.S3_BUCKET_NAME}. Reason: {str(err)}" LOG.info(log_json(request_id, msg, context)) return keys
def create_daily_archives(request_id, account, provider_uuid, filename, filepath, manifest_id, start_date, context={}): """ Create daily CSVs from incoming report and archive to S3. Args: request_id (str): The request id account (str): The account number provider_uuid (str): The uuid of a provider filename (str): The OCP file name filepath (str): The full path name of the file manifest_id (int): The manifest identifier start_date (Datetime): The start datetime of incoming report context (Dict): Logging context dictionary """ daily_file_names = [] if settings.ENABLE_S3_ARCHIVING or enable_trino_processing(provider_uuid): daily_files = divide_csv_daily(filepath, filename) for daily_file in daily_files: # Push to S3 s3_csv_path = get_path_prefix( account, Provider.PROVIDER_OCP, provider_uuid, start_date, Config.CSV_DATA_TYPE ) copy_local_report_file_to_s3_bucket( request_id, s3_csv_path, daily_file.get("filepath"), daily_file.get("filename"), manifest_id, start_date, context, ) daily_file_names.append(daily_file.get("filepath")) return daily_file_names
def copy_data_to_s3_bucket(request_id, path, filename, data, manifest_id=None, context={}): """ Copies data to s3 bucket file """ if not (settings.ENABLE_S3_ARCHIVING or enable_trino_processing( context.get("provider_uuid"), context.get("provider_type"), context.get("account"))): return None upload = None upload_key = f"{path}/{filename}" extra_args = {} if manifest_id: extra_args = {"Metadata": {"ManifestId": str(manifest_id)}} try: s3_resource = get_s3_resource() s3_obj = {"bucket_name": settings.S3_BUCKET_NAME, "key": upload_key} upload = s3_resource.Object(**s3_obj) upload.upload_fileobj(data, ExtraArgs=extra_args) except (EndpointConnectionError, ClientError) as err: msg = f"Unable to copy data to {upload_key} in bucket {settings.S3_BUCKET_NAME}. Reason: {str(err)}" LOG.info(log_json(request_id, msg, context)) return upload
def _set_processor(self): """ Create the report processor object. Processor is specific to the provider's cloud service. Args: None Returns: (Object) : Provider-specific report processor """ if enable_trino_processing(self.provider_uuid): return ParquetReportProcessor( schema_name=self.schema_name, report_path=self.report_path, compression=self.compression, provider_uuid=self.provider_uuid, provider_type=self.provider_type, manifest_id=self.manifest_id, context=self.context, ) if self.provider_type in (Provider.PROVIDER_AWS, Provider.PROVIDER_AWS_LOCAL): return AWSReportProcessor( schema_name=self.schema_name, report_path=self.report_path, compression=self.compression, provider_uuid=self.provider_uuid, manifest_id=self.manifest_id, ) if self.provider_type in (Provider.PROVIDER_AZURE, Provider.PROVIDER_AZURE_LOCAL): return AzureReportProcessor( schema_name=self.schema_name, report_path=self.report_path, compression=self.compression, provider_uuid=self.provider_uuid, manifest_id=self.manifest_id, ) if self.provider_type in (Provider.PROVIDER_OCP, ): return OCPReportProcessor( schema_name=self.schema_name, report_path=self.report_path, compression=self.compression, provider_uuid=self.provider_uuid, ) if self.provider_type in (Provider.PROVIDER_GCP, Provider.PROVIDER_GCP_LOCAL): return GCPReportProcessor( schema_name=self.schema_name, report_path=self.report_path, compression=self.compression, provider_uuid=self.provider_uuid, manifest_id=self.manifest_id, ) return None
def provider_post_delete_callback(*args, **kwargs): """ Asynchronously delete this Provider's archived data. Note: Signal receivers must accept keyword arguments (**kwargs). """ provider = kwargs["instance"] if provider.authentication_id: provider_auth_query = Provider.objects.exclude( uuid=provider.uuid).filter( authentication_id=provider.authentication_id) auth_count = provider_auth_query.count() if auth_count == 0: LOG.info("Deleting unreferenced ProviderAuthentication") auth_query = ProviderAuthentication.objects.filter( pk=provider.authentication_id) execute_delete_sql(auth_query) if provider.billing_source_id: provider_billing_query = Provider.objects.exclude( uuid=provider.uuid).filter( billing_source_id=provider.billing_source_id) billing_count = provider_billing_query.count() if billing_count == 0: LOG.info("Deleting unreferenced ProviderBillingSource") billing_source_query = ProviderBillingSource.objects.filter( pk=provider.billing_source_id) execute_delete_sql(billing_source_query) if not provider.customer: LOG.warning( "Provider %s has no Customer; we cannot call delete_archived_data.", provider.uuid) return customer = provider.customer customer.date_updated = DateHelper().now_utc customer.save() LOG.info("Deleting any related CostModelMap records") execute_delete_sql( CostModelMap.objects.filter(provider_uuid=provider.uuid)) if settings.ENABLE_S3_ARCHIVING or enable_trino_processing( provider.uuid, provider.type, provider.customer.schema_name): # Local import of task function to avoid potential import cycle. from masu.celery.tasks import delete_archived_data LOG.info("Deleting any archived data") delete_func = partial(delete_archived_data.delay, provider.customer.schema_name, provider.type, provider.uuid) transaction.on_commit(delete_func) LOG.info("Refreshing materialized views post-provider-delete uuid=%s.", provider.uuid) refresh_materialized_views(provider.customer.schema_name, provider.type, provider_uuid=provider.uuid, synchronous=True)
def delete_archived_data(schema_name, provider_type, provider_uuid): """ Delete archived data from our S3 bucket for a given provider. This function chiefly follows the deletion of a provider. This task is defined to attempt up to 10 retries using exponential backoff starting with a 10-second delay. This is intended to allow graceful handling of temporary AWS S3 connectivity issues because it is relatively important for us to delete this archived data. Args: schema_name (str): Koku user account (schema) name. provider_type (str): Koku backend provider type identifier. provider_uuid (UUID): Koku backend provider UUID. """ if not schema_name or not provider_type or not provider_uuid: # Sanity-check all of these inputs in case somehow any receives an # empty value such as None or '' because we need to minimize the risk # of deleting unrelated files from our S3 bucket. messages = [] if not schema_name: message = "missing required argument: schema_name" LOG.error(message) messages.append(message) if not provider_type: message = "missing required argument: provider_type" LOG.error(message) messages.append(message) if not provider_uuid: message = "missing required argument: provider_uuid" LOG.error(message) messages.append(message) raise TypeError("delete_archived_data() %s", ", ".join(messages)) if not (settings.ENABLE_S3_ARCHIVING or enable_trino_processing(provider_uuid)): LOG.info("Skipping delete_archived_data. Upload feature is disabled.") return else: message = f"Deleting S3 data for {provider_type} provider {provider_uuid} in account {schema_name}." LOG.info(message) # We need to normalize capitalization and "-local" dev providers. account = schema_name[4:] # Data in object storage does not use the local designation source_type = provider_type.replace("-local", "") path_prefix = f"{Config.WAREHOUSE_PATH}/{Config.CSV_DATA_TYPE}" prefix = f"{path_prefix}/{account}/{source_type}/source={provider_uuid}/" LOG.info("Attempting to delete our archived data in S3 under %s", prefix) deleted_archived_with_prefix(settings.S3_BUCKET_NAME, prefix) path_prefix = f"{Config.WAREHOUSE_PATH}/{Config.PARQUET_DATA_TYPE}" prefix = f"{path_prefix}/{account}/{source_type}/source={provider_uuid}/" LOG.info("Attempting to delete our archived data in S3 under %s", prefix) deleted_archived_with_prefix(settings.S3_BUCKET_NAME, prefix)
def create_daily_archives( request_id, account, provider_uuid, filename, file_path, manifest_id, start_date, context={} ): if settings.ENABLE_S3_ARCHIVING or enable_trino_processing(provider_uuid, Provider.PROVIDER_IBM, account): s3_csv_path = get_path_prefix(account, Provider.PROVIDER_IBM, provider_uuid, start_date, Config.CSV_DATA_TYPE) # add day to S3 CSV path because the IBM report is monthly and we want to diff between two days s3_csv_path = f"{s3_csv_path}/day={start_date.strftime('%d')}" copy_local_report_file_to_s3_bucket( request_id, s3_csv_path, file_path, filename, manifest_id, start_date, context ) return [file_path]
def copy_local_report_file_to_s3_bucket( request_id, s3_path, full_file_path, local_filename, manifest_id, start_date, context={} ): """ Copies local report file to s3 bucket """ if s3_path and ( settings.ENABLE_S3_ARCHIVING or enable_trino_processing(context.get("provider_uuid"), context.get("provider_type"), context.get("account")) ): LOG.info(f"copy_local_report_file_to_s3_bucket: {s3_path} {full_file_path}") with open(full_file_path, "rb") as fin: copy_data_to_s3_bucket(request_id, s3_path, local_filename, fin, manifest_id, context)
def provider_post_delete_callback(*args, **kwargs): """ Asynchronously delete this Provider's archived data. Note: Signal receivers must accept keyword arguments (**kwargs). """ provider = kwargs["instance"] if provider.authentication: auth_count = ( Provider.objects.exclude(uuid=provider.uuid).filter(authentication=provider.authentication).count() ) if auth_count == 0: provider.authentication.delete() if provider.billing_source: billing_count = ( Provider.objects.exclude(uuid=provider.uuid).filter(billing_source=provider.billing_source).count() ) if billing_count == 0: provider.billing_source.delete() provider_rate_objs = CostModelMap.objects.filter(provider_uuid=provider.uuid) if provider_rate_objs: provider_rate_objs.delete() if not provider.customer: LOG.warning("Provider %s has no Customer; we cannot call delete_archived_data.", provider.uuid) return customer = provider.customer customer.date_updated = DateHelper().now_utc customer.save() if settings.ENABLE_S3_ARCHIVING or enable_trino_processing(provider.uuid): # Local import of task function to avoid potential import cycle. from masu.celery.tasks import delete_archived_data delete_func = partial(delete_archived_data.delay, provider.customer.schema_name, provider.type, provider.uuid) transaction.on_commit(delete_func) refresh_materialized_views( provider.customer.schema_name, provider.type, provider_uuid=provider.uuid, synchronous=True )
def create_daily_archives(tracing_id, account, provider_uuid, filename, filepath, manifest_id, start_date, last_export_time, context={}): """ Create daily CSVs from incoming report and archive to S3. Args: tracing_id (str): The tracing id account (str): The account number provider_uuid (str): The uuid of a provider filename (str): The OCP file name filepath (str): The full path name of the file manifest_id (int): The manifest identifier start_date (Datetime): The start datetime of incoming report context (Dict): Logging context dictionary """ download_hash = None daily_file_names = [] if last_export_time: download_hash = hashlib.md5(str(last_export_time).encode()) download_hash = download_hash.hexdigest() if settings.ENABLE_S3_ARCHIVING or enable_trino_processing( provider_uuid, Provider.PROVIDER_GCP, account): dh = DateHelper() directory = os.path.dirname(filepath) try: data_frame = pd.read_csv(filepath) except Exception as error: LOG.error( f"File {filepath} could not be parsed. Reason: {str(error)}") raise error for invoice_month in data_frame["invoice.month"].unique(): # daily_files = [] invoice_filter = data_frame["invoice.month"] == invoice_month invoice_data = data_frame[invoice_filter] unique_times = invoice_data.partition_date.unique() days = list({cur_dt[:10] for cur_dt in unique_times}) daily_data_frames = [{ "data_frame": invoice_data[invoice_data.partition_date.str.contains( cur_day)], "date": cur_day } for cur_day in days] start_of_invoice = dh.gcp_invoice_month_start(invoice_month) s3_csv_path = get_path_prefix(account, Provider.PROVIDER_GCP, provider_uuid, start_of_invoice, Config.CSV_DATA_TYPE) for daily_data in daily_data_frames: day = daily_data.get("date") df = daily_data.get("data_frame") if download_hash: day_file = f"{invoice_month}_{day}_{download_hash}.csv" else: day_file = f"{invoice_month}_{day}.csv" day_filepath = f"{directory}/{day_file}" df.to_csv(day_filepath, index=False, header=True) copy_local_report_file_to_s3_bucket(tracing_id, s3_csv_path, day_filepath, day_file, manifest_id, start_date, context) daily_file_names.append(day_filepath) return daily_file_names
def update_summary_tables( # noqa: C901 schema_name, provider, provider_uuid, start_date, end_date=None, manifest_id=None, queue_name=None, synchronous=False, tracing_id=None, ): """Populate the summary tables for reporting. Args: schema_name (str) The DB schema name. provider (str) The provider type. provider_uuid (str) The provider uuid. report_dict (dict) The report data dict from previous task. start_date (str) The date to start populating the table. end_date (str) The date to end on. Returns None """ worker_stats.REPORT_SUMMARY_ATTEMPTS_COUNTER.labels( provider_type=provider).inc() task_name = "masu.processor.tasks.update_summary_tables" cache_args = [schema_name, provider, provider_uuid] ocp_on_cloud_infra_map = {} if not synchronous: worker_cache = WorkerCache() if worker_cache.single_task_is_running(task_name, cache_args): msg = f"Task {task_name} already running for {cache_args}. Requeuing." LOG.info(log_json(tracing_id, msg)) update_summary_tables.s( schema_name, provider, provider_uuid, start_date, end_date=end_date, manifest_id=manifest_id, queue_name=queue_name, tracing_id=tracing_id, ).apply_async(queue=queue_name or UPDATE_SUMMARY_TABLES_QUEUE) return worker_cache.lock_single_task(task_name, cache_args, timeout=settings.WORKER_CACHE_TIMEOUT) stmt = (f"update_summary_tables called with args: " f" schema_name: {schema_name}, " f" provider: {provider}, " f" start_date: {start_date}, " f" end_date: {end_date}, " f" manifest_id: {manifest_id}, " f" tracing_id: {tracing_id}") LOG.info(log_json(tracing_id, stmt)) try: updater = ReportSummaryUpdater(schema_name, provider_uuid, manifest_id, tracing_id) start_date, end_date = updater.update_daily_tables( start_date, end_date) updater.update_summary_tables(start_date, end_date, tracing_id) ocp_on_cloud_infra_map = updater.get_openshift_on_cloud_infra_map( start_date, end_date, tracing_id) except ReportSummaryUpdaterCloudError as ex: LOG.info( log_json( tracing_id, f"Failed to correlate OpenShift metrics for provider: {provider_uuid}. Error: {ex}" )) except ReportSummaryUpdaterProviderNotFoundError as pnf_ex: LOG.warning( log_json( tracing_id, (f"{pnf_ex} Possible source/provider delete during processing. " + "Processing for this provier will halt."), )) if not synchronous: worker_cache.release_single_task(task_name, cache_args) return except Exception as ex: if not synchronous: worker_cache.release_single_task(task_name, cache_args) raise ex if not provider_uuid: refresh_materialized_views.s( schema_name, provider, manifest_id=manifest_id, queue_name=queue_name, tracing_id=tracing_id).apply_async( queue=queue_name or REFRESH_MATERIALIZED_VIEWS_QUEUE) return if enable_trino_processing(provider_uuid, provider, schema_name) and provider in ( Provider.PROVIDER_AWS, Provider.PROVIDER_AWS_LOCAL, Provider.PROVIDER_AZURE, Provider.PROVIDER_AZURE_LOCAL, ): cost_model = None stmt = ( f"Markup for {provider} is calculated during summarization. No need to run update_cost_model_costs" f" schema_name: {schema_name}, " f" provider_uuid: {provider_uuid}") LOG.info(log_json(tracing_id, stmt)) else: with CostModelDBAccessor(schema_name, provider_uuid) as cost_model_accessor: cost_model = cost_model_accessor.cost_model # Create queued tasks for each OpenShift on Cloud cluster signature_list = [] for openshift_provider_uuid, infrastructure_tuple in ocp_on_cloud_infra_map.items( ): infra_provider_uuid = infrastructure_tuple[0] infra_provider_type = infrastructure_tuple[1] signature_list.append( update_openshift_on_cloud.s( schema_name, openshift_provider_uuid, infra_provider_uuid, infra_provider_type, str(start_date), str(end_date), manifest_id=manifest_id, queue_name=queue_name, synchronous=synchronous, tracing_id=tracing_id, ).set(queue=queue_name or UPDATE_SUMMARY_TABLES_QUEUE)) # Apply OCP on Cloud tasks if signature_list: if synchronous: group(signature_list).apply() else: group(signature_list).apply_async() if cost_model is not None: linked_tasks = update_cost_model_costs.s( schema_name, provider_uuid, start_date, end_date, tracing_id=tracing_id).set( queue=queue_name or UPDATE_COST_MODEL_COSTS_QUEUE ) | refresh_materialized_views.si( schema_name, provider, provider_uuid=provider_uuid, manifest_id=manifest_id, tracing_id=tracing_id).set( queue=queue_name or REFRESH_MATERIALIZED_VIEWS_QUEUE) else: stmt = f"update_cost_model_costs skipped. schema_name: {schema_name}, provider_uuid: {provider_uuid}" LOG.info(log_json(tracing_id, stmt)) linked_tasks = refresh_materialized_views.s( schema_name, provider, provider_uuid=provider_uuid, manifest_id=manifest_id, tracing_id=tracing_id).set( queue=queue_name or REFRESH_MATERIALIZED_VIEWS_QUEUE) chain(linked_tasks).apply_async() if not synchronous: worker_cache.release_single_task(task_name, cache_args)
def update_summary_tables( # noqa: C901 schema_name, provider, provider_uuid, start_date, end_date=None, manifest_id=None, queue_name=None, synchronous=False, ): """Populate the summary tables for reporting. Args: schema_name (str) The DB schema name. provider (str) The provider type. provider_uuid (str) The provider uuid. report_dict (dict) The report data dict from previous task. start_date (str) The date to start populating the table. end_date (str) The date to end on. Returns None """ worker_stats.REPORT_SUMMARY_ATTEMPTS_COUNTER.labels( provider_type=provider).inc() task_name = "masu.processor.tasks.update_summary_tables" cache_args = [schema_name] if not synchronous: worker_cache = WorkerCache() if worker_cache.single_task_is_running(task_name, cache_args): msg = f"Task {task_name} already running for {cache_args}. Requeuing." LOG.info(msg) update_summary_tables.s( schema_name, provider, provider_uuid, start_date, end_date=end_date, manifest_id=manifest_id, queue_name=queue_name, ).apply_async(queue=queue_name or UPDATE_SUMMARY_TABLES_QUEUE) return worker_cache.lock_single_task(task_name, cache_args, timeout=3600) stmt = (f"update_summary_tables called with args:\n" f" schema_name: {schema_name},\n" f" provider: {provider},\n" f" start_date: {start_date},\n" f" end_date: {end_date},\n" f" manifest_id: {manifest_id}") LOG.info(stmt) try: updater = ReportSummaryUpdater(schema_name, provider_uuid, manifest_id) start_date, end_date = updater.update_daily_tables( start_date, end_date) updater.update_summary_tables(start_date, end_date) except Exception as ex: if not synchronous: worker_cache.release_single_task(task_name, cache_args) raise ex if not provider_uuid: refresh_materialized_views.s( schema_name, provider, manifest_id=manifest_id, queue_name=queue_name).apply_async( queue=queue_name or REFRESH_MATERIALIZED_VIEWS_QUEUE) return if enable_trino_processing(provider_uuid) and provider in ( Provider.PROVIDER_AWS, Provider.PROVIDER_AWS_LOCAL, Provider.PROVIDER_AZURE, Provider.PROVIDER_AZURE_LOCAL, ): cost_model = None stmt = ( f"\n Markup for {provider} is calculated during summarization. No need to run update_cost_model_costs\n" f" schema_name: {schema_name},\n" f" provider_uuid: {provider_uuid}") LOG.info(stmt) else: with CostModelDBAccessor(schema_name, provider_uuid) as cost_model_accessor: cost_model = cost_model_accessor.cost_model if cost_model is not None: linked_tasks = update_cost_model_costs.s( schema_name, provider_uuid, start_date, end_date).set( queue=queue_name or UPDATE_COST_MODEL_COSTS_QUEUE ) | refresh_materialized_views.si( schema_name, provider, provider_uuid=provider_uuid, manifest_id=manifest_id).set( queue=queue_name or REFRESH_MATERIALIZED_VIEWS_QUEUE) else: stmt = (f"\n update_cost_model_costs skipped.\n" f" schema_name: {schema_name},\n" f" provider_uuid: {provider_uuid}") LOG.info(stmt) linked_tasks = refresh_materialized_views.s( schema_name, provider, provider_uuid=provider_uuid, manifest_id=manifest_id).set( queue=queue_name or REFRESH_MATERIALIZED_VIEWS_QUEUE) dh = DateHelper(utc=True) prev_month_start_day = dh.last_month_start.replace(tzinfo=None).date() if isinstance(start_date, str): start_date = ciso8601.parse_datetime(start_date).date() if manifest_id and (start_date <= prev_month_start_day): # We want make sure that the manifest_id is not none, because # we only want to call the delete line items after the summarize_reports # task above simulate = False line_items_only = True linked_tasks |= remove_expired_data.si( schema_name, provider, simulate, provider_uuid, line_items_only, queue_name).set(queue=queue_name or REMOVE_EXPIRED_DATA_QUEUE) chain(linked_tasks).apply_async() if not synchronous: worker_cache.release_single_task(task_name, cache_args)
def trino_enabled(self): """Return whether the source is enabled for Trino processing.""" return enable_trino_processing(self.provider_uuid, self.provider_type, self.schema_name)
def convert_to_parquet(self): """ Convert archived CSV data from our S3 bucket for a given provider to Parquet. This function chiefly follows the download of a providers data. This task is defined to attempt up to 10 retries using exponential backoff starting with a 10-second delay. This is intended to allow graceful handling of temporary AWS S3 connectivity issues because it is relatively important for us to convert the archived data. """ parquet_base_filename = "" daily_data_frame = pd.DataFrame() if not enable_trino_processing(self.provider_uuid, self.provider_type, self.schema_name): msg = "Skipping convert_to_parquet. Parquet processing is disabled." LOG.info(log_json(self.request_id, msg, self.error_context)) return "", pd.DataFrame() if self.csv_path_s3 is None or self.parquet_path_s3 is None or self.local_path is None: msg = ( f"Invalid paths provided to convert_csv_to_parquet." f"CSV path={self.csv_path_s3}, Parquet path={self.parquet_path_s3}, and local_path={self.local_path}." ) LOG.error(log_json(self.request_id, msg, self.error_context)) return "", pd.DataFrame() manifest_accessor = ReportManifestDBAccessor() manifest = manifest_accessor.get_manifest_by_id(self.manifest_id) # OCP data is daily chunked report files. # AWS and Azure are monthly reports. Previous reports should be removed so data isn't duplicated if not manifest_accessor.get_s3_parquet_cleared( manifest) and self.provider_type not in ( Provider.PROVIDER_OCP, Provider.PROVIDER_GCP, Provider.PROVIDER_GCP_LOCAL, ): remove_files_not_in_set_from_s3_bucket(self.request_id, self.parquet_path_s3, self.manifest_id, self.error_context) remove_files_not_in_set_from_s3_bucket(self.request_id, self.parquet_daily_path_s3, self.manifest_id, self.error_context) manifest_accessor.mark_s3_parquet_cleared(manifest) failed_conversion = [] for csv_filename in self.file_list: if self.provider_type == Provider.PROVIDER_OCP and self.report_type is None: msg = f"Could not establish report type for {csv_filename}." LOG.warn(log_json(self.request_id, msg, self.error_context)) failed_conversion.append(csv_filename) continue parquet_base_filename, daily_data_frame, success = self.convert_csv_to_parquet( csv_filename) if self.provider_type not in (Provider.PROVIDER_AZURE, Provider.PROVIDER_GCP): self.create_daily_parquet(parquet_base_filename, daily_data_frame) if not success: failed_conversion.append(csv_filename) if failed_conversion: msg = f"Failed to convert the following files to parquet:{','.join(failed_conversion)}." LOG.warn(log_json(self.request_id, msg, self.error_context)) return parquet_base_filename, daily_data_frame
def convert_to_parquet( # noqa: C901 self, request_id, account, provider_uuid, provider_type, start_date, manifest_id, files=[], context={}): """ Convert archived CSV data from our S3 bucket for a given provider to Parquet. This function chiefly follows the download of a providers data. This task is defined to attempt up to 10 retries using exponential backoff starting with a 10-second delay. This is intended to allow graceful handling of temporary AWS S3 connectivity issues because it is relatively important for us to convert the archived data. Args: request_id (str): The associated request id (ingress or celery task id) account (str): The account string provider_uuid (UUID): The provider UUID start_date (str): The report start time (YYYY-mm-dd) manifest_id (str): The identifier for the report manifest context (dict): A context object for logging """ if not context: context = {"account": account, "provider_uuid": provider_uuid} if not enable_trino_processing(provider_uuid): msg = "Skipping convert_to_parquet. Parquet processing is disabled." LOG.info(log_json(request_id, msg, context)) return if not request_id or not account or not provider_uuid: if not request_id: message = "missing required argument: request_id" LOG.error(message) if not account: message = "missing required argument: account" LOG.error(message) if not provider_uuid: message = "missing required argument: provider_uuid" LOG.error(message) if not provider_type: message = "missing required argument: provider_type" LOG.error(message) return if not start_date: msg = "Parquet processing is enabled, but no start_date was given for processing." LOG.warn(log_json(request_id, msg, context)) return try: cost_date = parser.parse(start_date) except ValueError: msg = "Parquet processing is enabled, but the start_date was not a valid date string ISO 8601 format." LOG.warn(log_json(request_id, msg, context)) return s3_csv_path = get_path_prefix(account, provider_type, provider_uuid, cost_date, Config.CSV_DATA_TYPE) local_path = f"{Config.TMP_DIR}/{account}/{provider_uuid}" s3_parquet_path = get_path_prefix(account, provider_type, provider_uuid, cost_date, Config.PARQUET_DATA_TYPE) if not files: file_keys = self.get_file_keys_from_s3_with_manifest_id( request_id, s3_csv_path, manifest_id, context) files = [os.path.basename(file_key) for file_key in file_keys] if not files: msg = "Parquet processing is enabled, but no files to process." LOG.info(log_json(request_id, msg, context)) return post_processor = None # OCP data is daily chunked report files. # AWS and Azure are monthly reports. Previous reports should be removed so data isn't duplicated if provider_type not in (Provider.PROVIDER_OCP, Provider.PROVIDER_GCP, Provider.PROVIDER_GCP_LOCAL): remove_files_not_in_set_from_s3_bucket(request_id, s3_parquet_path, manifest_id, context) if provider_type in [ Provider.PROVIDER_AWS, Provider.PROVIDER_AWS_LOCAL ]: post_processor = aws_post_processor elif provider_type in [ Provider.PROVIDER_AZURE, Provider.PROVIDER_AZURE_LOCAL ]: post_processor = azure_post_processor elif provider_type in [ Provider.PROVIDER_GCP, Provider.PROVIDER_GCP_LOCAL ]: post_processor = gcp_post_processor failed_conversion = [] for csv_filename in files: kwargs = {} parquet_path = s3_parquet_path parquet_report_type = None if provider_type == Provider.PROVIDER_OCP: for report_type in REPORT_TYPES.keys(): if report_type in csv_filename: parquet_path = get_path_prefix( account, provider_type, provider_uuid, cost_date, Config.PARQUET_DATA_TYPE, report_type=report_type, ) kwargs["report_type"] = report_type parquet_report_type = report_type break if parquet_report_type is None: msg = f"Could not establish report type for {csv_filename}." LOG.warn(log_json(request_id, msg, context)) continue converters = get_column_converters(provider_type, **kwargs) result = self.convert_csv_to_parquet( request_id, s3_csv_path, parquet_path, local_path, manifest_id, csv_filename, converters, post_processor, context, parquet_report_type, ) if not result: failed_conversion.append(csv_filename) if failed_conversion: msg = f"Failed to convert the following files to parquet:{','.join(failed_conversion)}." LOG.warn(log_json(request_id, msg, context)) return