def test_disk_status_logging(self, fake_downloader): """Test task for logging when temp directory exists.""" logging.disable(logging.NOTSET) os.makedirs(Config.TMP_DIR, exist_ok=True) account = fake_arn(service="iam", generate_account_id=True) expected = "INFO:masu.processor._tasks.download:Available disk space" with self.assertLogs("masu.processor._tasks.download", level="INFO") as logger: _get_report_files( Mock(), customer_name=self.fake.word(), authentication=account, provider_type=Provider.PROVIDER_AWS, report_month=DateHelper().today, provider_uuid=self.aws_provider_uuid, billing_source=self.fake.word(), cache_key=self.fake.word(), ) statement_found = False for log in logger.output: if expected in log: statement_found = True self.assertTrue(statement_found) shutil.rmtree(Config.TMP_DIR, ignore_errors=True)
def test_get_report_exception(self, fake_downloader): """Test task""" account = fake_arn(service='iam', generate_account_id=True) with self.assertRaises(Exception): _get_report_files(customer_name=self.fake.word(), authentication=account, provider_type='AWS', report_name=self.fake.word(), billing_source=self.fake.word())
def test_get_report_update_status(self, fake_downloader, fake_status): """Test that status is updated when downloading is complete.""" account = fake_arn(service='iam', generate_account_id=True) _get_report_files(customer_name=self.fake.word(), authentication=account, provider_type='AWS', report_name=self.fake.word(), provider_uuid=self.aws_test_provider_uuid, billing_source=self.fake.word()) fake_status.assert_called_with(ProviderStatusCode.READY)
def test_get_report_exception(self, fake_downloader): """Test task.""" account = fake_arn(service="iam", generate_account_id=True) with self.assertRaises(Exception): _get_report_files( Mock(), customer_name=self.fake.word(), authentication=account, provider_type=Provider.PROVIDER_AWS, report_month=DateAccessor().today(), provider_uuid=self.aws_provider_uuid, billing_source=self.fake.word(), )
def test_get_report_update_status(self, fake_downloader, fake_status): """Test that status is updated when downloading is complete.""" account = fake_arn(service="iam", generate_account_id=True) _get_report_files( Mock(), customer_name=self.fake.word(), authentication=account, provider_type=Provider.PROVIDER_AWS, report_month=DateAccessor().today(), provider_uuid=self.aws_provider_uuid, billing_source=self.fake.word(), ) fake_status.assert_called_with(ProviderStatusCode.READY)
def test_disk_status_logging_no_dir(self, fake_downloader): """Test task for logging when temp directory does not exist.""" logging.disable(logging.NOTSET) shutil.rmtree(Config.TMP_DIR, ignore_errors=True) account = fake_arn(service='iam', generate_account_id=True) expected = 'INFO:masu.processor._tasks.download:Unable to find avaiable disk space. {} does not exist'.format(Config.TMP_DIR) with self.assertLogs('masu.processor._tasks.download', level='INFO') as logger: _get_report_files(customer_name=self.fake.word(), authentication=account, provider_type='AWS', report_name=self.fake.word(), billing_source=self.fake.word()) self.assertIn(expected, logger.output)
def test_get_report_exception_update_status(self, fake_downloader, fake_status): """Test that status is updated when an exception is raised.""" account = fake_arn(service='iam', generate_account_id=True) try: _get_report_files(customer_name=self.fake.word(), authentication=account, provider_type='AWS', report_name=self.fake.word(), provider_uuid=self.aws_test_provider_uuid, billing_source=self.fake.word()) except ReportDownloaderError: pass fake_status.assert_called()
def test_get_report_without_override(self, fake_accessor, fake_report_files): """Test _get_report_files for two months.""" initial_month_qty = 2 account = fake_arn(service='iam', generate_account_id=True) with patch.object(ReportDownloader, 'get_reports') as download_call: _get_report_files( customer_name=self.fake.word(), authentication=account, provider_type='AWS', report_name=self.fake.word(), provider_uuid=self.aws_test_provider_uuid, billing_source=self.fake.word(), ) download_call.assert_called_with(initial_month_qty)
def test_disk_status_logging_no_dir(self, fake_downloader): """Test task for logging when temp directory does not exist.""" logging.disable(logging.NOTSET) Config.TMP_DIR = '/this/path/does/not/exist' account = fake_arn(service='iam', generate_account_id=True) expected = 'INFO:masu.processor._tasks.download:Unable to find' + \ f' available disk space. {Config.TMP_DIR} does not exist' with self.assertLogs('masu.processor._tasks.download', level='INFO') as logger: _get_report_files(customer_name=self.fake.word(), authentication=account, provider_type='AWS', report_name=self.fake.word(), provider_uuid=self.aws_test_provider_uuid, billing_source=self.fake.word()) self.assertIn(expected, logger.output)
def test_get_report_exception_update_status(self, fake_downloader, fake_status): """Test that status is updated when an exception is raised.""" account = fake_arn(service="iam", generate_account_id=True) try: _get_report_files( Mock(), customer_name=self.fake.word(), authentication=account, provider_type=Provider.PROVIDER_AWS, report_month=DateAccessor().today(), provider_uuid=self.aws_provider_uuid, billing_source=self.fake.word(), ) except ReportDownloaderError: pass fake_status.assert_called()
def test_get_report_with_override(self, fake_accessor, fake_report_files): """Test _get_report_files on non-initial load with override set.""" Config.INGEST_OVERRIDE = True Config.INITIAL_INGEST_NUM_MONTHS = 5 initial_month_qty = Config.INITIAL_INGEST_NUM_MONTHS account = fake_arn(service='iam', generate_account_id=True) with patch.object(ReportDownloader, 'get_reports') as download_call: _get_report_files(customer_name=self.fake.word(), authentication=account, provider_type='AWS', report_name=self.fake.word(), provider_uuid=self.aws_test_provider_uuid, billing_source=self.fake.word()) download_call.assert_called_with(initial_month_qty) Config.INGEST_OVERRIDE = False Config.INITIAL_INGEST_NUM_MONTHS = 2
def test_get_report(self, fake_downloader): """Test task""" account = fake_arn(service='iam', generate_account_id=True) report = _get_report_files(customer_name=self.fake.word(), authentication=account, provider_type='AWS', report_name=self.fake.word(), billing_source=self.fake.word()) self.assertIsInstance(report, list) self.assertGreater(len(report), 0)
def test_disk_status_logging(self, fake_downloader): """Test task for logging when temp directory exists.""" logging.disable(logging.NOTSET) os.makedirs(Config.TMP_DIR, exist_ok=True) account = fake_arn(service='iam', generate_account_id=True) expected = 'INFO:masu.processor._tasks.download:Avaiable disk space' with self.assertLogs('masu.processor._tasks.download', level='INFO') as logger: _get_report_files(customer_name=self.fake.word(), authentication=account, provider_type='AWS', report_name=self.fake.word(), billing_source=self.fake.word()) statement_found = False for log in logger.output: if expected in log: statement_found = True self.assertTrue(statement_found) shutil.rmtree(Config.TMP_DIR, ignore_errors=True)
def test_disk_status_logging_no_dir(self, fake_downloader): """Test task for logging when temp directory does not exist.""" logging.disable(logging.NOTSET) Config.PVC_DIR = "/this/path/does/not/exist" account = fake_arn(service="iam", generate_account_id=True) expected = ( "INFO:masu.processor._tasks.download:Unable to find" + f" available disk space. {Config.PVC_DIR} does not exist" ) with self.assertLogs("masu.processor._tasks.download", level="INFO") as logger: _get_report_files( Mock(), customer_name=self.fake.word(), authentication=account, provider_type=Provider.PROVIDER_AWS, report_month=DateAccessor().today(), provider_uuid=self.aws_provider_uuid, billing_source=self.fake.word(), ) self.assertIn(expected, logger.output)
def test_get_report(self, fake_downloader): """Test task.""" account = fake_arn(service="iam", generate_account_id=True) report = _get_report_files( Mock(), customer_name=self.fake.word(), authentication=account, provider_type=Provider.PROVIDER_AWS, report_month=DateAccessor().today(), provider_uuid=self.aws_provider_uuid, billing_source=self.fake.word(), ) self.assertIsInstance(report, list) self.assertGreater(len(report), 0)
def get_report_files(self, customer_name, authentication, billing_source, provider_type, schema_name, provider_uuid, report_month): """ Task to download a Report and process the report. FIXME: A 2 hour timeout is arbitrarily set for in progress processing requests. Once we know a realistic processing time for the largest CUR file in production this value can be adjusted or made configurable. Args: customer_name (String): Name of the customer owning the cost usage report. authentication (String): Credential needed to access cost usage report in the backend provider. billing_source (String): Location of the cost usage report in the backend provider. provider_type (String): Koku defined provider type string. Example: Amazon = 'AWS' schema_name (String): Name of the DB schema Returns: None """ worker_stats.GET_REPORT_ATTEMPTS_COUNTER.labels( provider_type=provider_type).inc() month = parser.parse(report_month) reports = _get_report_files(self, customer_name, authentication, billing_source, provider_type, provider_uuid, month) try: stmt = (f"Reports to be processed:\n" f" schema_name: {customer_name}\n" f" provider: {provider_type}\n" f" provider_uuid: {provider_uuid}\n") for report in reports: stmt += " file: " + str(report["file"]) + "\n" LOG.info(stmt[:-1]) reports_to_summarize = [] for report_dict in reports: manifest_id = report_dict.get("manifest_id") file_name = os.path.basename(report_dict.get("file")) with ReportStatsDBAccessor(file_name, manifest_id) as stats: started_date = stats.get_last_started_datetime() completed_date = stats.get_last_completed_datetime() # Skip processing if already in progress. if started_date and not completed_date: expired_start_date = started_date + datetime.timedelta(hours=2) if DateAccessor().today_with_timezone( "UTC") < expired_start_date: LOG.info( "Skipping processing task for %s since it was started at: %s.", file_name, str(started_date)) continue # Skip processing if complete. if started_date and completed_date: LOG.info( "Skipping processing task for %s. Started on: %s and completed on: %s.", file_name, str(started_date), str(completed_date), ) continue stmt = (f"Processing starting:\n" f" schema_name: {customer_name}\n" f" provider: {provider_type}\n" f" provider_uuid: {provider_uuid}\n" f' file: {report_dict.get("file")}') LOG.info(stmt) worker_stats.PROCESS_REPORT_ATTEMPTS_COUNTER.labels( provider_type=provider_type).inc() _process_report_file(schema_name, provider_type, provider_uuid, report_dict) report_meta = {} known_manifest_ids = [ report.get("manifest_id") for report in reports_to_summarize ] if report_dict.get("manifest_id") not in known_manifest_ids: report_meta["schema_name"] = schema_name report_meta["provider_type"] = provider_type report_meta["provider_uuid"] = provider_uuid report_meta["manifest_id"] = report_dict.get("manifest_id") reports_to_summarize.append(report_meta) except ReportProcessorError as processing_error: worker_stats.PROCESS_REPORT_ERROR_COUNTER.labels( provider_type=provider_type).inc() LOG.error(str(processing_error)) raise processing_error return reports_to_summarize
def get_report_files(self, customer_name, authentication, billing_source, provider_type, schema_name, provider_uuid, report_month): """ Task to download a Report and process the report. FIXME: A 2 hour timeout is arbitrarily set for in progress processing requests. Once we know a realistic processing time for the largest CUR file in production this value can be adjusted or made configurable. Args: customer_name (String): Name of the customer owning the cost usage report. authentication (String): Credential needed to access cost usage report in the backend provider. billing_source (String): Location of the cost usage report in the backend provider. provider_type (String): Koku defined provider type string. Example: Amazon = 'AWS' schema_name (String): Name of the DB schema Returns: None """ worker_stats.GET_REPORT_ATTEMPTS_COUNTER.labels( provider_type=provider_type).inc() month = report_month if isinstance(report_month, str): month = parser.parse(report_month) cache_key = f"{provider_uuid}:{month}" reports = _get_report_files(self, customer_name, authentication, billing_source, provider_type, provider_uuid, month, cache_key) stmt = (f"Reports to be processed:\n" f" schema_name: {customer_name}\n" f" provider: {provider_type}\n" f" provider_uuid: {provider_uuid}\n") for report in reports: stmt += " file: " + str(report["file"]) + "\n" LOG.info(stmt[:-1]) reports_to_summarize = [] start_date = None for report_dict in reports: with transaction.atomic(): try: manifest_id = report_dict.get("manifest_id") file_name = os.path.basename(report_dict.get("file")) with ReportStatsDBAccessor(file_name, manifest_id) as stats: started_date = stats.get_last_started_datetime() completed_date = stats.get_last_completed_datetime() # Skip processing if already in progress. if started_date and not completed_date: expired_start_date = started_date + datetime.timedelta( hours=Config.REPORT_PROCESSING_TIMEOUT_HOURS) if DateAccessor().today_with_timezone( "UTC") < expired_start_date: LOG.info( "Skipping processing task for %s since it was started at: %s.", file_name, str(started_date), ) continue stmt = (f"Processing starting:\n" f" schema_name: {customer_name}\n" f" provider: {provider_type}\n" f" provider_uuid: {provider_uuid}\n" f' file: {report_dict.get("file")}') LOG.info(stmt) if not start_date: start_date = report_dict.get("start_date") worker_stats.PROCESS_REPORT_ATTEMPTS_COUNTER.labels( provider_type=provider_type).inc() _process_report_file(schema_name, provider_type, provider_uuid, report_dict) known_manifest_ids = [ report.get("manifest_id") for report in reports_to_summarize ] if report_dict.get("manifest_id") not in known_manifest_ids: report_meta = { "schema_name": schema_name, "provider_type": provider_type, "provider_uuid": provider_uuid, "manifest_id": report_dict.get("manifest_id"), } reports_to_summarize.append(report_meta) except (ReportProcessorError, ReportProcessorDBError) as processing_error: worker_stats.PROCESS_REPORT_ERROR_COUNTER.labels( provider_type=provider_type).inc() LOG.error(str(processing_error)) WorkerCache().remove_task_from_cache(cache_key) raise processing_error WorkerCache().remove_task_from_cache(cache_key) if start_date: start_date_str = start_date.strftime("%Y-%m-%d") convert_to_parquet.delay(self.request.id, schema_name[4:], provider_uuid, provider_type, start_date_str, manifest_id) return reports_to_summarize
def get_report_files(customer_name, authentication, billing_source, provider_type, schema_name, report_name=None): """ Task to download a Report. Note that report_name will be not optional once Koku can specify what report we should download. FIXME: A 2 hour timeout is arbitrarily set for in progress processing requests. Once we know a realistic processing time for the largest CUR file in production this value can be adjusted or made configurable. Args: customer_name (String): Name of the customer owning the cost usage report. authentication (String): Credential needed to access cost usage report in the backend provider. billing_source (String): Location of the cost usage report in the backend provider. provider_type (String): Koku defined provider type string. Example: Amazon = 'AWS' schema_name (String): Name of the DB schema report_name (String): Name of the cost usage report to download. Returns: files (List) List of filenames with full local path. Example: ['/var/tmp/masu/my-report-name/aws/my-report-file.csv', '/var/tmp/masu/other-report-name/aws/other-report-file.csv'] """ reports = _get_report_files(customer_name, authentication, billing_source, provider_type, report_name) # initiate chained async task LOG.info('Reports to be processed: %s', str(reports)) for report_dict in reports: file_name = os.path.basename(report_dict.get('file')) stats = ReportStatsDBAccessor(file_name) started_date = stats.get_last_started_datetime() completed_date = stats.get_last_completed_datetime() stats.close_session() # Skip processing if already in progress. if started_date and not completed_date: expired_start_date = (started_date + datetime.timedelta(hours=2))\ .replace(tzinfo=pytz.UTC) if DateAccessor().today().replace(tzinfo=pytz.UTC) < expired_start_date: LOG.info('Skipping processing task for %s since it was started at: %s.', file_name, str(started_date)) continue # Skip processing if complete. if started_date and completed_date: LOG.info('Skipping processing task for %s. Started on: %s and completed on: %s.', file_name, str(started_date), str(completed_date)) continue request = {'schema_name': schema_name, 'report_path': report_dict.get('file'), 'compression': report_dict.get('compression')} result = process_report_file.delay(**request) LOG.info('Processing task queued - File: %s, Task ID: %s', report_dict.get('file'), str(result))
def get_report_files( # noqa: C901 self, customer_name, authentication, billing_source, provider_type, schema_name, provider_uuid, report_month, report_context, tracing_id=None, ): """ Task to download a Report and process the report. FIXME: A 2 hour timeout is arbitrarily set for in progress processing requests. Once we know a realistic processing time for the largest CUR file in production this value can be adjusted or made configurable. Args: customer_name (String): Name of the customer owning the cost usage report. authentication (String): Credential needed to access cost usage report in the backend provider. billing_source (String): Location of the cost usage report in the backend provider. provider_type (String): Koku defined provider type string. Example: Amazon = 'AWS' schema_name (String): Name of the DB schema Returns: None """ try: worker_stats.GET_REPORT_ATTEMPTS_COUNTER.labels( provider_type=provider_type).inc() month = report_month if isinstance(report_month, str): month = parser.parse(report_month) report_file = report_context.get("key") cache_key = f"{provider_uuid}:{report_file}" tracing_id = report_context.get("assembly_id", "no-tracing-id") WorkerCache().add_task_to_cache(cache_key) context = { "account": customer_name[4:], "provider_uuid": provider_uuid } try: report_dict = _get_report_files( tracing_id, customer_name, authentication, billing_source, provider_type, provider_uuid, month, report_context, ) except (MasuProcessingError, MasuProviderError, ReportDownloaderError) as err: worker_stats.REPORT_FILE_DOWNLOAD_ERROR_COUNTER.labels( provider_type=provider_type).inc() WorkerCache().remove_task_from_cache(cache_key) LOG.warning(log_json(tracing_id, str(err), context)) return stmt = (f"Reports to be processed: " f" schema_name: {customer_name} " f" provider: {provider_type} " f" provider_uuid: {provider_uuid}") if report_dict: stmt += f" file: {report_dict['file']}" LOG.info(log_json(tracing_id, stmt, context)) else: WorkerCache().remove_task_from_cache(cache_key) return None report_meta = { "schema_name": schema_name, "provider_type": provider_type, "provider_uuid": provider_uuid, "manifest_id": report_dict.get("manifest_id"), "tracing_id": tracing_id, } try: stmt = (f"Processing starting: " f" schema_name: {customer_name} " f" provider: {provider_type} " f" provider_uuid: {provider_uuid} " f' file: {report_dict.get("file")}') LOG.info(log_json(tracing_id, stmt)) worker_stats.PROCESS_REPORT_ATTEMPTS_COUNTER.labels( provider_type=provider_type).inc() report_dict["tracing_id"] = tracing_id report_dict["provider_type"] = provider_type _process_report_file(schema_name, provider_type, report_dict) except (ReportProcessorError, ReportProcessorDBError) as processing_error: worker_stats.PROCESS_REPORT_ERROR_COUNTER.labels( provider_type=provider_type).inc() LOG.error(log_json(tracing_id, str(processing_error), context)) WorkerCache().remove_task_from_cache(cache_key) raise processing_error except NotImplementedError as err: LOG.info(log_json(tracing_id, str(err), context)) WorkerCache().remove_task_from_cache(cache_key) WorkerCache().remove_task_from_cache(cache_key) return report_meta except ReportDownloaderWarning as err: LOG.warning(log_json(tracing_id, str(err), context)) WorkerCache().remove_task_from_cache(cache_key) except Exception as err: worker_stats.PROCESS_REPORT_ERROR_COUNTER.labels( provider_type=provider_type).inc() LOG.error(log_json(tracing_id, str(err), context)) WorkerCache().remove_task_from_cache(cache_key)
def get_report_files( self, customer_name, authentication, billing_source, provider_type, schema_name, provider_uuid, report_month, report_context, ): """ Task to download a Report and process the report. FIXME: A 2 hour timeout is arbitrarily set for in progress processing requests. Once we know a realistic processing time for the largest CUR file in production this value can be adjusted or made configurable. Args: customer_name (String): Name of the customer owning the cost usage report. authentication (String): Credential needed to access cost usage report in the backend provider. billing_source (String): Location of the cost usage report in the backend provider. provider_type (String): Koku defined provider type string. Example: Amazon = 'AWS' schema_name (String): Name of the DB schema Returns: None """ worker_stats.GET_REPORT_ATTEMPTS_COUNTER.labels( provider_type=provider_type).inc() month = report_month if isinstance(report_month, str): month = parser.parse(report_month) cache_key = f"{provider_uuid}:{month.date()}" WorkerCache().add_task_to_cache(cache_key) report_dict = _get_report_files( self, customer_name, authentication, billing_source, provider_type, provider_uuid, month, cache_key, report_context, ) stmt = (f"Reports to be processed:\n" f" schema_name: {customer_name}\n" f" provider: {provider_type}\n" f" provider_uuid: {provider_uuid}\n") if report_dict: stmt += f" file: {report_dict['file']}" LOG.info(stmt) else: return None try: stmt = (f"Processing starting:\n" f" schema_name: {customer_name}\n" f" provider: {provider_type}\n" f" provider_uuid: {provider_uuid}\n" f' file: {report_dict.get("file")}') LOG.info(stmt) worker_stats.PROCESS_REPORT_ATTEMPTS_COUNTER.labels( provider_type=provider_type).inc() _process_report_file(schema_name, provider_type, report_dict) report_meta = { "schema_name": schema_name, "provider_type": provider_type, "provider_uuid": provider_uuid, "manifest_id": report_dict.get("manifest_id"), } except (ReportProcessorError, ReportProcessorDBError) as processing_error: worker_stats.PROCESS_REPORT_ERROR_COUNTER.labels( provider_type=provider_type).inc() LOG.error(str(processing_error)) WorkerCache().remove_task_from_cache(cache_key) raise processing_error WorkerCache().remove_task_from_cache(cache_key) start_date = report_dict.get("start_date") manifest_id = report_dict.get("manifest_id") if start_date: start_date_str = start_date.strftime("%Y-%m-%d") convert_to_parquet.delay( self.request.id, schema_name[4:], provider_uuid, provider_type, start_date_str, manifest_id, [report_context.get("local_file")], ) return report_meta