def check_if_manifest_should_be_downloaded(self, assembly_id): """Check if we should download this manifest. We first check if we have a database record of this manifest. That would indicate that we have already downloaded and at least begun processing. We then check the last completed time for a file in this manifest. This second check is to cover the case when we did not complete processing and need to re-downlaod and process the manifest. Returns True if the manifest should be downloaded and processed. """ manifest_id = None num_processed_files = 0 num_total_files = 0 today = DateAccessor().today_with_timezone('UTC') last_completed_cutoff = today - datetime.timedelta(hours=1) with ReportManifestDBAccessor() as manifest_accessor: manifest = manifest_accessor.get_manifest(assembly_id, self._provider_id) if manifest: manifest_id = manifest.id num_processed_files = manifest.num_processed_files num_total_files = manifest.num_total_files if num_processed_files < num_total_files: completed_datetime = manifest_accessor.get_last_report_completed_datetime( manifest_id) if completed_datetime and completed_datetime < last_completed_cutoff: # It has been more than an hour since we processed a file # and we didn't finish processing. We should download # and reprocess. manifest_accessor.reset_manifest(manifest_id) return True # The manifest exists and we have processed all the files. # We should not redownload. return False # The manifest does not exist, this is the first time we are # downloading and processing it. return True
def summarize_reports(reports_to_summarize): """ Summarize reports returned from line summary task. Args: reports_to_summarize (list) list of reports to process Returns: None """ reports_deduplicated = [ dict(t) for t in {tuple(d.items()) for d in reports_to_summarize} ] for report in reports_deduplicated: # For day-to-day summarization we choose a small window to # cover new data from a window of days. # This saves us from re-summarizing unchanged data and cuts down # on processing time. There are override mechanisms in the # Updater classes for when full-month summarization is # required. with ReportManifestDBAccessor() as manifest_accesor: if manifest_accesor.manifest_ready_for_summary( report.get("manifest_id")): start_date = DateAccessor().today() - datetime.timedelta( days=2) start_date = start_date.strftime("%Y-%m-%d") end_date = DateAccessor().today().strftime("%Y-%m-%d") LOG.info("report to summarize: %s", str(report)) update_summary_tables.delay( report.get("schema_name"), report.get("provider_type"), report.get("provider_uuid"), start_date=start_date, end_date=end_date, manifest_id=report.get("manifest_id"), )
def setUpClass(cls): """Set up the test class with required objects.""" super().setUpClass() cls.test_report_path = ( './koku/masu/test/data/azure/costreport_a243c6f2-199f-4074-9a2c-40e671cf1584.csv' ) cls.date_accessor = DateAccessor() cls.manifest_accessor = ReportManifestDBAccessor() with ReportingCommonDBAccessor() as report_common_db: cls.column_map = report_common_db.column_map _report_tables = copy.deepcopy(AZURE_REPORT_TABLE_MAP) _report_tables.pop('line_item_daily_summary', None) _report_tables.pop('tags_summary', None) _report_tables.pop('ocp_on_azure_daily_summary', None) _report_tables.pop('ocp_on_azure_project_daily_summary', None) cls.report_tables = list(_report_tables.values()) # Grab a single row of test data to work with with open(cls.test_report_path, 'r', encoding='utf-8-sig') as f: reader = csv.DictReader(f) cls.row = next(reader)
def _determine_if_full_summary_update_needed(self, bill): """Decide whether to update summary tables for full billing period.""" now_utc = self._date_accessor.today_with_timezone("UTC") summary_creation = bill.summary_data_creation_datetime finalized_datetime = bill.finalized_datetime is_done_processing = False with ReportManifestDBAccessor() as manifest_accesor: is_done_processing = manifest_accesor.manifest_ready_for_summary(self._manifest.id) is_newly_finalized = False if finalized_datetime is not None: is_newly_finalized = finalized_datetime.date() == now_utc.date() is_new_bill = summary_creation is None # Do a full month update if we just finished processing a finalized # bill or we just finished processing a bill for the first time if (is_done_processing and is_newly_finalized) or (is_done_processing and is_new_bill): # noqa: W504 return True return False
def setUpClass(cls): """Set up the test class with required objects.""" cls.common_accessor = ReportingCommonDBAccessor() cls.column_map = cls.common_accessor.column_map cls.accessor = AWSReportDBAccessor(schema='acct10001', column_map=cls.column_map) cls.report_schema = cls.accessor.report_schema cls.creator = ReportObjectCreator(cls.accessor, cls.column_map, cls.report_schema.column_types) cls.all_tables = list(AWS_CUR_TABLE_MAP.values()) cls.foreign_key_tables = [ AWS_CUR_TABLE_MAP['bill'], AWS_CUR_TABLE_MAP['product'], AWS_CUR_TABLE_MAP['pricing'], AWS_CUR_TABLE_MAP['reservation'] ] billing_start = datetime.datetime.utcnow().replace(day=1) cls.manifest_dict = { 'assembly_id': '1234', 'billing_period_start_datetime': billing_start, 'num_total_files': 2, 'provider_id': 1 } cls.manifest_accessor = ReportManifestDBAccessor()
def setUpClass(cls): """Set up the test class with required objects.""" super().setUpClass() # These test reports should be replaced with OCP reports once processor is implemented. cls.test_report_path = ( "./koku/masu/test/data/ocp/e6b3701e-1e91-433b-b238-a31e49937558_February-2019-my-ocp-cluster-1.csv" ) cls.storage_report_path = "./koku/masu/test/data/ocp/e6b3701e-1e91-433b-b238-a31e49937558_storage.csv" cls.node_report_path = "./koku/masu/test/data/ocp/e6b3701e-1e91-433b-b238-a31e49937558_node_labels.csv" cls.unknown_report = "./koku/masu/test/data/test_cur.csv" cls.test_report_gzip_path = "./koku/masu/test/data/test_cur.csv.gz" cls.date_accessor = DateAccessor() cls.billing_start = cls.date_accessor.today_with_timezone( "UTC").replace(year=2018, month=6, day=1, hour=0, minute=0, second=0) cls.assembly_id = "1234" cls.manifest_accessor = ReportManifestDBAccessor() cls.accessor = OCPReportDBAccessor(cls.schema) cls.report_schema = cls.accessor.report_schema _report_tables = copy.deepcopy(OCP_REPORT_TABLE_MAP) cls.report_tables = list(_report_tables.values()) # Grab a single row of test data to work with with open(cls.test_report_path, "r") as f: reader = csv.DictReader(f) cls.row = next(reader) with open(cls.storage_report_path, "r") as f: reader = csv.DictReader(f) cls.storage_row = next(reader)
def test_get_report_context_for_date_should_download(self, mock_session, mock_manifest, mock_delete, mock_check): """Test that data is returned on the reports to process.""" current_month = DateAccessor().today().replace(day=1, second=1, microsecond=1) auth_credential = fake_arn(service="iam", generate_account_id=True) downloader = AWSReportDownloader( self.mock_task, self.fake_customer_name, auth_credential, self.fake_bucket_name, provider_uuid=self.aws_provider_uuid, ) start_str = current_month.strftime(downloader.manifest_date_format) assembly_id = "1234" compression = downloader.report.get("Compression") report_keys = ["file1", "file2"] mock_manifest.return_value = ( "", { "assemblyId": assembly_id, "Compression": compression, "reportKeys": report_keys, "billingPeriod": {"start": start_str}, }, ) mock_check.return_value = True expected = {"manifest_id": None, "assembly_id": assembly_id, "compression": compression, "files": report_keys} result = downloader.get_report_context_for_date(current_month) with ReportManifestDBAccessor() as manifest_accessor: manifest_entry = manifest_accessor.get_manifest(assembly_id, self.aws_provider_uuid) expected["manifest_id"] = manifest_entry.id self.assertIsInstance(result, dict) for key, value in result.items(): self.assertIn(key, expected) self.assertEqual(value, expected.get(key))
def _process_manifest_db_record(self, assembly_id, billing_start, num_of_files): """Insert or update the manifest DB record.""" LOG.info("Inserting/updating manifest in database for assembly_id: %s", assembly_id) with ReportManifestDBAccessor() as manifest_accessor: manifest_entry = manifest_accessor.get_manifest(assembly_id, self._provider_uuid) if not manifest_entry: msg = f"No manifest entry found in database. Adding for bill period start: {billing_start}" LOG.info(log_json(self.request_id, msg, self.context)) manifest_dict = { "assembly_id": assembly_id, "billing_period_start_datetime": billing_start, "num_total_files": num_of_files, "provider_uuid": self._provider_uuid, "task": self._task.request.id, } manifest_entry = manifest_accessor.add(**manifest_dict) manifest_accessor.mark_manifest_as_updated(manifest_entry) manifest_id = manifest_entry.id return manifest_id
def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None): """ Download a file from Azure bucket. Args: key (str): The object key identified. Returns: (String): The path and file name of the saved file """ local_filename = utils.get_local_file_name(key) full_file_path = f"{self._get_exports_data_directory()}/{local_filename}" file_creation_date = None try: blob = self._azure_client.get_cost_export_for_key(key, self.container_name) etag = blob.etag file_creation_date = blob.last_modified except AzureCostReportNotFound as ex: msg = f"Error when downloading Azure report for key: {key}. Error {ex}" LOG.error(log_json(self.request_id, msg, self.context)) raise AzureReportDownloaderError(msg) msg = f"Downloading {key} to {full_file_path}" LOG.info(log_json(self.request_id, msg, self.context)) blob = self._azure_client.download_cost_export(key, self.container_name, destination=full_file_path) # Push to S3 s3_csv_path = get_path_prefix( self.account, Provider.PROVIDER_AZURE, self._provider_uuid, start_date, Config.CSV_DATA_TYPE ) copy_local_report_file_to_s3_bucket( self.request_id, s3_csv_path, full_file_path, local_filename, manifest_id, start_date, self.context ) manifest_accessor = ReportManifestDBAccessor() manifest = manifest_accessor.get_manifest_by_id(manifest_id) if not manifest_accessor.get_s3_csv_cleared(manifest): remove_files_not_in_set_from_s3_bucket(self.request_id, s3_csv_path, manifest_id) manifest_accessor.mark_s3_csv_cleared(manifest) msg = f"Returning full_file_path: {full_file_path}, etag: {etag}" LOG.info(log_json(self.request_id, msg, self.context)) return full_file_path, etag, file_creation_date, []
def setUpClass(cls): """Set up the test class with required objects.""" super().setUpClass() cls.test_report_path = "./koku/masu/test/data/azure/costreport_a243c6f2-199f-4074-9a2c-40e671cf1584.csv" cls.test_v2_report_path = "./koku/masu/test/data/azure/azure_version_2.csv" cls.camelCase_report_path = "./koku/masu/test/data/azure/azure_camelCased.csv" cls.date_accessor = DateAccessor() cls.manifest_accessor = ReportManifestDBAccessor() _report_tables = copy.deepcopy(AZURE_REPORT_TABLE_MAP) _report_tables.pop("line_item_daily_summary", None) _report_tables.pop("tags_summary", None) _report_tables.pop("enabled_tag_keys", None) _report_tables.pop("ocp_on_azure_daily_summary", None) _report_tables.pop("ocp_on_azure_project_daily_summary", None) _report_tables.pop("ocp_on_azure_tags_summary", None) cls.report_tables = list(_report_tables.values()) # Grab a single row of test data to work with with open(cls.test_report_path, "r", encoding="utf-8-sig") as f: reader = csv.DictReader(f) cls.row = next(reader) cls.row = {key.lower(): value for key, value in cls.row.items()}
def refresh_materialized_views(schema_name, provider_type, manifest_id=None): """Refresh the database's materialized views for reporting.""" materialized_views = () if provider_type in (Provider.PROVIDER_AWS, Provider.PROVIDER_AWS_LOCAL): materialized_views = AWS_MATERIALIZED_VIEWS elif provider_type in (Provider.PROVIDER_OCP): materialized_views = OCP_MATERIALIZED_VIEWS elif provider_type in (Provider.PROVIDER_AZURE, Provider.PROVIDER_AZURE_LOCAL): materialized_views = AZURE_MATERIALIZED_VIEWS with schema_context(schema_name): for view in materialized_views: table_name = view._meta.db_table with connection.cursor() as cursor: cursor.execute( f"REFRESH MATERIALIZED VIEW CONCURRENTLY {table_name}") LOG.info(f"Refreshed {table_name}.") if manifest_id: # Processing for this monifest should be complete after this step with ReportManifestDBAccessor() as manifest_accessor: manifest = manifest_accessor.get_manifest_by_id(manifest_id) manifest_accessor.mark_manifest_as_completed(manifest)
def test_check_if_manifest_should_be_downloaded_error_processing_manifest( self): """Test that a manifest that did not succeessfully process should be reprocessed.""" with ReportManifestDBAccessor() as manifest_accessor: manifest = manifest_accessor.get_manifest_by_id(self.manifest_id) manifest.num_processed_files = 1 manifest.num_total_files = 2 manifest_accessor.commit() with ReportStatsDBAccessor(self.report_name, self.manifest_id) as file_accessor: file_accessor.log_last_started_datetime() file_accessor.commit() file_accessor.log_last_completed_datetime() file_accessor.commit() completed_datetime = self.date_accessor.today_with_timezone( 'UTC') - datetime.timedelta(hours=1) file_accessor.update(last_completed_datetime=completed_datetime) file_accessor.commit() result = self.downloader.check_if_manifest_should_be_downloaded( self.assembly_id) self.assertTrue(result)
def summarize_manifest(report_meta): """ Kick off manifest summary when all report files have completed line item processing. Args: report (Dict) - keys: value schema_name: String, manifest_id: Integer, provider_uuid: String, provider_type: String, Returns: Celery Async UUID. """ async_id = None schema_name = report_meta.get("schema_name") manifest_id = report_meta.get("manifest_id") provider_uuid = report_meta.get("provider_uuid") schema_name = report_meta.get("schema_name") provider_type = report_meta.get("provider_type") start_date = report_meta.get("start") end_date = report_meta.get("end") with ReportManifestDBAccessor() as manifest_accesor: if manifest_accesor.manifest_ready_for_summary(manifest_id): report_meta = { "schema_name": schema_name, "provider_type": provider_type, "provider_uuid": provider_uuid, "manifest_id": manifest_id, } if start_date and end_date: report_meta["start"] = start_date report_meta["end"] = end_date async_id = summarize_reports.delay([report_meta]) return async_id
def setUp(self): """Set up each test case.""" super().setUp() self.cache_key = self.fake.word() self.downloader = ReportDownloaderBase(provider_uuid=self.aws_provider_uuid, cache_key=self.cache_key) self.billing_start = self.date_accessor.today_with_timezone("UTC").replace(day=1) self.manifest_dict = { "assembly_id": self.assembly_id, "billing_period_start_datetime": self.billing_start, "num_total_files": 2, "provider_uuid": self.aws_provider_uuid, } with ReportManifestDBAccessor() as manifest_accessor: self.manifest = manifest_accessor.add(**self.manifest_dict) self.manifest.save() self.manifest_id = self.manifest.id for i in [1, 2]: baker.make( CostUsageReportStatus, report_name=f"{self.assembly_id}_file_{i}.csv.gz", last_completed_datetime=None, last_started_datetime=None, manifest_id=self.manifest_id, )
def _delete_line_items_in_range(self, bill_id, scan_start): """Delete stale data between date range.""" gcp_date_filter = {"usage_start__gte": scan_start} if not self._manifest_id: return False with ReportManifestDBAccessor() as manifest_accessor: num_processed_files = manifest_accessor.number_of_files_processed( self._manifest_id) if num_processed_files != 0: return False with GCPReportDBAccessor(self._schema) as accessor: line_item_query = accessor.get_lineitem_query_for_billid(bill_id) line_item_query = line_item_query.filter(**gcp_date_filter) delete_count = line_item_query.delete() if delete_count: log_statement = (f"Deleting data for:\n" f" schema_name: {self._schema}\n" f" provider_uuid: {self._provider_uuid}\n" f" bill ID: {bill_id}\n" f" on or after {scan_start}") LOG.info(log_statement) return True
def setUpClass(cls): """Set up the test class with required objects.""" super().setUpClass() # These test reports should be replaced with OCP reports once processor is impelmented. cls.test_report = './koku/masu/test/data/ocp/e6b3701e-1e91-433b-b238-a31e49937558_February-2019-my-ocp-cluster-1.csv' cls.storage_report = ( './koku/masu/test/data/ocp/e6b3701e-1e91-433b-b238-a31e49937558_storage.csv' ) cls.unknown_report = './koku/masu/test/data/test_cur.csv' cls.test_report_gzip = './koku/masu/test/data/test_cur.csv.gz' cls.date_accessor = DateAccessor() cls.billing_start = cls.date_accessor.today_with_timezone( 'UTC').replace(year=2018, month=6, day=1, hour=0, minute=0, second=0) cls.assembly_id = '1234' cls.manifest_accessor = ReportManifestDBAccessor() with ReportingCommonDBAccessor() as report_common_db: cls.column_map = report_common_db.column_map cls.accessor = OCPReportDBAccessor(cls.schema, cls.column_map) cls.report_schema = cls.accessor.report_schema _report_tables = copy.deepcopy(OCP_REPORT_TABLE_MAP) cls.report_tables = list(_report_tables.values()) # Grab a single row of test data to work with with open(cls.test_report, 'r') as f: reader = csv.DictReader(f) cls.row = next(reader)
def test_get_manifest_context_new_info(self, mock_manifest, mock_delete): """Test that the manifest is read.""" current_month = DateAccessor().today().replace(day=1, second=1, microsecond=1) assembly_id = "1234" compression = "PLAIN" report_keys = ["file1", "file2"] version = "5678" mock_manifest.return_value = { "uuid": assembly_id, "Compression": compression, "reportKeys": report_keys, "date": current_month, "files": report_keys, "version": version, "certified": False, "cluster_id": "4e009161-4f40-42c8-877c-3e59f6baea3d", "cr_status": { "clusterID": "4e009161-4f40-42c8-877c-3e59f6baea3d", "clusterVersion": "stable-4.6", "api_url": "https://cloud.redhat.com", "authentication": { "type": "token" }, "packaging": { "max_reports_to_store": 30, "max_size_MB": 100 }, "upload": { "ingress_path": "/api/ingress/v1/upload", "upload": False }, "operator_commit": "a09a5b21e55ce4a07fe31aa560650b538ec6de7c", "prometheus": { "error": "fake error" }, "reports": { "report_month": "07", "last_hour_queried": "2021-07-28 11:00:00 - 2021-07-28 11:59:59" }, "source": { "sources_path": "/api/sources/v1.0/", "name": "INSERT-SOURCE-NAME" }, }, } self.assertIsNone(self.ocp_report_downloader.context.get("version")) result = self.ocp_report_downloader.get_manifest_context_for_date( current_month) self.assertEqual(result.get("assembly_id"), assembly_id) self.assertEqual(result.get("compression"), compression) self.assertIsNotNone(result.get("files")) manifest_id = result.get("manifest_id") manifest = ReportManifestDBAccessor().get_manifest_by_id(manifest_id) expected_errors = {"prometheus_error": "fake error"} self.assertEqual(manifest.operator_version, version) self.assertEqual(manifest.operator_certified, False) self.assertEqual(manifest.operator_airgapped, True) self.assertEqual(manifest.cluster_channel, "stable-4.6") self.assertEqual(manifest.cluster_id, "4e009161-4f40-42c8-877c-3e59f6baea3d") self.assertEqual(manifest.operator_errors, expected_errors) self.assertEqual(self.ocp_report_downloader.context.get("version"), version)
class ReportManifestDBAccessorTest(IamTestCase): """Test cases for the ReportManifestDBAccessor.""" def setUp(self): """Set up the test class.""" super().setUp() self.schema = self.schema_name self.billing_start = DateAccessor().today_with_timezone("UTC").replace( day=1) self.manifest_dict = { "assembly_id": "1234", "billing_period_start_datetime": self.billing_start, "num_total_files": 2, "provider_uuid": self.provider_uuid, } self.manifest_accessor = ReportManifestDBAccessor() def tearDown(self): """Tear down the test class.""" super().tearDown() with schema_context(self.schema): manifests = self.manifest_accessor._get_db_obj_query().all() for manifest in manifests: self.manifest_accessor.delete(manifest) def test_initializer(self): """Test the initializer.""" accessor = ReportManifestDBAccessor() self.assertIsNotNone(accessor._table) def test_get_manifest(self): """Test that the right manifest is returned.""" with schema_context(self.schema): added_manifest = self.manifest_accessor.add(**self.manifest_dict) assembly_id = self.manifest_dict.get("assembly_id") provider_uuid = self.manifest_dict.get("provider_uuid") manifest = self.manifest_accessor.get_manifest( assembly_id, provider_uuid) self.assertIsNotNone(manifest) self.assertEqual(added_manifest, manifest) self.assertEqual(manifest.assembly_id, assembly_id) self.assertEqual(manifest.provider_id, provider_uuid) self.assertEqual(manifest.num_total_files, self.manifest_dict.get("num_total_files")) def test_get_manifest_by_id(self): """Test that the right manifest is returned by id.""" with schema_context(self.schema): added_manifest = self.manifest_accessor.add(**self.manifest_dict) manifest = self.manifest_accessor.get_manifest_by_id( added_manifest.id) self.assertIsNotNone(manifest) self.assertEqual(added_manifest, manifest) def test_mark_manifest_as_updated(self): """Test that the manifest is marked updated.""" with schema_context(self.schema): manifest = self.manifest_accessor.add(**self.manifest_dict) now = DateAccessor().today_with_timezone("UTC") self.manifest_accessor.mark_manifest_as_updated(manifest) self.assertGreater(manifest.manifest_updated_datetime, now) def test_mark_manifest_as_updated_none_manifest(self): """Test that a none manifest doesn't update failure.""" try: self.manifest_accessor.mark_manifest_as_updated(None) except Exception as err: self.fail(f"Test failed with error: {err}") def test_mark_manifest_as_completed_none_manifest(self): """Test that a none manifest doesn't complete failure.""" try: self.manifest_accessor.mark_manifest_as_completed(None) except Exception as err: self.fail(f"Test failed with error: {err}") def test_get_manifest_list_for_provider_and_bill_date(self): """Test that all manifests are returned for a provider and bill.""" bill_date = self.manifest_dict["billing_period_start_datetime"].date() manifest_dict = copy.deepcopy(self.manifest_dict) self.manifest_accessor.add(**manifest_dict) result = self.manifest_accessor.get_manifest_list_for_provider_and_bill_date( self.provider_uuid, bill_date) self.assertEqual(len(result), 1) manifest_dict["assembly_id"] = "2345" self.manifest_accessor.add(**manifest_dict) result = self.manifest_accessor.get_manifest_list_for_provider_and_bill_date( self.provider_uuid, bill_date) self.assertEqual(len(result), 2) manifest_dict["assembly_id"] = "3456" self.manifest_accessor.add(**manifest_dict) result = self.manifest_accessor.get_manifest_list_for_provider_and_bill_date( self.provider_uuid, bill_date) self.assertEqual(len(result), 3) def test_get_last_seen_manifest_ids(self): """Test that get_last_seen_manifest_ids returns the appropriate assembly_ids.""" # test that the most recently seen manifests that haven't been processed are returned manifest_dict2 = { "assembly_id": "5678", "billing_period_start_datetime": self.billing_start, "num_total_files": 1, "provider_uuid": "00000000-0000-0000-0000-000000000002", } manifest = self.manifest_accessor.add(**self.manifest_dict) manifest2 = self.manifest_accessor.add(**manifest_dict2) assembly_ids = self.manifest_accessor.get_last_seen_manifest_ids( self.billing_start) self.assertEqual(assembly_ids, [manifest.assembly_id, manifest2.assembly_id]) # test that when the manifest's files have been processed - it is no longer returned manifest2_helper = ManifestCreationHelper( manifest2.id, manifest_dict2.get("num_total_files"), manifest_dict2.get("assembly_id")) manifest2_helper.generate_test_report_files() manifest2_helper.process_all_files() assembly_ids = self.manifest_accessor.get_last_seen_manifest_ids( self.billing_start) self.assertEqual(assembly_ids, [manifest.assembly_id]) # test that of two manifests with the same provider_ids - that only the most recently # seen is returned manifest_dict3 = { "assembly_id": "91011", "billing_period_start_datetime": self.billing_start, "num_total_files": 1, "provider_uuid": self.provider_uuid, } manifest3 = self.manifest_accessor.add(**manifest_dict3) assembly_ids = self.manifest_accessor.get_last_seen_manifest_ids( self.billing_start) self.assertEqual(assembly_ids, [manifest3.assembly_id]) # test that manifests for a different billing month are not returned current_month = self.billing_start calculated_month = current_month + relativedelta(months=-2) manifest3.billing_period_start_datetime = calculated_month manifest3.save() assembly_ids = self.manifest_accessor.get_last_seen_manifest_ids( self.billing_start) self.assertEqual(assembly_ids, [manifest.assembly_id]) def test_is_last_completed_datetime_null(self): """Test is last completed datetime is null.""" manifest_id = 123456789 self.assertTrue( ReportManifestDBAccessor().is_last_completed_datetime_null( manifest_id)) baker.make(CostUsageReportManifest, id=manifest_id) baker.make(CostUsageReportStatus, manifest_id=manifest_id, last_completed_datetime=None) self.assertTrue( ReportManifestDBAccessor().is_last_completed_datetime_null( manifest_id)) CostUsageReportStatus.objects.filter(manifest_id=manifest_id).update( last_completed_datetime=FAKE.date()) self.assertFalse( ReportManifestDBAccessor().is_last_completed_datetime_null( manifest_id))
def _process_report_file(schema_name, provider, report_dict): """ Task to process a Report. Args: schema_name (String) db schema name provider (String) provider type report_dict (dict) The report data dict from previous task Returns: None """ start_date = report_dict.get("start_date") report_path = report_dict.get("file") compression = report_dict.get("compression") manifest_id = report_dict.get("manifest_id") provider_uuid = report_dict.get("provider_uuid") log_statement = (f"Processing Report:\n" f" schema_name: {schema_name}\n" f" provider: {provider}\n" f" provider_uuid: {provider_uuid}\n" f" file: {report_path}\n" f" compression: {compression}\n" f" start_date: {start_date}") LOG.info(log_statement) mem = psutil.virtual_memory() mem_msg = f"Avaiable memory: {mem.free} bytes ({mem.percent}%)" LOG.info(mem_msg) file_name = report_path.split("/")[-1] with ReportStatsDBAccessor(file_name, manifest_id) as stats_recorder: stats_recorder.log_last_started_datetime() try: processor = ReportProcessor( schema_name=schema_name, report_path=report_path, compression=compression, provider=provider, provider_uuid=provider_uuid, manifest_id=manifest_id, ) processor.process() except (ReportProcessorError, ReportProcessorDBError) as processing_error: with ReportStatsDBAccessor(file_name, manifest_id) as stats_recorder: stats_recorder.clear_last_started_datetime() raise processing_error with ReportStatsDBAccessor(file_name, manifest_id) as stats_recorder: stats_recorder.log_last_completed_datetime() with ReportManifestDBAccessor() as manifest_accesor: manifest = manifest_accesor.get_manifest_by_id(manifest_id) if manifest: manifest_accesor.mark_manifest_as_updated(manifest) else: LOG.error("Unable to find manifest for ID: %s, file %s", manifest_id, file_name) with ProviderDBAccessor(provider_uuid=provider_uuid) as provider_accessor: if provider_accessor.get_setup_complete(): files = processor.remove_processed_files(path.dirname(report_path)) LOG.info("Temporary files removed: %s", str(files)) provider_accessor.setup_complete() return True
class ReportStatsDBAccessorTest(MasuTestCase): """Test Cases for the ReportStatsDBAccessor object.""" def setUp(self): """Set up the test class.""" super().setUp() billing_start = datetime.utcnow().replace(day=1) manifest_dict = { "assembly_id": "1234", "billing_period_start_datetime": billing_start, "num_total_files": 2, "provider_uuid": self.aws_provider_uuid, } self.manifest_accessor = ReportManifestDBAccessor() manifest = self.manifest_accessor.add(**manifest_dict) self.manifest_id = manifest.id def test_initializer(self): """Test Initializer.""" saver = ReportStatsDBAccessor("myreport", self.manifest_id) self.assertIsNotNone(saver._obj) def test_initializer_preexisting_report(self): """Test getting a new accessor stats on a preexisting report.""" saver = ReportStatsDBAccessor("myreport", self.manifest_id) saver.update( cursor_position=33, last_completed_datetime="2011-1-1 11:11:11", last_started_datetime="2022-2-2 22:22:22", etag="myetag", ) self.assertIsNotNone(saver._obj) # Get another accessor for the same report and verify we get back the right information. saver2 = ReportStatsDBAccessor("myreport", self.manifest_id) last_completed = saver2.get_last_completed_datetime() self.assertEqual(last_completed.year, 2011) self.assertEqual(last_completed.month, 1) self.assertEqual(last_completed.day, 1) self.assertEqual(last_completed.hour, 11) self.assertEqual(last_completed.minute, 11) self.assertEqual(last_completed.second, 11) self.assertEqual(saver.get_etag(), "myetag") def test_add_remove(self): """Test basic add/remove logic.""" saver = ReportStatsDBAccessor("myreport", self.manifest_id) self.assertTrue(saver.does_db_entry_exist()) returned_obj = saver._get_db_obj_query() self.assertEqual(returned_obj.first().report_name, "myreport") saver.delete() returned_obj = saver._get_db_obj_query() self.assertIsNone(returned_obj.first()) def test_update(self): """Test updating an existing row.""" saver = ReportStatsDBAccessor("myreport", self.manifest_id) returned_obj = saver._get_db_obj_query() self.assertEqual(returned_obj.first().report_name, "myreport") saver.update( cursor_position=33, last_completed_datetime=parser.parse("2011-1-1 11:11:11"), last_started_datetime=parser.parse("2022-2-2 22:22:22"), etag="myetag", ) last_completed = saver.get_last_completed_datetime() self.assertEqual(last_completed.year, 2011) self.assertEqual(last_completed.month, 1) self.assertEqual(last_completed.day, 1) self.assertEqual(last_completed.hour, 11) self.assertEqual(last_completed.minute, 11) self.assertEqual(last_completed.second, 11) last_started = saver.get_last_started_datetime() self.assertEqual(last_started.year, 2022) self.assertEqual(last_started.month, 2) self.assertEqual(last_started.day, 2) self.assertEqual(last_started.hour, 22) self.assertEqual(last_started.minute, 22) self.assertEqual(last_started.second, 22) self.assertEqual(saver.get_etag(), "myetag") saver.delete() returned_obj = saver._get_db_obj_query() self.assertIsNone(returned_obj.first()) def test_log_last_started_datetime(self): """Test convience function for last started processing time.""" initial_count = CostUsageReportStatus.objects.count() saver = ReportStatsDBAccessor("myreport", self.manifest_id) saver.log_last_started_datetime() self.assertIsNotNone(saver.get_last_started_datetime()) saver.delete() self.assertEqual(CostUsageReportStatus.objects.count(), initial_count) def test_log_last_completed_datetime(self): """Test convience function for last completed processing time.""" initial_count = CostUsageReportStatus.objects.count() saver = ReportStatsDBAccessor("myreport", self.manifest_id) saver.log_last_completed_datetime() self.assertIsNotNone(saver.get_last_completed_datetime()) saver.delete() self.assertEqual(CostUsageReportStatus.objects.count(), initial_count) def test_clear_last_started_date(self): """Test convience function for clear last started date.""" saver = ReportStatsDBAccessor("myreport", self.manifest_id) saver.log_last_started_datetime() self.assertIsNotNone(saver.get_last_started_datetime()) saver.clear_last_started_datetime() self.assertIsNone(saver.get_last_started_datetime())
def convert_to_parquet(self): # noqa: C901 """ Convert archived CSV data from our S3 bucket for a given provider to Parquet. This function chiefly follows the download of a providers data. This task is defined to attempt up to 10 retries using exponential backoff starting with a 10-second delay. This is intended to allow graceful handling of temporary AWS S3 connectivity issues because it is relatively important for us to convert the archived data. """ parquet_base_filename = "" if self.csv_path_s3 is None or self.parquet_path_s3 is None or self.local_path is None: msg = ( f"Invalid paths provided to convert_csv_to_parquet." f"CSV path={self.csv_path_s3}, Parquet path={self.parquet_path_s3}, and local_path={self.local_path}." ) LOG.error(log_json(self.tracing_id, msg, self.error_context)) return "", pd.DataFrame() manifest_accessor = ReportManifestDBAccessor() manifest = manifest_accessor.get_manifest_by_id(self.manifest_id) # OCP data is daily chunked report files. # AWS and Azure are monthly reports. Previous reports should be removed so data isn't duplicated if not manifest_accessor.get_s3_parquet_cleared(manifest) and self.provider_type not in ( Provider.PROVIDER_OCP, Provider.PROVIDER_GCP, Provider.PROVIDER_GCP_LOCAL, ): remove_files_not_in_set_from_s3_bucket( self.tracing_id, self.parquet_path_s3, self.manifest_id, self.error_context ) remove_files_not_in_set_from_s3_bucket( self.tracing_id, self.parquet_daily_path_s3, self.manifest_id, self.error_context ) remove_files_not_in_set_from_s3_bucket( self.tracing_id, self.parquet_ocp_on_cloud_path_s3, self.manifest_id, self.error_context ) manifest_accessor.mark_s3_parquet_cleared(manifest) failed_conversion = [] daily_data_frames = [] for csv_filename in self.file_list: if self.provider_type == Provider.PROVIDER_OCP and self.report_type is None: msg = f"Could not establish report type for {csv_filename}." LOG.warn(log_json(self.tracing_id, msg, self.error_context)) failed_conversion.append(csv_filename) continue parquet_base_filename, daily_frame, success = self.convert_csv_to_parquet(csv_filename) daily_data_frames.extend(daily_frame) if self.provider_type not in (Provider.PROVIDER_AZURE): self.create_daily_parquet(parquet_base_filename, daily_frame) if not success: failed_conversion.append(csv_filename) if failed_conversion: msg = f"Failed to convert the following files to parquet:{','.join(failed_conversion)}." LOG.warn(log_json(self.tracing_id, msg, self.error_context)) return parquet_base_filename, daily_data_frames
def summarize_manifest(report_meta, manifest_uuid): """ Kick off manifest summary when all report files have completed line item processing. Args: manifest_uuid (string) - The id associated with the payload manifest report (Dict) - keys: value schema_name: String, manifest_id: Integer, provider_uuid: String, provider_type: String, Returns: Celery Async UUID. """ async_id = None schema_name = report_meta.get("schema_name") manifest_id = report_meta.get("manifest_id") provider_uuid = report_meta.get("provider_uuid") provider_type = report_meta.get("provider_type") start_date = report_meta.get("start") end_date = report_meta.get("end") context = {"account": report_meta.get("schema_name"), "provider_uuid": str(provider_uuid)} with ReportManifestDBAccessor() as manifest_accesor: if manifest_accesor.manifest_ready_for_summary(manifest_id): new_report_meta = { "schema_name": schema_name, "provider_type": provider_type, "provider_uuid": provider_uuid, "manifest_id": manifest_id, } if start_date and end_date: if "0001-01-01 00:00:00+00:00" in [str(start_date), str(end_date)]: cr_status = report_meta.get("cr_status", {}) context["cluster_id"] = cr_status.get("clusterID", "no-cluster-id") data_collection_message = cr_status.get("reports", {}).get("data_collection_message", "") if data_collection_message: # remove potentially sensitive info from the error message msg = ( f'data collection error [operator]: {re.sub("{[^}]+}", "{***}", data_collection_message)}' ) cr_status["reports"]["data_collection_message"] = msg # The full CR status is logged below, but we should limit our alert to just the query. # We can check the full manifest to get the full error. LOG.error(msg) LOG.info(log_json(manifest_uuid, msg, context)) LOG.info( log_json(manifest_uuid, f"CR Status for invalid manifest: {json.dumps(cr_status)}", context) ) return # an invalid payload will fail to summarize, so return before we try LOG.info( log_json( manifest_uuid, f"Summarizing OCP reports from {str(start_date)}-{str(end_date)} for provider: {provider_uuid}", context, ) ) new_report_meta["start"] = start_date new_report_meta["end"] = end_date new_report_meta["manifest_uuid"] = manifest_uuid async_id = summarize_reports.s([new_report_meta], OCP_QUEUE).apply_async(queue=OCP_QUEUE) return async_id
class OCPCloudReportSummaryUpdaterTest(MasuTestCase): """Test cases for the OCPCloudReportSummaryUpdaterTest class.""" @classmethod def setUpClass(cls): """Set up the test class with required objects.""" super().setUpClass() cls.date_accessor = DateAccessor() def setUp(self): """Setup tests.""" super().setUp() self.column_map = ReportingCommonDBAccessor().column_map self.accessor = AWSReportDBAccessor(schema='acct10001', column_map=self.column_map) self.manifest_accessor = ReportManifestDBAccessor() def tearDown(self): """Return the database to a pre-test state.""" self.accessor._session.rollback() aws_tables = list(AWS_CUR_TABLE_MAP.values()) with AWSReportDBAccessor(self.test_schema, self.column_map) as aws_accessor: aws_accessor._session.rollback() for table_name in aws_tables: tables = aws_accessor._get_db_obj_query(table_name).all() for table in tables: aws_accessor._session.delete(table) aws_accessor.commit() ocp_tables = list(OCP_REPORT_TABLE_MAP.values()) with OCPReportDBAccessor(self.test_schema, self.column_map) as ocp_accessor: for table_name in ocp_tables: tables = ocp_accessor._get_db_obj_query(table_name).all() for table in tables: ocp_accessor._session.delete(table) ocp_accessor.commit() manifests = self.manifest_accessor._get_db_obj_query().all() for manifest in manifests: self.manifest_accessor.delete(manifest) self.manifest_accessor.commit() def _generate_ocp_on_aws_data(self): """Test that the OCP on AWS cost summary table is populated.""" creator = ReportObjectCreator(self.accessor, self.column_map, self.accessor.report_schema.column_types) bill_ids = [] today = DateAccessor().today_with_timezone('UTC') last_month = today - relativedelta.relativedelta(months=1) resource_id = 'i-12345' for cost_entry_date in (today, last_month): bill = creator.create_cost_entry_bill(cost_entry_date) bill_ids.append(str(bill.id)) cost_entry = creator.create_cost_entry(bill, cost_entry_date) product = creator.create_cost_entry_product('Compute Instance') pricing = creator.create_cost_entry_pricing() reservation = creator.create_cost_entry_reservation() creator.create_cost_entry_line_item(bill, cost_entry, product, pricing, reservation, resource_id=resource_id) self.accessor.populate_line_item_daily_table(last_month, today, bill_ids) with OCPReportDBAccessor(self.test_schema, self.column_map) as ocp_accessor: cluster_id = self.ocp_provider_resource_name with ProviderDBAccessor(provider_uuid=self.ocp_test_provider_uuid ) as provider_access: provider_id = provider_access.get_provider().id for cost_entry_date in (today, last_month): period = creator.create_ocp_report_period( cost_entry_date, provider_id=provider_id, cluster_id=cluster_id) report = creator.create_ocp_report(period, cost_entry_date) creator.create_ocp_usage_line_item(period, report, resource_id=resource_id) cluster_id = get_cluster_id_from_provider( self.ocp_test_provider_uuid) ocp_accessor.populate_line_item_daily_table( last_month, today, cluster_id) def test_get_infra_db_key_for_provider_type(self): """Test db_key private method for OCP-on-AWS infrastructure map.""" with ProviderDBAccessor( self.ocp_test_provider_uuid) as provider_accessor: provider = provider_accessor.get_provider() updater = OCPCloudReportSummaryUpdater(schema='acct10001', provider=provider, manifest=None) self.assertEqual(updater._get_infra_db_key_for_provider_type('AWS'), 'aws_uuid') self.assertEqual( updater._get_infra_db_key_for_provider_type('AWS-local'), 'aws_uuid') self.assertEqual(updater._get_infra_db_key_for_provider_type('OCP'), 'ocp_uuid') self.assertEqual(updater._get_infra_db_key_for_provider_type('WRONG'), None) @patch( 'masu.processor.ocp.ocp_cloud_summary_updater.AWSReportDBAccessor.populate_ocp_on_aws_cost_daily_summary' ) @patch( 'masu.database.ocp_report_db_accessor.OCPReportDBAccessor.populate_cost_summary_table' ) @patch( 'masu.processor.ocp.ocp_cloud_summary_updater.OCPCloudReportSummaryUpdater._get_ocp_cluster_id_for_provider' ) def test_update_summary_tables_with_ocp_provider(self, mock_utility, mock_ocp, mock_ocp_on_aws): """Test that summary tables are properly run for an OCP provider.""" fake_cluster = 'my-ocp-cluster' mock_utility.return_value = fake_cluster start_date = self.date_accessor.today_with_timezone('UTC') end_date = start_date + datetime.timedelta(days=1) start_date_str = start_date.strftime('%Y-%m-%d') end_date_str = end_date.strftime('%Y-%m-%d') with ProviderDBAccessor( self.ocp_test_provider_uuid) as provider_accessor: provider = provider_accessor.get_provider() updater = OCPCloudReportSummaryUpdater(schema='acct10001', provider=provider, manifest=None) updater.update_summary_tables(start_date_str, end_date_str) mock_ocp_on_aws.assert_called_with(start_date_str, end_date_str, fake_cluster, []) mock_ocp.assert_called_with(fake_cluster, start_date_str, end_date_str) @patch( 'masu.processor.ocp.ocp_cloud_summary_updater.AWSReportDBAccessor.populate_ocp_on_aws_cost_daily_summary' ) @patch( 'masu.database.ocp_report_db_accessor.OCPReportDBAccessor.populate_cost_summary_table' ) @patch( 'masu.processor.ocp.ocp_cloud_summary_updater.get_bills_from_provider') @patch( 'masu.processor.ocp.ocp_cloud_summary_updater.OCPCloudReportSummaryUpdater._get_ocp_cluster_id_for_provider' ) def test_update_summary_tables_with_aws_provider(self, mock_cluster_id_utility, mock_utility, mock_ocp, mock_ocp_on_aws): """Test that summary tables are properly run for an OCP provider.""" fake_cluster_id = 'my-ocp-cluster' mock_cluster_id_utility.return_value = fake_cluster_id fake_bills = [Mock(), Mock()] fake_bills[0].id = 1 fake_bills[1].id = 2 bill_ids = [str(bill.id) for bill in fake_bills] mock_utility.return_value = fake_bills start_date = self.date_accessor.today_with_timezone('UTC') end_date = start_date + datetime.timedelta(days=1) start_date_str = start_date.strftime('%Y-%m-%d') end_date_str = end_date.strftime('%Y-%m-%d') with ProviderDBAccessor( self.aws_test_provider_uuid) as provider_accessor: provider = provider_accessor.get_provider() updater = OCPCloudReportSummaryUpdater(schema='acct10001', provider=provider, manifest=None) updater.update_summary_tables(start_date_str, end_date_str) mock_ocp_on_aws.assert_called_with(start_date_str, end_date_str, fake_cluster_id, bill_ids) mock_ocp.assert_called_with(fake_cluster_id, start_date_str, end_date_str) @patch( 'masu.processor.ocp.ocp_cloud_summary_updater.AWSReportDBAccessor.populate_ocp_on_aws_cost_daily_summary' ) @patch( 'masu.database.ocp_report_db_accessor.OCPReportDBAccessor.populate_cost_summary_table' ) def test_update_summary_tables_no_ocp_on_aws(self, mock_ocp, mock_ocp_on_aws): """Test that summary tables do not run when OCP-on-AWS does not exist.""" test_provider_list = [ self.aws_test_provider_uuid, self.ocp_test_provider_uuid ] for provider_uuid in test_provider_list: start_date = self.date_accessor.today_with_timezone('UTC') end_date = start_date + datetime.timedelta(days=1) start_date_str = start_date.strftime('%Y-%m-%d') end_date_str = end_date.strftime('%Y-%m-%d') with ProviderDBAccessor(provider_uuid) as provider_accessor: provider = provider_accessor.get_provider() updater = OCPCloudReportSummaryUpdater(schema='acct10001', provider=provider, manifest=None) updater.update_summary_tables(start_date_str, end_date_str) mock_ocp.assert_called() mock_ocp_on_aws.assert_not_called() def test_update_summary_tables(self): """Test that summary tables are updated correctly.""" self._generate_ocp_on_aws_data() start_date = self.date_accessor.today_with_timezone('UTC') end_date = start_date + datetime.timedelta(days=1) start_date_str = start_date.strftime('%Y-%m-%d') end_date_str = end_date.strftime('%Y-%m-%d') with ProviderDBAccessor( self.ocp_test_provider_uuid) as provider_accessor: provider = provider_accessor.get_provider() updater = OCPCloudReportSummaryUpdater(schema='acct10001', provider=provider, manifest=None) with AWSReportDBAccessor(self.test_schema, self.column_map) as aws_accessor: summary_table_name = AWS_CUR_TABLE_MAP['ocp_on_aws_daily_summary'] query = aws_accessor._get_db_obj_query(summary_table_name) initial_count = query.count() updater.update_summary_tables(start_date_str, end_date_str) self.assertNotEqual(query.count(), initial_count)
class ReportManifestDBAccessorTest(IamTestCase): """Test cases for the ReportManifestDBAccessor.""" def setUp(self): """Set up the test class.""" super().setUp() self.schema = self.schema_name billing_start = DateAccessor().today_with_timezone('UTC').replace( day=1) self.manifest_dict = { 'assembly_id': '1234', 'billing_period_start_datetime': billing_start, 'num_total_files': 2, 'provider_uuid': self.provider_uuid, } self.manifest_accessor = ReportManifestDBAccessor() def tearDown(self): """Tear down the test class.""" super().tearDown() with schema_context(self.schema): manifests = self.manifest_accessor._get_db_obj_query().all() for manifest in manifests: self.manifest_accessor.delete(manifest) def test_initializer(self): """Test the initializer.""" accessor = ReportManifestDBAccessor() self.assertIsNotNone(accessor._table) def test_get_manifest(self): """Test that the right manifest is returned.""" with schema_context(self.schema): added_manifest = self.manifest_accessor.add(**self.manifest_dict) assembly_id = self.manifest_dict.get('assembly_id') provider_uuid = self.manifest_dict.get('provider_uuid') manifest = self.manifest_accessor.get_manifest( assembly_id, provider_uuid) self.assertIsNotNone(manifest) self.assertEqual(added_manifest, manifest) self.assertEqual(manifest.assembly_id, assembly_id) self.assertEqual(manifest.provider_id, provider_uuid) self.assertEqual(manifest.num_total_files, self.manifest_dict.get('num_total_files')) def test_get_manifest_by_id(self): """Test that the right manifest is returned by id.""" with schema_context(self.schema): added_manifest = self.manifest_accessor.add(**self.manifest_dict) manifest = self.manifest_accessor.get_manifest_by_id( added_manifest.id) self.assertIsNotNone(manifest) self.assertEqual(added_manifest, manifest) def test_mark_manifest_as_updated(self): """Test that the manifest is marked updated.""" with schema_context(self.schema): manifest = self.manifest_accessor.add(**self.manifest_dict) now = DateAccessor().today_with_timezone('UTC') self.manifest_accessor.mark_manifest_as_updated(manifest) self.assertGreater(manifest.manifest_updated_datetime, now) def test_mark_manifest_as_completed(self): """Test that the manifest is marked updated.""" manifest = self.manifest_accessor.add(**self.manifest_dict) now = DateAccessor().today_with_timezone('UTC') self.manifest_accessor.mark_manifest_as_completed(manifest) self.assertGreater(manifest.manifest_completed_datetime, now) def test_get_manifest_list_for_provider_and_bill_date(self): """Test that all manifests are returned for a provider and bill.""" bill_date = self.manifest_dict['billing_period_start_datetime'].date() manifest_dict = copy.deepcopy(self.manifest_dict) self.manifest_accessor.add(**manifest_dict) result = self.manifest_accessor.get_manifest_list_for_provider_and_bill_date( self.provider_uuid, bill_date) self.assertEqual(len(result), 1) manifest_dict['assembly_id'] = '2345' self.manifest_accessor.add(**manifest_dict) result = self.manifest_accessor.get_manifest_list_for_provider_and_bill_date( self.provider_uuid, bill_date) self.assertEqual(len(result), 2) manifest_dict['assembly_id'] = '3456' self.manifest_accessor.add(**manifest_dict) result = self.manifest_accessor.get_manifest_list_for_provider_and_bill_date( self.provider_uuid, bill_date) self.assertEqual(len(result), 3) def test_get_last_report_completed_datetime(self): """Test that the last completed report datetime is returned.""" manifest = self.manifest_accessor.add(**self.manifest_dict) earlier_time = DateAccessor().today_with_timezone('UTC') later_time = earlier_time + datetime.timedelta(hours=1) ReportStatsDBAccessor( 'earlier_report', manifest.id).update(last_completed_datetime=earlier_time) ReportStatsDBAccessor( 'later_report', manifest.id).update(last_completed_datetime=later_time) result = self.manifest_accessor.get_last_report_completed_datetime( manifest.id) self.assertEqual(result, later_time)
def test_initializer(self): """Test the initializer.""" accessor = ReportManifestDBAccessor() self.assertIsNotNone(accessor._table)
def test_clean_volume(self, mock_date, mock_config): """Test that the clean volume function is cleaning the appropriate files""" # create a manifest mock_date.return_value = ["2020-02-01"] manifest_dict = { "assembly_id": "1234", "billing_period_start_datetime": "2020-02-01", "num_total_files": 2, "provider_uuid": self.aws_provider_uuid, } manifest_accessor = ReportManifestDBAccessor() manifest = manifest_accessor.add(**manifest_dict) # create two files on the temporary volume one with a matching prefix id # as the assembly_id in the manifest above with tempfile.TemporaryDirectory() as tmpdirname: mock_config.PVC_DIR = tmpdirname mock_config.VOLUME_FILE_RETENTION = 60 * 60 * 24 old_matching_file = os.path.join(tmpdirname, "%s.csv" % manifest.assembly_id) new_no_match_file = os.path.join(tmpdirname, "newfile.csv") old_no_match_file = os.path.join(tmpdirname, "oldfile.csv") filepaths = [ old_matching_file, new_no_match_file, old_no_match_file ] for path in filepaths: open(path, "a").close() self.assertEqual(os.path.exists(path), True) # Update timestame for oldfile.csv datehelper = DateHelper() now = datehelper.now old_datetime = now - timedelta( seconds=mock_config.VOLUME_FILE_RETENTION * 2) oldtime = old_datetime.timestamp() os.utime(old_matching_file, (oldtime, oldtime)) os.utime(old_no_match_file, (oldtime, oldtime)) # now run the clean volume task tasks.clean_volume() # make sure that the file with the matching id still exists and that # the file with the other id is gone self.assertEqual(os.path.exists(old_matching_file), True) self.assertEqual(os.path.exists(new_no_match_file), True) self.assertEqual(os.path.exists(old_no_match_file), False) # now edit the manifest to say that all the files have been processed # and rerun the clean_volumes task manifest.num_processed_files = manifest_dict.get("num_total_files") manifest_helper = ManifestCreationHelper( manifest.id, manifest_dict.get("num_total_files"), manifest_dict.get("assembly_id")) manifest_helper.generate_test_report_files() manifest_helper.process_all_files() manifest.save() tasks.clean_volume() # ensure that the original file is deleted from the volume self.assertEqual(os.path.exists(old_matching_file), False) self.assertEqual(os.path.exists(new_no_match_file), True) # assert the tempdir is cleaned up self.assertEqual(os.path.exists(tmpdirname), False) # test no files found for codecov tasks.clean_volume()
class ReportManifestDBAccessorTest(IamTestCase): """Test cases for the ReportManifestDBAccessor.""" def setUp(self): """Set up the test class.""" super().setUp() self.schema = self.schema_name billing_start = DateAccessor().today_with_timezone('UTC').replace( day=1) self.manifest_dict = { 'assembly_id': '1234', 'billing_period_start_datetime': billing_start, 'num_total_files': 2, 'provider_id': 1, } self.manifest_accessor = ReportManifestDBAccessor() def tearDown(self): """Tear down the test class.""" super().tearDown() with schema_context(self.schema): manifests = self.manifest_accessor._get_db_obj_query().all() for manifest in manifests: self.manifest_accessor.delete(manifest) def test_initializer(self): """Test the initializer.""" accessor = ReportManifestDBAccessor() self.assertIsNotNone(accessor._table) def test_get_manifest(self): """Test that the right manifest is returned.""" with schema_context(self.schema): added_manifest = self.manifest_accessor.add(**self.manifest_dict) assembly_id = self.manifest_dict.get('assembly_id') provider_id = self.manifest_dict.get('provider_id') manifest = self.manifest_accessor.get_manifest( assembly_id, provider_id) self.assertIsNotNone(manifest) self.assertEqual(added_manifest, manifest) self.assertEqual(manifest.assembly_id, assembly_id) self.assertEqual(manifest.provider_id, provider_id) self.assertEqual(manifest.num_total_files, self.manifest_dict.get('num_total_files')) def test_get_manifest_by_id(self): """Test that the right manifest is returned by id.""" with schema_context(self.schema): added_manifest = self.manifest_accessor.add(**self.manifest_dict) manifest = self.manifest_accessor.get_manifest_by_id( added_manifest.id) self.assertIsNotNone(manifest) self.assertEqual(added_manifest, manifest) def test_mark_manifest_as_updated(self): """Test that the manifest is marked updated.""" with schema_context(self.schema): manifest = self.manifest_accessor.add(**self.manifest_dict) now = DateAccessor().today_with_timezone('UTC') self.manifest_accessor.mark_manifest_as_updated(manifest) self.assertGreater(manifest.manifest_updated_datetime, now)
def _process_report_file(schema_name, provider, provider_uuid, report_dict): """ Task to process a Report. Args: schema_name (String) db schema name provider (String) provider type provider_uuid (String) provider uuid report_dict (dict) The report data dict from previous task Returns: None """ start_date = report_dict.get('start_date') report_path = report_dict.get('file') compression = report_dict.get('compression') manifest_id = report_dict.get('manifest_id') provider_id = report_dict.get('provider_id') stmt = ('Processing Report:' ' schema_name: {},' ' report_path: {},' ' compression: {},' ' provider: {},' ' start_date: {}') log_statement = stmt.format(schema_name, report_path, compression, provider, start_date) LOG.info(log_statement) mem = psutil.virtual_memory() mem_msg = 'Avaiable memory: {} bytes ({}%)'.format(mem.free, mem.percent) LOG.info(mem_msg) file_name = report_path.split('/')[-1] with ReportStatsDBAccessor(file_name, manifest_id) as stats_recorder: stats_recorder.log_last_started_datetime() stats_recorder.commit() processor = ReportProcessor(schema_name=schema_name, report_path=report_path, compression=compression, provider=provider, provider_id=provider_id, manifest_id=manifest_id) processor.process() stats_recorder.log_last_completed_datetime() stats_recorder.commit() with ReportManifestDBAccessor() as manifest_accesor: manifest = manifest_accesor.get_manifest_by_id(manifest_id) if manifest: manifest.num_processed_files += 1 manifest_accesor.mark_manifest_as_updated(manifest) manifest_accesor.commit() else: LOG.error('Unable to find manifest for ID: %s, file %s', manifest_id, file_name) with ProviderDBAccessor(provider_uuid=provider_uuid) as provider_accessor: provider_accessor.setup_complete() provider_accessor.commit() files = processor.remove_processed_files(path.dirname(report_path)) LOG.info('Temporary files removed: %s', str(files))
def refresh_materialized_views( # noqa: C901 schema_name, provider_type, manifest_id=None, provider_uuid=None, synchronous=False, queue_name=None): """Refresh the database's materialized views for reporting.""" task_name = "masu.processor.tasks.refresh_materialized_views" cache_args = [schema_name] if not synchronous: worker_cache = WorkerCache() if worker_cache.single_task_is_running(task_name, cache_args): msg = f"Task {task_name} already running for {cache_args}. Requeuing." LOG.info(msg) refresh_materialized_views.s( schema_name, provider_type, manifest_id=manifest_id, provider_uuid=provider_uuid, synchronous=synchronous, queue_name=queue_name, ).apply_async(queue=queue_name or REFRESH_MATERIALIZED_VIEWS_QUEUE) return worker_cache.lock_single_task(task_name, cache_args, timeout=600) materialized_views = () if provider_type in (Provider.PROVIDER_AWS, Provider.PROVIDER_AWS_LOCAL): materialized_views = (AWS_MATERIALIZED_VIEWS + OCP_ON_AWS_MATERIALIZED_VIEWS + OCP_ON_INFRASTRUCTURE_MATERIALIZED_VIEWS) elif provider_type in (Provider.PROVIDER_OCP): materialized_views = (OCP_MATERIALIZED_VIEWS + OCP_ON_AWS_MATERIALIZED_VIEWS + OCP_ON_AZURE_MATERIALIZED_VIEWS + OCP_ON_INFRASTRUCTURE_MATERIALIZED_VIEWS) elif provider_type in (Provider.PROVIDER_AZURE, Provider.PROVIDER_AZURE_LOCAL): materialized_views = (AZURE_MATERIALIZED_VIEWS + OCP_ON_AZURE_MATERIALIZED_VIEWS + OCP_ON_INFRASTRUCTURE_MATERIALIZED_VIEWS) elif provider_type in (Provider.PROVIDER_GCP, Provider.PROVIDER_GCP_LOCAL): materialized_views = GCP_MATERIALIZED_VIEWS with schema_context(schema_name): for view in materialized_views: table_name = view._meta.db_table with connection.cursor() as cursor: cursor.execute( f"REFRESH MATERIALIZED VIEW CONCURRENTLY {table_name}") LOG.info(f"Refreshed {table_name}.") invalidate_view_cache_for_tenant_and_source_type(schema_name, provider_type) if provider_uuid: ProviderDBAccessor(provider_uuid).set_data_updated_timestamp() if manifest_id: # Processing for this monifest should be complete after this step with ReportManifestDBAccessor() as manifest_accessor: manifest = manifest_accessor.get_manifest_by_id(manifest_id) manifest_accessor.mark_manifest_as_completed(manifest) if not synchronous: worker_cache.release_single_task(task_name, cache_args)
def remove(self, simulate=False, provider_uuid=None, line_items_only=False): """ Remove expired data based on the retention policy. Also remove expired CostUsageReportManifests, regardless of Provider type. Args: None Returns: ([{}]) List of dictionaries containing 'account_payer_id' and 'billing_period_start' """ removed_data = None disable_purge_line_item = (Provider.PROVIDER_AZURE, Provider.PROVIDER_AZURE_LOCAL, Provider.PROVIDER_OCP) no_data_msg = "%s has no line item data to be be removed." if provider_uuid is not None: if line_items_only: if self._provider in disable_purge_line_item: LOG.info(no_data_msg % self._provider) else: expiration_date = self._calculate_expiration_date( line_items_only=line_items_only) removed_data = self._cleaner.purge_expired_line_item( expired_date=expiration_date, simulate=simulate, provider_uuid=provider_uuid) else: removed_data = self._cleaner.purge_expired_report_data( simulate=simulate, provider_uuid=provider_uuid) with ReportManifestDBAccessor() as manifest_accessor: # Remove expired CostUsageReportManifests expiration_date = self._calculate_expiration_date() if not simulate: manifest_accessor.purge_expired_report_manifest_provider_uuid( provider_uuid, expiration_date) LOG.info( """Removed CostUsageReportManifest for provider uuid: %s before billing period: %s""", provider_uuid, expiration_date, ) else: expiration_date = self._calculate_expiration_date( line_items_only=line_items_only) if line_items_only: if self._provider in disable_purge_line_item: LOG.info(no_data_msg % self._provider) else: removed_data = self._cleaner.purge_expired_line_item( expired_date=expiration_date, simulate=simulate) else: # Remove expired CostUsageReportManifests removed_data = self._cleaner.purge_expired_report_data( expired_date=expiration_date, simulate=simulate) with ReportManifestDBAccessor() as manifest_accessor: if not simulate: manifest_accessor.purge_expired_report_manifest( self._provider, expiration_date) LOG.info( """Removed CostUsageReportManifest for provider type: %s before billing period: %s""", self._provider, expiration_date, ) return removed_data