class ReportManifestDBAccessorTest(IamTestCase): """Test cases for the ReportManifestDBAccessor.""" def setUp(self): """Set up the test class.""" super().setUp() self.schema = self.schema_name self.billing_start = DateAccessor().today_with_timezone("UTC").replace( day=1) self.manifest_dict = { "assembly_id": "1234", "billing_period_start_datetime": self.billing_start, "num_total_files": 2, "provider_uuid": self.provider_uuid, } self.manifest_accessor = ReportManifestDBAccessor() def tearDown(self): """Tear down the test class.""" super().tearDown() with schema_context(self.schema): manifests = self.manifest_accessor._get_db_obj_query().all() for manifest in manifests: self.manifest_accessor.delete(manifest) def test_initializer(self): """Test the initializer.""" accessor = ReportManifestDBAccessor() self.assertIsNotNone(accessor._table) def test_get_manifest(self): """Test that the right manifest is returned.""" with schema_context(self.schema): added_manifest = self.manifest_accessor.add(**self.manifest_dict) assembly_id = self.manifest_dict.get("assembly_id") provider_uuid = self.manifest_dict.get("provider_uuid") manifest = self.manifest_accessor.get_manifest( assembly_id, provider_uuid) self.assertIsNotNone(manifest) self.assertEqual(added_manifest, manifest) self.assertEqual(manifest.assembly_id, assembly_id) self.assertEqual(manifest.provider_id, provider_uuid) self.assertEqual(manifest.num_total_files, self.manifest_dict.get("num_total_files")) def test_get_manifest_by_id(self): """Test that the right manifest is returned by id.""" with schema_context(self.schema): added_manifest = self.manifest_accessor.add(**self.manifest_dict) manifest = self.manifest_accessor.get_manifest_by_id( added_manifest.id) self.assertIsNotNone(manifest) self.assertEqual(added_manifest, manifest) def test_mark_manifest_as_updated(self): """Test that the manifest is marked updated.""" with schema_context(self.schema): manifest = self.manifest_accessor.add(**self.manifest_dict) now = DateAccessor().today_with_timezone("UTC") self.manifest_accessor.mark_manifest_as_updated(manifest) self.assertGreater(manifest.manifest_updated_datetime, now) def test_mark_manifest_as_updated_none_manifest(self): """Test that a none manifest doesn't update failure.""" try: self.manifest_accessor.mark_manifest_as_updated(None) except Exception as err: self.fail(f"Test failed with error: {err}") def test_mark_manifest_as_completed_none_manifest(self): """Test that a none manifest doesn't complete failure.""" try: self.manifest_accessor.mark_manifest_as_completed(None) except Exception as err: self.fail(f"Test failed with error: {err}") def test_get_manifest_list_for_provider_and_bill_date(self): """Test that all manifests are returned for a provider and bill.""" bill_date = self.manifest_dict["billing_period_start_datetime"].date() manifest_dict = copy.deepcopy(self.manifest_dict) self.manifest_accessor.add(**manifest_dict) result = self.manifest_accessor.get_manifest_list_for_provider_and_bill_date( self.provider_uuid, bill_date) self.assertEqual(len(result), 1) manifest_dict["assembly_id"] = "2345" self.manifest_accessor.add(**manifest_dict) result = self.manifest_accessor.get_manifest_list_for_provider_and_bill_date( self.provider_uuid, bill_date) self.assertEqual(len(result), 2) manifest_dict["assembly_id"] = "3456" self.manifest_accessor.add(**manifest_dict) result = self.manifest_accessor.get_manifest_list_for_provider_and_bill_date( self.provider_uuid, bill_date) self.assertEqual(len(result), 3) def test_get_last_seen_manifest_ids(self): """Test that get_last_seen_manifest_ids returns the appropriate assembly_ids.""" # test that the most recently seen manifests that haven't been processed are returned manifest_dict2 = { "assembly_id": "5678", "billing_period_start_datetime": self.billing_start, "num_total_files": 1, "provider_uuid": "00000000-0000-0000-0000-000000000002", } manifest = self.manifest_accessor.add(**self.manifest_dict) manifest2 = self.manifest_accessor.add(**manifest_dict2) assembly_ids = self.manifest_accessor.get_last_seen_manifest_ids( self.billing_start) self.assertEqual(assembly_ids, [manifest.assembly_id, manifest2.assembly_id]) # test that when the manifest's files have been processed - it is no longer returned manifest2_helper = ManifestCreationHelper( manifest2.id, manifest_dict2.get("num_total_files"), manifest_dict2.get("assembly_id")) manifest2_helper.generate_test_report_files() manifest2_helper.process_all_files() assembly_ids = self.manifest_accessor.get_last_seen_manifest_ids( self.billing_start) self.assertEqual(assembly_ids, [manifest.assembly_id]) # test that of two manifests with the same provider_ids - that only the most recently # seen is returned manifest_dict3 = { "assembly_id": "91011", "billing_period_start_datetime": self.billing_start, "num_total_files": 1, "provider_uuid": self.provider_uuid, } manifest3 = self.manifest_accessor.add(**manifest_dict3) assembly_ids = self.manifest_accessor.get_last_seen_manifest_ids( self.billing_start) self.assertEqual(assembly_ids, [manifest3.assembly_id]) # test that manifests for a different billing month are not returned current_month = self.billing_start calculated_month = current_month + relativedelta(months=-2) manifest3.billing_period_start_datetime = calculated_month manifest3.save() assembly_ids = self.manifest_accessor.get_last_seen_manifest_ids( self.billing_start) self.assertEqual(assembly_ids, [manifest.assembly_id]) def test_is_last_completed_datetime_null(self): """Test is last completed datetime is null.""" manifest_id = 123456789 self.assertTrue( ReportManifestDBAccessor().is_last_completed_datetime_null( manifest_id)) baker.make(CostUsageReportManifest, id=manifest_id) baker.make(CostUsageReportStatus, manifest_id=manifest_id, last_completed_datetime=None) self.assertTrue( ReportManifestDBAccessor().is_last_completed_datetime_null( manifest_id)) CostUsageReportStatus.objects.filter(manifest_id=manifest_id).update( last_completed_datetime=FAKE.date()) self.assertFalse( ReportManifestDBAccessor().is_last_completed_datetime_null( manifest_id)) def test_get_s3_csv_cleared(self): """Test that s3 CSV clear status is reported.""" with schema_context(self.schema): manifest = self.manifest_accessor.add(**self.manifest_dict) status = self.manifest_accessor.get_s3_csv_cleared(manifest) self.assertFalse(status) self.manifest_accessor.mark_s3_csv_cleared(manifest) status = self.manifest_accessor.get_s3_csv_cleared(manifest) self.assertTrue(status) def test_get_s3_parquet_cleared(self): """Test that s3 CSV clear status is reported.""" with schema_context(self.schema): manifest = self.manifest_accessor.add(**self.manifest_dict) status = self.manifest_accessor.get_s3_parquet_cleared(manifest) self.assertFalse(status) self.manifest_accessor.mark_s3_parquet_cleared(manifest) status = self.manifest_accessor.get_s3_parquet_cleared(manifest) self.assertTrue(status)
def convert_to_parquet(self): # noqa: C901 """ Convert archived CSV data from our S3 bucket for a given provider to Parquet. This function chiefly follows the download of a providers data. This task is defined to attempt up to 10 retries using exponential backoff starting with a 10-second delay. This is intended to allow graceful handling of temporary AWS S3 connectivity issues because it is relatively important for us to convert the archived data. """ parquet_base_filename = "" if self.csv_path_s3 is None or self.parquet_path_s3 is None or self.local_path is None: msg = ( f"Invalid paths provided to convert_csv_to_parquet." f"CSV path={self.csv_path_s3}, Parquet path={self.parquet_path_s3}, and local_path={self.local_path}." ) LOG.error(log_json(self.tracing_id, msg, self.error_context)) return "", pd.DataFrame() manifest_accessor = ReportManifestDBAccessor() manifest = manifest_accessor.get_manifest_by_id(self.manifest_id) # OCP data is daily chunked report files. # AWS and Azure are monthly reports. Previous reports should be removed so data isn't duplicated if not manifest_accessor.get_s3_parquet_cleared(manifest) and self.provider_type not in ( Provider.PROVIDER_OCP, Provider.PROVIDER_GCP, Provider.PROVIDER_GCP_LOCAL, ): remove_files_not_in_set_from_s3_bucket( self.tracing_id, self.parquet_path_s3, self.manifest_id, self.error_context ) remove_files_not_in_set_from_s3_bucket( self.tracing_id, self.parquet_daily_path_s3, self.manifest_id, self.error_context ) remove_files_not_in_set_from_s3_bucket( self.tracing_id, self.parquet_ocp_on_cloud_path_s3, self.manifest_id, self.error_context ) manifest_accessor.mark_s3_parquet_cleared(manifest) failed_conversion = [] daily_data_frames = [] for csv_filename in self.file_list: if self.provider_type == Provider.PROVIDER_OCP and self.report_type is None: msg = f"Could not establish report type for {csv_filename}." LOG.warn(log_json(self.tracing_id, msg, self.error_context)) failed_conversion.append(csv_filename) continue parquet_base_filename, daily_frame, success = self.convert_csv_to_parquet(csv_filename) daily_data_frames.extend(daily_frame) if self.provider_type not in (Provider.PROVIDER_AZURE): self.create_daily_parquet(parquet_base_filename, daily_frame) if not success: failed_conversion.append(csv_filename) if failed_conversion: msg = f"Failed to convert the following files to parquet:{','.join(failed_conversion)}." LOG.warn(log_json(self.tracing_id, msg, self.error_context)) return parquet_base_filename, daily_data_frames