Esempio n. 1
0
class ReportManifestDBAccessorTest(IamTestCase):
    """Test cases for the ReportManifestDBAccessor."""
    def setUp(self):
        """Set up the test class."""
        super().setUp()
        self.schema = self.schema_name
        self.billing_start = DateAccessor().today_with_timezone("UTC").replace(
            day=1)
        self.manifest_dict = {
            "assembly_id": "1234",
            "billing_period_start_datetime": self.billing_start,
            "num_total_files": 2,
            "provider_uuid": self.provider_uuid,
        }
        self.manifest_accessor = ReportManifestDBAccessor()

    def tearDown(self):
        """Tear down the test class."""
        super().tearDown()
        with schema_context(self.schema):
            manifests = self.manifest_accessor._get_db_obj_query().all()
            for manifest in manifests:
                self.manifest_accessor.delete(manifest)

    def test_initializer(self):
        """Test the initializer."""
        accessor = ReportManifestDBAccessor()
        self.assertIsNotNone(accessor._table)

    def test_get_manifest(self):
        """Test that the right manifest is returned."""
        with schema_context(self.schema):
            added_manifest = self.manifest_accessor.add(**self.manifest_dict)

            assembly_id = self.manifest_dict.get("assembly_id")
            provider_uuid = self.manifest_dict.get("provider_uuid")
            manifest = self.manifest_accessor.get_manifest(
                assembly_id, provider_uuid)

        self.assertIsNotNone(manifest)
        self.assertEqual(added_manifest, manifest)
        self.assertEqual(manifest.assembly_id, assembly_id)
        self.assertEqual(manifest.provider_id, provider_uuid)
        self.assertEqual(manifest.num_total_files,
                         self.manifest_dict.get("num_total_files"))

    def test_get_manifest_by_id(self):
        """Test that the right manifest is returned by id."""
        with schema_context(self.schema):
            added_manifest = self.manifest_accessor.add(**self.manifest_dict)
            manifest = self.manifest_accessor.get_manifest_by_id(
                added_manifest.id)
        self.assertIsNotNone(manifest)
        self.assertEqual(added_manifest, manifest)

    def test_mark_manifest_as_updated(self):
        """Test that the manifest is marked updated."""
        with schema_context(self.schema):
            manifest = self.manifest_accessor.add(**self.manifest_dict)
            now = DateAccessor().today_with_timezone("UTC")
            self.manifest_accessor.mark_manifest_as_updated(manifest)
            self.assertGreater(manifest.manifest_updated_datetime, now)

    def test_mark_manifest_as_updated_none_manifest(self):
        """Test that a none manifest doesn't update failure."""
        try:
            self.manifest_accessor.mark_manifest_as_updated(None)
        except Exception as err:
            self.fail(f"Test failed with error: {err}")

    def test_mark_manifest_as_completed_none_manifest(self):
        """Test that a none manifest doesn't complete failure."""
        try:
            self.manifest_accessor.mark_manifest_as_completed(None)
        except Exception as err:
            self.fail(f"Test failed with error: {err}")

    def test_get_manifest_list_for_provider_and_bill_date(self):
        """Test that all manifests are returned for a provider and bill."""
        bill_date = self.manifest_dict["billing_period_start_datetime"].date()
        manifest_dict = copy.deepcopy(self.manifest_dict)
        self.manifest_accessor.add(**manifest_dict)
        result = self.manifest_accessor.get_manifest_list_for_provider_and_bill_date(
            self.provider_uuid, bill_date)
        self.assertEqual(len(result), 1)

        manifest_dict["assembly_id"] = "2345"
        self.manifest_accessor.add(**manifest_dict)
        result = self.manifest_accessor.get_manifest_list_for_provider_and_bill_date(
            self.provider_uuid, bill_date)
        self.assertEqual(len(result), 2)

        manifest_dict["assembly_id"] = "3456"
        self.manifest_accessor.add(**manifest_dict)
        result = self.manifest_accessor.get_manifest_list_for_provider_and_bill_date(
            self.provider_uuid, bill_date)
        self.assertEqual(len(result), 3)

    def test_get_last_seen_manifest_ids(self):
        """Test that get_last_seen_manifest_ids returns the appropriate assembly_ids."""
        # test that the most recently seen manifests that haven't been processed are returned
        manifest_dict2 = {
            "assembly_id": "5678",
            "billing_period_start_datetime": self.billing_start,
            "num_total_files": 1,
            "provider_uuid": "00000000-0000-0000-0000-000000000002",
        }
        manifest = self.manifest_accessor.add(**self.manifest_dict)
        manifest2 = self.manifest_accessor.add(**manifest_dict2)
        assembly_ids = self.manifest_accessor.get_last_seen_manifest_ids(
            self.billing_start)
        self.assertEqual(assembly_ids,
                         [manifest.assembly_id, manifest2.assembly_id])

        # test that when the manifest's files have been processed - it is no longer returned
        manifest2_helper = ManifestCreationHelper(
            manifest2.id, manifest_dict2.get("num_total_files"),
            manifest_dict2.get("assembly_id"))

        manifest2_helper.generate_test_report_files()
        manifest2_helper.process_all_files()

        assembly_ids = self.manifest_accessor.get_last_seen_manifest_ids(
            self.billing_start)
        self.assertEqual(assembly_ids, [manifest.assembly_id])

        # test that of two manifests with the same provider_ids - that only the most recently
        # seen is returned
        manifest_dict3 = {
            "assembly_id": "91011",
            "billing_period_start_datetime": self.billing_start,
            "num_total_files": 1,
            "provider_uuid": self.provider_uuid,
        }
        manifest3 = self.manifest_accessor.add(**manifest_dict3)
        assembly_ids = self.manifest_accessor.get_last_seen_manifest_ids(
            self.billing_start)
        self.assertEqual(assembly_ids, [manifest3.assembly_id])

        # test that manifests for a different billing month are not returned
        current_month = self.billing_start
        calculated_month = current_month + relativedelta(months=-2)
        manifest3.billing_period_start_datetime = calculated_month
        manifest3.save()
        assembly_ids = self.manifest_accessor.get_last_seen_manifest_ids(
            self.billing_start)
        self.assertEqual(assembly_ids, [manifest.assembly_id])

    def test_is_last_completed_datetime_null(self):
        """Test is last completed datetime is null."""
        manifest_id = 123456789
        self.assertTrue(
            ReportManifestDBAccessor().is_last_completed_datetime_null(
                manifest_id))
        baker.make(CostUsageReportManifest, id=manifest_id)
        baker.make(CostUsageReportStatus,
                   manifest_id=manifest_id,
                   last_completed_datetime=None)
        self.assertTrue(
            ReportManifestDBAccessor().is_last_completed_datetime_null(
                manifest_id))

        CostUsageReportStatus.objects.filter(manifest_id=manifest_id).update(
            last_completed_datetime=FAKE.date())

        self.assertFalse(
            ReportManifestDBAccessor().is_last_completed_datetime_null(
                manifest_id))

    def test_get_s3_csv_cleared(self):
        """Test that s3 CSV clear status is reported."""
        with schema_context(self.schema):
            manifest = self.manifest_accessor.add(**self.manifest_dict)
            status = self.manifest_accessor.get_s3_csv_cleared(manifest)
            self.assertFalse(status)

            self.manifest_accessor.mark_s3_csv_cleared(manifest)

            status = self.manifest_accessor.get_s3_csv_cleared(manifest)
            self.assertTrue(status)

    def test_get_s3_parquet_cleared(self):
        """Test that s3 CSV clear status is reported."""
        with schema_context(self.schema):
            manifest = self.manifest_accessor.add(**self.manifest_dict)
            status = self.manifest_accessor.get_s3_parquet_cleared(manifest)
            self.assertFalse(status)

            self.manifest_accessor.mark_s3_parquet_cleared(manifest)

            status = self.manifest_accessor.get_s3_parquet_cleared(manifest)
            self.assertTrue(status)
    def convert_to_parquet(self):  # noqa: C901
        """
        Convert archived CSV data from our S3 bucket for a given provider to Parquet.

        This function chiefly follows the download of a providers data.

        This task is defined to attempt up to 10 retries using exponential backoff
        starting with a 10-second delay. This is intended to allow graceful handling
        of temporary AWS S3 connectivity issues because it is relatively important
        for us to convert the archived data.
        """
        parquet_base_filename = ""

        if self.csv_path_s3 is None or self.parquet_path_s3 is None or self.local_path is None:
            msg = (
                f"Invalid paths provided to convert_csv_to_parquet."
                f"CSV path={self.csv_path_s3}, Parquet path={self.parquet_path_s3}, and local_path={self.local_path}."
            )
            LOG.error(log_json(self.tracing_id, msg, self.error_context))
            return "", pd.DataFrame()

        manifest_accessor = ReportManifestDBAccessor()
        manifest = manifest_accessor.get_manifest_by_id(self.manifest_id)

        # OCP data is daily chunked report files.
        # AWS and Azure are monthly reports. Previous reports should be removed so data isn't duplicated
        if not manifest_accessor.get_s3_parquet_cleared(manifest) and self.provider_type not in (
            Provider.PROVIDER_OCP,
            Provider.PROVIDER_GCP,
            Provider.PROVIDER_GCP_LOCAL,
        ):
            remove_files_not_in_set_from_s3_bucket(
                self.tracing_id, self.parquet_path_s3, self.manifest_id, self.error_context
            )
            remove_files_not_in_set_from_s3_bucket(
                self.tracing_id, self.parquet_daily_path_s3, self.manifest_id, self.error_context
            )
            remove_files_not_in_set_from_s3_bucket(
                self.tracing_id, self.parquet_ocp_on_cloud_path_s3, self.manifest_id, self.error_context
            )
            manifest_accessor.mark_s3_parquet_cleared(manifest)

        failed_conversion = []
        daily_data_frames = []
        for csv_filename in self.file_list:
            if self.provider_type == Provider.PROVIDER_OCP and self.report_type is None:
                msg = f"Could not establish report type for {csv_filename}."
                LOG.warn(log_json(self.tracing_id, msg, self.error_context))
                failed_conversion.append(csv_filename)
                continue
            parquet_base_filename, daily_frame, success = self.convert_csv_to_parquet(csv_filename)
            daily_data_frames.extend(daily_frame)
            if self.provider_type not in (Provider.PROVIDER_AZURE):
                self.create_daily_parquet(parquet_base_filename, daily_frame)
            if not success:
                failed_conversion.append(csv_filename)

        if failed_conversion:
            msg = f"Failed to convert the following files to parquet:{','.join(failed_conversion)}."
            LOG.warn(log_json(self.tracing_id, msg, self.error_context))
        return parquet_base_filename, daily_data_frames