def test_create_entity_summary_model(self, test_resources):
     entity_target_rule_binding_configs = {
         "rule_binding_ids_list": [
             "rule_binding_id_1",
             "rule_binding_id_2",
         ]
     }
     entity_summary_model = lib.create_entity_summary_model(
         entity_table_id="entity_table_id",
         entity_target_rule_binding_configs=
         entity_target_rule_binding_configs,
         gcp_project_id="gcp_project_id",
         gcp_bq_dataset_id="gcp_bq_dataset_id",
         debug=True)
     with open(test_resources / "expected_entity_summary_model.sql") as f:
         expected_entity_summary_model = f.read()
         expected = utils.strip_margin(
             re.sub(RE_NEWLINES, '\n',
                    expected_entity_summary_model)).strip()
     assert expected == utils.strip_margin(
         re.sub(RE_NEWLINES, '\n', entity_summary_model)).strip()
    def test_render_run_dq_main_sql_high_watermark(
        self,
        test_rule_bindings_collection_team_1,
        test_configs_cache,
        test_resources,
        gcp_project_id,
        gcp_bq_dataset,
        test_bigquery_client,
    ):
        """

        Args:
          test_rule_bindings_collection_team_1:
          test_entities_collection:
          test_rules_collection:
          test_row_filters_collection:

        Returns:

        """
        with open(
                test_resources /
                "test_render_run_dq_main_sql_expected_high_watermark.sql",
        ) as f:
            expected = f.read()
        rule_binding_id, rule_binding_configs = (
            test_rule_bindings_collection_team_1.items().__iter__().__next__(
            )  # use first rule binding
        )
        output = lib.create_rule_binding_view_model(
            rule_binding_id=rule_binding_id,
            rule_binding_configs=rule_binding_configs,
            dq_summary_table_name=
            f"{gcp_project_id}.{gcp_bq_dataset}.dq_summary",
            configs_cache=test_configs_cache,
            environment="DEV",
            debug=True,
            dq_summary_table_exists=True,
            high_watermark_filter_exists=False,
            bigquery_client=test_bigquery_client,
        )
        output = re.sub(RE_CONFIGS_HASHSUM, CONFIGS_HASHSUM_REP, output)
        output = re.sub(RE_HIGH_WATERMARK_TIMESTAMP, HIGH_WATERMARK_VALUE_REP,
                        output)
        output = re.sub(RE_CURRENT_TIMESTAMP, CURRENT_TIMESTAMP_VALUE_REP,
                        output)
        output = re.sub(RE_NEWLINES, '\n', output).strip()
        expected = utils.strip_margin(re.sub(RE_NEWLINES, '\n',
                                             expected)).strip()
        assert output == expected
Exemple #3
0
    def test_render_run_dq_main_sql_from_configs_file(
        self,
        test_rule_bindings_collection_from_configs_file,
        test_default_dataplex_configs_cache_from_file,
        test_resources,
        gcp_project_id,
        gcp_dataplex_bigquery_dataset_id,
        gcp_bq_dataset,
        test_dataplex_metadata_defaults_configs,
        gcp_dataplex_zone_id,
        gcp_dataplex_lake_name,
        test_bigquery_client,
    ):
        """ """
        for rule_binding_id, rule_binding_configs in test_rule_bindings_collection_from_configs_file.items():

            with open(test_resources / "dataplex_metadata_sql_expected.sql") as f:
                expected = f.read()
            output = lib.create_rule_binding_view_model(
                rule_binding_id=rule_binding_id,
                rule_binding_configs=rule_binding_configs,
                dq_summary_table_name="<your_gcp_project_id>.<your_bigquery_dataset_id>.dq_summary",
                configs_cache=test_default_dataplex_configs_cache_from_file,
                environment="DEV",
                debug=True,
                default_configs=test_dataplex_metadata_defaults_configs,
                bigquery_client=test_bigquery_client,
            )
            output = re.sub(RE_CONFIGS_HASHSUM, CONFIGS_HASHSUM_REP, output)
            output = output.replace(gcp_project_id, "<your-gcp-project-id>")\
                .replace(gcp_dataplex_bigquery_dataset_id, "<your_bigquery_dataset_id>")\
                .replace(gcp_bq_dataset, "<your_bigquery_dataset_id>")
            if gcp_dataplex_zone_id in output:
                output = output.replace(gcp_dataplex_zone_id, "<your_dataplex_zone_id>")
            else:
                output = output.replace("CAST(NULL AS STRING) AS dataplex_zone",
                                        "'<your_dataplex_zone_id>' AS dataplex_zone")
            if gcp_dataplex_lake_name in output:
                output = output.replace(gcp_dataplex_lake_name, "<your_dataplex_lake_id>")
            else:
                output = output.replace("CAST(NULL AS STRING) AS dataplex_lake",
                                        "'<your_dataplex_lake_id>' AS dataplex_lake")

            output = output.replace(rule_binding_id, "<rule_binding_id>")
            expected = utils.strip_margin(re.sub(RE_NEWLINES, '\n', expected)).strip()
            print(output)
            output = re.sub(RE_NEWLINES, '\n', output).strip()
            output = re.sub(RE_ASSET_ID, ASSET_ID_REP, output)
            output = output.replace("CAST(NULL AS STRING) AS dataplex_asset_id,", ASSET_ID_REP)
            assert output == expected
    def test_dq_rule_binding_conflicted_column_id_is_not_escaped_for_sql_statement(self, temp_configs_dir, tmp_path):
        try:
            temp_dir = Path(tmp_path).joinpath("clouddq_test_configs_cache_2")
            temp_dir.mkdir(parents=True)
            with working_directory(temp_dir):
                configs_cache = lib.prepare_configs_cache(temp_configs_dir)
        finally:
            shutil.rmtree(temp_dir)

        dq_rule_binding_dict_with_conflicted_column_id = {
            "entity_id": "TEST_TABLE",
            "column_id": "data",
            "row_filter_id": "NONE",
            "rule_ids": [{"NO_DUPLICATES_IN_COLUMN_GROUPS": {"column_names": "data"}}],
            "metadata": {"key": "value"}
        }

        output = DqRuleBinding.from_dict(
            rule_binding_id="valid",
            kwargs=dq_rule_binding_dict_with_conflicted_column_id
        ).resolve_all_configs_to_dict(configs_cache=configs_cache)
        text = output["rule_configs_dict"]["NO_DUPLICATES_IN_COLUMN_GROUPS"]["rule_sql_expr"]

        expected = """
        |select a.*
        |from data a
        |inner join (
        |  select
        |    data
        |  from data
        |  group by data
        |  having count(*) > 1
        |) duplicates
        |using (data)"""
        assert strip_margin(text.replace(r"\s\s+", " ")) == \
            strip_margin(expected.replace(r"\s\s+", " "))
    def test_render_run_dq_main_sql_env_override(
        self,
        test_rule_bindings_collection_team_2,
        test_configs_cache,
        test_resources,
        test_bigquery_client,
    ):
        """

        Args:
          test_rule_bindings_collection_team_2:
          test_entities_collection:
          test_rules_collection:
          test_row_filters_collection:

        Returns:

        """
        with open(test_resources /
                  "test_render_run_dq_main_sql_expected.sql") as f:
            expected = f.read()
        rule_binding_id, rule_binding_configs = (
            test_rule_bindings_collection_team_2.items().__iter__().__next__(
            )  # use first rule binding
        )
        output = lib.create_rule_binding_view_model(
            rule_binding_id=rule_binding_id,
            rule_binding_configs=rule_binding_configs,
            dq_summary_table_name=
            "<your_gcp_project_id>.<your_bigquery_dataset_id>.dq_summary",
            configs_cache=test_configs_cache,
            environment="TEST",
            debug=True,
            high_watermark_filter_exists=False,
            bigquery_client=test_bigquery_client,
        )
        expected = expected.replace(
            "<your_gcp_project_id>.<your_bigquery_dataset_id>",
            "<your_gcp_project_id_2>.<your_bigquery_dataset_id_2>")
        expected = expected.replace("<your_bigquery_dataset_id>.__TABLES__",
                                    "<your_bigquery_dataset_id_2>.__TABLES__")
        output = re.sub(RE_NEWLINES, '\n', output).strip()
        output = re.sub(RE_CONFIGS_HASHSUM, CONFIGS_HASHSUM_REP, output)
        expected = utils.strip_margin(re.sub(RE_NEWLINES, '\n',
                                             expected)).strip()
        assert output == expected
Exemple #6
0
    def test_render_run_dq_main_sql_bq_native_partitioned(
        self,
        test_rule_bindings_collection_team_8,
        test_default_dataplex_configs_cache,
        test_resources,
        gcp_project_id,
        test_dataplex_metadata_defaults_configs,
        gcp_dataplex_zone_id,
        gcp_dataplex_lake_name,
        gcp_dataplex_bigquery_dataset_id,
        gcp_bq_dataset,
        test_bigquery_client,
    ):
        """ """
        for rule_binding_id, rule_binding_configs in test_rule_bindings_collection_team_8.items():

            if rule_binding_id in ["T16_URI_BQ_PARTITIONED_EMAIL_DUPLICATE", "T17_URI_BQ_PARTITIONED_EMAIL_DUPLICATE"]:
                expected_sql_filename = "bq_native_default_partitioned_sql_expected.sql"
            else:
                expected_sql_filename = "bq_native_partitioned_sql_expected.sql"

            with open(test_resources / expected_sql_filename) as f:
                expected = f.read()
            output = lib.create_rule_binding_view_model(
                rule_binding_id=rule_binding_id,
                rule_binding_configs=rule_binding_configs,
                dq_summary_table_name="<your_gcp_project_id>.<your_bigquery_dataset_id>.dq_summary",
                configs_cache=test_default_dataplex_configs_cache,
                environment="DEV",
                debug=True,
                default_configs=test_dataplex_metadata_defaults_configs,
                bigquery_client=test_bigquery_client,
            )
            print(output)
            output = output.replace(gcp_project_id, "<your-gcp-project-id>")\
                .replace(gcp_dataplex_bigquery_dataset_id, "<your_bigquery_dataset_id>")\
                .replace(gcp_bq_dataset, "<your_bigquery_dataset_id>")
            output = output.replace(rule_binding_id, "<rule_binding_id>")
            output = re.sub(RE_NEWLINES, '\n', output).strip()
            output = re.sub(RE_CONFIGS_HASHSUM, CONFIGS_HASHSUM_REP, output)
            output = re.sub(RE_ASSET_ID, ASSET_ID_REP, output)
            expected = utils.strip_margin(re.sub(RE_NEWLINES, '\n', expected)).strip()
            assert output == expected
Exemple #7
0
    def test_render_run_dq_main_sql_gcs_partitioned(
        self,
        test_rule_bindings_collection_team_6,
        test_default_dataplex_configs_cache,
        test_resources,
        gcp_project_id,
        test_dataplex_metadata_defaults_configs,
        gcp_dataplex_zone_id,
        gcp_dataplex_lake_name,
        test_bigquery_client,
    ):
        """ """
        for rule_binding_id, rule_binding_configs in test_rule_bindings_collection_team_6.items():

            with open(test_resources / "dataplex_gcs_partitioned_metadata_sql_expected.sql") as f:
                expected = f.read()
            output = lib.create_rule_binding_view_model(
                rule_binding_id=rule_binding_id,
                rule_binding_configs=rule_binding_configs,
                dq_summary_table_name="<your_gcp_project_id>.<your_bigquery_dataset_id>.dq_summary",
                configs_cache=test_default_dataplex_configs_cache,
                environment="DEV",
                debug=True,
                default_configs=test_dataplex_metadata_defaults_configs,
                bigquery_client=test_bigquery_client,
            )
            output = output.replace(gcp_project_id, "<your-gcp-project-id>")\
                .replace(gcp_dataplex_zone_id.replace('-', '_'), "<your_dataplex_zone_name>")\
                .replace(gcp_dataplex_zone_id, "<your_dataplex_zone_name>")\
                .replace(rule_binding_id, "<rule_binding_id>")\
                .replace(gcp_dataplex_lake_name, "<your_dataplex_lake_id>")
            expected = utils.strip_margin(re.sub(RE_NEWLINES, '\n', expected)).strip()
            output = re.sub(RE_NEWLINES, '\n', output).strip()
            output = re.sub(RE_CONFIGS_HASHSUM, CONFIGS_HASHSUM_REP, output)
            output = re.sub(RE_ASSET_ID, ASSET_ID_REP, output)
            assert output == expected