def test_create_entity_summary_model(self, test_resources): entity_target_rule_binding_configs = { "rule_binding_ids_list": [ "rule_binding_id_1", "rule_binding_id_2", ] } entity_summary_model = lib.create_entity_summary_model( entity_table_id="entity_table_id", entity_target_rule_binding_configs= entity_target_rule_binding_configs, gcp_project_id="gcp_project_id", gcp_bq_dataset_id="gcp_bq_dataset_id", debug=True) with open(test_resources / "expected_entity_summary_model.sql") as f: expected_entity_summary_model = f.read() expected = utils.strip_margin( re.sub(RE_NEWLINES, '\n', expected_entity_summary_model)).strip() assert expected == utils.strip_margin( re.sub(RE_NEWLINES, '\n', entity_summary_model)).strip()
def test_render_run_dq_main_sql_high_watermark( self, test_rule_bindings_collection_team_1, test_configs_cache, test_resources, gcp_project_id, gcp_bq_dataset, test_bigquery_client, ): """ Args: test_rule_bindings_collection_team_1: test_entities_collection: test_rules_collection: test_row_filters_collection: Returns: """ with open( test_resources / "test_render_run_dq_main_sql_expected_high_watermark.sql", ) as f: expected = f.read() rule_binding_id, rule_binding_configs = ( test_rule_bindings_collection_team_1.items().__iter__().__next__( ) # use first rule binding ) output = lib.create_rule_binding_view_model( rule_binding_id=rule_binding_id, rule_binding_configs=rule_binding_configs, dq_summary_table_name= f"{gcp_project_id}.{gcp_bq_dataset}.dq_summary", configs_cache=test_configs_cache, environment="DEV", debug=True, dq_summary_table_exists=True, high_watermark_filter_exists=False, bigquery_client=test_bigquery_client, ) output = re.sub(RE_CONFIGS_HASHSUM, CONFIGS_HASHSUM_REP, output) output = re.sub(RE_HIGH_WATERMARK_TIMESTAMP, HIGH_WATERMARK_VALUE_REP, output) output = re.sub(RE_CURRENT_TIMESTAMP, CURRENT_TIMESTAMP_VALUE_REP, output) output = re.sub(RE_NEWLINES, '\n', output).strip() expected = utils.strip_margin(re.sub(RE_NEWLINES, '\n', expected)).strip() assert output == expected
def test_render_run_dq_main_sql_from_configs_file( self, test_rule_bindings_collection_from_configs_file, test_default_dataplex_configs_cache_from_file, test_resources, gcp_project_id, gcp_dataplex_bigquery_dataset_id, gcp_bq_dataset, test_dataplex_metadata_defaults_configs, gcp_dataplex_zone_id, gcp_dataplex_lake_name, test_bigquery_client, ): """ """ for rule_binding_id, rule_binding_configs in test_rule_bindings_collection_from_configs_file.items(): with open(test_resources / "dataplex_metadata_sql_expected.sql") as f: expected = f.read() output = lib.create_rule_binding_view_model( rule_binding_id=rule_binding_id, rule_binding_configs=rule_binding_configs, dq_summary_table_name="<your_gcp_project_id>.<your_bigquery_dataset_id>.dq_summary", configs_cache=test_default_dataplex_configs_cache_from_file, environment="DEV", debug=True, default_configs=test_dataplex_metadata_defaults_configs, bigquery_client=test_bigquery_client, ) output = re.sub(RE_CONFIGS_HASHSUM, CONFIGS_HASHSUM_REP, output) output = output.replace(gcp_project_id, "<your-gcp-project-id>")\ .replace(gcp_dataplex_bigquery_dataset_id, "<your_bigquery_dataset_id>")\ .replace(gcp_bq_dataset, "<your_bigquery_dataset_id>") if gcp_dataplex_zone_id in output: output = output.replace(gcp_dataplex_zone_id, "<your_dataplex_zone_id>") else: output = output.replace("CAST(NULL AS STRING) AS dataplex_zone", "'<your_dataplex_zone_id>' AS dataplex_zone") if gcp_dataplex_lake_name in output: output = output.replace(gcp_dataplex_lake_name, "<your_dataplex_lake_id>") else: output = output.replace("CAST(NULL AS STRING) AS dataplex_lake", "'<your_dataplex_lake_id>' AS dataplex_lake") output = output.replace(rule_binding_id, "<rule_binding_id>") expected = utils.strip_margin(re.sub(RE_NEWLINES, '\n', expected)).strip() print(output) output = re.sub(RE_NEWLINES, '\n', output).strip() output = re.sub(RE_ASSET_ID, ASSET_ID_REP, output) output = output.replace("CAST(NULL AS STRING) AS dataplex_asset_id,", ASSET_ID_REP) assert output == expected
def test_dq_rule_binding_conflicted_column_id_is_not_escaped_for_sql_statement(self, temp_configs_dir, tmp_path): try: temp_dir = Path(tmp_path).joinpath("clouddq_test_configs_cache_2") temp_dir.mkdir(parents=True) with working_directory(temp_dir): configs_cache = lib.prepare_configs_cache(temp_configs_dir) finally: shutil.rmtree(temp_dir) dq_rule_binding_dict_with_conflicted_column_id = { "entity_id": "TEST_TABLE", "column_id": "data", "row_filter_id": "NONE", "rule_ids": [{"NO_DUPLICATES_IN_COLUMN_GROUPS": {"column_names": "data"}}], "metadata": {"key": "value"} } output = DqRuleBinding.from_dict( rule_binding_id="valid", kwargs=dq_rule_binding_dict_with_conflicted_column_id ).resolve_all_configs_to_dict(configs_cache=configs_cache) text = output["rule_configs_dict"]["NO_DUPLICATES_IN_COLUMN_GROUPS"]["rule_sql_expr"] expected = """ |select a.* |from data a |inner join ( | select | data | from data | group by data | having count(*) > 1 |) duplicates |using (data)""" assert strip_margin(text.replace(r"\s\s+", " ")) == \ strip_margin(expected.replace(r"\s\s+", " "))
def test_render_run_dq_main_sql_env_override( self, test_rule_bindings_collection_team_2, test_configs_cache, test_resources, test_bigquery_client, ): """ Args: test_rule_bindings_collection_team_2: test_entities_collection: test_rules_collection: test_row_filters_collection: Returns: """ with open(test_resources / "test_render_run_dq_main_sql_expected.sql") as f: expected = f.read() rule_binding_id, rule_binding_configs = ( test_rule_bindings_collection_team_2.items().__iter__().__next__( ) # use first rule binding ) output = lib.create_rule_binding_view_model( rule_binding_id=rule_binding_id, rule_binding_configs=rule_binding_configs, dq_summary_table_name= "<your_gcp_project_id>.<your_bigquery_dataset_id>.dq_summary", configs_cache=test_configs_cache, environment="TEST", debug=True, high_watermark_filter_exists=False, bigquery_client=test_bigquery_client, ) expected = expected.replace( "<your_gcp_project_id>.<your_bigquery_dataset_id>", "<your_gcp_project_id_2>.<your_bigquery_dataset_id_2>") expected = expected.replace("<your_bigquery_dataset_id>.__TABLES__", "<your_bigquery_dataset_id_2>.__TABLES__") output = re.sub(RE_NEWLINES, '\n', output).strip() output = re.sub(RE_CONFIGS_HASHSUM, CONFIGS_HASHSUM_REP, output) expected = utils.strip_margin(re.sub(RE_NEWLINES, '\n', expected)).strip() assert output == expected
def test_render_run_dq_main_sql_bq_native_partitioned( self, test_rule_bindings_collection_team_8, test_default_dataplex_configs_cache, test_resources, gcp_project_id, test_dataplex_metadata_defaults_configs, gcp_dataplex_zone_id, gcp_dataplex_lake_name, gcp_dataplex_bigquery_dataset_id, gcp_bq_dataset, test_bigquery_client, ): """ """ for rule_binding_id, rule_binding_configs in test_rule_bindings_collection_team_8.items(): if rule_binding_id in ["T16_URI_BQ_PARTITIONED_EMAIL_DUPLICATE", "T17_URI_BQ_PARTITIONED_EMAIL_DUPLICATE"]: expected_sql_filename = "bq_native_default_partitioned_sql_expected.sql" else: expected_sql_filename = "bq_native_partitioned_sql_expected.sql" with open(test_resources / expected_sql_filename) as f: expected = f.read() output = lib.create_rule_binding_view_model( rule_binding_id=rule_binding_id, rule_binding_configs=rule_binding_configs, dq_summary_table_name="<your_gcp_project_id>.<your_bigquery_dataset_id>.dq_summary", configs_cache=test_default_dataplex_configs_cache, environment="DEV", debug=True, default_configs=test_dataplex_metadata_defaults_configs, bigquery_client=test_bigquery_client, ) print(output) output = output.replace(gcp_project_id, "<your-gcp-project-id>")\ .replace(gcp_dataplex_bigquery_dataset_id, "<your_bigquery_dataset_id>")\ .replace(gcp_bq_dataset, "<your_bigquery_dataset_id>") output = output.replace(rule_binding_id, "<rule_binding_id>") output = re.sub(RE_NEWLINES, '\n', output).strip() output = re.sub(RE_CONFIGS_HASHSUM, CONFIGS_HASHSUM_REP, output) output = re.sub(RE_ASSET_ID, ASSET_ID_REP, output) expected = utils.strip_margin(re.sub(RE_NEWLINES, '\n', expected)).strip() assert output == expected
def test_render_run_dq_main_sql_gcs_partitioned( self, test_rule_bindings_collection_team_6, test_default_dataplex_configs_cache, test_resources, gcp_project_id, test_dataplex_metadata_defaults_configs, gcp_dataplex_zone_id, gcp_dataplex_lake_name, test_bigquery_client, ): """ """ for rule_binding_id, rule_binding_configs in test_rule_bindings_collection_team_6.items(): with open(test_resources / "dataplex_gcs_partitioned_metadata_sql_expected.sql") as f: expected = f.read() output = lib.create_rule_binding_view_model( rule_binding_id=rule_binding_id, rule_binding_configs=rule_binding_configs, dq_summary_table_name="<your_gcp_project_id>.<your_bigquery_dataset_id>.dq_summary", configs_cache=test_default_dataplex_configs_cache, environment="DEV", debug=True, default_configs=test_dataplex_metadata_defaults_configs, bigquery_client=test_bigquery_client, ) output = output.replace(gcp_project_id, "<your-gcp-project-id>")\ .replace(gcp_dataplex_zone_id.replace('-', '_'), "<your_dataplex_zone_name>")\ .replace(gcp_dataplex_zone_id, "<your_dataplex_zone_name>")\ .replace(rule_binding_id, "<rule_binding_id>")\ .replace(gcp_dataplex_lake_name, "<your_dataplex_lake_id>") expected = utils.strip_margin(re.sub(RE_NEWLINES, '\n', expected)).strip() output = re.sub(RE_NEWLINES, '\n', output).strip() output = re.sub(RE_CONFIGS_HASHSUM, CONFIGS_HASHSUM_REP, output) output = re.sub(RE_ASSET_ID, ASSET_ID_REP, output) assert output == expected