def test_resolve_dataplex_entity_uris(self, temp_configs_dir, test_dq_dataplex_client, test_bigquery_client, test_dataplex_metadata_defaults_configs, tmp_path): try: temp_dir = Path(tmp_path).joinpath("clouddq_test_configs_cache_2") temp_dir.mkdir(parents=True) with working_directory(temp_dir): configs_cache = lib.prepare_configs_cache(temp_configs_dir) count_1 = configs_cache._cache_db['entities'].count target_rule_binding_ids = [ row["id"].lower() for row in configs_cache._cache_db.query("select id from rule_bindings") ] configs_cache.resolve_dataplex_entity_uris( dataplex_client=test_dq_dataplex_client, bigquery_client=test_bigquery_client, default_configs=test_dataplex_metadata_defaults_configs, target_rule_binding_ids=target_rule_binding_ids ) count_2 = configs_cache._cache_db['entities'].count assert count_2 > count_1 finally: shutil.rmtree(temp_dir)
def test_prepare_configs_cache(self, temp_configs_dir, tmp_path): try: temp_dir = Path(tmp_path).joinpath("clouddq_test_configs_cache_1") temp_dir.mkdir(parents=True) with working_directory(temp_dir): configs_cache = lib.prepare_configs_cache(temp_configs_dir) assert type(configs_cache) == DqConfigsCache assert configs_cache._cache_db["entities"].exists() assert configs_cache._cache_db["rules"].exists() assert configs_cache._cache_db["row_filters"].exists() assert configs_cache._cache_db["rule_bindings"].exists() finally: shutil.rmtree(temp_dir)
def test_prepare_configs_cache(self, temp_configs_dir, tmp_path): config_path = Path(temp_configs_dir) temp_dir = Path(tmp_path).joinpath("clouddq_test_lib", "test_prepare_configs_cache") try: temp_dir.mkdir(parents=True) shutil.copytree(config_path, temp_dir / 'configs') base_rules = temp_dir / 'configs' / 'rules' / 'base-rules.yml' assert os.path.isfile(base_rules) with (temp_dir / 'configs' / 'rules' / 'base-rules.yml').open() as f: rule_config = yaml.safe_load(f) # Add dimension "correctness" to all rules for id in rule_config['rules']: rule_config['rules'][id]['dimension'] = 'correctness' # Rewrite the file os.remove(base_rules) with open(base_rules, 'w') as f: yaml.safe_dump(rule_config, f) # Expect to raise a ValueError because no rule_dimensions are defined: with pytest.raises(ValueError): lib.prepare_configs_cache(temp_dir) # Add the rule dimensions and try again rule_config['rule_dimensions'] = ['CORRECTNESS', 'CONFORMITY', 'COMPLETENESS'] os.remove(base_rules) with open(base_rules, 'w') as f: yaml.safe_dump(rule_config, f) lib.prepare_configs_cache(temp_dir) finally: shutil.rmtree(temp_dir)
def test_default_dataplex_configs_cache( temp_configs_dir, test_dq_dataplex_client, test_dataplex_metadata_defaults_configs, tmp_path, test_bigquery_client): temp_path = Path(tmp_path).joinpath("clouddq_test_configs_cache") temp_path.mkdir() with working_directory(temp_path): configs_cache = prepare_configs_cache(configs_path=temp_configs_dir) target_rule_binding_ids = [ row["id"] for row in configs_cache._cache_db.query( "select id from rule_bindings") ] configs_cache.resolve_dataplex_entity_uris( dataplex_client=test_dq_dataplex_client, bigquery_client=test_bigquery_client, default_configs=test_dataplex_metadata_defaults_configs, target_rule_binding_ids=target_rule_binding_ids) yield configs_cache
def test_dq_rule_binding_conflicted_column_id_is_not_escaped_for_sql_statement(self, temp_configs_dir, tmp_path): try: temp_dir = Path(tmp_path).joinpath("clouddq_test_configs_cache_2") temp_dir.mkdir(parents=True) with working_directory(temp_dir): configs_cache = lib.prepare_configs_cache(temp_configs_dir) finally: shutil.rmtree(temp_dir) dq_rule_binding_dict_with_conflicted_column_id = { "entity_id": "TEST_TABLE", "column_id": "data", "row_filter_id": "NONE", "rule_ids": [{"NO_DUPLICATES_IN_COLUMN_GROUPS": {"column_names": "data"}}], "metadata": {"key": "value"} } output = DqRuleBinding.from_dict( rule_binding_id="valid", kwargs=dq_rule_binding_dict_with_conflicted_column_id ).resolve_all_configs_to_dict(configs_cache=configs_cache) text = output["rule_configs_dict"]["NO_DUPLICATES_IN_COLUMN_GROUPS"]["rule_sql_expr"] expected = """ |select a.* |from data a |inner join ( | select | data | from data | group by data | having count(*) > 1 |) duplicates |using (data)""" assert strip_margin(text.replace(r"\s\s+", " ")) == \ strip_margin(expected.replace(r"\s\s+", " "))
def test_dq_rule_binding_conflicted_column_id_is_escaped_for_sql_expr(self, temp_configs_dir, tmp_path): try: temp_dir = Path(tmp_path).joinpath("clouddq_test_configs_cache_2") temp_dir.mkdir(parents=True) with working_directory(temp_dir): configs_cache = lib.prepare_configs_cache(temp_configs_dir) finally: shutil.rmtree(temp_dir) dq_rule_binding_dict_with_conflicted_column_id = { "entity_id": "TEST_TABLE", "column_id": "data", "row_filter_id": "NONE", "rule_ids": ["REGEX_VALID_EMAIL"], "metadata": {"key": "value"} } output = DqRuleBinding.from_dict( rule_binding_id="valid", kwargs=dq_rule_binding_dict_with_conflicted_column_id ).resolve_all_configs_to_dict(configs_cache=configs_cache) assert output["rule_configs_dict"]["REGEX_VALID_EMAIL"]["rule_sql_expr"] == \ "REGEXP_CONTAINS( CAST( data.data AS STRING), '^[^@]+[@]{1}[^@]+$' )"
def test_configs_cache(source_configs_path, tmp_path): temp_path = Path(tmp_path).joinpath("clouddq_test_configs_cache") temp_path.mkdir() with working_directory(temp_path): configs_cache = prepare_configs_cache(configs_path=source_configs_path) yield configs_cache
def main( # noqa: C901 rule_binding_ids: str, rule_binding_config_path: str, environment_target: Optional[str], gcp_project_id: Optional[str], gcp_region_id: Optional[str], gcp_bq_dataset_id: Optional[str], gcp_service_account_key_path: Optional[Path], gcp_impersonation_credentials: Optional[str], metadata: Optional[str], dry_run: bool, progress_watermark: bool, target_bigquery_summary_table: str, intermediate_table_expiration_hours: int, num_threads: int, debug: bool = False, print_sql_queries: bool = False, skip_sql_validation: bool = False, summary_to_stdout: bool = False, enable_experimental_bigquery_entity_uris: bool = True, enable_experimental_dataplex_gcs_validation: bool = True, ) -> None: """Run RULE_BINDING_IDS from a RULE_BINDING_CONFIG_PATH. RULE_BINDING_IDS: comma-separated Rule Binding ID(s) containing the configurations for the run. Set RULE_BINDING_IDS to 'ALL' to run all rule_bindings in RULE_BINDING_CONFIG_PATH. RULE_BINDING_CONFIG_PATH: Path to YAML configs directory containing `rule_bindings`, `entities`, `rules`, and `row_filters` YAML config files. Usage examples: \b > python clouddq_executable.zip \\ T2_DQ_1_EMAIL \\ configs/ \\ --gcp_project_id="${GOOGLE_CLOUD_PROJECT}" \\ --gcp_bq_dataset_id="${CLOUDDQ_BIGQUERY_DATASET}" \\ --target_bigquery_summary_table="${CLOUDDQ_TARGET_BIGQUERY_TABLE}" \\ --metadata='{"key":"value"}' \\ \b > python clouddq_executable.zip \\ ALL \\ configs/ \\ --gcp_project_id="${GOOGLE_CLOUD_PROJECT}" \\ --gcp_bq_dataset_id="${CLOUDDQ_BIGQUERY_DATASET}" \\ --target_bigquery_summary_table="${CLOUDDQ_TARGET_BIGQUERY_TABLE}" \\ --dry_run \\ --debug """ if debug: logger.setLevel("DEBUG") for handler in logger.handlers: handler.setLevel(logging.DEBUG) logger.debug("Debug logging enabled") if not gcp_project_id or not gcp_bq_dataset_id: raise ValueError( "CLI input must define connection configs using the parameters: " "'--gcp_project_id', '--gcp_bq_dataset_id', '--gcp_region_id').") bigquery_client = None try: gcp_credentials = GcpCredentials( gcp_project_id=gcp_project_id, gcp_service_account_key_path=gcp_service_account_key_path, gcp_impersonation_credentials=gcp_impersonation_credentials, ) # Set-up cloud logging add_cloud_logging_handler(logger=json_logger) logger.info("Starting CloudDQ run with configs:") json_logger.warning( json.dumps({"clouddq_run_configs": locals()}, cls=JsonEncoderDatetime)) # Create BigQuery client bigquery_client = BigQueryClient(gcp_credentials=gcp_credentials) # Prepare dbt runtime dbt_runner = DbtRunner( environment_target=environment_target, gcp_project_id=gcp_project_id, gcp_region_id=gcp_region_id, gcp_bq_dataset_id=gcp_bq_dataset_id, bigquery_client=bigquery_client, gcp_service_account_key_path=gcp_service_account_key_path, gcp_impersonation_credentials=gcp_impersonation_credentials, intermediate_table_expiration_hours= intermediate_table_expiration_hours, num_threads=num_threads, ) dbt_path = dbt_runner.get_dbt_path() dbt_rule_binding_views_path = dbt_runner.get_rule_binding_view_path() dbt_entity_summary_path = dbt_runner.get_entity_summary_path() ( dbt_profiles_dir, environment_target, ) = dbt_runner.get_dbt_profiles_dir_and_environment_target( gcp_project_id=gcp_project_id, gcp_bq_dataset_id=gcp_bq_dataset_id, gcp_region_id=gcp_region_id, bigquery_client=bigquery_client, ) # Prepare DQ Summary Table dq_summary_table_name = get_bigquery_dq_summary_table_name( dbt_path=Path(dbt_path), dbt_profiles_dir=Path(dbt_profiles_dir), environment_target=environment_target, ) logger.info( "Writing rule_binding views and intermediate summary " f"results to BigQuery dq_summary_table_name: `{dq_summary_table_name}`. " ) dq_summary_table_exists = False dq_summary_table_ref = bigquery_client.table_from_string( dq_summary_table_name) dq_summary_project_id = dq_summary_table_ref.project dq_summary_dataset = dq_summary_table_ref.dataset_id logger.info( f"Using dq_summary_dataset: {dq_summary_project_id}.{dq_summary_dataset}" ) dq_summary_table_exists = bigquery_client.is_table_exists( table=dq_summary_table_name, project_id=dq_summary_project_id) if not bigquery_client.is_dataset_exists( dataset=dq_summary_dataset, project_id=dq_summary_project_id): raise AssertionError( "Invalid argument to --gcp_bq_dataset_id: " f"Dataset {dq_summary_project_id}.{dq_summary_dataset} does not exist. " ) dq_summary_dataset_region = bigquery_client.get_dataset_region( dataset=dq_summary_dataset, project_id=dq_summary_project_id, ) if gcp_region_id and dq_summary_dataset_region != gcp_region_id: raise AssertionError( f"GCP region in --gcp_region_id '{gcp_region_id}' " f"must be the same as dq_summary_dataset " f"'{dq_summary_project_id}.{dq_summary_dataset}' region: " f"'{dq_summary_dataset_region}'.") bigquery_client.assert_required_columns_exist_in_table( table=dq_summary_table_name, project_id=dq_summary_project_id) # Check existence of dataset for target BQ table in the selected GCP region if target_bigquery_summary_table: logger.info("Using target_bigquery_summary_table: " f"`{target_bigquery_summary_table}`. ") target_table_ref = bigquery_client.table_from_string( target_bigquery_summary_table) target_project_id = target_table_ref.project target_dataset_id = target_table_ref.dataset_id logger.debug( f"BigQuery dataset used in --target_bigquery_summary_table: " f"{target_project_id}.{target_dataset_id}") if not bigquery_client.is_dataset_exists( dataset=target_dataset_id, project_id=target_project_id): raise AssertionError( "Invalid argument to --target_bigquery_summary_table: " f"{target_bigquery_summary_table}. " f"Dataset {target_project_id}.{target_dataset_id} does not exist. " ) target_dataset_region = bigquery_client.get_dataset_region( dataset=target_dataset_id, project_id=target_project_id, ) if gcp_region_id and target_dataset_region != gcp_region_id: raise AssertionError( f"GCP region in --gcp_region_id '{gcp_region_id}' " f"must be the same as --target_bigquery_summary_table " f"'{target_project_id}.{target_dataset_id}' region " f"'{target_dataset_region}'.") if target_dataset_region != dq_summary_dataset_region: raise ValueError( f"GCP region for --gcp_bq_dataset_id " f"'{dq_summary_project_id}.{dq_summary_dataset}': " f"'{dq_summary_dataset_region}' must be the same as " f"GCP region for --target_bigquery_summary_table " f"'{dq_summary_project_id}.{dq_summary_dataset}': " f"'{target_dataset_region}'") bigquery_client.assert_required_columns_exist_in_table( table=target_bigquery_summary_table, project_id=target_project_id) else: logger.warning( "CLI --target_bigquery_summary_table is not set. This will become a required argument in v1.0.0." ) # Log information about --summary_to_stdout if summary_to_stdout and target_bigquery_summary_table: logger.info( "--summary_to_stdout is True. Logging summary results as json to stdout." ) elif summary_to_stdout and not target_bigquery_summary_table: logger.warning( "--summary_to_stdout is True but --target_bigquery_summary_table is not set. " "No summary logs will be logged to stdout.") # Load metadata metadata = json.loads(metadata) # Load Rule Bindings configs_path = Path(rule_binding_config_path) logger.debug(f"Loading rule bindings from: {configs_path.absolute()}") all_rule_bindings = lib.load_rule_bindings_config(Path(configs_path)) print(all_rule_bindings) # Prepare list of Rule Bindings in-scope for run target_rule_binding_ids = [ r.strip().upper() for r in rule_binding_ids.split(",") ] if len(target_rule_binding_ids ) == 1 and target_rule_binding_ids[0] == "ALL": target_rule_binding_ids = [ rule_binding.upper() for rule_binding in all_rule_bindings.keys() ] logger.info( f"Preparing SQL for rule bindings: {target_rule_binding_ids}") # Load default configs for metadata registries registry_defaults: MetadataRegistryDefaults = ( lib.load_metadata_registry_default_configs(Path(configs_path))) default_dataplex_projects = registry_defaults.get_dataplex_registry_defaults( "projects") default_dataplex_locations = registry_defaults.get_dataplex_registry_defaults( "locations") default_dataplex_lakes = registry_defaults.get_dataplex_registry_defaults( "lakes") dataplex_registry_defaults = registry_defaults.get_dataplex_registry_defaults( ) # Prepare Dataplex Client from metadata registry defaults dataplex_client = CloudDqDataplexClient( gcp_credentials=gcp_credentials, gcp_project_id=default_dataplex_projects, gcp_dataplex_lake_name=default_dataplex_lakes, gcp_dataplex_region=default_dataplex_locations, ) logger.debug("Created CloudDqDataplexClient with arguments: " f"{gcp_credentials}, " f"{default_dataplex_projects}, " f"{default_dataplex_lakes}, " f"{default_dataplex_locations}, ") # Load all configs into a local cache configs_cache = lib.prepare_configs_cache( configs_path=Path(configs_path)) configs_cache.resolve_dataplex_entity_uris( dataplex_client=dataplex_client, bigquery_client=bigquery_client, default_configs=dataplex_registry_defaults, target_rule_binding_ids=target_rule_binding_ids, ) # Get Entities for entity-level summary views target_entity_summary_configs: dict = ( configs_cache.get_entities_configs_from_rule_bindings( target_rule_binding_ids=target_rule_binding_ids, )) # Create Rule_binding views for rule_binding_id in target_rule_binding_ids: rule_binding_configs = all_rule_bindings.get(rule_binding_id, None) assert_not_none_or_empty( rule_binding_configs, f"Target Rule Binding Id: {rule_binding_id} not found " f"in config path {configs_path.absolute()}.", ) if debug: logger.debug( f"Creating sql string from configs for rule binding: " f"{rule_binding_id}") logger.debug( f"Rule binding config json:\n{pformat(rule_binding_configs)}" ) high_watermark_filter_exists = False sql_string = lib.create_rule_binding_view_model( rule_binding_id=rule_binding_id, rule_binding_configs=rule_binding_configs, dq_summary_table_name=dq_summary_table_name, configs_cache=configs_cache, environment=environment_target, metadata=metadata, debug=print_sql_queries, progress_watermark=progress_watermark, default_configs=dataplex_registry_defaults, dq_summary_table_exists=dq_summary_table_exists, high_watermark_filter_exists=high_watermark_filter_exists, bigquery_client=bigquery_client, ) print("sql string is ***") print(sql_string) if not skip_sql_validation: logger.debug( f"Validating generated SQL code for rule binding " f"{rule_binding_id} using BigQuery dry-run client.", ) bigquery_client.check_query_dry_run(query_string=sql_string) logger.debug( f"*** Writing sql to {dbt_rule_binding_views_path.absolute()}/" f"{rule_binding_id}.sql", ) lib.write_sql_string_as_dbt_model( model_id=rule_binding_id, sql_string=sql_string, dbt_model_path=dbt_rule_binding_views_path, ) # clean up old rule_bindings for view in dbt_rule_binding_views_path.glob("*.sql"): if view.stem.upper() not in target_rule_binding_ids: view.unlink() logger.info( f"target_entity_summary_configs:\n{pformat(target_entity_summary_configs)}" ) # create entity-level summary table models for ( entity_table_id, entity_configs_dict, ) in target_entity_summary_configs.items(): rule_binding_ids_list = entity_configs_dict.get( "rule_binding_ids_list") assert_not_none_or_empty( rule_binding_ids_list, f"Internal Error: no rule_binding_id found for entity_table_id {entity_table_id}.", ) sql_string = lib.create_entity_summary_model( entity_table_id=entity_table_id, entity_target_rule_binding_configs=entity_configs_dict, gcp_project_id=gcp_project_id, gcp_bq_dataset_id=gcp_bq_dataset_id, debug=print_sql_queries, ) logger.debug( f"*** Writing sql to {dbt_entity_summary_path.absolute()}/" f"{entity_table_id}.sql", ) lib.write_sql_string_as_dbt_model( model_id=entity_table_id, sql_string=sql_string, dbt_model_path=dbt_entity_summary_path, ) # clean up old entity_summary views for view in dbt_entity_summary_path.glob("*.sql"): if view.stem not in target_entity_summary_configs.keys(): view.unlink() # create dbt configs json for the main.sql loop and run dbt configs = { "entity_dq_statistics_models": list(target_entity_summary_configs.keys()), } dbt_runner.run( configs=configs, debug=debug, dry_run=dry_run, ) if not dry_run: if target_bigquery_summary_table: if target_bigquery_summary_table == dq_summary_table_name: raise ValueError( f"The target bigquery summary table name `{target_bigquery_summary_table}` " f"cannot be same as dq summary table name `{dq_summary_table_name}` which " f"is reserved for storing the intermediate results used by clouddq " f"for further processing in case of incremental validation." ) else: invocation_id = get_dbt_invocation_id(dbt_path) logger.info(f"dbt invocation id for current execution " f"is {invocation_id}") partition_date = datetime.now(timezone.utc).date() target_table = TargetTable(invocation_id, bigquery_client) num_rows = target_table.write_to_target_bq_table( partition_date, target_bigquery_summary_table, dq_summary_table_name, summary_to_stdout, ) json_logger.info( json.dumps( { "clouddq_job_completion_config": { "invocation_id": invocation_id, "target_bigquery_summary_table": target_bigquery_summary_table, "summary_to_stdout": summary_to_stdout, "target_rule_binding_ids": target_rule_binding_ids, "partition_date": partition_date, "num_rows_loaded_to_target_table": num_rows, } }, cls=JsonEncoderDatetime, )) logger.info("Job completed successfully.") else: raise ValueError( "'--target_bigquery_summary_table' was not provided. " "It is needed to append the dq summary results to the " "provided target bigquery table.") except Exception as error: logger.error(error, exc_info=True) json_logger.error(error, exc_info=True) raise SystemExit(f"\n\n{error}") finally: if bigquery_client: bigquery_client.close_connection()