def test_resolve_dataplex_entity_uris(self,
         temp_configs_dir,
         test_dq_dataplex_client,
         test_bigquery_client,
         test_dataplex_metadata_defaults_configs,
         tmp_path):
     try:
         temp_dir = Path(tmp_path).joinpath("clouddq_test_configs_cache_2")
         temp_dir.mkdir(parents=True)
         with working_directory(temp_dir):
             configs_cache = lib.prepare_configs_cache(temp_configs_dir)
             count_1 = configs_cache._cache_db['entities'].count
             target_rule_binding_ids = [
                 row["id"].lower() for row in
                 configs_cache._cache_db.query("select id from rule_bindings")
             ]
             configs_cache.resolve_dataplex_entity_uris(
                 dataplex_client=test_dq_dataplex_client,
                 bigquery_client=test_bigquery_client,
                 default_configs=test_dataplex_metadata_defaults_configs,
                 target_rule_binding_ids=target_rule_binding_ids
             )
             count_2 = configs_cache._cache_db['entities'].count
             assert count_2 > count_1
     finally:
         shutil.rmtree(temp_dir)
 def test_prepare_configs_cache(self, temp_configs_dir, tmp_path):
     try:
         temp_dir = Path(tmp_path).joinpath("clouddq_test_configs_cache_1")
         temp_dir.mkdir(parents=True)
         with working_directory(temp_dir):
             configs_cache = lib.prepare_configs_cache(temp_configs_dir)
             assert type(configs_cache) == DqConfigsCache
             assert configs_cache._cache_db["entities"].exists()
             assert configs_cache._cache_db["rules"].exists()
             assert configs_cache._cache_db["row_filters"].exists()
             assert configs_cache._cache_db["rule_bindings"].exists()
     finally:
         shutil.rmtree(temp_dir)
コード例 #3
0
    def test_prepare_configs_cache(self, temp_configs_dir, tmp_path):
        config_path = Path(temp_configs_dir)

        temp_dir = Path(tmp_path).joinpath("clouddq_test_lib", "test_prepare_configs_cache")

        try:
            temp_dir.mkdir(parents=True)
            shutil.copytree(config_path, temp_dir / 'configs')

            base_rules = temp_dir / 'configs' / 'rules' / 'base-rules.yml'
            assert os.path.isfile(base_rules)
            with (temp_dir / 'configs' / 'rules' / 'base-rules.yml').open() as f:
                rule_config = yaml.safe_load(f)

            # Add dimension "correctness" to all rules

            for id in rule_config['rules']:
                rule_config['rules'][id]['dimension'] = 'correctness'

            # Rewrite the file
            os.remove(base_rules)
            with open(base_rules, 'w') as f:
                yaml.safe_dump(rule_config, f)

            #  Expect to raise a ValueError because no rule_dimensions are defined:
            with pytest.raises(ValueError):
                lib.prepare_configs_cache(temp_dir)

            # Add the rule dimensions and try again
            rule_config['rule_dimensions'] = ['CORRECTNESS', 'CONFORMITY', 'COMPLETENESS']
            os.remove(base_rules)
            with open(base_rules, 'w') as f:
                yaml.safe_dump(rule_config, f)

            lib.prepare_configs_cache(temp_dir)

        finally:
            shutil.rmtree(temp_dir)
コード例 #4
0
def test_default_dataplex_configs_cache(
        temp_configs_dir, test_dq_dataplex_client,
        test_dataplex_metadata_defaults_configs, tmp_path,
        test_bigquery_client):
    temp_path = Path(tmp_path).joinpath("clouddq_test_configs_cache")
    temp_path.mkdir()
    with working_directory(temp_path):
        configs_cache = prepare_configs_cache(configs_path=temp_configs_dir)
        target_rule_binding_ids = [
            row["id"] for row in configs_cache._cache_db.query(
                "select id from rule_bindings")
        ]
        configs_cache.resolve_dataplex_entity_uris(
            dataplex_client=test_dq_dataplex_client,
            bigquery_client=test_bigquery_client,
            default_configs=test_dataplex_metadata_defaults_configs,
            target_rule_binding_ids=target_rule_binding_ids)
        yield configs_cache
コード例 #5
0
    def test_dq_rule_binding_conflicted_column_id_is_not_escaped_for_sql_statement(self, temp_configs_dir, tmp_path):
        try:
            temp_dir = Path(tmp_path).joinpath("clouddq_test_configs_cache_2")
            temp_dir.mkdir(parents=True)
            with working_directory(temp_dir):
                configs_cache = lib.prepare_configs_cache(temp_configs_dir)
        finally:
            shutil.rmtree(temp_dir)

        dq_rule_binding_dict_with_conflicted_column_id = {
            "entity_id": "TEST_TABLE",
            "column_id": "data",
            "row_filter_id": "NONE",
            "rule_ids": [{"NO_DUPLICATES_IN_COLUMN_GROUPS": {"column_names": "data"}}],
            "metadata": {"key": "value"}
        }

        output = DqRuleBinding.from_dict(
            rule_binding_id="valid",
            kwargs=dq_rule_binding_dict_with_conflicted_column_id
        ).resolve_all_configs_to_dict(configs_cache=configs_cache)
        text = output["rule_configs_dict"]["NO_DUPLICATES_IN_COLUMN_GROUPS"]["rule_sql_expr"]

        expected = """
        |select a.*
        |from data a
        |inner join (
        |  select
        |    data
        |  from data
        |  group by data
        |  having count(*) > 1
        |) duplicates
        |using (data)"""
        assert strip_margin(text.replace(r"\s\s+", " ")) == \
            strip_margin(expected.replace(r"\s\s+", " "))
コード例 #6
0
    def test_dq_rule_binding_conflicted_column_id_is_escaped_for_sql_expr(self, temp_configs_dir, tmp_path):
        try:
            temp_dir = Path(tmp_path).joinpath("clouddq_test_configs_cache_2")
            temp_dir.mkdir(parents=True)
            with working_directory(temp_dir):
                configs_cache = lib.prepare_configs_cache(temp_configs_dir)
        finally:
            shutil.rmtree(temp_dir)

        dq_rule_binding_dict_with_conflicted_column_id = {
            "entity_id": "TEST_TABLE",
            "column_id": "data",
            "row_filter_id": "NONE",
            "rule_ids": ["REGEX_VALID_EMAIL"],
            "metadata": {"key": "value"}
        }

        output = DqRuleBinding.from_dict(
            rule_binding_id="valid",
            kwargs=dq_rule_binding_dict_with_conflicted_column_id
        ).resolve_all_configs_to_dict(configs_cache=configs_cache)

        assert output["rule_configs_dict"]["REGEX_VALID_EMAIL"]["rule_sql_expr"] == \
            "REGEXP_CONTAINS( CAST( data.data  AS STRING), '^[^@]+[@]{1}[^@]+$' )"
コード例 #7
0
def test_configs_cache(source_configs_path, tmp_path):
    temp_path = Path(tmp_path).joinpath("clouddq_test_configs_cache")
    temp_path.mkdir()
    with working_directory(temp_path):
        configs_cache = prepare_configs_cache(configs_path=source_configs_path)
        yield configs_cache
コード例 #8
0
def main(  # noqa: C901
    rule_binding_ids: str,
    rule_binding_config_path: str,
    environment_target: Optional[str],
    gcp_project_id: Optional[str],
    gcp_region_id: Optional[str],
    gcp_bq_dataset_id: Optional[str],
    gcp_service_account_key_path: Optional[Path],
    gcp_impersonation_credentials: Optional[str],
    metadata: Optional[str],
    dry_run: bool,
    progress_watermark: bool,
    target_bigquery_summary_table: str,
    intermediate_table_expiration_hours: int,
    num_threads: int,
    debug: bool = False,
    print_sql_queries: bool = False,
    skip_sql_validation: bool = False,
    summary_to_stdout: bool = False,
    enable_experimental_bigquery_entity_uris: bool = True,
    enable_experimental_dataplex_gcs_validation: bool = True,
) -> None:
    """Run RULE_BINDING_IDS from a RULE_BINDING_CONFIG_PATH.

    RULE_BINDING_IDS:
    comma-separated Rule Binding ID(s) containing the
    configurations for the run.

    Set RULE_BINDING_IDS to 'ALL' to run all rule_bindings
    in RULE_BINDING_CONFIG_PATH.

    RULE_BINDING_CONFIG_PATH:
    Path to YAML configs directory containing `rule_bindings`,
    `entities`, `rules`, and `row_filters` YAML config files.

    Usage examples:

    \b
    > python clouddq_executable.zip \\
      T2_DQ_1_EMAIL \\
      configs/ \\
      --gcp_project_id="${GOOGLE_CLOUD_PROJECT}" \\
      --gcp_bq_dataset_id="${CLOUDDQ_BIGQUERY_DATASET}" \\
      --target_bigquery_summary_table="${CLOUDDQ_TARGET_BIGQUERY_TABLE}" \\
      --metadata='{"key":"value"}' \\

    \b
    > python clouddq_executable.zip \\
      ALL \\
      configs/ \\
      --gcp_project_id="${GOOGLE_CLOUD_PROJECT}" \\
      --gcp_bq_dataset_id="${CLOUDDQ_BIGQUERY_DATASET}" \\
      --target_bigquery_summary_table="${CLOUDDQ_TARGET_BIGQUERY_TABLE}" \\
      --dry_run  \\
      --debug

    """
    if debug:
        logger.setLevel("DEBUG")
        for handler in logger.handlers:
            handler.setLevel(logging.DEBUG)
            logger.debug("Debug logging enabled")

    if not gcp_project_id or not gcp_bq_dataset_id:
        raise ValueError(
            "CLI input must define connection configs using the parameters: "
            "'--gcp_project_id', '--gcp_bq_dataset_id', '--gcp_region_id').")
    bigquery_client = None
    try:
        gcp_credentials = GcpCredentials(
            gcp_project_id=gcp_project_id,
            gcp_service_account_key_path=gcp_service_account_key_path,
            gcp_impersonation_credentials=gcp_impersonation_credentials,
        )
        # Set-up cloud logging
        add_cloud_logging_handler(logger=json_logger)
        logger.info("Starting CloudDQ run with configs:")
        json_logger.warning(
            json.dumps({"clouddq_run_configs": locals()},
                       cls=JsonEncoderDatetime))
        # Create BigQuery client
        bigquery_client = BigQueryClient(gcp_credentials=gcp_credentials)
        # Prepare dbt runtime
        dbt_runner = DbtRunner(
            environment_target=environment_target,
            gcp_project_id=gcp_project_id,
            gcp_region_id=gcp_region_id,
            gcp_bq_dataset_id=gcp_bq_dataset_id,
            bigquery_client=bigquery_client,
            gcp_service_account_key_path=gcp_service_account_key_path,
            gcp_impersonation_credentials=gcp_impersonation_credentials,
            intermediate_table_expiration_hours=
            intermediate_table_expiration_hours,
            num_threads=num_threads,
        )
        dbt_path = dbt_runner.get_dbt_path()
        dbt_rule_binding_views_path = dbt_runner.get_rule_binding_view_path()
        dbt_entity_summary_path = dbt_runner.get_entity_summary_path()

        (
            dbt_profiles_dir,
            environment_target,
        ) = dbt_runner.get_dbt_profiles_dir_and_environment_target(
            gcp_project_id=gcp_project_id,
            gcp_bq_dataset_id=gcp_bq_dataset_id,
            gcp_region_id=gcp_region_id,
            bigquery_client=bigquery_client,
        )

        # Prepare DQ Summary Table
        dq_summary_table_name = get_bigquery_dq_summary_table_name(
            dbt_path=Path(dbt_path),
            dbt_profiles_dir=Path(dbt_profiles_dir),
            environment_target=environment_target,
        )
        logger.info(
            "Writing rule_binding views and intermediate summary "
            f"results to BigQuery dq_summary_table_name: `{dq_summary_table_name}`. "
        )
        dq_summary_table_exists = False
        dq_summary_table_ref = bigquery_client.table_from_string(
            dq_summary_table_name)
        dq_summary_project_id = dq_summary_table_ref.project
        dq_summary_dataset = dq_summary_table_ref.dataset_id
        logger.info(
            f"Using dq_summary_dataset: {dq_summary_project_id}.{dq_summary_dataset}"
        )
        dq_summary_table_exists = bigquery_client.is_table_exists(
            table=dq_summary_table_name, project_id=dq_summary_project_id)
        if not bigquery_client.is_dataset_exists(
                dataset=dq_summary_dataset, project_id=dq_summary_project_id):
            raise AssertionError(
                "Invalid argument to --gcp_bq_dataset_id: "
                f"Dataset {dq_summary_project_id}.{dq_summary_dataset} does not exist. "
            )
        dq_summary_dataset_region = bigquery_client.get_dataset_region(
            dataset=dq_summary_dataset,
            project_id=dq_summary_project_id,
        )
        if gcp_region_id and dq_summary_dataset_region != gcp_region_id:
            raise AssertionError(
                f"GCP region in --gcp_region_id '{gcp_region_id}' "
                f"must be the same as dq_summary_dataset "
                f"'{dq_summary_project_id}.{dq_summary_dataset}' region: "
                f"'{dq_summary_dataset_region}'.")
        bigquery_client.assert_required_columns_exist_in_table(
            table=dq_summary_table_name, project_id=dq_summary_project_id)
        # Check existence of dataset for target BQ table in the selected GCP region
        if target_bigquery_summary_table:
            logger.info("Using target_bigquery_summary_table: "
                        f"`{target_bigquery_summary_table}`. ")
            target_table_ref = bigquery_client.table_from_string(
                target_bigquery_summary_table)
            target_project_id = target_table_ref.project
            target_dataset_id = target_table_ref.dataset_id
            logger.debug(
                f"BigQuery dataset used in --target_bigquery_summary_table: "
                f"{target_project_id}.{target_dataset_id}")
            if not bigquery_client.is_dataset_exists(
                    dataset=target_dataset_id, project_id=target_project_id):
                raise AssertionError(
                    "Invalid argument to --target_bigquery_summary_table: "
                    f"{target_bigquery_summary_table}. "
                    f"Dataset {target_project_id}.{target_dataset_id} does not exist. "
                )
            target_dataset_region = bigquery_client.get_dataset_region(
                dataset=target_dataset_id,
                project_id=target_project_id,
            )
            if gcp_region_id and target_dataset_region != gcp_region_id:
                raise AssertionError(
                    f"GCP region in --gcp_region_id '{gcp_region_id}' "
                    f"must be the same as --target_bigquery_summary_table "
                    f"'{target_project_id}.{target_dataset_id}' region "
                    f"'{target_dataset_region}'.")
            if target_dataset_region != dq_summary_dataset_region:
                raise ValueError(
                    f"GCP region for --gcp_bq_dataset_id "
                    f"'{dq_summary_project_id}.{dq_summary_dataset}': "
                    f"'{dq_summary_dataset_region}' must be the same as "
                    f"GCP region for --target_bigquery_summary_table "
                    f"'{dq_summary_project_id}.{dq_summary_dataset}': "
                    f"'{target_dataset_region}'")
            bigquery_client.assert_required_columns_exist_in_table(
                table=target_bigquery_summary_table,
                project_id=target_project_id)
        else:
            logger.warning(
                "CLI --target_bigquery_summary_table is not set. This will become a required argument in v1.0.0."
            )
        # Log information about --summary_to_stdout
        if summary_to_stdout and target_bigquery_summary_table:
            logger.info(
                "--summary_to_stdout is True. Logging summary results as json to stdout."
            )
        elif summary_to_stdout and not target_bigquery_summary_table:
            logger.warning(
                "--summary_to_stdout is True but --target_bigquery_summary_table is not set. "
                "No summary logs will be logged to stdout.")
        # Load metadata
        metadata = json.loads(metadata)
        # Load Rule Bindings
        configs_path = Path(rule_binding_config_path)
        logger.debug(f"Loading rule bindings from: {configs_path.absolute()}")
        all_rule_bindings = lib.load_rule_bindings_config(Path(configs_path))
        print(all_rule_bindings)
        # Prepare list of Rule Bindings in-scope for run
        target_rule_binding_ids = [
            r.strip().upper() for r in rule_binding_ids.split(",")
        ]
        if len(target_rule_binding_ids
               ) == 1 and target_rule_binding_ids[0] == "ALL":
            target_rule_binding_ids = [
                rule_binding.upper()
                for rule_binding in all_rule_bindings.keys()
            ]
        logger.info(
            f"Preparing SQL for rule bindings: {target_rule_binding_ids}")
        # Load default configs for metadata registries
        registry_defaults: MetadataRegistryDefaults = (
            lib.load_metadata_registry_default_configs(Path(configs_path)))
        default_dataplex_projects = registry_defaults.get_dataplex_registry_defaults(
            "projects")
        default_dataplex_locations = registry_defaults.get_dataplex_registry_defaults(
            "locations")
        default_dataplex_lakes = registry_defaults.get_dataplex_registry_defaults(
            "lakes")
        dataplex_registry_defaults = registry_defaults.get_dataplex_registry_defaults(
        )
        # Prepare Dataplex Client from metadata registry defaults
        dataplex_client = CloudDqDataplexClient(
            gcp_credentials=gcp_credentials,
            gcp_project_id=default_dataplex_projects,
            gcp_dataplex_lake_name=default_dataplex_lakes,
            gcp_dataplex_region=default_dataplex_locations,
        )
        logger.debug("Created CloudDqDataplexClient with arguments: "
                     f"{gcp_credentials}, "
                     f"{default_dataplex_projects}, "
                     f"{default_dataplex_lakes}, "
                     f"{default_dataplex_locations}, ")
        # Load all configs into a local cache
        configs_cache = lib.prepare_configs_cache(
            configs_path=Path(configs_path))
        configs_cache.resolve_dataplex_entity_uris(
            dataplex_client=dataplex_client,
            bigquery_client=bigquery_client,
            default_configs=dataplex_registry_defaults,
            target_rule_binding_ids=target_rule_binding_ids,
        )
        # Get Entities for entity-level summary views
        target_entity_summary_configs: dict = (
            configs_cache.get_entities_configs_from_rule_bindings(
                target_rule_binding_ids=target_rule_binding_ids, ))
        # Create Rule_binding views
        for rule_binding_id in target_rule_binding_ids:
            rule_binding_configs = all_rule_bindings.get(rule_binding_id, None)
            assert_not_none_or_empty(
                rule_binding_configs,
                f"Target Rule Binding Id: {rule_binding_id} not found "
                f"in config path {configs_path.absolute()}.",
            )
            if debug:
                logger.debug(
                    f"Creating sql string from configs for rule binding: "
                    f"{rule_binding_id}")
                logger.debug(
                    f"Rule binding config json:\n{pformat(rule_binding_configs)}"
                )
            high_watermark_filter_exists = False

            sql_string = lib.create_rule_binding_view_model(
                rule_binding_id=rule_binding_id,
                rule_binding_configs=rule_binding_configs,
                dq_summary_table_name=dq_summary_table_name,
                configs_cache=configs_cache,
                environment=environment_target,
                metadata=metadata,
                debug=print_sql_queries,
                progress_watermark=progress_watermark,
                default_configs=dataplex_registry_defaults,
                dq_summary_table_exists=dq_summary_table_exists,
                high_watermark_filter_exists=high_watermark_filter_exists,
                bigquery_client=bigquery_client,
            )
            print("sql string is ***")
            print(sql_string)
            if not skip_sql_validation:
                logger.debug(
                    f"Validating generated SQL code for rule binding "
                    f"{rule_binding_id} using BigQuery dry-run client.", )
                bigquery_client.check_query_dry_run(query_string=sql_string)
            logger.debug(
                f"*** Writing sql to {dbt_rule_binding_views_path.absolute()}/"
                f"{rule_binding_id}.sql", )
            lib.write_sql_string_as_dbt_model(
                model_id=rule_binding_id,
                sql_string=sql_string,
                dbt_model_path=dbt_rule_binding_views_path,
            )
        # clean up old rule_bindings
        for view in dbt_rule_binding_views_path.glob("*.sql"):
            if view.stem.upper() not in target_rule_binding_ids:
                view.unlink()
        logger.info(
            f"target_entity_summary_configs:\n{pformat(target_entity_summary_configs)}"
        )
        # create entity-level summary table models
        for (
                entity_table_id,
                entity_configs_dict,
        ) in target_entity_summary_configs.items():
            rule_binding_ids_list = entity_configs_dict.get(
                "rule_binding_ids_list")
            assert_not_none_or_empty(
                rule_binding_ids_list,
                f"Internal Error: no rule_binding_id found for entity_table_id {entity_table_id}.",
            )
            sql_string = lib.create_entity_summary_model(
                entity_table_id=entity_table_id,
                entity_target_rule_binding_configs=entity_configs_dict,
                gcp_project_id=gcp_project_id,
                gcp_bq_dataset_id=gcp_bq_dataset_id,
                debug=print_sql_queries,
            )
            logger.debug(
                f"*** Writing sql to {dbt_entity_summary_path.absolute()}/"
                f"{entity_table_id}.sql", )
            lib.write_sql_string_as_dbt_model(
                model_id=entity_table_id,
                sql_string=sql_string,
                dbt_model_path=dbt_entity_summary_path,
            )
        # clean up old entity_summary views
        for view in dbt_entity_summary_path.glob("*.sql"):
            if view.stem not in target_entity_summary_configs.keys():
                view.unlink()
        # create dbt configs json for the main.sql loop and run dbt
        configs = {
            "entity_dq_statistics_models":
            list(target_entity_summary_configs.keys()),
        }
        dbt_runner.run(
            configs=configs,
            debug=debug,
            dry_run=dry_run,
        )
        if not dry_run:
            if target_bigquery_summary_table:
                if target_bigquery_summary_table == dq_summary_table_name:
                    raise ValueError(
                        f"The target bigquery summary table name `{target_bigquery_summary_table}` "
                        f"cannot be same as dq summary table name `{dq_summary_table_name}` which "
                        f"is reserved for storing the intermediate results used by clouddq "
                        f"for further processing in case of incremental validation."
                    )
                else:
                    invocation_id = get_dbt_invocation_id(dbt_path)
                    logger.info(f"dbt invocation id for current execution "
                                f"is {invocation_id}")
                    partition_date = datetime.now(timezone.utc).date()
                    target_table = TargetTable(invocation_id, bigquery_client)
                    num_rows = target_table.write_to_target_bq_table(
                        partition_date,
                        target_bigquery_summary_table,
                        dq_summary_table_name,
                        summary_to_stdout,
                    )
                    json_logger.info(
                        json.dumps(
                            {
                                "clouddq_job_completion_config": {
                                    "invocation_id": invocation_id,
                                    "target_bigquery_summary_table":
                                    target_bigquery_summary_table,
                                    "summary_to_stdout": summary_to_stdout,
                                    "target_rule_binding_ids":
                                    target_rule_binding_ids,
                                    "partition_date": partition_date,
                                    "num_rows_loaded_to_target_table":
                                    num_rows,
                                }
                            },
                            cls=JsonEncoderDatetime,
                        ))
                    logger.info("Job completed successfully.")
            else:
                raise ValueError(
                    "'--target_bigquery_summary_table' was not provided. "
                    "It is needed to append the dq summary results to the "
                    "provided target bigquery table.")
    except Exception as error:
        logger.error(error, exc_info=True)
        json_logger.error(error, exc_info=True)
        raise SystemExit(f"\n\n{error}")
    finally:
        if bigquery_client:
            bigquery_client.close_connection()