def test_all_rule_bindings_collections(self, source_configs_path):
     """ """
     return lib.load_rule_bindings_config(configs_path=source_configs_path)
 def test_rule_bindings_collection_team_3(self, source_configs_path):
     """ """
     return lib.load_rule_bindings_config(
         source_configs_path / "rule_bindings" / "team-3-rule-bindings.yml")
Ejemplo n.º 3
0
 def test_rule_bindings_collection_team_8(self, temp_configs_dir):
     """ """
     return lib.load_rule_bindings_config(
         Path(temp_configs_dir / "rule_bindings/team-8-rule-bindings.yml")
     )
Ejemplo n.º 4
0
 def test_rule_bindings_collection_from_configs_file(self, temp_configs_from_file):
     """ """
     return lib.load_rule_bindings_config(
         Path(temp_configs_from_file)
     )
Ejemplo n.º 5
0
def main(  # noqa: C901
    rule_binding_ids: str,
    rule_binding_config_path: str,
    environment_target: Optional[str],
    gcp_project_id: Optional[str],
    gcp_region_id: Optional[str],
    gcp_bq_dataset_id: Optional[str],
    gcp_service_account_key_path: Optional[Path],
    gcp_impersonation_credentials: Optional[str],
    metadata: Optional[str],
    dry_run: bool,
    progress_watermark: bool,
    target_bigquery_summary_table: str,
    intermediate_table_expiration_hours: int,
    num_threads: int,
    debug: bool = False,
    print_sql_queries: bool = False,
    skip_sql_validation: bool = False,
    summary_to_stdout: bool = False,
    enable_experimental_bigquery_entity_uris: bool = True,
    enable_experimental_dataplex_gcs_validation: bool = True,
) -> None:
    """Run RULE_BINDING_IDS from a RULE_BINDING_CONFIG_PATH.

    RULE_BINDING_IDS:
    comma-separated Rule Binding ID(s) containing the
    configurations for the run.

    Set RULE_BINDING_IDS to 'ALL' to run all rule_bindings
    in RULE_BINDING_CONFIG_PATH.

    RULE_BINDING_CONFIG_PATH:
    Path to YAML configs directory containing `rule_bindings`,
    `entities`, `rules`, and `row_filters` YAML config files.

    Usage examples:

    \b
    > python clouddq_executable.zip \\
      T2_DQ_1_EMAIL \\
      configs/ \\
      --gcp_project_id="${GOOGLE_CLOUD_PROJECT}" \\
      --gcp_bq_dataset_id="${CLOUDDQ_BIGQUERY_DATASET}" \\
      --target_bigquery_summary_table="${CLOUDDQ_TARGET_BIGQUERY_TABLE}" \\
      --metadata='{"key":"value"}' \\

    \b
    > python clouddq_executable.zip \\
      ALL \\
      configs/ \\
      --gcp_project_id="${GOOGLE_CLOUD_PROJECT}" \\
      --gcp_bq_dataset_id="${CLOUDDQ_BIGQUERY_DATASET}" \\
      --target_bigquery_summary_table="${CLOUDDQ_TARGET_BIGQUERY_TABLE}" \\
      --dry_run  \\
      --debug

    """
    if debug:
        logger.setLevel("DEBUG")
        for handler in logger.handlers:
            handler.setLevel(logging.DEBUG)
            logger.debug("Debug logging enabled")

    if not gcp_project_id or not gcp_bq_dataset_id:
        raise ValueError(
            "CLI input must define connection configs using the parameters: "
            "'--gcp_project_id', '--gcp_bq_dataset_id', '--gcp_region_id').")
    bigquery_client = None
    try:
        gcp_credentials = GcpCredentials(
            gcp_project_id=gcp_project_id,
            gcp_service_account_key_path=gcp_service_account_key_path,
            gcp_impersonation_credentials=gcp_impersonation_credentials,
        )
        # Set-up cloud logging
        add_cloud_logging_handler(logger=json_logger)
        logger.info("Starting CloudDQ run with configs:")
        json_logger.warning(
            json.dumps({"clouddq_run_configs": locals()},
                       cls=JsonEncoderDatetime))
        # Create BigQuery client
        bigquery_client = BigQueryClient(gcp_credentials=gcp_credentials)
        # Prepare dbt runtime
        dbt_runner = DbtRunner(
            environment_target=environment_target,
            gcp_project_id=gcp_project_id,
            gcp_region_id=gcp_region_id,
            gcp_bq_dataset_id=gcp_bq_dataset_id,
            bigquery_client=bigquery_client,
            gcp_service_account_key_path=gcp_service_account_key_path,
            gcp_impersonation_credentials=gcp_impersonation_credentials,
            intermediate_table_expiration_hours=
            intermediate_table_expiration_hours,
            num_threads=num_threads,
        )
        dbt_path = dbt_runner.get_dbt_path()
        dbt_rule_binding_views_path = dbt_runner.get_rule_binding_view_path()
        dbt_entity_summary_path = dbt_runner.get_entity_summary_path()

        (
            dbt_profiles_dir,
            environment_target,
        ) = dbt_runner.get_dbt_profiles_dir_and_environment_target(
            gcp_project_id=gcp_project_id,
            gcp_bq_dataset_id=gcp_bq_dataset_id,
            gcp_region_id=gcp_region_id,
            bigquery_client=bigquery_client,
        )

        # Prepare DQ Summary Table
        dq_summary_table_name = get_bigquery_dq_summary_table_name(
            dbt_path=Path(dbt_path),
            dbt_profiles_dir=Path(dbt_profiles_dir),
            environment_target=environment_target,
        )
        logger.info(
            "Writing rule_binding views and intermediate summary "
            f"results to BigQuery dq_summary_table_name: `{dq_summary_table_name}`. "
        )
        dq_summary_table_exists = False
        dq_summary_table_ref = bigquery_client.table_from_string(
            dq_summary_table_name)
        dq_summary_project_id = dq_summary_table_ref.project
        dq_summary_dataset = dq_summary_table_ref.dataset_id
        logger.info(
            f"Using dq_summary_dataset: {dq_summary_project_id}.{dq_summary_dataset}"
        )
        dq_summary_table_exists = bigquery_client.is_table_exists(
            table=dq_summary_table_name, project_id=dq_summary_project_id)
        if not bigquery_client.is_dataset_exists(
                dataset=dq_summary_dataset, project_id=dq_summary_project_id):
            raise AssertionError(
                "Invalid argument to --gcp_bq_dataset_id: "
                f"Dataset {dq_summary_project_id}.{dq_summary_dataset} does not exist. "
            )
        dq_summary_dataset_region = bigquery_client.get_dataset_region(
            dataset=dq_summary_dataset,
            project_id=dq_summary_project_id,
        )
        if gcp_region_id and dq_summary_dataset_region != gcp_region_id:
            raise AssertionError(
                f"GCP region in --gcp_region_id '{gcp_region_id}' "
                f"must be the same as dq_summary_dataset "
                f"'{dq_summary_project_id}.{dq_summary_dataset}' region: "
                f"'{dq_summary_dataset_region}'.")
        bigquery_client.assert_required_columns_exist_in_table(
            table=dq_summary_table_name, project_id=dq_summary_project_id)
        # Check existence of dataset for target BQ table in the selected GCP region
        if target_bigquery_summary_table:
            logger.info("Using target_bigquery_summary_table: "
                        f"`{target_bigquery_summary_table}`. ")
            target_table_ref = bigquery_client.table_from_string(
                target_bigquery_summary_table)
            target_project_id = target_table_ref.project
            target_dataset_id = target_table_ref.dataset_id
            logger.debug(
                f"BigQuery dataset used in --target_bigquery_summary_table: "
                f"{target_project_id}.{target_dataset_id}")
            if not bigquery_client.is_dataset_exists(
                    dataset=target_dataset_id, project_id=target_project_id):
                raise AssertionError(
                    "Invalid argument to --target_bigquery_summary_table: "
                    f"{target_bigquery_summary_table}. "
                    f"Dataset {target_project_id}.{target_dataset_id} does not exist. "
                )
            target_dataset_region = bigquery_client.get_dataset_region(
                dataset=target_dataset_id,
                project_id=target_project_id,
            )
            if gcp_region_id and target_dataset_region != gcp_region_id:
                raise AssertionError(
                    f"GCP region in --gcp_region_id '{gcp_region_id}' "
                    f"must be the same as --target_bigquery_summary_table "
                    f"'{target_project_id}.{target_dataset_id}' region "
                    f"'{target_dataset_region}'.")
            if target_dataset_region != dq_summary_dataset_region:
                raise ValueError(
                    f"GCP region for --gcp_bq_dataset_id "
                    f"'{dq_summary_project_id}.{dq_summary_dataset}': "
                    f"'{dq_summary_dataset_region}' must be the same as "
                    f"GCP region for --target_bigquery_summary_table "
                    f"'{dq_summary_project_id}.{dq_summary_dataset}': "
                    f"'{target_dataset_region}'")
            bigquery_client.assert_required_columns_exist_in_table(
                table=target_bigquery_summary_table,
                project_id=target_project_id)
        else:
            logger.warning(
                "CLI --target_bigquery_summary_table is not set. This will become a required argument in v1.0.0."
            )
        # Log information about --summary_to_stdout
        if summary_to_stdout and target_bigquery_summary_table:
            logger.info(
                "--summary_to_stdout is True. Logging summary results as json to stdout."
            )
        elif summary_to_stdout and not target_bigquery_summary_table:
            logger.warning(
                "--summary_to_stdout is True but --target_bigquery_summary_table is not set. "
                "No summary logs will be logged to stdout.")
        # Load metadata
        metadata = json.loads(metadata)
        # Load Rule Bindings
        configs_path = Path(rule_binding_config_path)
        logger.debug(f"Loading rule bindings from: {configs_path.absolute()}")
        all_rule_bindings = lib.load_rule_bindings_config(Path(configs_path))
        print(all_rule_bindings)
        # Prepare list of Rule Bindings in-scope for run
        target_rule_binding_ids = [
            r.strip().upper() for r in rule_binding_ids.split(",")
        ]
        if len(target_rule_binding_ids
               ) == 1 and target_rule_binding_ids[0] == "ALL":
            target_rule_binding_ids = [
                rule_binding.upper()
                for rule_binding in all_rule_bindings.keys()
            ]
        logger.info(
            f"Preparing SQL for rule bindings: {target_rule_binding_ids}")
        # Load default configs for metadata registries
        registry_defaults: MetadataRegistryDefaults = (
            lib.load_metadata_registry_default_configs(Path(configs_path)))
        default_dataplex_projects = registry_defaults.get_dataplex_registry_defaults(
            "projects")
        default_dataplex_locations = registry_defaults.get_dataplex_registry_defaults(
            "locations")
        default_dataplex_lakes = registry_defaults.get_dataplex_registry_defaults(
            "lakes")
        dataplex_registry_defaults = registry_defaults.get_dataplex_registry_defaults(
        )
        # Prepare Dataplex Client from metadata registry defaults
        dataplex_client = CloudDqDataplexClient(
            gcp_credentials=gcp_credentials,
            gcp_project_id=default_dataplex_projects,
            gcp_dataplex_lake_name=default_dataplex_lakes,
            gcp_dataplex_region=default_dataplex_locations,
        )
        logger.debug("Created CloudDqDataplexClient with arguments: "
                     f"{gcp_credentials}, "
                     f"{default_dataplex_projects}, "
                     f"{default_dataplex_lakes}, "
                     f"{default_dataplex_locations}, ")
        # Load all configs into a local cache
        configs_cache = lib.prepare_configs_cache(
            configs_path=Path(configs_path))
        configs_cache.resolve_dataplex_entity_uris(
            dataplex_client=dataplex_client,
            bigquery_client=bigquery_client,
            default_configs=dataplex_registry_defaults,
            target_rule_binding_ids=target_rule_binding_ids,
        )
        # Get Entities for entity-level summary views
        target_entity_summary_configs: dict = (
            configs_cache.get_entities_configs_from_rule_bindings(
                target_rule_binding_ids=target_rule_binding_ids, ))
        # Create Rule_binding views
        for rule_binding_id in target_rule_binding_ids:
            rule_binding_configs = all_rule_bindings.get(rule_binding_id, None)
            assert_not_none_or_empty(
                rule_binding_configs,
                f"Target Rule Binding Id: {rule_binding_id} not found "
                f"in config path {configs_path.absolute()}.",
            )
            if debug:
                logger.debug(
                    f"Creating sql string from configs for rule binding: "
                    f"{rule_binding_id}")
                logger.debug(
                    f"Rule binding config json:\n{pformat(rule_binding_configs)}"
                )
            high_watermark_filter_exists = False

            sql_string = lib.create_rule_binding_view_model(
                rule_binding_id=rule_binding_id,
                rule_binding_configs=rule_binding_configs,
                dq_summary_table_name=dq_summary_table_name,
                configs_cache=configs_cache,
                environment=environment_target,
                metadata=metadata,
                debug=print_sql_queries,
                progress_watermark=progress_watermark,
                default_configs=dataplex_registry_defaults,
                dq_summary_table_exists=dq_summary_table_exists,
                high_watermark_filter_exists=high_watermark_filter_exists,
                bigquery_client=bigquery_client,
            )
            print("sql string is ***")
            print(sql_string)
            if not skip_sql_validation:
                logger.debug(
                    f"Validating generated SQL code for rule binding "
                    f"{rule_binding_id} using BigQuery dry-run client.", )
                bigquery_client.check_query_dry_run(query_string=sql_string)
            logger.debug(
                f"*** Writing sql to {dbt_rule_binding_views_path.absolute()}/"
                f"{rule_binding_id}.sql", )
            lib.write_sql_string_as_dbt_model(
                model_id=rule_binding_id,
                sql_string=sql_string,
                dbt_model_path=dbt_rule_binding_views_path,
            )
        # clean up old rule_bindings
        for view in dbt_rule_binding_views_path.glob("*.sql"):
            if view.stem.upper() not in target_rule_binding_ids:
                view.unlink()
        logger.info(
            f"target_entity_summary_configs:\n{pformat(target_entity_summary_configs)}"
        )
        # create entity-level summary table models
        for (
                entity_table_id,
                entity_configs_dict,
        ) in target_entity_summary_configs.items():
            rule_binding_ids_list = entity_configs_dict.get(
                "rule_binding_ids_list")
            assert_not_none_or_empty(
                rule_binding_ids_list,
                f"Internal Error: no rule_binding_id found for entity_table_id {entity_table_id}.",
            )
            sql_string = lib.create_entity_summary_model(
                entity_table_id=entity_table_id,
                entity_target_rule_binding_configs=entity_configs_dict,
                gcp_project_id=gcp_project_id,
                gcp_bq_dataset_id=gcp_bq_dataset_id,
                debug=print_sql_queries,
            )
            logger.debug(
                f"*** Writing sql to {dbt_entity_summary_path.absolute()}/"
                f"{entity_table_id}.sql", )
            lib.write_sql_string_as_dbt_model(
                model_id=entity_table_id,
                sql_string=sql_string,
                dbt_model_path=dbt_entity_summary_path,
            )
        # clean up old entity_summary views
        for view in dbt_entity_summary_path.glob("*.sql"):
            if view.stem not in target_entity_summary_configs.keys():
                view.unlink()
        # create dbt configs json for the main.sql loop and run dbt
        configs = {
            "entity_dq_statistics_models":
            list(target_entity_summary_configs.keys()),
        }
        dbt_runner.run(
            configs=configs,
            debug=debug,
            dry_run=dry_run,
        )
        if not dry_run:
            if target_bigquery_summary_table:
                if target_bigquery_summary_table == dq_summary_table_name:
                    raise ValueError(
                        f"The target bigquery summary table name `{target_bigquery_summary_table}` "
                        f"cannot be same as dq summary table name `{dq_summary_table_name}` which "
                        f"is reserved for storing the intermediate results used by clouddq "
                        f"for further processing in case of incremental validation."
                    )
                else:
                    invocation_id = get_dbt_invocation_id(dbt_path)
                    logger.info(f"dbt invocation id for current execution "
                                f"is {invocation_id}")
                    partition_date = datetime.now(timezone.utc).date()
                    target_table = TargetTable(invocation_id, bigquery_client)
                    num_rows = target_table.write_to_target_bq_table(
                        partition_date,
                        target_bigquery_summary_table,
                        dq_summary_table_name,
                        summary_to_stdout,
                    )
                    json_logger.info(
                        json.dumps(
                            {
                                "clouddq_job_completion_config": {
                                    "invocation_id": invocation_id,
                                    "target_bigquery_summary_table":
                                    target_bigquery_summary_table,
                                    "summary_to_stdout": summary_to_stdout,
                                    "target_rule_binding_ids":
                                    target_rule_binding_ids,
                                    "partition_date": partition_date,
                                    "num_rows_loaded_to_target_table":
                                    num_rows,
                                }
                            },
                            cls=JsonEncoderDatetime,
                        ))
                    logger.info("Job completed successfully.")
            else:
                raise ValueError(
                    "'--target_bigquery_summary_table' was not provided. "
                    "It is needed to append the dq summary results to the "
                    "provided target bigquery table.")
    except Exception as error:
        logger.error(error, exc_info=True)
        json_logger.error(error, exc_info=True)
        raise SystemExit(f"\n\n{error}")
    finally:
        if bigquery_client:
            bigquery_client.close_connection()