def build_config_from_args(args, config_manager): """Return config manager object ready to execute. Args: config_manager (ConfigManager): Validation config manager instance. """ config_manager.append_calculated_fields( get_calculated_config(args, config_manager)) if config_manager.validation_type == consts.COLUMN_VALIDATION: config_manager.append_aggregates( get_aggregate_config(args, config_manager)) if args.grouped_columns is not None: grouped_columns = cli_tools.get_arg_list(args.grouped_columns) config_manager.append_query_groups( config_manager.build_column_configs(grouped_columns)) elif config_manager.validation_type == consts.ROW_VALIDATION: if args.comparison_fields is not None: comparison_fields = cli_tools.get_arg_list(args.comparison_fields, default_value=[]) config_manager.append_comparison_fields( config_manager.build_config_comparison_fields( comparison_fields)) if args.hash != "*": config_manager.append_dependent_aliases(comparison_fields) if args.primary_keys is not None: primary_keys = cli_tools.get_arg_list(args.primary_keys) config_manager.append_primary_keys( config_manager.build_column_configs(primary_keys)) if args.hash != "*": config_manager.append_dependent_aliases(primary_keys) if config_manager.validation_type == consts.CUSTOM_QUERY: config_manager.append_aggregates( get_aggregate_config(args, config_manager)) if args.custom_query_type is not None: config_manager.append_custom_query_type(args.custom_query_type) else: raise ValueError( "Expected custom query type to be given, got empty string.") if args.source_query_file is not None: query_file = cli_tools.get_arg_list(args.source_query_file) config_manager.append_source_query_file(query_file) if args.target_query_file is not None: query_file = cli_tools.get_arg_list(args.target_query_file) config_manager.append_target_query_file(query_file) return config_manager
def find_tables_using_string_matching(args): """Return JSON String with matched tables for use in validations.""" score_cutoff = args.score_cutoff or 0.8 mgr = state_manager.StateManager() source_client = clients.get_data_client( mgr.get_connection_config(args.source_conn)) target_client = clients.get_data_client( mgr.get_connection_config(args.target_conn)) allowed_schemas = cli_tools.get_arg_list(args.allowed_schemas) source_table_map = get_table_map(source_client, allowed_schemas=allowed_schemas) target_table_map = get_table_map(target_client) table_configs = _compare_match_tables(source_table_map, target_table_map, score_cutoff=score_cutoff) return json.dumps(table_configs)
def get_calculated_config(args, config_manager): """Return list of formatted calculated objects. Args: config_manager(ConfigManager): Validation config manager instance. """ calculated_configs = [] fields = [] if args.hash: col_list = None if args.hash == "*" else cli_tools.get_arg_list( args.hash) fields = config_manager._build_dependent_aliases("hash", col_list) aliases = [field["name"] for field in fields] # Add to list of necessary columns for selective hashing in order to drop # excess columns with invalid data types (i.e structs) when generating source/target DFs if col_list: config_manager.append_dependent_aliases(col_list) config_manager.append_dependent_aliases(aliases) if len(fields) > 0: max_depth = max([x["depth"] for x in fields]) else: max_depth = 0 for field in fields: calculated_configs.append( config_manager.build_config_calculated_fields( field["reference"], field["calc_type"], field["name"], field["depth"], None, )) if args.hash: config_manager.append_comparison_fields( config_manager.build_config_comparison_fields(["hash__all"], depth=max_depth)) return calculated_configs
def test_get_arg_list(test_input, expected): """Test get aggregations list of columns.""" res = cli_tools.get_arg_list(test_input) assert res == expected
def test_find_tables_config(): parser = cli_tools.configure_arg_parser() args = parser.parse_args(CLI_FIND_TABLES_ARGS) allowed_schemas = cli_tools.get_arg_list(args.allowed_schemas) assert allowed_schemas[0] == "my_schema"
def get_aggregate_config(args, config_manager): """Return list of formated aggregation objects. Args: config_manager (ConfigManager): Validation config manager instance. """ aggregate_configs = [config_manager.build_config_count_aggregate()] supported_data_types = [ "float64", "float32", "int8", "int16", "int32", "int64", "decimal", "timestamp", ] if args.wildcard_include_string_len: supported_data_types.append("string") cast_to_bigint = True if args.cast_to_bigint else False if args.count: col_args = None if args.count == "*" else cli_tools.get_arg_list( args.count) aggregate_configs += config_manager.build_config_column_aggregates( "count", col_args, None, cast_to_bigint=cast_to_bigint) if args.sum: col_args = None if args.sum == "*" else cli_tools.get_arg_list( args.sum) aggregate_configs += config_manager.build_config_column_aggregates( "sum", col_args, supported_data_types, cast_to_bigint=cast_to_bigint) if args.avg: col_args = None if args.avg == "*" else cli_tools.get_arg_list( args.avg) aggregate_configs += config_manager.build_config_column_aggregates( "avg", col_args, supported_data_types, cast_to_bigint=cast_to_bigint) if args.min: col_args = None if args.min == "*" else cli_tools.get_arg_list( args.min) aggregate_configs += config_manager.build_config_column_aggregates( "min", col_args, supported_data_types, cast_to_bigint=cast_to_bigint) if args.max: col_args = None if args.max == "*" else cli_tools.get_arg_list( args.max) aggregate_configs += config_manager.build_config_column_aggregates( "max", col_args, supported_data_types, cast_to_bigint=cast_to_bigint) if args.bit_xor: col_args = None if args.bit_xor == "*" else cli_tools.get_arg_list( args.bit_xor) aggregate_configs += config_manager.build_config_column_aggregates( "bit_xor", col_args, supported_data_types, cast_to_bigint=cast_to_bigint) return aggregate_configs
def build_config_managers_from_args(args): """Return a list of config managers ready to execute.""" configs = [] validate_cmd = args.validate_cmd.capitalize() if validate_cmd == "Schema": config_type = consts.SCHEMA_VALIDATION elif validate_cmd == "Column": config_type = consts.COLUMN_VALIDATION elif validate_cmd == "Row": config_type = consts.ROW_VALIDATION elif validate_cmd == "Custom-query": config_type = consts.CUSTOM_QUERY else: raise ValueError(f"Unknown Validation Type: {validate_cmd}") result_handler_config = None if args.bq_result_handler: result_handler_config = cli_tools.get_result_handler( args.bq_result_handler, args.service_account) elif args.result_handler_config: result_handler_config = cli_tools.get_result_handler( args.result_handler_config, args.service_account) # Schema validation will not accept filters, labels, or threshold as flags filter_config, labels, threshold = [], [], 0.0 if config_type != consts.SCHEMA_VALIDATION: if args.filters: filter_config = cli_tools.get_filters(args.filters) if args.threshold: threshold = args.threshold labels = cli_tools.get_labels(args.labels) mgr = state_manager.StateManager() source_client = clients.get_data_client( mgr.get_connection_config(args.source_conn)) target_client = clients.get_data_client( mgr.get_connection_config(args.target_conn)) format = args.format if args.format else "table" use_random_rows = (None if config_type == consts.SCHEMA_VALIDATION else args.use_random_row) random_row_batch_size = (None if config_type == consts.SCHEMA_VALIDATION else args.random_row_batch_size) is_filesystem = source_client._source_type == "FileSystem" tables_list = cli_tools.get_tables_list(args.tables_list, default_value=[{}], is_filesystem=is_filesystem) for table_obj in tables_list: config_manager = ConfigManager.build_config_manager( config_type, args.source_conn, args.target_conn, table_obj, labels, threshold, format, use_random_rows=use_random_rows, random_row_batch_size=random_row_batch_size, source_client=source_client, target_client=target_client, result_handler_config=result_handler_config, filter_config=filter_config, verbose=args.verbose, ) if config_type != consts.SCHEMA_VALIDATION: config_manager = build_config_from_args(args, config_manager) else: if args.exclusion_columns is not None: exclusion_columns = cli_tools.get_arg_list( args.exclusion_columns) config_manager.append_exclusion_columns( [col.casefold() for col in exclusion_columns]) configs.append(config_manager) return configs