Beispiel #1
0
def test_validation_add_groups(module_under_test):
    mock_config_manager = ConfigManager(COLUMN_VALIDATION_CONFIG,
                                        MockIbisClient(),
                                        MockIbisClient(),
                                        verbose=False)
    builder = module_under_test.ValidationBuilder(mock_config_manager)

    mock_config_manager.append_query_groups(QUERY_GROUPS_TEST)
    builder.add_config_query_groups()

    assert list(builder.get_group_aliases()) == ["start_alias"]
Beispiel #2
0
def test_column_validation_aggregates(module_under_test):
    mock_config_manager = ConfigManager(COLUMN_VALIDATION_CONFIG,
                                        MockIbisClient(),
                                        MockIbisClient(),
                                        verbose=False)
    builder = module_under_test.ValidationBuilder(mock_config_manager)

    mock_config_manager.append_aggregates(AGGREGATES_TEST)
    builder.add_config_aggregates()

    assert list(builder.get_metadata().keys()) == ["sum_starttime"]
def build_config_managers_from_yaml(args):
    """Returns List[ConfigManager] instances ready to be executed."""
    config_managers = []

    config_file_path = _get_arg_config_file(args)
    yaml_configs = _get_yaml_config_from_file(config_file_path)

    mgr = state_manager.StateManager()
    source_conn = mgr.get_connection_config(yaml_configs[consts.YAML_SOURCE])
    target_conn = mgr.get_connection_config(yaml_configs[consts.YAML_TARGET])

    source_client = clients.get_data_client(source_conn)
    target_client = clients.get_data_client(target_conn)

    for config in yaml_configs[consts.YAML_VALIDATIONS]:
        config[consts.CONFIG_SOURCE_CONN] = source_conn
        config[consts.CONFIG_TARGET_CONN] = target_conn
        config[consts.CONFIG_RESULT_HANDLER] = yaml_configs[
            consts.YAML_RESULT_HANDLER]
        config_manager = ConfigManager(config,
                                       source_client,
                                       target_client,
                                       verbose=args.verbose)

        config_managers.append(config_manager)

    return config_managers
Beispiel #4
0
def test_column_validation_calculate(module_under_test):
    mock_config_manager = ConfigManager(COLUMN_VALIDATION_CONFIG,
                                        MockIbisClient(),
                                        MockIbisClient(),
                                        verbose=False)
    builder = module_under_test.ValidationBuilder(mock_config_manager)

    mock_config_manager.append_calculated_fields(CALCULATED_MULTIPLE_TEST)
    builder.add_config_calculated_fields()
    print(sorted(list(builder.get_calculated_aliases())))
    assert sorted(list(builder.get_calculated_aliases())) == [
        "concat_calcs",
        "concat_start_station_name_end_station_name",
        "ifnull_start_station_name",
        "length_start_station_name",
        "rstrip_start_station_name",
        "upper_start_station_name",
    ]
Beispiel #5
0
def test_column_validation(module_under_test):
    mock_config_manager = ConfigManager(COLUMN_VALIDATION_CONFIG,
                                        MockIbisClient(),
                                        MockIbisClient(),
                                        verbose=False)
    builder = module_under_test.ValidationBuilder(mock_config_manager)

    assert not builder.verbose
    assert builder.config_manager.query_limit is None
Beispiel #6
0
def test_custom_query_get_query_from_file(module_under_test):
    mock_config_manager = ConfigManager(
        CUSTOM_QUERY_VALIDATION_CONFIG,
        MockIbisClient(),
        MockIbisClient(),
        verbose=False,
    )
    builder = module_under_test.ValidationBuilder(mock_config_manager)
    query = builder.get_query_from_file(
        builder.config_manager.source_query_file)
    assert query == "SELECT * FROM bigquery-public-data.usa_names.usa_1910_2013"
Beispiel #7
0
def test_validation_add_filters(module_under_test):
    mock_config_manager = ConfigManager(COLUMN_VALIDATION_CONFIG,
                                        MockIbisClient(),
                                        MockIbisClient(),
                                        verbose=False)
    builder = module_under_test.ValidationBuilder(mock_config_manager)

    builder.add_config_filters()
    filter_field = builder.source_builder.filters[0]

    assert filter_field.left == "column_name > 100"
Beispiel #8
0
def test_column_validation_limit(module_under_test):
    mock_config_manager = ConfigManager(
        COLUMN_VALIDATION_CONFIG_LIMIT,
        MockIbisClient(),
        MockIbisClient(),
        verbose=False,
    )
    builder = module_under_test.ValidationBuilder(mock_config_manager)
    builder.add_query_limit()

    assert builder.source_builder.limit == QUERY_LIMIT
Beispiel #9
0
def test_custom_query_validation(module_under_test):
    mock_config_manager = ConfigManager(
        CUSTOM_QUERY_VALIDATION_CONFIG,
        MockIbisClient(),
        MockIbisClient(),
        verbose=False,
    )
    builder = module_under_test.ValidationBuilder(mock_config_manager)

    assert not builder.verbose
    assert (builder.config_manager.source_query_file ==
            "tests/resources/custom-query.sql")
Beispiel #10
0
    def __init__(
        self,
        config,
        validation_builder=None,
        schema_validator=None,
        result_handler=None,
        verbose=False,
    ):
        """Initialize a DataValidation client

        Args:
            config (dict): The validation config used for the comparison.
            validation_builder (ValidationBuilder): Optional instance of a ValidationBuilder.
            schema_validator (SchemaValidation): Optional instance of a SchemaValidation.
            result_handler (ResultHandler): Optional instance of as ResultHandler client.
            verbose (bool): If verbose, the Data Validation client will print the queries run.
        """
        self.verbose = verbose

        # Data Client Management
        self.config = config

        self.config_manager = ConfigManager(config, verbose=self.verbose)

        self.run_metadata = metadata.RunMetadata()
        self.run_metadata.labels = self.config_manager.labels

        # Initialize Validation Builder if None was supplied
        self.validation_builder = validation_builder or ValidationBuilder(
            self.config_manager)

        self.schema_validator = schema_validator or SchemaValidation(
            self.config_manager,
            run_metadata=self.run_metadata,
            verbose=self.verbose)

        # Initialize the default Result Handler if None was supplied
        self.result_handler = result_handler or self.config_manager.get_result_handler(
        )
Beispiel #11
0
class DataValidation(object):
    def __init__(
        self,
        config,
        validation_builder=None,
        schema_validator=None,
        result_handler=None,
        verbose=False,
    ):
        """Initialize a DataValidation client

        Args:
            config (dict): The validation config used for the comparison.
            validation_builder (ValidationBuilder): Optional instance of a ValidationBuilder.
            schema_validator (SchemaValidation): Optional instance of a SchemaValidation.
            result_handler (ResultHandler): Optional instance of as ResultHandler client.
            verbose (bool): If verbose, the Data Validation client will print the queries run.
        """
        self.verbose = verbose

        # Data Client Management
        self.config = config

        self.config_manager = ConfigManager(config, verbose=self.verbose)

        self.run_metadata = metadata.RunMetadata()
        self.run_metadata.labels = self.config_manager.labels

        # Initialize Validation Builder if None was supplied
        self.validation_builder = validation_builder or ValidationBuilder(
            self.config_manager)

        self.schema_validator = schema_validator or SchemaValidation(
            self.config_manager,
            run_metadata=self.run_metadata,
            verbose=self.verbose)

        # Initialize the default Result Handler if None was supplied
        self.result_handler = result_handler or self.config_manager.get_result_handler(
        )

    # TODO(dhercher) we planned on shifting this to use an Execution Handler.
    # Leaving to to swast on the design of how this should look.
    def execute(self):
        """Execute Queries and Store Results"""
        # Apply random row filter before validations run
        if self.config_manager.use_random_rows():
            self._add_random_row_filter()

        # Run correct execution for the given validation type
        if self.config_manager.validation_type == consts.ROW_VALIDATION:
            grouped_fields = self.validation_builder.pop_grouped_fields()
            result_df = self.execute_recursive_validation(
                self.validation_builder, grouped_fields)
        elif self.config_manager.validation_type == consts.SCHEMA_VALIDATION:
            """Perform only schema validation"""
            result_df = self.schema_validator.execute()
        else:
            result_df = self._execute_validation(self.validation_builder,
                                                 process_in_memory=True)

        # Call Result Handler to Manage Results
        return self.result_handler.execute(self.config, result_df)

    def _add_random_row_filter(self):
        """Add random row filters to the validation builder."""
        if not self.config_manager.primary_keys:
            raise ValueError(
                "Primary Keys are required for Random Row Filters")

        # Filter for only first primary key (multi-pk filter not supported)
        primary_key_info = self.config_manager.primary_keys[0]
        query = RandomRowBuilder(
            [primary_key_info[consts.CONFIG_SOURCE_COLUMN]],
            self.config_manager.random_row_batch_size(),
        ).compile(
            self.config_manager.source_client,
            self.config_manager.source_schema,
            self.config_manager.source_table,
        )

        random_rows = self.config_manager.source_client.execute(query)
        filter_field = {
            consts.CONFIG_TYPE:
            consts.FILTER_TYPE_ISIN,
            consts.CONFIG_FILTER_SOURCE_COLUMN:
            primary_key_info[consts.CONFIG_SOURCE_COLUMN],
            consts.CONFIG_FILTER_SOURCE_VALUE:
            random_rows[primary_key_info[consts.CONFIG_SOURCE_COLUMN]],
            consts.CONFIG_FILTER_TARGET_COLUMN:
            primary_key_info[consts.CONFIG_TARGET_COLUMN],
            consts.CONFIG_FILTER_TARGET_VALUE:
            random_rows[primary_key_info[consts.CONFIG_SOURCE_COLUMN]],
        }
        self.validation_builder.add_filter(filter_field)

    def query_too_large(self, rows_df, grouped_fields):
        """Return bool to dictate if another level of recursion
        would create a too large result set.

        Rules to define too large are:
            - If any grouped fields remain, return False.
                (assumes user added logical sized groups)
            - Else, if next group size is larger
                than the limit, return True.
            - Finally return False if no covered case occured.
        """
        if len(grouped_fields) > 1:
            return False

        try:
            count_df = rows_df[rows_df[consts.AGGREGATION_TYPE] ==
                               consts.CONFIG_TYPE_COUNT]
            for row in count_df.to_dict(orient="row"):
                recursive_query_size = max(
                    float(row[consts.SOURCE_AGG_VALUE]),
                    float(row[consts.TARGET_AGG_VALUE]),
                )
                if recursive_query_size > self.config_manager.max_recursive_query_size:
                    logging.warning(
                        "Query result is too large for recursion: %s", row)
                    return True
        except Exception:
            logging.warning("Recursive values could not be cast to float.")
            return False

        return False

    def execute_recursive_validation(self, validation_builder, grouped_fields):
        """Recursive execution for Row validations.

        This method executes aggregate queries, such as sum-of-hashes, on the
        source and target tables. Where they differ, add to the GROUP BY
        clause recursively until the individual row differences can be
        identified.
        """
        process_in_memory = self.config_manager.process_in_memory()
        past_results = []
        if len(grouped_fields) > 0:
            validation_builder.add_query_group(grouped_fields[0])
            result_df = self._execute_validation(
                validation_builder, process_in_memory=process_in_memory)

            for grouped_key in result_df[consts.GROUP_BY_COLUMNS].unique():
                # Validations are viewed separtely, but queried together.
                # We must treat them as a single item which failed or succeeded.
                group_suceeded = True
                grouped_key_df = result_df[result_df[consts.GROUP_BY_COLUMNS]
                                           == grouped_key]

                if self.query_too_large(grouped_key_df, grouped_fields):
                    past_results.append(grouped_key_df)
                    continue

                for row in grouped_key_df.to_dict(orient="row"):
                    if row[consts.SOURCE_AGG_VALUE] == row[
                            consts.TARGET_AGG_VALUE]:
                        continue
                    else:
                        group_suceeded = False
                        break

                if group_suceeded:
                    past_results.append(grouped_key_df)
                else:
                    recursive_validation_builder = validation_builder.clone()
                    self._add_recursive_validation_filter(
                        recursive_validation_builder, row)
                    past_results.append(
                        self.execute_recursive_validation(
                            recursive_validation_builder, grouped_fields[1:]))
        elif self.config_manager.primary_keys and len(grouped_fields) == 0:
            past_results.append(
                self._execute_validation(validation_builder,
                                         process_in_memory=process_in_memory))

        # elif self.config_manager.primary_keys:
        #     validation_builder.add_config_query_groups(self.config_manager.primary_keys)
        #     validation_builder.add_config_query_groups(grouped_fields)

        else:
            warnings.warn(
                "WARNING: No Primary Keys Suppplied in Row Validation",
                UserWarning)
            return None

        return pandas.concat(past_results)

    def _add_recursive_validation_filter(self, validation_builder, row):
        """Return ValidationBuilder Configured for Next Recursive Search"""
        group_by_columns = json.loads(row[consts.GROUP_BY_COLUMNS])
        for alias, value in group_by_columns.items():
            filter_field = {
                consts.CONFIG_TYPE:
                consts.FILTER_TYPE_EQUALS,
                consts.CONFIG_FILTER_SOURCE_COLUMN:
                validation_builder.get_grouped_alias_source_column(alias),
                consts.CONFIG_FILTER_SOURCE_VALUE:
                value,
                consts.CONFIG_FILTER_TARGET_COLUMN:
                validation_builder.get_grouped_alias_target_column(alias),
                consts.CONFIG_FILTER_TARGET_VALUE:
                value,
            }
            validation_builder.add_filter(filter_field)

    @classmethod
    def _get_pandas_schema(self,
                           source_df,
                           target_df,
                           join_on_fields,
                           verbose=False):
        """Return a pandas schema which aligns source and targe for joins."""
        # TODO(dhercher): We are experiencing issues around datetime coming as sring and not matching
        # currently the hack to cast it to string works, but is not ideal.
        # We should look at both types, and if 1 is
        # date-like than use pandas.to_datetime on the other.
        for join_on_field in join_on_fields:
            source_df[join_on_field] = source_df[join_on_field].astype(str)
            target_df[join_on_field] = target_df[join_on_field].astype(str)

        # Loop over index keys() instead of iteritems() because pandas is
        # failing with datetime64[ns, UTC] data type on Python 3.9.
        schema_data = []
        schema_index = []
        for key in source_df.dtypes.keys():
            dtype = source_df.dtypes[key]
            # The Ibis pandas backend fails with `KeyError: dtype('O')` if
            # object dtypes are passed in.
            if dtype in {numpy.dtype("O")}:
                continue
            schema_data.append(dtype)
            schema_index.append(key)
        pd_schema = pandas.Series(schema_data, index=schema_index)
        if verbose:
            print("-- ** Pandas Schema ** --")
            print(pd_schema)

        return pd_schema

    def _execute_validation(self, validation_builder, process_in_memory=True):
        """Execute Against a Supplied Validation Builder"""
        self.run_metadata.validations = validation_builder.get_metadata()

        source_query = validation_builder.get_source_query()
        target_query = validation_builder.get_target_query()

        join_on_fields = (set(validation_builder.get_primary_keys())
                          if self.config_manager.validation_type
                          == consts.ROW_VALIDATION else set(
                              validation_builder.get_group_aliases()))
        if (self.config_manager.validation_type == consts.CUSTOM_QUERY
                and self.config_manager.custom_query_type == "row"):
            join_on_fields = set(["hash__all"])

        # If row validation from YAML, compare source and target agg values
        is_value_comparison = (
            self.config_manager.validation_type == consts.ROW_VALIDATION
            or (self.config_manager.validation_type == consts.CUSTOM_QUERY
                and self.config_manager.custom_query_type == "row"))

        if process_in_memory:
            source_df = self.config_manager.source_client.execute(source_query)
            target_df = self.config_manager.target_client.execute(target_query)

            # Drop excess fields for row validation to avoid pandas errors for unsupported column data types (i.e structs)
            if (self.config_manager.validation_type == consts.ROW_VALIDATION
                    and self.config_manager.dependent_aliases):
                source_df.drop(
                    source_df.columns.difference(
                        self.config_manager.dependent_aliases),
                    axis=1,
                    inplace=True,
                )
                target_df.drop(
                    target_df.columns.difference(
                        self.config_manager.dependent_aliases),
                    axis=1,
                    inplace=True,
                )

            pd_schema = self._get_pandas_schema(source_df,
                                                target_df,
                                                join_on_fields,
                                                verbose=self.verbose)

            pandas_client = ibis.backends.pandas.connect({
                combiner.DEFAULT_SOURCE:
                source_df,
                combiner.DEFAULT_TARGET:
                target_df
            })

            try:
                result_df = combiner.generate_report(
                    pandas_client,
                    self.run_metadata,
                    pandas_client.table(combiner.DEFAULT_SOURCE,
                                        schema=pd_schema),
                    pandas_client.table(combiner.DEFAULT_TARGET,
                                        schema=pd_schema),
                    join_on_fields=join_on_fields,
                    is_value_comparison=is_value_comparison,
                    verbose=self.verbose,
                )
            except Exception as e:
                if self.verbose:
                    print("-- ** Logging Source DF ** --")
                    print(source_df.dtypes)
                    print(source_df)
                    print("-- ** Logging Target DF ** --")
                    print(target_df.dtypes)
                    print(target_df)
                raise e
        else:
            result_df = combiner.generate_report(
                self.config_manager.source_client,
                self.run_metadata,
                source_query,
                target_query,
                join_on_fields=join_on_fields,
                is_value_comparison=is_value_comparison,
                verbose=self.verbose,
            )

        return result_df

    def combine_data(self, source_df, target_df, join_on_fields):
        """TODO: Return List of Dictionaries"""
        # Clean Data to Standardize
        if join_on_fields:
            df = source_df.merge(
                target_df,
                how="outer",
                on=join_on_fields,
                suffixes=(consts.INPUT_SUFFIX, consts.OUTPUT_SUFFIX),
            )
        else:
            df = source_df.join(
                target_df,
                how="outer",
                lsuffix=consts.INPUT_SUFFIX,
                rsuffix=consts.OUTPUT_SUFFIX,
            )
        return df
def build_config_managers_from_args(args):
    """Return a list of config managers ready to execute."""
    configs = []

    validate_cmd = args.validate_cmd.capitalize()
    if validate_cmd == "Schema":
        config_type = consts.SCHEMA_VALIDATION
    elif validate_cmd == "Column":
        config_type = consts.COLUMN_VALIDATION
    elif validate_cmd == "Row":
        config_type = consts.ROW_VALIDATION
    elif validate_cmd == "Custom-query":
        config_type = consts.CUSTOM_QUERY
    else:
        raise ValueError(f"Unknown Validation Type: {validate_cmd}")

    result_handler_config = None
    if args.bq_result_handler:
        result_handler_config = cli_tools.get_result_handler(
            args.bq_result_handler, args.service_account)
    elif args.result_handler_config:
        result_handler_config = cli_tools.get_result_handler(
            args.result_handler_config, args.service_account)

    # Schema validation will not accept filters, labels, or threshold as flags
    filter_config, labels, threshold = [], [], 0.0
    if config_type != consts.SCHEMA_VALIDATION:
        if args.filters:
            filter_config = cli_tools.get_filters(args.filters)
        if args.threshold:
            threshold = args.threshold
    labels = cli_tools.get_labels(args.labels)

    mgr = state_manager.StateManager()
    source_client = clients.get_data_client(
        mgr.get_connection_config(args.source_conn))
    target_client = clients.get_data_client(
        mgr.get_connection_config(args.target_conn))

    format = args.format if args.format else "table"

    use_random_rows = (None if config_type == consts.SCHEMA_VALIDATION else
                       args.use_random_row)
    random_row_batch_size = (None if config_type == consts.SCHEMA_VALIDATION
                             else args.random_row_batch_size)

    is_filesystem = source_client._source_type == "FileSystem"
    tables_list = cli_tools.get_tables_list(args.tables_list,
                                            default_value=[{}],
                                            is_filesystem=is_filesystem)

    for table_obj in tables_list:
        config_manager = ConfigManager.build_config_manager(
            config_type,
            args.source_conn,
            args.target_conn,
            table_obj,
            labels,
            threshold,
            format,
            use_random_rows=use_random_rows,
            random_row_batch_size=random_row_batch_size,
            source_client=source_client,
            target_client=target_client,
            result_handler_config=result_handler_config,
            filter_config=filter_config,
            verbose=args.verbose,
        )
        if config_type != consts.SCHEMA_VALIDATION:
            config_manager = build_config_from_args(args, config_manager)
        else:
            if args.exclusion_columns is not None:
                exclusion_columns = cli_tools.get_arg_list(
                    args.exclusion_columns)
                config_manager.append_exclusion_columns(
                    [col.casefold() for col in exclusion_columns])

        configs.append(config_manager)

    return configs