Example #1
0
def test_parse_validation_graph_with_bad_metrics_args():
    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]})
    expectation_configuration = ExpectationConfiguration(
        expectation_type="expect_column_value_z_scores_to_be_less_than",
        kwargs={
            "column": "a",
            "mostly": 0.9,
            "threshold": 4,
            "double_sided": True,
        },
    )
    graph = ValidationGraph()
    engine = PandasExecutionEngine()
    validator = Validator(execution_engine=engine)
    for configuration in [expectation_configuration]:
        expectation_impl = get_expectation_impl(
            "expect_column_value_z_scores_to_be_less_than")
        validation_dependencies = expectation_impl(
            configuration).get_validation_dependencies(
                configuration,
                execution_engine=engine,
            )

        for metric_configuration in validation_dependencies["metrics"].values(
        ):
            validator.build_metric_dependency_graph(
                graph=graph,
                execution_engine=engine,
                metric_configuration=metric_configuration,
                configuration=configuration,
            )
    ready_metrics, needed_metrics = validator._parse_validation_graph(
        validation_graph=graph, metrics=("nonexistent", "NONE"))
    assert len(ready_metrics) == 2 and len(needed_metrics) == 9
Example #2
0
def test_parse_validation_graph():
    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]})
    expectation_configuration = ExpectationConfiguration(
        expectation_type="expect_column_value_z_scores_to_be_less_than",
        kwargs={
            "column": "a",
            "mostly": 0.9,
            "threshold": 4,
            "double_sided": True,
        },
    )
    # noinspection PyUnusedLocal
    expectation = ExpectColumnValueZScoresToBeLessThan(
        expectation_configuration)
    # noinspection PyUnusedLocal
    batch = Batch(data=df)
    graph = ValidationGraph()
    engine = PandasExecutionEngine()
    for configuration in [expectation_configuration]:
        expectation_impl = get_expectation_impl(
            "expect_column_value_z_scores_to_be_less_than")
        validation_dependencies = expectation_impl(
            configuration).get_validation_dependencies(configuration, engine)

        for metric_configuration in validation_dependencies["metrics"].values(
        ):
            Validator(execution_engine=engine).build_metric_dependency_graph(
                graph=graph,
                execution_engine=engine,
                metric_configuration=metric_configuration,
                configuration=configuration,
            )
    ready_metrics, needed_metrics = Validator(engine)._parse_validation_graph(
        validation_graph=graph, metrics=dict())
    assert len(ready_metrics) == 2 and len(needed_metrics) == 9
Example #3
0
def test_populate_dependencies():
    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]})
    expectationConfiguration = ExpectationConfiguration(
        expectation_type="expect_column_value_z_scores_to_be_less_than",
        kwargs={
            "column": "a",
            "mostly": 0.9,
            "threshold": 4,
            "double_sided": True,
        },
    )
    expectation = ExpectColumnValueZScoresToBeLessThan(
        expectationConfiguration)
    batch = Batch(data=df)
    graph = ValidationGraph()
    engine = PandasExecutionEngine()
    for configuration in [expectationConfiguration]:
        expectation_impl = get_expectation_impl(
            "expect_column_value_z_scores_to_be_less_than")
        validation_dependencies = expectation_impl(
            configuration).get_validation_dependencies(
                configuration,
                engine,
            )

        for metric_configuration in validation_dependencies["metrics"].values(
        ):
            Validator(execution_engine=engine).build_metric_dependency_graph(
                graph,
                metric_configuration,
                configuration,
                execution_engine=engine)
    assert len(graph.edges) == 10
Example #4
0
def test_populate_dependencies_with_incorrect_metric_name():
    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]})
    expectation_configuration = ExpectationConfiguration(
        expectation_type="expect_column_value_z_scores_to_be_less_than",
        kwargs={
            "column": "a",
            "mostly": 0.9,
            "threshold": 4,
            "double_sided": True,
        },
    )
    # noinspection PyUnusedLocal
    expectation = ExpectColumnValueZScoresToBeLessThan(expectation_configuration)
    # noinspection PyUnusedLocal
    batch = Batch(data=df)
    graph = ValidationGraph()
    engine = PandasExecutionEngine()
    for configuration in [expectation_configuration]:
        expectation_impl = get_expectation_impl(
            "expect_column_value_z_scores_to_be_less_than"
        )
        validation_dependencies = expectation_impl(
            configuration
        ).get_validation_dependencies(
            configuration,
            engine,
        )

        try:
            Validator(execution_engine=engine).build_metric_dependency_graph(
                graph=graph,
                execution_engine=engine,
                metric_configuration=MetricConfiguration(
                    "column_values.not_a_metric", IDDict()
                ),
                configuration=configuration,
            )
        except ge_exceptions.MetricProviderError as e:
            graph = e

    assert isinstance(graph, ge_exceptions.MetricProviderError)
Example #5
0
def test_resolve_validation_graph_with_bad_config_catch_exceptions_true(
    basic_datasource, ):
    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, None]})

    batch = basic_datasource.get_single_batch_from_batch_request(
        RuntimeBatchRequest(
            **{
                "datasource_name": "my_datasource",
                "data_connector_name": "test_runtime_data_connector",
                "data_asset_name": "IN_MEMORY_DATA_ASSET",
                "runtime_parameters": {
                    "batch_data": df,
                },
                "batch_identifiers": {
                    "pipeline_stage_name": 0,
                    "airflow_run_id": 0,
                    "custom_key_0": 0,
                },
            }))

    expectation_configuration = ExpectationConfiguration(
        expectation_type="expect_column_max_to_be_between",
        kwargs={
            "column": "not_in_table",
            "min_value": 1,
            "max_value": 29
        },
    )

    runtime_configuration = {
        "catch_exceptions": True,
        "result_format": {
            "result_format": "BASIC"
        },
    }

    execution_engine = PandasExecutionEngine()

    validator = Validator(execution_engine=execution_engine, batches=[batch])

    expectation_impl = get_expectation_impl(
        expectation_configuration.expectation_type)
    validation_dependencies = expectation_impl().get_validation_dependencies(
        expectation_configuration, execution_engine,
        runtime_configuration)["metrics"]

    graph = ValidationGraph()

    for metric_configuration in validation_dependencies.values():
        validator.build_metric_dependency_graph(
            graph=graph,
            execution_engine=execution_engine,
            metric_configuration=metric_configuration,
            configuration=expectation_configuration,
            runtime_configuration=runtime_configuration,
        )

    metrics: Dict[Tuple[str, str, str], Any] = {}
    aborted_metrics_info: Dict[Tuple[str, str, str], Dict[
        str, Union[MetricConfiguration, Set[ExceptionInfo],
                   int]], ] = validator.resolve_validation_graph(
                       graph=graph,
                       metrics=metrics,
                       runtime_configuration=runtime_configuration,
                   )

    assert len(aborted_metrics_info) == 1

    aborted_metric_info_item = list(aborted_metrics_info.values())[0]
    assert aborted_metric_info_item[
        "num_failures"] == MAX_METRIC_COMPUTATION_RETRIES

    assert len(aborted_metric_info_item["exception_info"]) == 1

    exception_info = next(iter(aborted_metric_info_item["exception_info"]))
    assert (exception_info["exception_message"] ==
            'Error: The column "not_in_table" in BatchData does not exist.')
Example #6
0
 def get_validation_dependencies(
     self,
     configuration: Optional[ExpectationConfiguration] = None,
     execution_engine: Optional[ExecutionEngine] = None,
     runtime_configuration: Optional[dict] = None,
 ):
     all_dependencies = super().get_validation_dependencies(
         configuration, execution_engine, runtime_configuration
     )
     dependencies = all_dependencies["metrics"]
     partition_object = configuration.kwargs["partition_object"]
     domain_kwargs = configuration.get_domain_kwargs()
     is_categorical = None
     bins = None
     if partition_object is None:
         if configuration.kwargs.get(
             "bucketize_data", self.default_kwarg_values["bucketize_data"]
         ):
             is_categorical = False
             partition_metric_configuration = MetricConfiguration(
                 "column.partition",
                 metric_domain_kwargs=domain_kwargs,
                 metric_value_kwargs={
                     "bins": "auto",
                     "allow_relative_error": False,
                 },
             )
             #
             # Note: 20201116 - JPC - the execution engine doesn't provide capability to evaluate
             # dependencies, so we use a validator
             #
             validator = Validator(execution_engine=execution_engine)
             graph = ValidationGraph()
             validator.build_metric_dependency_graph(
                 graph=graph,
                 child_node=partition_metric_configuration,
                 configuration=configuration,
                 execution_engine=execution_engine,
             )
             bins = validator.resolve_validation_graph(graph, metrics=dict())[
                 partition_metric_configuration.id
             ]
             hist_metric_configuration = MetricConfiguration(
                 "column.histogram",
                 metric_domain_kwargs=domain_kwargs,
                 metric_value_kwargs={"bins": tuple(bins),},
             )
             nonnull_configuration = MetricConfiguration(
                 "column_values.nonnull.count",
                 metric_domain_kwargs=domain_kwargs,
                 metric_value_kwargs=dict(),
             )
             #
             # NOTE 20201117 - JPC - Would prefer not to include partition_metric_configuraiton here,
             # since we have already evaluated it, and its result is in the kwargs for the histogram.
             # However, currently the dependencies' configurations are not passed to the _validate method
             #
             dependencies["column.partition"] = partition_metric_configuration
             dependencies["column.histogram"] = hist_metric_configuration
             dependencies["column_values.nonnull.count"] = nonnull_configuration
         else:
             is_categorical = True
             counts_configuration = MetricConfiguration(
                 "column.value_counts",
                 metric_domain_kwargs=domain_kwargs,
                 metric_value_kwargs={"sort": "value",},
             )
             nonnull_configuration = MetricConfiguration(
                 "column_values.nonnull.count", metric_domain_kwargs=domain_kwargs,
             )
             dependencies["column.value_counts"] = counts_configuration
             dependencies["column_values.nonnull.count"] = nonnull_configuration
     if is_categorical is True or is_valid_categorical_partition_object(
         partition_object
     ):
         dependencies["column.value_counts"] = MetricConfiguration(
             "column.value_counts",
             metric_domain_kwargs=domain_kwargs,
             metric_value_kwargs={"sort": "value"},
         )
         dependencies["column_values.nonnull.count"] = MetricConfiguration(
             "column_values.nonnull.count", domain_kwargs
         )
     else:
         if (
             bins is None
         ):  # if the user did not supply a partition_object, so we just computed it
             if not is_valid_partition_object(partition_object):
                 raise ValueError("Invalid partition_object provided")
             bins = partition_object["bins"]
         hist_metric_configuration = MetricConfiguration(
             "column.histogram",
             metric_domain_kwargs=domain_kwargs,
             metric_value_kwargs={"bins": bins,},
         )
         nonnull_configuration = MetricConfiguration(
             "column_values.nonnull.count",
             metric_domain_kwargs=domain_kwargs,
             metric_value_kwargs=dict(),
         )
         dependencies["column.histogram"] = hist_metric_configuration
         dependencies["column_values.nonnull.count"] = nonnull_configuration
         below_partition = MetricConfiguration(
             "column_values.between.count",
             metric_domain_kwargs=domain_kwargs,
             metric_value_kwargs={"max_value": bins[0]},
         )
         above_partition = MetricConfiguration(
             "column_values.between.count",
             metric_domain_kwargs=domain_kwargs,
             metric_value_kwargs={"min_value": bins[-1], "strict_min": True},
         )
         dependencies["below_partition"] = below_partition
         dependencies["above_partition"] = above_partition
     return all_dependencies