def test_spark_expect_column_value_z_scores_to_be_less_than_impl(
    spark_session, basic_spark_df_execution_engine
):
    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10]})
    spark = get_or_create_spark_application(
        spark_config={
            "spark.sql.catalogImplementation": "hive",
            "spark.executor.memory": "450m",
            # "spark.driver.allowMultipleContexts": "true",  # This directive does not appear to have any effect.
        }
    )
    df = spark.createDataFrame(df)

    expectationConfiguration = ExpectationConfiguration(
        expectation_type="expect_column_value_z_scores_to_be_less_than",
        kwargs={
            "column": "a",
            "mostly": 0.9,
            "threshold": 4,
            "double_sided": True,
        },
    )
    expectation = ExpectColumnValueZScoresToBeLessThan(expectationConfiguration)
    engine = basic_spark_df_execution_engine
    engine.load_batch_data(batch_id="my_id", batch_data=df)
    result = expectation.validate(Validator(execution_engine=engine))
    assert result == ExpectationValidationResult(
        success=True,
    )
Example #2
0
def test_sa_expect_column_value_z_scores_to_be_less_than_impl(postgresql_engine):
    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10]})
    df.to_sql(
        name="z_score_test_data",
        con=postgresql_engine,
        index=False,
        if_exists="replace",
    )
    expectationConfiguration = ExpectationConfiguration(
        expectation_type="expect_column_value_z_scores_to_be_less_than",
        kwargs={
            "column": "a",
            "mostly": 0.9,
            "threshold": 4,
            "double_sided": True,
        },
    )
    expectation = ExpectColumnValueZScoresToBeLessThan(expectationConfiguration)
    engine = SqlAlchemyExecutionEngine(engine=postgresql_engine)
    engine.load_batch_data(
        "my_id",
        SqlAlchemyBatchData(execution_engine=engine, table_name="z_score_test_data"),
    )
    result = expectation.validate(Validator(execution_engine=engine))
    assert result == ExpectationValidationResult(
        success=True,
    )
Example #3
0
def test_parse_validation_graph():
    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]})
    expectation_configuration = ExpectationConfiguration(
        expectation_type="expect_column_value_z_scores_to_be_less_than",
        kwargs={
            "column": "a",
            "mostly": 0.9,
            "threshold": 4,
            "double_sided": True,
        },
    )
    # noinspection PyUnusedLocal
    expectation = ExpectColumnValueZScoresToBeLessThan(
        expectation_configuration)
    # noinspection PyUnusedLocal
    batch = Batch(data=df)
    graph = ValidationGraph()
    engine = PandasExecutionEngine()
    for configuration in [expectation_configuration]:
        expectation_impl = get_expectation_impl(
            "expect_column_value_z_scores_to_be_less_than")
        validation_dependencies = expectation_impl(
            configuration).get_validation_dependencies(configuration, engine)

        for metric_configuration in validation_dependencies["metrics"].values(
        ):
            Validator(execution_engine=engine).build_metric_dependency_graph(
                graph=graph,
                execution_engine=engine,
                metric_configuration=metric_configuration,
                configuration=configuration,
            )
    ready_metrics, needed_metrics = Validator(engine)._parse_validation_graph(
        validation_graph=graph, metrics=dict())
    assert len(ready_metrics) == 2 and len(needed_metrics) == 9
Example #4
0
def test_populate_dependencies():
    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]})
    expectationConfiguration = ExpectationConfiguration(
        expectation_type="expect_column_value_z_scores_to_be_less_than",
        kwargs={
            "column": "a",
            "mostly": 0.9,
            "threshold": 4,
            "double_sided": True,
        },
    )
    expectation = ExpectColumnValueZScoresToBeLessThan(
        expectationConfiguration)
    batch = Batch(data=df)
    graph = ValidationGraph()
    engine = PandasExecutionEngine()
    for configuration in [expectationConfiguration]:
        expectation_impl = get_expectation_impl(
            "expect_column_value_z_scores_to_be_less_than")
        validation_dependencies = expectation_impl(
            configuration).get_validation_dependencies(
                configuration,
                engine,
            )

        for metric_configuration in validation_dependencies["metrics"].values(
        ):
            Validator(execution_engine=engine).build_metric_dependency_graph(
                graph,
                metric_configuration,
                configuration,
                execution_engine=engine)
    assert len(graph.edges) == 10
Example #5
0
def test_expect_column_value_z_scores_to_be_less_than_impl():
    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10]})
    expectationConfiguration = ExpectationConfiguration(
        expectation_type="expect_column_value_z_scores_to_be_less_than",
        kwargs={
            "column": "a",
            "mostly": 0.9,
            "threshold": 4,
            "double_sided": True,
        },
    )
    expectation = ExpectColumnValueZScoresToBeLessThan(
        expectationConfiguration)
    engine = PandasExecutionEngine(batch_data_dict={"my_id": df})
    result = expectation.validate(Validator(execution_engine=engine))
    assert result == ExpectationValidationResult(success=True, )
Example #6
0
def test_graph_validate_with_runtime_config(basic_datasource):
    df = pd.DataFrame(
        {"a": [1, 5, 22, 3, 5, 10, 2, 3], "b": [97, 332, 3, 4, 5, 6, 7, None]}
    )
    expectationConfiguration = ExpectationConfiguration(
        expectation_type="expect_column_value_z_scores_to_be_less_than",
        kwargs={"column": "b", "mostly": 1, "threshold": 2, "double_sided": True},
    )
    expectation = ExpectColumnValueZScoresToBeLessThan(expectationConfiguration)

    batch = basic_datasource.get_single_batch_from_batch_request(
        BatchRequest(
            **{
                "datasource_name": "my_datasource",
                "data_connector_name": "test_runtime_data_connector",
                "batch_data": df,
                "partition_request": PartitionRequest(
                    **{
                        "partition_identifiers": {
                            "pipeline_stage_name": 0,
                            "run_id": 0,
                            "custom_key_0": 0,
                        }
                    }
                ),
            }
        )
    )

    try:
        result = Validator(
            execution_engine=PandasExecutionEngine(), batches=(batch,)
        ).graph_validate(
            configurations=[expectationConfiguration],
            runtime_configuration={"result_format": "COMPLETE"},
        )
    except AssertionError as e:
        result = e
    assert result == [
        ExpectationValidationResult(
            success=False,
            meta={},
            result={
                "element_count": 8,
                "unexpected_count": 1,
                "unexpected_percent": 12.5,
                "partial_unexpected_list": [332.0],
                "missing_count": 1,
                "missing_percent": 12.5,
                "unexpected_percent_nonmissing": 14.285714285714285,
                "partial_unexpected_index_list": None,
                "partial_unexpected_counts": [{"value": 332.0, "count": 1}],
                "unexpected_list": [332.0],
                "unexpected_index_list": None,
            },
            expectation_config=None,
            exception_info=None,
        )
    ]
Example #7
0
def test_populate_dependencies_with_incorrect_metric_name():
    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]})
    expectation_configuration = ExpectationConfiguration(
        expectation_type="expect_column_value_z_scores_to_be_less_than",
        kwargs={
            "column": "a",
            "mostly": 0.9,
            "threshold": 4,
            "double_sided": True,
        },
    )
    # noinspection PyUnusedLocal
    expectation = ExpectColumnValueZScoresToBeLessThan(expectation_configuration)
    # noinspection PyUnusedLocal
    batch = Batch(data=df)
    graph = ValidationGraph()
    engine = PandasExecutionEngine()
    for configuration in [expectation_configuration]:
        expectation_impl = get_expectation_impl(
            "expect_column_value_z_scores_to_be_less_than"
        )
        validation_dependencies = expectation_impl(
            configuration
        ).get_validation_dependencies(
            configuration,
            engine,
        )

        try:
            Validator(execution_engine=engine).build_metric_dependency_graph(
                graph=graph,
                execution_engine=engine,
                metric_configuration=MetricConfiguration(
                    "column_values.not_a_metric", IDDict()
                ),
                configuration=configuration,
            )
        except ge_exceptions.MetricProviderError as e:
            graph = e

    assert isinstance(graph, ge_exceptions.MetricProviderError)