def test_get_compute_domain_with_row_condition_alt( spark_session, basic_spark_df_execution_engine): engine: SparkDFExecutionEngine = build_spark_engine( spark=spark_session, df=pd.DataFrame({ "a": [1, 2, 3, 4], "b": [2, 3, 4, None] }, ), batch_id="1234", ) df = engine.dataframe expected_df = df.where("b > 2") # Loading batch data engine.load_batch_data(batch_data=df, batch_id="1234") data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={ "row_condition": "b > 2", "condition_parser": "spark" }, domain_type="identity", ) # Ensuring data has been properly queried assert dataframes_equal( data, expected_df), "Data does not match after getting compute domain" # Ensuring compute kwargs have not been modified assert ("row_condition" in compute_kwargs.keys() ), "Row condition should be located within compute kwargs" assert accessor_kwargs == {}, "Accessor kwargs have been modified"
def test_get_compute_domain_with_nonexistent_condition_parser( spark_session, basic_spark_df_execution_engine): engine: SparkDFExecutionEngine = build_spark_engine( spark=spark_session, df=pd.DataFrame({ "a": [1, 2, 3, 4], "b": [2, 3, 4, None] }, ), batch_id="1234", ) df = engine.dataframe # Loading batch data engine.load_batch_data(batch_data=df, batch_id="1234") # Expect GreatExpectationsError because parser doesn't exist with pytest.raises(GreatExpectationsError): # noinspection PyUnusedLocal data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={ "row_condition": "b > 24", "condition_parser": "nonexistent", }, domain_type=MetricDomainTypes.IDENTITY, )
def test_get_compute_domain_with_column_pair(spark_session): engine: SparkDFExecutionEngine = build_spark_engine( spark=spark_session, df=pd.DataFrame({ "a": [1, 2, 3, 4], "b": [2, 3, 4, None] }, ), batch_id="1234", ) df = engine.dataframe data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={ "column_A": "a", "column_B": "b" }, domain_type="column_pair") # Ensuring that with no domain nothing happens to the data itself assert dataframes_equal( data, df), "Data does not match after getting compute domain" assert compute_kwargs == {}, "Compute domain kwargs should be existent" assert accessor_kwargs == { "column_A": "a", "column_B": "b", }, "Accessor kwargs have been modified"
def test_max_metric_spark_column_does_not_exist(spark_session): engine: SparkDFExecutionEngine = build_spark_engine( spark=spark_session, df=pd.DataFrame({"a": [1, 2, 1]}), batch_id="my_id", ) metrics: dict = {} table_columns_metric: MetricConfiguration results: dict table_columns_metric, results = get_table_columns_metric(engine=engine) metrics.update(results) partial_metric = MetricConfiguration( metric_name="column.max.aggregate_fn", metric_domain_kwargs={"column": "non_existent_column"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) with pytest.raises(ge_exceptions.ExecutionEngineError) as eee: # noinspection PyUnusedLocal results = engine.resolve_metrics( metrics_to_resolve=(partial_metric,), metrics=metrics ) metrics.update(results) assert ( str(eee.value) == 'Error: The column "non_existent_column" in BatchData does not exist.' )
def test_distinct_metric_spark(spark_session): engine: SparkDFExecutionEngine = build_spark_engine( spark=spark_session, df=pd.DataFrame( {"a": [1, 2, 1, 2, 3, 3, None]}, ), batch_id="my_id", ) desired_metric = MetricConfiguration( metric_name="column.value_counts", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"sort": "value", "collate": None}, ) metrics = engine.resolve_metrics(metrics_to_resolve=(desired_metric,)) assert pd.Series(index=[1, 2, 3], data=[2, 2, 2]).equals(metrics[desired_metric.id]) desired_metric = MetricConfiguration( metric_name="column.distinct_values", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={"column.value_counts": desired_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) assert results == {desired_metric.id: {1, 2, 3}}
def test_median_metric_spark(spark_session): engine: SparkDFExecutionEngine = build_spark_engine( spark=spark_session, df=pd.DataFrame( {"a": [1, 2, 3]}, ), batch_id="my_id", ) desired_metric = MetricConfiguration( metric_name="table.row_count.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), ) metrics = engine.resolve_metrics(metrics_to_resolve=(desired_metric,)) row_count = MetricConfiguration( metric_name="table.row_count", metric_domain_kwargs={}, metric_value_kwargs=dict(), metric_dependencies={"metric_partial_fn": desired_metric}, ) metrics = engine.resolve_metrics(metrics_to_resolve=(row_count,), metrics=metrics) desired_metric = MetricConfiguration( metric_name="column.median", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={"table.row_count": row_count}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) assert results == {desired_metric.id: 2}
def test_dataframe_property_given_loaded_batch(spark_session): engine: SparkDFExecutionEngine = build_spark_engine( spark=spark_session, df=pd.DataFrame({"a": [1, 5, 22, 3, 5, 10]}, ), batch_id="1234", ) df = engine.dataframe # Ensuring Data not distorted assert engine.dataframe == df
def test_get_compute_domain_with_ge_experimental_condition_parser( spark_session, basic_spark_df_execution_engine): engine: SparkDFExecutionEngine = build_spark_engine( spark=spark_session, df=pd.DataFrame({ "a": [1, 2, 3, 4], "b": [2, 3, 4, None] }, ), batch_id="1234", ) df = engine.dataframe # Filtering expected data based on row condition expected_df = df.where("b == 2") # Loading batch data engine.load_batch_data(batch_data=df, batch_id="1234") # Obtaining data from computation data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={ "column": "b", "row_condition": 'col("b") == 2', "condition_parser": "great_expectations__experimental__", }, domain_type="column", ) # Ensuring data has been properly queried assert dataframes_equal( data, expected_df), "Data does not match after getting compute domain" # Ensuring compute kwargs have not been modified assert ("row_condition" in compute_kwargs.keys() ), "Row condition should be located within compute kwargs" assert accessor_kwargs == { "column": "b" }, "Accessor kwargs have been modified" # Should react differently for domain type identity data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={ "column": "b", "row_condition": 'col("b") == 2', "condition_parser": "great_expectations__experimental__", }, domain_type="identity", ) # Ensuring data has been properly queried assert dataframes_equal(data, expected_df.select( "b")), "Data does not match after getting compute domain" # Ensuring compute kwargs have not been modified assert ("row_condition" in compute_kwargs.keys() ), "Row condition should be located within compute kwargs" assert accessor_kwargs == {}, "Accessor kwargs have been modified"
def test_get_compute_domain_with_unmeetable_row_condition_alt( spark_session, basic_spark_df_execution_engine): engine: SparkDFExecutionEngine = build_spark_engine( spark=spark_session, df=pd.DataFrame({ "a": [1, 2, 3, 4], "b": [2, 3, 4, None] }, ), batch_id="1234", ) df = engine.dataframe expected_df = df.where("b > 24") # Loading batch data engine.load_batch_data(batch_data=df, batch_id="1234") data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={ "row_condition": "b > 24", "condition_parser": "spark" }, domain_type="identity", ) # Ensuring data has been properly queried assert dataframes_equal( data, expected_df), "Data does not match after getting compute domain" # Ensuring compute kwargs have not been modified assert ("row_condition" in compute_kwargs.keys() ), "Row condition should be located within compute kwargs" assert accessor_kwargs == {}, "Accessor kwargs have been modified" # Ensuring errors for column and column_ pair domains are caught with pytest.raises(GreatExpectationsError): # noinspection PyUnusedLocal data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={ "row_condition": "b > 24", "condition_parser": "spark", }, domain_type="column", ) with pytest.raises(GreatExpectationsError) as g: # noinspection PyUnusedLocal data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={ "row_condition": "b > 24", "condition_parser": "spark", }, domain_type="column_pair", )
def test_max_metric_spark_column_exists(spark_session): engine: SparkDFExecutionEngine = build_spark_engine( spark=spark_session, df=pd.DataFrame({"a": [1, 2, 1]}), batch_id="my_id", ) metrics: dict = {} table_columns_metric: MetricConfiguration results: dict table_columns_metric, results = get_table_columns_metric(engine=engine) metrics.update(results) partial_metric = MetricConfiguration( metric_name="column.max.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(partial_metric,), metrics=metrics ) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column.max", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "metric_partial_fn": partial_metric, "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) assert results == {desired_metric.id: 2}
def test_get_domain_records_with_unmeetable_row_condition_alt(spark_session): engine: SparkDFExecutionEngine = build_spark_engine( spark=spark_session, df=pd.DataFrame({ "a": [1, 2, 3, 4], "b": [2, 3, 4, None] }, ), batch_id="1234", ) df = engine.dataframe expected_df = df.where("b > 24") # Loading batch data engine.load_batch_data(batch_data=df, batch_id="1234") data = engine.get_domain_records(domain_kwargs={ "row_condition": "b > 24", "condition_parser": "spark", }) # Ensuring data has been properly queried assert dataframes_equal( data, expected_df), "Data does not match after getting compute domain" # Ensuring errors for column and column_ pair domains are caught with pytest.raises(ge_exceptions.GreatExpectationsError): # noinspection PyUnusedLocal data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={ "row_condition": "b > 24", "condition_parser": "spark", }, domain_type="column", ) with pytest.raises(ge_exceptions.GreatExpectationsError) as g: # noinspection PyUnusedLocal data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={ "row_condition": "b > 24", "condition_parser": "spark", }, domain_type="column_pair", )
def test_resolve_metric_bundle_with_nonexistent_metric( spark_session, basic_spark_df_execution_engine): engine: SparkDFExecutionEngine = build_spark_engine( spark=spark_session, df=pd.DataFrame({ "a": [1, 2, 1, 2, 3, 3], "b": [4, 4, 4, 4, 4, 4] }, ), batch_id="1234", ) desired_metric_1 = MetricConfiguration( metric_name="column_values.unique", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), ) desired_metric_2 = MetricConfiguration( metric_name="column.min", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), ) desired_metric_3 = MetricConfiguration( metric_name="column.max", metric_domain_kwargs={"column": "b"}, metric_value_kwargs=dict(), ) desired_metric_4 = MetricConfiguration( metric_name="column.does_not_exist", metric_domain_kwargs={"column": "b"}, metric_value_kwargs=dict(), ) # Ensuring a metric provider error is raised if metric does not exist with pytest.raises(MetricProviderError) as e: # noinspection PyUnusedLocal res = engine.resolve_metrics(metrics_to_resolve=( desired_metric_1, desired_metric_2, desired_metric_3, desired_metric_4, )) print(e)
def test_get_compute_domain_with_column_domain_alt(spark_session): engine: SparkDFExecutionEngine = build_spark_engine( spark=spark_session, df=pd.DataFrame({ "a": [1, 2, 3, 4], "b": [2, 3, 4, None] }, ), batch_id="1234", ) df = engine.dataframe data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={"column": "a"}, domain_type="column") # Ensuring that column domain is now an accessor kwarg, and data remains unmodified assert dataframes_equal( data, df), "Data does not match after getting compute domain" assert compute_kwargs == {}, "Compute domain kwargs should be empty" assert accessor_kwargs == { "column": "a" }, "Accessor kwargs have been modified"
def test_get_compute_domain_with_nonexistent_condition_parser(spark_session): engine: SparkDFExecutionEngine = build_spark_engine( spark=spark_session, df=pd.DataFrame({ "a": [1, 2, 3, 4], "b": [2, 3, 4, None] }, ), batch_id="1234", ) df = engine.dataframe # Loading batch data engine.load_batch_data(batch_data=df, batch_id="1234") # Expect GreatExpectationsError because parser doesn't exist with pytest.raises(ge_exceptions.GreatExpectationsError): # noinspection PyUnusedLocal data = engine.get_domain_records(domain_kwargs={ "row_condition": "b > 24", "condition_parser": "nonexistent", }, )
def test_get_compute_domain_with_multicolumn(spark_session, basic_spark_df_execution_engine): engine: SparkDFExecutionEngine = build_spark_engine( spark=spark_session, df=pd.DataFrame( { "a": [1, 2, 3, 4], "b": [2, 3, 4, None], "c": [1, 2, 3, None] }, ), batch_id="1234", ) df = engine.dataframe data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={"column_list": ["a", "b", "c"]}, domain_type="multicolumn") # Ensuring that with no domain nothing happens to the data itself assert dataframes_equal( data, df), "Data does not match after getting compute domain" assert compute_kwargs == {}, "Compute domain kwargs should be empty" assert accessor_kwargs == { "column_list": ["a", "b", "c"] }, "Accessor kwargs have been modified" # Checking for identity engine.load_batch_data(batch_data=df, batch_id="1234") data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={"column_list": ["a", "b", "c"]}, domain_type="identity") # Ensuring that with no domain nothing happens to the data itself assert dataframes_equal( data, df), "Data does not match after getting compute domain" assert compute_kwargs == { "column_list": ["a", "b", "c"] }, "Compute domain kwargs should not change for identity domain" assert accessor_kwargs == {}, "Accessor kwargs have been modified"
def test_map_unique_spark_column_does_not_exist(spark_session): engine: SparkDFExecutionEngine = build_spark_engine( spark=spark_session, df=pd.DataFrame( { "a": [1, 2, 3, 3, 4, None], "b": [None, "foo", "bar", "baz", "qux", "fish"], } ), batch_id="my_id", ) metrics: dict = {} table_columns_metric: MetricConfiguration results: dict table_columns_metric, results = get_table_columns_metric(engine=engine) metrics.update(results) condition_metric = MetricConfiguration( metric_name="column_values.unique.condition", metric_domain_kwargs={"column": "non_existent_column"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) with pytest.raises(ge_exceptions.ExecutionEngineError) as eee: # noinspection PyUnusedLocal metrics = engine.resolve_metrics( metrics_to_resolve=(condition_metric,), metrics=metrics ) assert ( str(eee.value) == 'Error: The column "non_existent_column" in BatchData does not exist.' )
def test_get_domain_records_with_row_condition_alt(spark_session): engine: SparkDFExecutionEngine = build_spark_engine( spark=spark_session, df=pd.DataFrame({ "a": [1, 2, 3, 4], "b": [2, 3, 4, None] }, ), batch_id="1234", ) df = engine.dataframe expected_df = df.where("b > 2") # Loading batch data engine.load_batch_data(batch_data=df, batch_id="1234") data = engine.get_domain_records(domain_kwargs={ "row_condition": "b > 2", "condition_parser": "spark", }) # Ensuring data has been properly queried assert dataframes_equal( data, expected_df), "Data does not match after getting compute domain"
def test_map_value_set_spark(spark_session, basic_spark_df_execution_engine): engine: SparkDFExecutionEngine = build_spark_engine( spark=spark_session, df=pd.DataFrame( {"a": [1, 2, 3, 3, None]}, ), batch_id="my_id", ) metrics: dict = {} table_columns_metric: MetricConfiguration results: dict table_columns_metric, results = get_table_columns_metric(engine=engine) metrics.update(results) condition_metric = MetricConfiguration( metric_name="column_values.in_set.condition", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"value_set": [1, 2, 3]}, metric_dependencies={ "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(condition_metric,), metrics=metrics ) metrics.update(results) # Note: metric_dependencies is optional here in the config when called from a validator. aggregate_partial = MetricConfiguration( metric_name="column_values.in_set.unexpected_count.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"value_set": [1, 2, 3]}, metric_dependencies={"unexpected_condition": condition_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(aggregate_partial,), metrics=metrics ) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.in_set.unexpected_count", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"value_set": [1, 2, 3]}, metric_dependencies={"metric_partial_fn": aggregate_partial}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) assert results == {desired_metric.id: 0} # We run the same computation again, this time with None being replaced by nan instead of NULL # to demonstrate this behavior df = pd.DataFrame({"a": [1, 2, 3, 3, None]}) df = spark_session.createDataFrame(df) engine = basic_spark_df_execution_engine engine.load_batch_data(batch_id="my_id", batch_data=df) condition_metric = MetricConfiguration( metric_name="column_values.in_set.condition", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"value_set": [1, 2, 3]}, metric_dependencies={ "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(condition_metric,), metrics=metrics ) metrics.update(results) # Note: metric_dependencies is optional here in the config when called from a validator. aggregate_partial = MetricConfiguration( metric_name="column_values.in_set.unexpected_count.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"value_set": [1, 2, 3]}, metric_dependencies={"unexpected_condition": condition_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(aggregate_partial,), metrics=metrics ) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.in_set.unexpected_count", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"value_set": [1, 2, 3]}, metric_dependencies={"metric_partial_fn": aggregate_partial}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) assert results == {desired_metric.id: 1}
def test_sparkdf_batch_aggregate_metrics(caplog, spark_session): import datetime engine: SparkDFExecutionEngine = build_spark_engine( spark=spark_session, df=pd.DataFrame({ "a": [1, 2, 1, 2, 3, 3], "b": [4, 4, 4, 4, 4, 4] }, ), batch_id="1234", ) metrics: dict = {} table_columns_metric: MetricConfiguration results: dict table_columns_metric, results = get_table_columns_metric(engine=engine) metrics.update(results) desired_metric_1 = MetricConfiguration( metric_name="column.max.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=None, metric_dependencies={ "table.columns": table_columns_metric, }, ) desired_metric_2 = MetricConfiguration( metric_name="column.min.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=None, metric_dependencies={ "table.columns": table_columns_metric, }, ) desired_metric_3 = MetricConfiguration( metric_name="column.max.aggregate_fn", metric_domain_kwargs={"column": "b"}, metric_value_kwargs=None, metric_dependencies={ "table.columns": table_columns_metric, }, ) desired_metric_4 = MetricConfiguration( metric_name="column.min.aggregate_fn", metric_domain_kwargs={"column": "b"}, metric_value_kwargs=None, metric_dependencies={ "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=( desired_metric_1, desired_metric_2, desired_metric_3, desired_metric_4, ), metrics=metrics, ) metrics.update(results) desired_metric_1 = MetricConfiguration( metric_name="column.max", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=None, metric_dependencies={ "metric_partial_fn": desired_metric_1, "table.columns": table_columns_metric, }, ) desired_metric_2 = MetricConfiguration( metric_name="column.min", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=None, metric_dependencies={ "metric_partial_fn": desired_metric_2, "table.columns": table_columns_metric, }, ) desired_metric_3 = MetricConfiguration( metric_name="column.max", metric_domain_kwargs={"column": "b"}, metric_value_kwargs=None, metric_dependencies={ "metric_partial_fn": desired_metric_3, "table.columns": table_columns_metric, }, ) desired_metric_4 = MetricConfiguration( metric_name="column.min", metric_domain_kwargs={"column": "b"}, metric_value_kwargs=None, metric_dependencies={ "metric_partial_fn": desired_metric_4, "table.columns": table_columns_metric, }, ) start = datetime.datetime.now() caplog.clear() caplog.set_level(logging.DEBUG, logger="great_expectations") results = engine.resolve_metrics( metrics_to_resolve=( desired_metric_1, desired_metric_2, desired_metric_3, desired_metric_4, ), metrics=metrics, ) metrics.update(results) end = datetime.datetime.now() print(end - start) assert metrics[desired_metric_1.id] == 3 assert metrics[desired_metric_2.id] == 1 assert metrics[desired_metric_3.id] == 4 assert metrics[desired_metric_4.id] == 4 # Check that all four of these metrics were computed on a single domain found_message = False for record in caplog.records: if (record.message == "SparkDFExecutionEngine computed 4 metrics on domain_id ()"): found_message = True assert found_message
def test_map_unique_spark_column_exists(spark_session): engine: SparkDFExecutionEngine = build_spark_engine( spark=spark_session, df=pd.DataFrame( { "a": [1, 2, 3, 3, 4, None], "b": [None, "foo", "bar", "baz", "qux", "fish"], } ), batch_id="my_id", ) metrics: dict = {} table_columns_metric: MetricConfiguration results: dict table_columns_metric, results = get_table_columns_metric(engine=engine) metrics.update(results) condition_metric = MetricConfiguration( metric_name="column_values.unique.condition", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(condition_metric,), metrics=metrics ) metrics.update(results) # unique is a *window* function so does not use the aggregate_fn version of unexpected count desired_metric = MetricConfiguration( metric_name="column_values.unique.unexpected_count", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "unexpected_condition": condition_metric, "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) assert results[desired_metric.id] == 2 desired_metric = MetricConfiguration( metric_name="column_values.unique.unexpected_values", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={ "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20} }, metric_dependencies={ "unexpected_condition": condition_metric, "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) assert results[desired_metric.id] == [3, 3] desired_metric = MetricConfiguration( metric_name="column_values.unique.unexpected_value_counts", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={ "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20} }, metric_dependencies={ "unexpected_condition": condition_metric, "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) assert results[desired_metric.id] == [(3, 2)] desired_metric = MetricConfiguration( metric_name="column_values.unique.unexpected_rows", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={ "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20} }, metric_dependencies={"unexpected_condition": condition_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) assert results[desired_metric.id] == [(3, "bar"), (3, "baz")]
def test_z_score_under_threshold_spark(spark_session): engine: SparkDFExecutionEngine = build_spark_engine( spark=spark_session, df=pd.DataFrame( {"a": [1, 2, 3, 3, None]}, ), batch_id="my_id", ) metrics: dict = {} table_columns_metric: MetricConfiguration results: dict table_columns_metric, results = get_table_columns_metric(engine=engine) metrics.update(results) mean = MetricConfiguration( metric_name="column.mean.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) stdev = MetricConfiguration( metric_name="column.standard_deviation.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) desired_metrics = (mean, stdev) results = engine.resolve_metrics( metrics_to_resolve=desired_metrics, metrics=metrics ) metrics.update(results) mean = MetricConfiguration( metric_name="column.mean", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={"metric_partial_fn": mean}, ) stdev = MetricConfiguration( metric_name="column.standard_deviation", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "metric_partial_fn": stdev, "table.columns": table_columns_metric, }, ) desired_metrics = (mean, stdev) results = engine.resolve_metrics( metrics_to_resolve=desired_metrics, metrics=metrics ) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.z_score.map", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "column.standard_deviation": stdev, "column.mean": mean, "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.z_score.under_threshold.condition", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"double_sided": True, "threshold": 2}, metric_dependencies={ "column_values.z_score.map": desired_metric, "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.z_score.under_threshold.unexpected_count.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"double_sided": True, "threshold": 2}, metric_dependencies={"unexpected_condition": desired_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.z_score.under_threshold.unexpected_count", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"double_sided": True, "threshold": 2}, metric_dependencies={"metric_partial_fn": desired_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) assert results[desired_metric.id] == 0