def test_sa_batch_aggregate_metrics(caplog, sa): import datetime engine = build_sa_engine( pd.DataFrame({ "a": [1, 2, 1, 2, 3, 3], "b": [4, 4, 4, 4, 4, 4] }), sa) metrics: dict = {} table_columns_metric: MetricConfiguration results: dict table_columns_metric, results = get_table_columns_metric(engine=engine) metrics.update(results) desired_metric_1 = MetricConfiguration( metric_name="column.max.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) desired_metric_2 = MetricConfiguration( metric_name="column.min.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) desired_metric_3 = MetricConfiguration( metric_name="column.max.aggregate_fn", metric_domain_kwargs={"column": "b"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) desired_metric_4 = MetricConfiguration( metric_name="column.min.aggregate_fn", metric_domain_kwargs={"column": "b"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=( desired_metric_1, desired_metric_2, desired_metric_3, desired_metric_4, ), metrics=metrics, ) metrics.update(results) desired_metric_1 = MetricConfiguration( metric_name="column.max", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "metric_partial_fn": desired_metric_1, "table.columns": table_columns_metric, }, ) desired_metric_2 = MetricConfiguration( metric_name="column.min", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "metric_partial_fn": desired_metric_2, "table.columns": table_columns_metric, }, ) desired_metric_3 = MetricConfiguration( metric_name="column.max", metric_domain_kwargs={"column": "b"}, metric_value_kwargs=dict(), metric_dependencies={ "metric_partial_fn": desired_metric_3, "table.columns": table_columns_metric, }, ) desired_metric_4 = MetricConfiguration( metric_name="column.min", metric_domain_kwargs={"column": "b"}, metric_value_kwargs=dict(), metric_dependencies={ "metric_partial_fn": desired_metric_4, "table.columns": table_columns_metric, }, ) caplog.clear() caplog.set_level(logging.DEBUG, logger="great_expectations") start = datetime.datetime.now() results = engine.resolve_metrics( metrics_to_resolve=( desired_metric_1, desired_metric_2, desired_metric_3, desired_metric_4, ), metrics=metrics, ) metrics.update(results) end = datetime.datetime.now() print("t1") print(end - start) assert results[desired_metric_1.id] == 3 assert results[desired_metric_2.id] == 1 assert results[desired_metric_3.id] == 4 assert results[desired_metric_4.id] == 4 # Check that all four of these metrics were computed on a single domain found_message = False for record in caplog.records: if (record.message == "SqlAlchemyExecutionEngine computed 4 metrics on domain_id ()" ): found_message = True assert found_message
def test_z_score_under_threshold_spark(spark_session): engine: SparkDFExecutionEngine = build_spark_engine( spark=spark_session, df=pd.DataFrame( {"a": [1, 2, 3, 3, None]}, ), batch_id="my_id", ) metrics: dict = {} table_columns_metric: MetricConfiguration results: dict table_columns_metric, results = get_table_columns_metric(engine=engine) metrics.update(results) mean = MetricConfiguration( metric_name="column.mean.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) stdev = MetricConfiguration( metric_name="column.standard_deviation.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) desired_metrics = (mean, stdev) results = engine.resolve_metrics( metrics_to_resolve=desired_metrics, metrics=metrics ) metrics.update(results) mean = MetricConfiguration( metric_name="column.mean", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={"metric_partial_fn": mean}, ) stdev = MetricConfiguration( metric_name="column.standard_deviation", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "metric_partial_fn": stdev, "table.columns": table_columns_metric, }, ) desired_metrics = (mean, stdev) results = engine.resolve_metrics( metrics_to_resolve=desired_metrics, metrics=metrics ) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.z_score.map", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "column.standard_deviation": stdev, "column.mean": mean, "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.z_score.under_threshold.condition", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"double_sided": True, "threshold": 2}, metric_dependencies={ "column_values.z_score.map": desired_metric, "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.z_score.under_threshold.unexpected_count.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"double_sided": True, "threshold": 2}, metric_dependencies={"unexpected_condition": desired_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.z_score.under_threshold.unexpected_count", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"double_sided": True, "threshold": 2}, metric_dependencies={"metric_partial_fn": desired_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) assert results[desired_metric.id] == 0
def test_z_score_under_threshold_pd(): df = pd.DataFrame({"a": [1, 2, 3, None]}) engine = PandasExecutionEngine(batch_data_dict={"my_id": df}) metrics: dict = {} table_columns_metric: MetricConfiguration results: dict table_columns_metric, results = get_table_columns_metric(engine=engine) metrics.update(results) mean = MetricConfiguration( metric_name="column.mean", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) stdev = MetricConfiguration( metric_name="column.standard_deviation", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) desired_metrics = (mean, stdev) results = engine.resolve_metrics( metrics_to_resolve=desired_metrics, metrics=metrics ) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.z_score.map", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "column.standard_deviation": stdev, "column.mean": mean, "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.z_score.under_threshold.condition", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"double_sided": True, "threshold": 2}, metric_dependencies={ "column_values.z_score.map": desired_metric, "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) assert list(results[desired_metric.id][0]) == [False, False, False] metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.z_score.under_threshold.unexpected_count", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"double_sided": True, "threshold": 2}, metric_dependencies={"unexpected_condition": desired_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) assert results[desired_metric.id] == 0
def test_map_unique_spark_column_exists(spark_session): engine: SparkDFExecutionEngine = build_spark_engine( spark=spark_session, df=pd.DataFrame( { "a": [1, 2, 3, 3, 4, None], "b": [None, "foo", "bar", "baz", "qux", "fish"], } ), batch_id="my_id", ) metrics: dict = {} table_columns_metric: MetricConfiguration results: dict table_columns_metric, results = get_table_columns_metric(engine=engine) metrics.update(results) condition_metric = MetricConfiguration( metric_name="column_values.unique.condition", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(condition_metric,), metrics=metrics ) metrics.update(results) # unique is a *window* function so does not use the aggregate_fn version of unexpected count desired_metric = MetricConfiguration( metric_name="column_values.unique.unexpected_count", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "unexpected_condition": condition_metric, "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) assert results[desired_metric.id] == 2 desired_metric = MetricConfiguration( metric_name="column_values.unique.unexpected_values", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={ "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20} }, metric_dependencies={ "unexpected_condition": condition_metric, "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) assert results[desired_metric.id] == [3, 3] desired_metric = MetricConfiguration( metric_name="column_values.unique.unexpected_value_counts", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={ "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20} }, metric_dependencies={ "unexpected_condition": condition_metric, "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) assert results[desired_metric.id] == [(3, 2)] desired_metric = MetricConfiguration( metric_name="column_values.unique.unexpected_rows", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={ "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20} }, metric_dependencies={"unexpected_condition": condition_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) assert results[desired_metric.id] == [(3, "bar"), (3, "baz")]
def test_map_unique_sa_column_exists(sa): engine = build_sa_engine( pd.DataFrame( {"a": [1, 2, 3, 3, None], "b": ["foo", "bar", "baz", "qux", "fish"]} ), sa, ) metrics: dict = {} table_columns_metric: MetricConfiguration results: dict table_columns_metric, results = get_table_columns_metric(engine=engine) metrics.update(results) condition_metric = MetricConfiguration( metric_name="column_values.unique.condition", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(condition_metric,), metrics=metrics ) metrics.update(results) # This is no longer a MAP_CONDITION because mssql does not support it. Instead, it is a WINDOW_CONDITION # # aggregate_fn = MetricConfiguration( # metric_name="column_values.unique.unexpected_count.aggregate_fn", # metric_domain_kwargs={"column": "a"}, # metric_value_kwargs=dict(), # metric_dependencies={"unexpected_condition": condition_metric}, # ) # aggregate_fn_metrics = engine.resolve_metrics( # metrics_to_resolve=(aggregate_fn,), metrics=metrics # ) desired_metric = MetricConfiguration( metric_name="column_values.unique.unexpected_count", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), # metric_dependencies={"metric_partial_fn": aggregate_fn}, metric_dependencies={ "unexpected_condition": condition_metric, "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics, # metrics=aggregate_fn_metrics ) metrics.update(results) assert results[desired_metric.id] == 2 desired_metric = MetricConfiguration( metric_name="column_values.unique.unexpected_values", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={ "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20} }, metric_dependencies={ "unexpected_condition": condition_metric, "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) assert results[desired_metric.id] == [3, 3] desired_metric = MetricConfiguration( metric_name="column_values.unique.unexpected_value_counts", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={ "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20} }, metric_dependencies={ "unexpected_condition": condition_metric, "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) assert results[desired_metric.id] == [(3, 2)] desired_metric = MetricConfiguration( metric_name="column_values.unique.unexpected_rows", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={ "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20} }, metric_dependencies={"unexpected_condition": condition_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) assert results[desired_metric.id] == [(3, "baz"), (3, "qux")]
def test_map_value_set_spark(spark_session, basic_spark_df_execution_engine): engine: SparkDFExecutionEngine = build_spark_engine( spark=spark_session, df=pd.DataFrame( {"a": [1, 2, 3, 3, None]}, ), batch_id="my_id", ) metrics: dict = {} table_columns_metric: MetricConfiguration results: dict table_columns_metric, results = get_table_columns_metric(engine=engine) metrics.update(results) condition_metric = MetricConfiguration( metric_name="column_values.in_set.condition", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"value_set": [1, 2, 3]}, metric_dependencies={ "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(condition_metric,), metrics=metrics ) metrics.update(results) # Note: metric_dependencies is optional here in the config when called from a validator. aggregate_partial = MetricConfiguration( metric_name="column_values.in_set.unexpected_count.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"value_set": [1, 2, 3]}, metric_dependencies={"unexpected_condition": condition_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(aggregate_partial,), metrics=metrics ) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.in_set.unexpected_count", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"value_set": [1, 2, 3]}, metric_dependencies={"metric_partial_fn": aggregate_partial}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) assert results == {desired_metric.id: 0} # We run the same computation again, this time with None being replaced by nan instead of NULL # to demonstrate this behavior df = pd.DataFrame({"a": [1, 2, 3, 3, None]}) df = spark_session.createDataFrame(df) engine = basic_spark_df_execution_engine engine.load_batch_data(batch_id="my_id", batch_data=df) condition_metric = MetricConfiguration( metric_name="column_values.in_set.condition", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"value_set": [1, 2, 3]}, metric_dependencies={ "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(condition_metric,), metrics=metrics ) metrics.update(results) # Note: metric_dependencies is optional here in the config when called from a validator. aggregate_partial = MetricConfiguration( metric_name="column_values.in_set.unexpected_count.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"value_set": [1, 2, 3]}, metric_dependencies={"unexpected_condition": condition_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(aggregate_partial,), metrics=metrics ) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.in_set.unexpected_count", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"value_set": [1, 2, 3]}, metric_dependencies={"metric_partial_fn": aggregate_partial}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) assert results == {desired_metric.id: 1}