def test_get_domain_records_with_different_column_domain_and_filter_conditions( sa): df = pd.DataFrame({ "a": [1, 2, 3, 4, 5], "b": [2, 3, 4, 5, None], "c": [1, 2, 3, 4, None] }) engine = build_sa_engine(df, sa) data = engine.get_domain_records( domain_kwargs={ "column": "a", "row_condition": 'col("a")<2', "condition_parser": "great_expectations__experimental__", "filter_conditions": [ RowCondition( condition=f'col("b").notnull()', condition_type=RowConditionParserType.GE, ) ], }) domain_data = engine.engine.execute( get_sqlalchemy_domain_data(data)).fetchall() expected_column_df = df.iloc[:1] engine = build_sa_engine(expected_column_df, sa) expected_data = engine.engine.execute( sa.select(["*"]).select_from( engine.active_batch_data.selectable)).fetchall() assert (domain_data == expected_data ), "Data does not match after getting full access compute domain"
def test_get_compute_domain_with_column_pair(sa): engine = build_sa_engine( pd.DataFrame({ "a": [1, 2, 3, 4], "b": [2, 3, 4, None] }), sa) # Fetching data, compute_domain_kwargs, accessor_kwargs data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={ "column_A": "a", "column_B": "b" }, domain_type="column_pair") # Seeing if raw data is the same as the data after condition has been applied - checking post computation data raw_data = engine.engine.execute( sa.select(["*"]).select_from( engine.active_batch_data.selectable)).fetchall() domain_data = engine.engine.execute(sa.select( ["*"]).select_from(data)).fetchall() # Ensuring that with no domain nothing happens to the data itself assert raw_data == domain_data, "Data does not match after getting compute domain" assert ("column_A" not in compute_kwargs.keys() and "column_B" not in compute_kwargs.keys()), "domain kwargs should be existent" assert accessor_kwargs == { "column_A": "a", "column_B": "b", }, "Accessor kwargs have been modified" # Building new engine so that values still found engine = build_sa_engine( pd.DataFrame({ "a": [1, 2, 3, 4], "b": [2, 3, 4, None] }), sa) data2, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={ "column_A": "a", "column_B": "b" }, domain_type="identity") # Seeing if raw data is the same as the data after condition has been applied - checking post computation data raw_data = engine.engine.execute( sa.select([sa.column("a"), sa.column("b")]).select_from( engine.active_batch_data.selectable)).fetchall() domain_data = engine.engine.execute(sa.select( ["*"]).select_from(data2)).fetchall() # Ensuring that with no domain nothing happens to the data itself assert raw_data == domain_data, "Data does not match after getting compute domain" assert compute_kwargs == { "column_A": "a", "column_B": "b", }, "Compute domain kwargs should be existent" assert accessor_kwargs == {}, "Accessor kwargs have been modified"
def test_get_compute_domain_with_column_domain(sa): engine = build_sa_engine( pd.DataFrame({ "a": [1, 2, 3, 4], "b": [2, 3, 4, None] }), sa) # Loading batch data data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={"column": "a"}, domain_type=MetricDomainTypes.COLUMN) # Seeing if raw data is the same as the data after condition has been applied - checking post computation data raw_data = engine.engine.execute( sa.select(["*"]).select_from( engine.active_batch_data.selectable)).fetchall() domain_data = engine.engine.execute(sa.select( ["*"]).select_from(data)).fetchall() # Ensuring that column domain is now an accessor kwarg, and data remains unmodified assert raw_data == domain_data, "Data does not match after getting compute domain" assert compute_kwargs == {}, "Compute domain kwargs should be existent" assert accessor_kwargs == { "column": "a" }, "Accessor kwargs have been modified" # Testing for identity engine = build_sa_engine( pd.DataFrame({ "a": [1, 2, 3, 4], "b": [2, 3, 4, None] }), sa) # Loading batch data data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={"column": "a"}, domain_type=MetricDomainTypes.IDENTITY) # Seeing if raw data is the same as the data after condition has been applied - checking post computation data raw_data = engine.engine.execute( sa.select([sa.column("a")]).select_from( engine.active_batch_data.selectable)).fetchall() domain_data = engine.engine.execute(sa.select( ["*"]).select_from(data)).fetchall() # Ensuring that column domain is now an accessor kwarg, and data remains unmodified assert raw_data == domain_data, "Data does not match after getting compute domain" assert compute_kwargs == { "column": "a" }, "Compute domain kwargs should be existent" assert accessor_kwargs == {}, "Accessor kwargs have been modified"
def test_map_unique_sa_column_does_not_exist(sa): engine = build_sa_engine( pd.DataFrame( {"a": [1, 2, 3, 3, None], "b": ["foo", "bar", "baz", "qux", "fish"]} ), sa, ) metrics: dict = {} table_columns_metric: MetricConfiguration results: dict table_columns_metric, results = get_table_columns_metric(engine=engine) metrics.update(results) condition_metric = MetricConfiguration( metric_name="column_values.unique.condition", metric_domain_kwargs={"column": "non_existent_column"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) with pytest.raises(ge_exceptions.ExecutionEngineError) as eee: # noinspection PyUnusedLocal metrics = engine.resolve_metrics( metrics_to_resolve=(condition_metric,), metrics=metrics ) assert ( 'Error: The column "non_existent_column" in BatchData does not exist.' in str(eee.value) )
def test_get_compute_domain_with_ge_experimental_condition_parser(sa): engine = build_sa_engine( pd.DataFrame({ "a": [1, 2, 3, 4], "b": [2, 3, 4, None] }), sa) # Obtaining data from computation data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={ "column": "b", "row_condition": 'col("b") == 2', "condition_parser": "great_expectations__experimental__", }, domain_type="column", ) # Seeing if raw data is the same as the data after condition has been applied - checking post computation data raw_data = engine.engine.execute( sa.select(["*" ]).select_from(engine.active_batch_data.selectable).where( sa.column("b") == 2)).fetchall() domain_data = engine.engine.execute( get_sqlalchemy_domain_data(data)).fetchall() # Ensuring that column domain is now an accessor kwarg, and data remains unmodified assert raw_data == domain_data, "Data does not match after getting compute domain" # Ensuring compute kwargs have not been modified assert ("row_condition" in compute_kwargs.keys() ), "Row condition should be located within compute kwargs" assert accessor_kwargs == { "column": "b" }, "Accessor kwargs have been modified"
def test_max_metric_sa_column_does_not_exist(sa): engine = build_sa_engine(pd.DataFrame({"a": [1, 2, 1, None]}), sa) metrics: dict = {} table_columns_metric: MetricConfiguration results: dict table_columns_metric, results = get_table_columns_metric(engine=engine) metrics.update(results) partial_metric = MetricConfiguration( metric_name="column.max.aggregate_fn", metric_domain_kwargs={"column": "non_existent_column"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) with pytest.raises(ge_exceptions.ExecutionEngineError) as eee: # noinspection PyUnusedLocal results = engine.resolve_metrics( metrics_to_resolve=(partial_metric,), metrics=metrics ) metrics.update(results) assert ( 'Error: The column "non_existent_column" in BatchData does not exist.' in str(eee.value) )
def test_get_compute_domain_with_multicolumn(sa): engine = build_sa_engine( pd.DataFrame({ "a": [1, 2, 3, 4], "b": [2, 3, 4, None], "c": [1, 2, 3, None] }), sa, ) # Obtaining compute domain data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={"column_list": ["a", "b", "c"]}, domain_type="multicolumn") # Seeing if raw data is the same as the data after condition has been applied - checking post computation data raw_data = engine.engine.execute( sa.select(["*"]).select_from( engine.active_batch_data.selectable)).fetchall() domain_data = engine.engine.execute(sa.select( ["*"]).select_from(data)).fetchall() # Ensuring that with no domain nothing happens to the data itself assert raw_data == domain_data, "Data does not match after getting compute domain" assert compute_kwargs is not None, "Compute domain kwargs should be existent" assert accessor_kwargs == { "column_list": ["a", "b", "c"] }, "Accessor kwargs have been modified"
def test_get_domain_records_with_column_domain_and_filter_conditions_raises_error_on_multiple_conditions( sa, ): df = pd.DataFrame({ "a": [1, 2, 3, 4, 5], "b": [2, 3, 4, 5, None], "c": [1, 2, 3, 4, None] }) engine = build_sa_engine(df, sa) with pytest.raises(ge_exceptions.GreatExpectationsError) as e: data = engine.get_domain_records( domain_kwargs={ "column": "a", "row_condition": 'col("a")<2', "condition_parser": "great_expectations__experimental__", "filter_conditions": [ RowCondition( condition=f'col("b").notnull()', condition_type=RowConditionParserType.GE, ), RowCondition( condition=f'col("c").notnull()', condition_type=RowConditionParserType.GE, ), ], })
def test_sa_batch_unexpected_condition_temp_table(caplog, sa): def validate_tmp_tables(): temp_tables = [ name for name in get_sqlite_temp_table_names(engine.engine) if name.startswith("ge_temp_") ] tables = [ name for name in get_sqlite_table_names(engine.engine) if name.startswith("ge_temp_") ] assert len(temp_tables) == 0 assert len(tables) == 0 engine = build_sa_engine( pd.DataFrame({"a": [1, 2, 1, 2, 3, 3], "b": [4, 4, 4, 4, 4, 4]}), sa ) metrics: dict = {} table_columns_metric: MetricConfiguration results: dict table_columns_metric, results = get_table_columns_metric(engine=engine) metrics.update(results) validate_tmp_tables() condition_metric = MetricConfiguration( metric_name="column_values.unique.condition", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=None, metric_dependencies={ "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(condition_metric,), metrics=metrics ) metrics.update(results) validate_tmp_tables() desired_metric = MetricConfiguration( metric_name="column_values.unique.unexpected_count", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=None, metric_dependencies={ "unexpected_condition": condition_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) validate_tmp_tables()
def test_get_compute_domain_with_nonexistent_condition_parser(sa): engine = build_sa_engine( pd.DataFrame({"a": [1, 2, 3, 4], "b": [2, 3, 4, None]}), sa ) # Expect GreatExpectationsError because parser doesn't exist with pytest.raises(ge_exceptions.GreatExpectationsError) as e: data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={ "row_condition": "b > 24", "condition_parser": "nonexistent", }, domain_type=MetricDomainTypes.TABLE, )
def test_map_value_set_sa(sa): engine = build_sa_engine(pd.DataFrame({"a": [1, 2, 3, 3, None]}), sa) metrics: dict = {} table_columns_metric: MetricConfiguration results: dict table_columns_metric, results = get_table_columns_metric(engine=engine) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.in_set.condition", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"value_set": [1, 2, 3]}, metric_dependencies={ "table.columns": table_columns_metric, }, ) metrics = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) # Note: metric_dependencies is optional here in the config when called from a validator. aggregate_partial = MetricConfiguration( metric_name="column_values.in_set.unexpected_count.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"value_set": [1, 2, 3]}, metric_dependencies={"unexpected_condition": desired_metric}, ) metrics = engine.resolve_metrics( metrics_to_resolve=(aggregate_partial,), metrics=metrics ) desired_metric = MetricConfiguration( metric_name="column_values.in_set.unexpected_count", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"value_set": [1, 2, 3]}, metric_dependencies={"metric_partial_fn": aggregate_partial}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) assert results == {desired_metric.id: 0}
def test_max_metric_sa_column_exists(sa): engine = build_sa_engine(pd.DataFrame({"a": [1, 2, 1, None]}), sa) metrics: dict = {} table_columns_metric: MetricConfiguration results: dict table_columns_metric, results = get_table_columns_metric(engine=engine) metrics.update(results) partial_metric = MetricConfiguration( metric_name="column.max.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(partial_metric,), metrics=metrics ) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column.max", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "metric_partial_fn": partial_metric, "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) assert results == {desired_metric.id: 2}
def test_distinct_metric_sa(sa): engine = build_sa_engine( pd.DataFrame({"a": [1, 2, 1, 2, 3, 3], "b": [4, 4, 4, 4, 4, 4]}), sa ) desired_metric = MetricConfiguration( metric_name="column.value_counts", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"sort": "value", "collate": None}, ) desired_metric_b = MetricConfiguration( metric_name="column.value_counts", metric_domain_kwargs={"column": "b"}, metric_value_kwargs={"sort": "value", "collate": None}, ) metrics = engine.resolve_metrics( metrics_to_resolve=(desired_metric, desired_metric_b) ) assert pd.Series(index=[1, 2, 3], data=[2, 2, 2]).equals(metrics[desired_metric.id]) assert pd.Series(index=[4], data=[6]).equals(metrics[desired_metric_b.id]) desired_metric = MetricConfiguration( metric_name="column.distinct_values", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={"column.value_counts": desired_metric}, ) desired_metric_b = MetricConfiguration( metric_name="column.distinct_values", metric_domain_kwargs={"column": "b"}, metric_value_kwargs=dict(), metric_dependencies={"column.value_counts": desired_metric_b}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric, desired_metric_b), metrics=metrics ) assert results[desired_metric.id] == {1, 2, 3} assert results[desired_metric_b.id] == {4}
def test_sqlite_sample_using_limit(sa): csv_path: str = file_relative_path( os.path.dirname(os.path.dirname(__file__)), os.path.join( "test_sets", "taxi_yellow_tripdata_samples", "ten_trips_from_each_month", "yellow_tripdata_sample_10_trips_from_each_month.csv", ), ) df: pd.DataFrame = pd.read_csv(csv_path) engine: SqlAlchemyExecutionEngine = build_sa_engine(df, sa) n: int = 10 batch_spec: SqlAlchemyDatasourceBatchSpec = SqlAlchemyDatasourceBatchSpec( table_name="test", schema_name="main", sampling_method="sample_using_limit", sampling_kwargs={"n": n}, ) batch_data: SqlAlchemyBatchData = engine.get_batch_data(batch_spec=batch_spec) # Right number of rows? num_rows: int = batch_data.execution_engine.engine.execute( sa.select([sa.func.count()]).select_from(batch_data.selectable) ).scalar() assert num_rows == n # Right rows? rows: sa.Row = batch_data.execution_engine.engine.execute( sa.select([sa.text("*")]).select_from(batch_data.selectable) ).fetchall() row_dates: List[datetime.datetime] = [parse(row["pickup_datetime"]) for row in rows] for row_date in row_dates: assert row_date.month == 1 assert row_date.year == 2018
def test_resolve_metric_bundle_with_nonexistent_metric(sa): engine = build_sa_engine( pd.DataFrame({ "a": [1, 2, 1, 2, 3, 3], "b": [4, 4, 4, 4, 4, 4] }), sa) desired_metric_1 = MetricConfiguration( metric_name="column_values.unique", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=None, ) desired_metric_2 = MetricConfiguration( metric_name="column.min", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=None, ) desired_metric_3 = MetricConfiguration( metric_name="column.max", metric_domain_kwargs={"column": "b"}, metric_value_kwargs=None, ) desired_metric_4 = MetricConfiguration( metric_name="column.does_not_exist", metric_domain_kwargs={"column": "b"}, metric_value_kwargs=None, ) # Ensuring a metric provider error is raised if metric does not exist with pytest.raises(ge_exceptions.MetricProviderError) as e: res = engine.resolve_metrics(metrics_to_resolve=( desired_metric_1, desired_metric_2, desired_metric_3, desired_metric_4, )) print(e)
def in_memory_sqlite_taxi_ten_trips_per_month_execution_engine(sa): df: pd.DataFrame = ten_trips_per_month_df() convert_string_columns_to_datetime( df=df, column_names_to_convert=["pickup_datetime", "dropoff_datetime"]) engine: SqlAlchemyExecutionEngine = build_sa_engine(df, sa) return engine
def test_map_unique_sa_column_exists(sa): engine = build_sa_engine( pd.DataFrame( {"a": [1, 2, 3, 3, None], "b": ["foo", "bar", "baz", "qux", "fish"]} ), sa, ) metrics: dict = {} table_columns_metric: MetricConfiguration results: dict table_columns_metric, results = get_table_columns_metric(engine=engine) metrics.update(results) condition_metric = MetricConfiguration( metric_name="column_values.unique.condition", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(condition_metric,), metrics=metrics ) metrics.update(results) # This is no longer a MAP_CONDITION because mssql does not support it. Instead, it is a WINDOW_CONDITION # # aggregate_fn = MetricConfiguration( # metric_name="column_values.unique.unexpected_count.aggregate_fn", # metric_domain_kwargs={"column": "a"}, # metric_value_kwargs=dict(), # metric_dependencies={"unexpected_condition": condition_metric}, # ) # aggregate_fn_metrics = engine.resolve_metrics( # metrics_to_resolve=(aggregate_fn,), metrics=metrics # ) desired_metric = MetricConfiguration( metric_name="column_values.unique.unexpected_count", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), # metric_dependencies={"metric_partial_fn": aggregate_fn}, metric_dependencies={ "unexpected_condition": condition_metric, "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics, # metrics=aggregate_fn_metrics ) metrics.update(results) assert results[desired_metric.id] == 2 desired_metric = MetricConfiguration( metric_name="column_values.unique.unexpected_values", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={ "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20} }, metric_dependencies={ "unexpected_condition": condition_metric, "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) assert results[desired_metric.id] == [3, 3] desired_metric = MetricConfiguration( metric_name="column_values.unique.unexpected_value_counts", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={ "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20} }, metric_dependencies={ "unexpected_condition": condition_metric, "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) assert results[desired_metric.id] == [(3, 2)] desired_metric = MetricConfiguration( metric_name="column_values.unique.unexpected_rows", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={ "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20} }, metric_dependencies={"unexpected_condition": condition_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) assert results[desired_metric.id] == [(3, "baz"), (3, "qux")]
def test_get_domain_records_with_multicolumn_domain(sa): df = pd.DataFrame({ "a": [1, 2, 3, 4, None, 5], "b": [2, 3, 4, 5, 6, 7], "c": [1, 2, 3, 4, None, 6], }) engine = build_sa_engine(df, sa) data = engine.get_domain_records( domain_kwargs={ "column_list": ["a", "c"], "row_condition": 'col("b")>2', "condition_parser": "great_expectations__experimental__", "ignore_row_if": "all_values_are_missing", }) domain_data = engine.engine.execute(sa.select( ["*"]).select_from(data)).fetchall() expected_multicolumn_df = pd.DataFrame( { "a": [2, 3, 4, 5], "b": [3, 4, 5, 7], "c": [2, 3, 4, 6] }, index=[0, 1, 2, 4]) engine = build_sa_engine(expected_multicolumn_df, sa) expected_data = engine.engine.execute( sa.select(["*"]).select_from( engine.active_batch_data.selectable)).fetchall() assert (domain_data == expected_data ), "Data does not match after getting full access compute domain" df = pd.DataFrame({ "a": [1, 2, 3, 4, 5, 6], "b": [2, 3, 4, 5, None, 6], "c": [1, 2, 3, 4, 5, None], }) engine = build_sa_engine(df, sa) data = engine.get_domain_records( domain_kwargs={ "column_list": ["b", "c"], "row_condition": 'col("a")<5', "condition_parser": "great_expectations__experimental__", "ignore_row_if": "any_value_is_missing", }) domain_data = engine.engine.execute(sa.select( ["*"]).select_from(data)).fetchall() expected_multicolumn_df = pd.DataFrame( { "a": [1, 2, 3, 4], "b": [2, 3, 4, 5], "c": [1, 2, 3, 4] }, index=[0, 1, 2, 3]) engine = build_sa_engine(expected_multicolumn_df, sa) expected_data = engine.engine.execute( sa.select(["*"]).select_from( engine.active_batch_data.selectable)).fetchall() assert (domain_data == expected_data ), "Data does not match after getting full access compute domain" df = pd.DataFrame({ "a": [1, 2, 3, 4, None, 5], "b": [2, 3, 4, 5, 6, 7], "c": [1, 2, 3, 4, None, 6], }) engine = build_sa_engine(df, sa) data = engine.get_domain_records(domain_kwargs={ "column_list": ["b", "c"], "ignore_row_if": "never", }) domain_data = engine.engine.execute(sa.select( ["*"]).select_from(data)).fetchall() expected_multicolumn_df = pd.DataFrame( { "a": [1, 2, 3, 4, None, 5], "b": [2, 3, 4, 5, 6, 7], "c": [1, 2, 3, 4, None, 6], }, index=[0, 1, 2, 3, 4, 5], ) engine = build_sa_engine(expected_multicolumn_df, sa) expected_data = engine.engine.execute( sa.select(["*"]).select_from( engine.active_batch_data.selectable)).fetchall() assert (domain_data == expected_data ), "Data does not match after getting full access compute domain"
def test_get_domain_records_with_column_pair_domain(sa): df = pd.DataFrame({ "a": [1, 2, 3, 4, 5, 6], "b": [2, 3, 4, 5, None, 6], "c": [1, 2, 3, 4, 5, None], }) engine = build_sa_engine(df, sa) data = engine.get_domain_records( domain_kwargs={ "column_A": "a", "column_B": "b", "row_condition": 'col("b")>2', "condition_parser": "great_expectations__experimental__", "ignore_row_if": "both_values_are_missing", }) domain_data = engine.engine.execute(sa.select( ["*"]).select_from(data)).fetchall() expected_column_pair_df = pd.DataFrame({ "a": [2, 3, 4, 6], "b": [3.0, 4.0, 5.0, 6.0], "c": [2.0, 3.0, 4.0, None] }) engine = build_sa_engine(expected_column_pair_df, sa) expected_data = engine.engine.execute( sa.select(["*"]).select_from( engine.active_batch_data.selectable)).fetchall() assert (domain_data == expected_data ), "Data does not match after getting full access compute domain" engine = build_sa_engine(df, sa) data = engine.get_domain_records( domain_kwargs={ "column_A": "b", "column_B": "c", "row_condition": 'col("b")>2', "condition_parser": "great_expectations__experimental__", "ignore_row_if": "either_value_is_missing", }) domain_data = engine.engine.execute(sa.select( ["*"]).select_from(data)).fetchall() expected_column_pair_df = pd.DataFrame({ "a": [2, 3, 4], "b": [3, 4, 5], "c": [2, 3, 4] }) engine = build_sa_engine(expected_column_pair_df, sa) expected_data = engine.engine.execute( sa.select(["*"]).select_from( engine.active_batch_data.selectable)).fetchall() assert (domain_data == expected_data ), "Data does not match after getting full access compute domain" engine = build_sa_engine(df, sa) data = engine.get_domain_records( domain_kwargs={ "column_A": "b", "column_B": "c", "row_condition": 'col("a")<6', "condition_parser": "great_expectations__experimental__", "ignore_row_if": "neither", }) domain_data = engine.engine.execute( get_sqlalchemy_domain_data(data)).fetchall() expected_column_pair_df = pd.DataFrame({ "a": [1, 2, 3, 4, 5], "b": [2.0, 3.0, 4.0, 5.0, None], "c": [1.0, 2.0, 3.0, 4.0, 5.0], }) engine = build_sa_engine(expected_column_pair_df, sa) expected_data = engine.engine.execute( sa.select(["*"]).select_from( engine.active_batch_data.selectable)).fetchall() assert (domain_data == expected_data ), "Data does not match after getting full access compute domain"
def test_sa_batch_aggregate_metrics(caplog, sa): import datetime engine = build_sa_engine( pd.DataFrame({ "a": [1, 2, 1, 2, 3, 3], "b": [4, 4, 4, 4, 4, 4] }), sa) metrics: dict = {} table_columns_metric: MetricConfiguration results: dict table_columns_metric, results = get_table_columns_metric(engine=engine) metrics.update(results) desired_metric_1 = MetricConfiguration( metric_name="column.max.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=None, metric_dependencies={ "table.columns": table_columns_metric, }, ) desired_metric_2 = MetricConfiguration( metric_name="column.min.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=None, metric_dependencies={ "table.columns": table_columns_metric, }, ) desired_metric_3 = MetricConfiguration( metric_name="column.max.aggregate_fn", metric_domain_kwargs={"column": "b"}, metric_value_kwargs=None, metric_dependencies={ "table.columns": table_columns_metric, }, ) desired_metric_4 = MetricConfiguration( metric_name="column.min.aggregate_fn", metric_domain_kwargs={"column": "b"}, metric_value_kwargs=None, metric_dependencies={ "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=( desired_metric_1, desired_metric_2, desired_metric_3, desired_metric_4, ), metrics=metrics, ) metrics.update(results) desired_metric_1 = MetricConfiguration( metric_name="column.max", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=None, metric_dependencies={ "metric_partial_fn": desired_metric_1, "table.columns": table_columns_metric, }, ) desired_metric_2 = MetricConfiguration( metric_name="column.min", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=None, metric_dependencies={ "metric_partial_fn": desired_metric_2, "table.columns": table_columns_metric, }, ) desired_metric_3 = MetricConfiguration( metric_name="column.max", metric_domain_kwargs={"column": "b"}, metric_value_kwargs=None, metric_dependencies={ "metric_partial_fn": desired_metric_3, "table.columns": table_columns_metric, }, ) desired_metric_4 = MetricConfiguration( metric_name="column.min", metric_domain_kwargs={"column": "b"}, metric_value_kwargs=None, metric_dependencies={ "metric_partial_fn": desired_metric_4, "table.columns": table_columns_metric, }, ) caplog.clear() caplog.set_level(logging.DEBUG, logger="great_expectations") start = datetime.datetime.now() results = engine.resolve_metrics( metrics_to_resolve=( desired_metric_1, desired_metric_2, desired_metric_3, desired_metric_4, ), metrics=metrics, ) metrics.update(results) end = datetime.datetime.now() print("t1") print(end - start) assert results[desired_metric_1.id] == 3 assert results[desired_metric_2.id] == 1 assert results[desired_metric_3.id] == 4 assert results[desired_metric_4.id] == 4 # Check that all four of these metrics were computed on a single domain found_message = False for record in caplog.records: if (record.message == "SqlAlchemyExecutionEngine computed 4 metrics on domain_id ()" ): found_message = True assert found_message
def in_memory_sqlite_taxi_ten_trips_per_month_execution_engine(sa): engine: SqlAlchemyExecutionEngine = build_sa_engine(ten_trips_per_month_df(), sa) return engine
def test_sqlite_split( taxi_test_cases: TaxiSplittingTestCasesBase, sa, ): """What does this test and why? splitters should work with sqlite. """ engine: SqlAlchemyExecutionEngine = build_sa_engine( taxi_test_cases.test_df, sa) test_cases: List[TaxiSplittingTestCase] = taxi_test_cases.test_cases() test_case: TaxiSplittingTestCase batch_spec: SqlAlchemyDatasourceBatchSpec for test_case in test_cases: if test_case.table_domain_test_case: batch_spec = SqlAlchemyDatasourceBatchSpec( table_name="test", schema_name="main", splitter_method=test_case.splitter_method_name, splitter_kwargs=test_case.splitter_kwargs, batch_identifiers={}, ) else: if taxi_test_cases.test_column_name: batch_spec = SqlAlchemyDatasourceBatchSpec( table_name="test", schema_name="main", splitter_method=test_case.splitter_method_name, splitter_kwargs=test_case.splitter_kwargs, batch_identifiers={ taxi_test_cases.test_column_name: test_case.expected_column_values[0] }, ) elif taxi_test_cases.test_column_names: column_name: str batch_spec = SqlAlchemyDatasourceBatchSpec( table_name="test", schema_name="main", splitter_method=test_case.splitter_method_name, splitter_kwargs=test_case.splitter_kwargs, batch_identifiers={ column_name: test_case.expected_column_values[0][column_name] for column_name in taxi_test_cases.test_column_names }, ) else: raise ValueError( "Missing test_column_names or test_column_names attribute." ) batch_data: SqlAlchemyBatchData = engine.get_batch_data( batch_spec=batch_spec) # Right number of rows? num_rows: int = batch_data.execution_engine.engine.execute( sa.select([sa.func.count() ]).select_from(batch_data.selectable)).scalar() # noinspection PyUnresolvedReferences assert num_rows == test_case.num_expected_rows_in_first_batch_definition