Ejemplo n.º 1
0
def test_get_domain_records_with_different_column_domain_and_filter_conditions(
        sa):
    df = pd.DataFrame({
        "a": [1, 2, 3, 4, 5],
        "b": [2, 3, 4, 5, None],
        "c": [1, 2, 3, 4, None]
    })
    engine = build_sa_engine(df, sa)
    data = engine.get_domain_records(
        domain_kwargs={
            "column":
            "a",
            "row_condition":
            'col("a")<2',
            "condition_parser":
            "great_expectations__experimental__",
            "filter_conditions": [
                RowCondition(
                    condition=f'col("b").notnull()',
                    condition_type=RowConditionParserType.GE,
                )
            ],
        })
    domain_data = engine.engine.execute(
        get_sqlalchemy_domain_data(data)).fetchall()

    expected_column_df = df.iloc[:1]
    engine = build_sa_engine(expected_column_df, sa)
    expected_data = engine.engine.execute(
        sa.select(["*"]).select_from(
            engine.active_batch_data.selectable)).fetchall()

    assert (domain_data == expected_data
            ), "Data does not match after getting full access compute domain"
Ejemplo n.º 2
0
def test_get_compute_domain_with_column_pair(sa):
    engine = build_sa_engine(
        pd.DataFrame({
            "a": [1, 2, 3, 4],
            "b": [2, 3, 4, None]
        }), sa)

    # Fetching data, compute_domain_kwargs, accessor_kwargs
    data, compute_kwargs, accessor_kwargs = engine.get_compute_domain(
        domain_kwargs={
            "column_A": "a",
            "column_B": "b"
        },
        domain_type="column_pair")

    # Seeing if raw data is the same as the data after condition has been applied - checking post computation data
    raw_data = engine.engine.execute(
        sa.select(["*"]).select_from(
            engine.active_batch_data.selectable)).fetchall()
    domain_data = engine.engine.execute(sa.select(
        ["*"]).select_from(data)).fetchall()

    # Ensuring that with no domain nothing happens to the data itself
    assert raw_data == domain_data, "Data does not match after getting compute domain"
    assert ("column_A" not in compute_kwargs.keys() and "column_B"
            not in compute_kwargs.keys()), "domain kwargs should be existent"
    assert accessor_kwargs == {
        "column_A": "a",
        "column_B": "b",
    }, "Accessor kwargs have been modified"

    # Building new engine so that values still found
    engine = build_sa_engine(
        pd.DataFrame({
            "a": [1, 2, 3, 4],
            "b": [2, 3, 4, None]
        }), sa)
    data2, compute_kwargs, accessor_kwargs = engine.get_compute_domain(
        domain_kwargs={
            "column_A": "a",
            "column_B": "b"
        },
        domain_type="identity")

    # Seeing if raw data is the same as the data after condition has been applied - checking post computation data
    raw_data = engine.engine.execute(
        sa.select([sa.column("a"), sa.column("b")]).select_from(
            engine.active_batch_data.selectable)).fetchall()
    domain_data = engine.engine.execute(sa.select(
        ["*"]).select_from(data2)).fetchall()

    # Ensuring that with no domain nothing happens to the data itself
    assert raw_data == domain_data, "Data does not match after getting compute domain"
    assert compute_kwargs == {
        "column_A": "a",
        "column_B": "b",
    }, "Compute domain kwargs should be existent"
    assert accessor_kwargs == {}, "Accessor kwargs have been modified"
Ejemplo n.º 3
0
def test_get_compute_domain_with_column_domain(sa):
    engine = build_sa_engine(
        pd.DataFrame({
            "a": [1, 2, 3, 4],
            "b": [2, 3, 4, None]
        }), sa)

    # Loading batch data
    data, compute_kwargs, accessor_kwargs = engine.get_compute_domain(
        domain_kwargs={"column": "a"}, domain_type=MetricDomainTypes.COLUMN)

    # Seeing if raw data is the same as the data after condition has been applied - checking post computation data
    raw_data = engine.engine.execute(
        sa.select(["*"]).select_from(
            engine.active_batch_data.selectable)).fetchall()
    domain_data = engine.engine.execute(sa.select(
        ["*"]).select_from(data)).fetchall()

    # Ensuring that column domain is now an accessor kwarg, and data remains unmodified
    assert raw_data == domain_data, "Data does not match after getting compute domain"
    assert compute_kwargs == {}, "Compute domain kwargs should be existent"
    assert accessor_kwargs == {
        "column": "a"
    }, "Accessor kwargs have been modified"

    # Testing for identity
    engine = build_sa_engine(
        pd.DataFrame({
            "a": [1, 2, 3, 4],
            "b": [2, 3, 4, None]
        }), sa)

    # Loading batch data
    data, compute_kwargs, accessor_kwargs = engine.get_compute_domain(
        domain_kwargs={"column": "a"}, domain_type=MetricDomainTypes.IDENTITY)

    # Seeing if raw data is the same as the data after condition has been applied - checking post computation data
    raw_data = engine.engine.execute(
        sa.select([sa.column("a")]).select_from(
            engine.active_batch_data.selectable)).fetchall()
    domain_data = engine.engine.execute(sa.select(
        ["*"]).select_from(data)).fetchall()

    # Ensuring that column domain is now an accessor kwarg, and data remains unmodified
    assert raw_data == domain_data, "Data does not match after getting compute domain"
    assert compute_kwargs == {
        "column": "a"
    }, "Compute domain kwargs should be existent"
    assert accessor_kwargs == {}, "Accessor kwargs have been modified"
Ejemplo n.º 4
0
def test_map_unique_sa_column_does_not_exist(sa):
    engine = build_sa_engine(
        pd.DataFrame(
            {"a": [1, 2, 3, 3, None], "b": ["foo", "bar", "baz", "qux", "fish"]}
        ),
        sa,
    )

    metrics: dict = {}

    table_columns_metric: MetricConfiguration
    results: dict

    table_columns_metric, results = get_table_columns_metric(engine=engine)
    metrics.update(results)

    condition_metric = MetricConfiguration(
        metric_name="column_values.unique.condition",
        metric_domain_kwargs={"column": "non_existent_column"},
        metric_value_kwargs=dict(),
        metric_dependencies={
            "table.columns": table_columns_metric,
        },
    )
    with pytest.raises(ge_exceptions.ExecutionEngineError) as eee:
        # noinspection PyUnusedLocal
        metrics = engine.resolve_metrics(
            metrics_to_resolve=(condition_metric,), metrics=metrics
        )
    assert (
        'Error: The column "non_existent_column" in BatchData does not exist.'
        in str(eee.value)
    )
def test_get_compute_domain_with_ge_experimental_condition_parser(sa):
    engine = build_sa_engine(
        pd.DataFrame({
            "a": [1, 2, 3, 4],
            "b": [2, 3, 4, None]
        }), sa)

    # Obtaining data from computation
    data, compute_kwargs, accessor_kwargs = engine.get_compute_domain(
        domain_kwargs={
            "column": "b",
            "row_condition": 'col("b") == 2',
            "condition_parser": "great_expectations__experimental__",
        },
        domain_type="column",
    )

    # Seeing if raw data is the same as the data after condition has been applied - checking post computation data
    raw_data = engine.engine.execute(
        sa.select(["*"
                   ]).select_from(engine.active_batch_data.selectable).where(
                       sa.column("b") == 2)).fetchall()
    domain_data = engine.engine.execute(
        get_sqlalchemy_domain_data(data)).fetchall()

    # Ensuring that column domain is now an accessor kwarg, and data remains unmodified
    assert raw_data == domain_data, "Data does not match after getting compute domain"

    # Ensuring compute kwargs have not been modified
    assert ("row_condition" in compute_kwargs.keys()
            ), "Row condition should be located within compute kwargs"
    assert accessor_kwargs == {
        "column": "b"
    }, "Accessor kwargs have been modified"
Ejemplo n.º 6
0
def test_max_metric_sa_column_does_not_exist(sa):
    engine = build_sa_engine(pd.DataFrame({"a": [1, 2, 1, None]}), sa)

    metrics: dict = {}

    table_columns_metric: MetricConfiguration
    results: dict

    table_columns_metric, results = get_table_columns_metric(engine=engine)
    metrics.update(results)

    partial_metric = MetricConfiguration(
        metric_name="column.max.aggregate_fn",
        metric_domain_kwargs={"column": "non_existent_column"},
        metric_value_kwargs=dict(),
        metric_dependencies={
            "table.columns": table_columns_metric,
        },
    )

    with pytest.raises(ge_exceptions.ExecutionEngineError) as eee:
        # noinspection PyUnusedLocal
        results = engine.resolve_metrics(
            metrics_to_resolve=(partial_metric,), metrics=metrics
        )
        metrics.update(results)
    assert (
        'Error: The column "non_existent_column" in BatchData does not exist.'
        in str(eee.value)
    )
def test_get_compute_domain_with_multicolumn(sa):
    engine = build_sa_engine(
        pd.DataFrame({
            "a": [1, 2, 3, 4],
            "b": [2, 3, 4, None],
            "c": [1, 2, 3, None]
        }),
        sa,
    )

    # Obtaining compute domain
    data, compute_kwargs, accessor_kwargs = engine.get_compute_domain(
        domain_kwargs={"column_list": ["a", "b", "c"]},
        domain_type="multicolumn")

    # Seeing if raw data is the same as the data after condition has been applied - checking post computation data
    raw_data = engine.engine.execute(
        sa.select(["*"]).select_from(
            engine.active_batch_data.selectable)).fetchall()
    domain_data = engine.engine.execute(sa.select(
        ["*"]).select_from(data)).fetchall()

    # Ensuring that with no domain nothing happens to the data itself
    assert raw_data == domain_data, "Data does not match after getting compute domain"
    assert compute_kwargs is not None, "Compute domain kwargs should be existent"
    assert accessor_kwargs == {
        "column_list": ["a", "b", "c"]
    }, "Accessor kwargs have been modified"
Ejemplo n.º 8
0
def test_get_domain_records_with_column_domain_and_filter_conditions_raises_error_on_multiple_conditions(
    sa, ):
    df = pd.DataFrame({
        "a": [1, 2, 3, 4, 5],
        "b": [2, 3, 4, 5, None],
        "c": [1, 2, 3, 4, None]
    })
    engine = build_sa_engine(df, sa)
    with pytest.raises(ge_exceptions.GreatExpectationsError) as e:
        data = engine.get_domain_records(
            domain_kwargs={
                "column":
                "a",
                "row_condition":
                'col("a")<2',
                "condition_parser":
                "great_expectations__experimental__",
                "filter_conditions": [
                    RowCondition(
                        condition=f'col("b").notnull()',
                        condition_type=RowConditionParserType.GE,
                    ),
                    RowCondition(
                        condition=f'col("c").notnull()',
                        condition_type=RowConditionParserType.GE,
                    ),
                ],
            })
def test_sa_batch_unexpected_condition_temp_table(caplog, sa):
    def validate_tmp_tables():
        temp_tables = [
            name
            for name in get_sqlite_temp_table_names(engine.engine)
            if name.startswith("ge_temp_")
        ]
        tables = [
            name
            for name in get_sqlite_table_names(engine.engine)
            if name.startswith("ge_temp_")
        ]
        assert len(temp_tables) == 0
        assert len(tables) == 0

    engine = build_sa_engine(
        pd.DataFrame({"a": [1, 2, 1, 2, 3, 3], "b": [4, 4, 4, 4, 4, 4]}), sa
    )

    metrics: dict = {}

    table_columns_metric: MetricConfiguration
    results: dict

    table_columns_metric, results = get_table_columns_metric(engine=engine)
    metrics.update(results)

    validate_tmp_tables()

    condition_metric = MetricConfiguration(
        metric_name="column_values.unique.condition",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=None,
        metric_dependencies={
            "table.columns": table_columns_metric,
        },
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(condition_metric,), metrics=metrics
    )
    metrics.update(results)

    validate_tmp_tables()

    desired_metric = MetricConfiguration(
        metric_name="column_values.unique.unexpected_count",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=None,
        metric_dependencies={
            "unexpected_condition": condition_metric,
        },
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )

    validate_tmp_tables()
def test_get_compute_domain_with_nonexistent_condition_parser(sa):
    engine = build_sa_engine(
        pd.DataFrame({"a": [1, 2, 3, 4], "b": [2, 3, 4, None]}), sa
    )

    # Expect GreatExpectationsError because parser doesn't exist
    with pytest.raises(ge_exceptions.GreatExpectationsError) as e:
        data, compute_kwargs, accessor_kwargs = engine.get_compute_domain(
            domain_kwargs={
                "row_condition": "b > 24",
                "condition_parser": "nonexistent",
            },
            domain_type=MetricDomainTypes.TABLE,
        )
Ejemplo n.º 11
0
def test_map_value_set_sa(sa):
    engine = build_sa_engine(pd.DataFrame({"a": [1, 2, 3, 3, None]}), sa)

    metrics: dict = {}

    table_columns_metric: MetricConfiguration
    results: dict

    table_columns_metric, results = get_table_columns_metric(engine=engine)
    metrics.update(results)

    desired_metric = MetricConfiguration(
        metric_name="column_values.in_set.condition",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={"value_set": [1, 2, 3]},
        metric_dependencies={
            "table.columns": table_columns_metric,
        },
    )
    metrics = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )

    # Note: metric_dependencies is optional here in the config when called from a validator.
    aggregate_partial = MetricConfiguration(
        metric_name="column_values.in_set.unexpected_count.aggregate_fn",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={"value_set": [1, 2, 3]},
        metric_dependencies={"unexpected_condition": desired_metric},
    )

    metrics = engine.resolve_metrics(
        metrics_to_resolve=(aggregate_partial,), metrics=metrics
    )
    desired_metric = MetricConfiguration(
        metric_name="column_values.in_set.unexpected_count",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={"value_set": [1, 2, 3]},
        metric_dependencies={"metric_partial_fn": aggregate_partial},
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )
    assert results == {desired_metric.id: 0}
Ejemplo n.º 12
0
def test_max_metric_sa_column_exists(sa):
    engine = build_sa_engine(pd.DataFrame({"a": [1, 2, 1, None]}), sa)

    metrics: dict = {}

    table_columns_metric: MetricConfiguration
    results: dict

    table_columns_metric, results = get_table_columns_metric(engine=engine)
    metrics.update(results)

    partial_metric = MetricConfiguration(
        metric_name="column.max.aggregate_fn",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
        metric_dependencies={
            "table.columns": table_columns_metric,
        },
    )

    results = engine.resolve_metrics(
        metrics_to_resolve=(partial_metric,), metrics=metrics
    )
    metrics.update(results)

    desired_metric = MetricConfiguration(
        metric_name="column.max",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
        metric_dependencies={
            "metric_partial_fn": partial_metric,
            "table.columns": table_columns_metric,
        },
    )

    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )
    metrics.update(results)
    assert results == {desired_metric.id: 2}
Ejemplo n.º 13
0
def test_distinct_metric_sa(sa):
    engine = build_sa_engine(
        pd.DataFrame({"a": [1, 2, 1, 2, 3, 3], "b": [4, 4, 4, 4, 4, 4]}), sa
    )

    desired_metric = MetricConfiguration(
        metric_name="column.value_counts",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={"sort": "value", "collate": None},
    )
    desired_metric_b = MetricConfiguration(
        metric_name="column.value_counts",
        metric_domain_kwargs={"column": "b"},
        metric_value_kwargs={"sort": "value", "collate": None},
    )

    metrics = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric, desired_metric_b)
    )
    assert pd.Series(index=[1, 2, 3], data=[2, 2, 2]).equals(metrics[desired_metric.id])
    assert pd.Series(index=[4], data=[6]).equals(metrics[desired_metric_b.id])

    desired_metric = MetricConfiguration(
        metric_name="column.distinct_values",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
        metric_dependencies={"column.value_counts": desired_metric},
    )
    desired_metric_b = MetricConfiguration(
        metric_name="column.distinct_values",
        metric_domain_kwargs={"column": "b"},
        metric_value_kwargs=dict(),
        metric_dependencies={"column.value_counts": desired_metric_b},
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric, desired_metric_b), metrics=metrics
    )
    assert results[desired_metric.id] == {1, 2, 3}
    assert results[desired_metric_b.id] == {4}
Ejemplo n.º 14
0
def test_sqlite_sample_using_limit(sa):

    csv_path: str = file_relative_path(
        os.path.dirname(os.path.dirname(__file__)),
        os.path.join(
            "test_sets",
            "taxi_yellow_tripdata_samples",
            "ten_trips_from_each_month",
            "yellow_tripdata_sample_10_trips_from_each_month.csv",
        ),
    )
    df: pd.DataFrame = pd.read_csv(csv_path)
    engine: SqlAlchemyExecutionEngine = build_sa_engine(df, sa)

    n: int = 10
    batch_spec: SqlAlchemyDatasourceBatchSpec = SqlAlchemyDatasourceBatchSpec(
        table_name="test",
        schema_name="main",
        sampling_method="sample_using_limit",
        sampling_kwargs={"n": n},
    )
    batch_data: SqlAlchemyBatchData = engine.get_batch_data(batch_spec=batch_spec)

    # Right number of rows?
    num_rows: int = batch_data.execution_engine.engine.execute(
        sa.select([sa.func.count()]).select_from(batch_data.selectable)
    ).scalar()
    assert num_rows == n

    # Right rows?
    rows: sa.Row = batch_data.execution_engine.engine.execute(
        sa.select([sa.text("*")]).select_from(batch_data.selectable)
    ).fetchall()

    row_dates: List[datetime.datetime] = [parse(row["pickup_datetime"]) for row in rows]
    for row_date in row_dates:
        assert row_date.month == 1
        assert row_date.year == 2018
def test_resolve_metric_bundle_with_nonexistent_metric(sa):
    engine = build_sa_engine(
        pd.DataFrame({
            "a": [1, 2, 1, 2, 3, 3],
            "b": [4, 4, 4, 4, 4, 4]
        }), sa)

    desired_metric_1 = MetricConfiguration(
        metric_name="column_values.unique",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=None,
    )
    desired_metric_2 = MetricConfiguration(
        metric_name="column.min",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=None,
    )
    desired_metric_3 = MetricConfiguration(
        metric_name="column.max",
        metric_domain_kwargs={"column": "b"},
        metric_value_kwargs=None,
    )
    desired_metric_4 = MetricConfiguration(
        metric_name="column.does_not_exist",
        metric_domain_kwargs={"column": "b"},
        metric_value_kwargs=None,
    )

    # Ensuring a metric provider error is raised if metric does not exist
    with pytest.raises(ge_exceptions.MetricProviderError) as e:
        res = engine.resolve_metrics(metrics_to_resolve=(
            desired_metric_1,
            desired_metric_2,
            desired_metric_3,
            desired_metric_4,
        ))
        print(e)
def in_memory_sqlite_taxi_ten_trips_per_month_execution_engine(sa):
    df: pd.DataFrame = ten_trips_per_month_df()
    convert_string_columns_to_datetime(
        df=df, column_names_to_convert=["pickup_datetime", "dropoff_datetime"])
    engine: SqlAlchemyExecutionEngine = build_sa_engine(df, sa)
    return engine
Ejemplo n.º 17
0
def test_map_unique_sa_column_exists(sa):
    engine = build_sa_engine(
        pd.DataFrame(
            {"a": [1, 2, 3, 3, None], "b": ["foo", "bar", "baz", "qux", "fish"]}
        ),
        sa,
    )

    metrics: dict = {}

    table_columns_metric: MetricConfiguration
    results: dict

    table_columns_metric, results = get_table_columns_metric(engine=engine)
    metrics.update(results)

    condition_metric = MetricConfiguration(
        metric_name="column_values.unique.condition",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
        metric_dependencies={
            "table.columns": table_columns_metric,
        },
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(condition_metric,), metrics=metrics
    )
    metrics.update(results)

    # This is no longer a MAP_CONDITION because mssql does not support it. Instead, it is a WINDOW_CONDITION
    #
    # aggregate_fn = MetricConfiguration(
    #     metric_name="column_values.unique.unexpected_count.aggregate_fn",
    #     metric_domain_kwargs={"column": "a"},
    #     metric_value_kwargs=dict(),
    #     metric_dependencies={"unexpected_condition": condition_metric},
    # )
    # aggregate_fn_metrics = engine.resolve_metrics(
    #     metrics_to_resolve=(aggregate_fn,), metrics=metrics
    # )

    desired_metric = MetricConfiguration(
        metric_name="column_values.unique.unexpected_count",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
        # metric_dependencies={"metric_partial_fn": aggregate_fn},
        metric_dependencies={
            "unexpected_condition": condition_metric,
            "table.columns": table_columns_metric,
        },
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,),
        metrics=metrics,  # metrics=aggregate_fn_metrics
    )
    metrics.update(results)
    assert results[desired_metric.id] == 2

    desired_metric = MetricConfiguration(
        metric_name="column_values.unique.unexpected_values",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={
            "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20}
        },
        metric_dependencies={
            "unexpected_condition": condition_metric,
            "table.columns": table_columns_metric,
        },
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )
    metrics.update(results)
    assert results[desired_metric.id] == [3, 3]

    desired_metric = MetricConfiguration(
        metric_name="column_values.unique.unexpected_value_counts",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={
            "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20}
        },
        metric_dependencies={
            "unexpected_condition": condition_metric,
            "table.columns": table_columns_metric,
        },
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )
    assert results[desired_metric.id] == [(3, 2)]

    desired_metric = MetricConfiguration(
        metric_name="column_values.unique.unexpected_rows",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={
            "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20}
        },
        metric_dependencies={"unexpected_condition": condition_metric},
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )
    metrics.update(results)
    assert results[desired_metric.id] == [(3, "baz"), (3, "qux")]
def test_get_domain_records_with_multicolumn_domain(sa):
    df = pd.DataFrame({
        "a": [1, 2, 3, 4, None, 5],
        "b": [2, 3, 4, 5, 6, 7],
        "c": [1, 2, 3, 4, None, 6],
    })
    engine = build_sa_engine(df, sa)
    data = engine.get_domain_records(
        domain_kwargs={
            "column_list": ["a", "c"],
            "row_condition": 'col("b")>2',
            "condition_parser": "great_expectations__experimental__",
            "ignore_row_if": "all_values_are_missing",
        })
    domain_data = engine.engine.execute(sa.select(
        ["*"]).select_from(data)).fetchall()

    expected_multicolumn_df = pd.DataFrame(
        {
            "a": [2, 3, 4, 5],
            "b": [3, 4, 5, 7],
            "c": [2, 3, 4, 6]
        },
        index=[0, 1, 2, 4])
    engine = build_sa_engine(expected_multicolumn_df, sa)
    expected_data = engine.engine.execute(
        sa.select(["*"]).select_from(
            engine.active_batch_data.selectable)).fetchall()

    assert (domain_data == expected_data
            ), "Data does not match after getting full access compute domain"

    df = pd.DataFrame({
        "a": [1, 2, 3, 4, 5, 6],
        "b": [2, 3, 4, 5, None, 6],
        "c": [1, 2, 3, 4, 5, None],
    })
    engine = build_sa_engine(df, sa)
    data = engine.get_domain_records(
        domain_kwargs={
            "column_list": ["b", "c"],
            "row_condition": 'col("a")<5',
            "condition_parser": "great_expectations__experimental__",
            "ignore_row_if": "any_value_is_missing",
        })
    domain_data = engine.engine.execute(sa.select(
        ["*"]).select_from(data)).fetchall()

    expected_multicolumn_df = pd.DataFrame(
        {
            "a": [1, 2, 3, 4],
            "b": [2, 3, 4, 5],
            "c": [1, 2, 3, 4]
        },
        index=[0, 1, 2, 3])
    engine = build_sa_engine(expected_multicolumn_df, sa)
    expected_data = engine.engine.execute(
        sa.select(["*"]).select_from(
            engine.active_batch_data.selectable)).fetchall()

    assert (domain_data == expected_data
            ), "Data does not match after getting full access compute domain"

    df = pd.DataFrame({
        "a": [1, 2, 3, 4, None, 5],
        "b": [2, 3, 4, 5, 6, 7],
        "c": [1, 2, 3, 4, None, 6],
    })
    engine = build_sa_engine(df, sa)
    data = engine.get_domain_records(domain_kwargs={
        "column_list": ["b", "c"],
        "ignore_row_if": "never",
    })
    domain_data = engine.engine.execute(sa.select(
        ["*"]).select_from(data)).fetchall()

    expected_multicolumn_df = pd.DataFrame(
        {
            "a": [1, 2, 3, 4, None, 5],
            "b": [2, 3, 4, 5, 6, 7],
            "c": [1, 2, 3, 4, None, 6],
        },
        index=[0, 1, 2, 3, 4, 5],
    )
    engine = build_sa_engine(expected_multicolumn_df, sa)
    expected_data = engine.engine.execute(
        sa.select(["*"]).select_from(
            engine.active_batch_data.selectable)).fetchall()

    assert (domain_data == expected_data
            ), "Data does not match after getting full access compute domain"
def test_get_domain_records_with_column_pair_domain(sa):
    df = pd.DataFrame({
        "a": [1, 2, 3, 4, 5, 6],
        "b": [2, 3, 4, 5, None, 6],
        "c": [1, 2, 3, 4, 5, None],
    })
    engine = build_sa_engine(df, sa)
    data = engine.get_domain_records(
        domain_kwargs={
            "column_A": "a",
            "column_B": "b",
            "row_condition": 'col("b")>2',
            "condition_parser": "great_expectations__experimental__",
            "ignore_row_if": "both_values_are_missing",
        })
    domain_data = engine.engine.execute(sa.select(
        ["*"]).select_from(data)).fetchall()

    expected_column_pair_df = pd.DataFrame({
        "a": [2, 3, 4, 6],
        "b": [3.0, 4.0, 5.0, 6.0],
        "c": [2.0, 3.0, 4.0, None]
    })
    engine = build_sa_engine(expected_column_pair_df, sa)
    expected_data = engine.engine.execute(
        sa.select(["*"]).select_from(
            engine.active_batch_data.selectable)).fetchall()

    assert (domain_data == expected_data
            ), "Data does not match after getting full access compute domain"

    engine = build_sa_engine(df, sa)
    data = engine.get_domain_records(
        domain_kwargs={
            "column_A": "b",
            "column_B": "c",
            "row_condition": 'col("b")>2',
            "condition_parser": "great_expectations__experimental__",
            "ignore_row_if": "either_value_is_missing",
        })
    domain_data = engine.engine.execute(sa.select(
        ["*"]).select_from(data)).fetchall()

    expected_column_pair_df = pd.DataFrame({
        "a": [2, 3, 4],
        "b": [3, 4, 5],
        "c": [2, 3, 4]
    })
    engine = build_sa_engine(expected_column_pair_df, sa)
    expected_data = engine.engine.execute(
        sa.select(["*"]).select_from(
            engine.active_batch_data.selectable)).fetchall()

    assert (domain_data == expected_data
            ), "Data does not match after getting full access compute domain"

    engine = build_sa_engine(df, sa)
    data = engine.get_domain_records(
        domain_kwargs={
            "column_A": "b",
            "column_B": "c",
            "row_condition": 'col("a")<6',
            "condition_parser": "great_expectations__experimental__",
            "ignore_row_if": "neither",
        })
    domain_data = engine.engine.execute(
        get_sqlalchemy_domain_data(data)).fetchall()

    expected_column_pair_df = pd.DataFrame({
        "a": [1, 2, 3, 4, 5],
        "b": [2.0, 3.0, 4.0, 5.0, None],
        "c": [1.0, 2.0, 3.0, 4.0, 5.0],
    })
    engine = build_sa_engine(expected_column_pair_df, sa)
    expected_data = engine.engine.execute(
        sa.select(["*"]).select_from(
            engine.active_batch_data.selectable)).fetchall()

    assert (domain_data == expected_data
            ), "Data does not match after getting full access compute domain"
def test_sa_batch_aggregate_metrics(caplog, sa):
    import datetime

    engine = build_sa_engine(
        pd.DataFrame({
            "a": [1, 2, 1, 2, 3, 3],
            "b": [4, 4, 4, 4, 4, 4]
        }), sa)

    metrics: dict = {}

    table_columns_metric: MetricConfiguration
    results: dict

    table_columns_metric, results = get_table_columns_metric(engine=engine)
    metrics.update(results)

    desired_metric_1 = MetricConfiguration(
        metric_name="column.max.aggregate_fn",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=None,
        metric_dependencies={
            "table.columns": table_columns_metric,
        },
    )
    desired_metric_2 = MetricConfiguration(
        metric_name="column.min.aggregate_fn",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=None,
        metric_dependencies={
            "table.columns": table_columns_metric,
        },
    )
    desired_metric_3 = MetricConfiguration(
        metric_name="column.max.aggregate_fn",
        metric_domain_kwargs={"column": "b"},
        metric_value_kwargs=None,
        metric_dependencies={
            "table.columns": table_columns_metric,
        },
    )
    desired_metric_4 = MetricConfiguration(
        metric_name="column.min.aggregate_fn",
        metric_domain_kwargs={"column": "b"},
        metric_value_kwargs=None,
        metric_dependencies={
            "table.columns": table_columns_metric,
        },
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(
            desired_metric_1,
            desired_metric_2,
            desired_metric_3,
            desired_metric_4,
        ),
        metrics=metrics,
    )
    metrics.update(results)

    desired_metric_1 = MetricConfiguration(
        metric_name="column.max",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=None,
        metric_dependencies={
            "metric_partial_fn": desired_metric_1,
            "table.columns": table_columns_metric,
        },
    )
    desired_metric_2 = MetricConfiguration(
        metric_name="column.min",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=None,
        metric_dependencies={
            "metric_partial_fn": desired_metric_2,
            "table.columns": table_columns_metric,
        },
    )
    desired_metric_3 = MetricConfiguration(
        metric_name="column.max",
        metric_domain_kwargs={"column": "b"},
        metric_value_kwargs=None,
        metric_dependencies={
            "metric_partial_fn": desired_metric_3,
            "table.columns": table_columns_metric,
        },
    )
    desired_metric_4 = MetricConfiguration(
        metric_name="column.min",
        metric_domain_kwargs={"column": "b"},
        metric_value_kwargs=None,
        metric_dependencies={
            "metric_partial_fn": desired_metric_4,
            "table.columns": table_columns_metric,
        },
    )
    caplog.clear()
    caplog.set_level(logging.DEBUG, logger="great_expectations")
    start = datetime.datetime.now()
    results = engine.resolve_metrics(
        metrics_to_resolve=(
            desired_metric_1,
            desired_metric_2,
            desired_metric_3,
            desired_metric_4,
        ),
        metrics=metrics,
    )
    metrics.update(results)
    end = datetime.datetime.now()
    print("t1")
    print(end - start)
    assert results[desired_metric_1.id] == 3
    assert results[desired_metric_2.id] == 1
    assert results[desired_metric_3.id] == 4
    assert results[desired_metric_4.id] == 4

    # Check that all four of these metrics were computed on a single domain
    found_message = False
    for record in caplog.records:
        if (record.message ==
                "SqlAlchemyExecutionEngine computed 4 metrics on domain_id ()"
            ):
            found_message = True
    assert found_message
def in_memory_sqlite_taxi_ten_trips_per_month_execution_engine(sa):
    engine: SqlAlchemyExecutionEngine = build_sa_engine(ten_trips_per_month_df(), sa)
    return engine
def test_sqlite_split(
    taxi_test_cases: TaxiSplittingTestCasesBase,
    sa,
):
    """What does this test and why?
    splitters should work with sqlite.
    """
    engine: SqlAlchemyExecutionEngine = build_sa_engine(
        taxi_test_cases.test_df, sa)

    test_cases: List[TaxiSplittingTestCase] = taxi_test_cases.test_cases()
    test_case: TaxiSplittingTestCase
    batch_spec: SqlAlchemyDatasourceBatchSpec
    for test_case in test_cases:
        if test_case.table_domain_test_case:
            batch_spec = SqlAlchemyDatasourceBatchSpec(
                table_name="test",
                schema_name="main",
                splitter_method=test_case.splitter_method_name,
                splitter_kwargs=test_case.splitter_kwargs,
                batch_identifiers={},
            )
        else:
            if taxi_test_cases.test_column_name:
                batch_spec = SqlAlchemyDatasourceBatchSpec(
                    table_name="test",
                    schema_name="main",
                    splitter_method=test_case.splitter_method_name,
                    splitter_kwargs=test_case.splitter_kwargs,
                    batch_identifiers={
                        taxi_test_cases.test_column_name:
                        test_case.expected_column_values[0]
                    },
                )
            elif taxi_test_cases.test_column_names:
                column_name: str
                batch_spec = SqlAlchemyDatasourceBatchSpec(
                    table_name="test",
                    schema_name="main",
                    splitter_method=test_case.splitter_method_name,
                    splitter_kwargs=test_case.splitter_kwargs,
                    batch_identifiers={
                        column_name:
                        test_case.expected_column_values[0][column_name]
                        for column_name in taxi_test_cases.test_column_names
                    },
                )
            else:
                raise ValueError(
                    "Missing test_column_names or test_column_names attribute."
                )

        batch_data: SqlAlchemyBatchData = engine.get_batch_data(
            batch_spec=batch_spec)

        # Right number of rows?
        num_rows: int = batch_data.execution_engine.engine.execute(
            sa.select([sa.func.count()
                       ]).select_from(batch_data.selectable)).scalar()
        # noinspection PyUnresolvedReferences
        assert num_rows == test_case.num_expected_rows_in_first_batch_definition