Beispiel #1
0
def test_build_suite_when_suite_already_exists(mock_emit, cardinality_dataset):
    """
    What does this test do and why?
    Confirms that creating a new suite on an existing profiler wipes the previous suite
    """
    profiler = UserConfigurableProfiler(
        cardinality_dataset,
        table_expectations_only=True,
        excluded_expectations=["expect_table_row_count_to_be_between"],
    )

    suite = profiler.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(suite)
    assert len(suite.expectations) == 1
    assert "expect_table_columns_to_match_ordered_list" in expectations

    profiler.excluded_expectations = [
        "expect_table_columns_to_match_ordered_list"
    ]
    suite = profiler.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(suite)
    assert len(suite.expectations) == 1
    assert "expect_table_row_count_to_be_between" in expectations

    # Note 20211209 - Currently the only method called by the Profiler that is instrumented for usage_statistics
    # is ExpectationSuite's add_expectation(). It will not send a usage_stats event when called from a Profiler.
    # this number can change in the future if our instrumentation changes.
    assert mock_emit.call_count == 0
    assert mock_emit.call_args_list == []
Beispiel #2
0
def test_config_with_not_null_only(titanic_data_context_modular_api,
                                   nulls_validator, possible_expectations_set):
    """
    What does this test do and why?
    Confirms that the not_null_only key in config works as expected.
    """

    excluded_expectations = [
        i for i in possible_expectations_set if "null" not in i
    ]

    validator = nulls_validator

    profiler_without_not_null_only = UserConfigurableProfiler(
        validator, excluded_expectations, not_null_only=False)
    suite_without_not_null_only = profiler_without_not_null_only.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(
        suite_without_not_null_only)
    assert expectations == {
        "expect_column_values_to_be_null",
        "expect_column_values_to_not_be_null",
    }

    profiler_with_not_null_only = UserConfigurableProfiler(
        validator, excluded_expectations, not_null_only=True)
    not_null_only_suite = profiler_with_not_null_only.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(
        not_null_only_suite)
    assert expectations == {"expect_column_values_to_not_be_null"}

    no_config_profiler = UserConfigurableProfiler(validator)
    no_config_suite = no_config_profiler.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(
        no_config_suite)
    assert "expect_column_values_to_be_null" in expectations
Beispiel #3
0
def test_profiler_all_expectation_types_spark(
    titanic_data_context_modular_api,
    taxi_validator_spark,
    possible_expectations_set,
    taxi_data_semantic_types,
    taxi_data_ignored_columns,
):
    """
    What does this test do and why?
    Ensures that all available expectation types work as expected for spark
    """
    context = titanic_data_context_modular_api

    profiler = UserConfigurableProfiler(
        taxi_validator_spark,
        semantic_types_dict=taxi_data_semantic_types,
        ignored_columns=taxi_data_ignored_columns,
        # TODO: Add primary_or_compound_key test
        #  primary_or_compound_key=[
        #     "vendor_id",
        #     "pickup_datetime",
        #     "dropoff_datetime",
        #     "trip_distance",
        #     "pickup_location_id",
        #     "dropoff_location_id",
        #  ],
    )

    assert profiler.column_info.get("rate_code_id")
    with pytest.deprecated_call(
    ):  # parse_strings_as_datetimes is deprecated in V3
        suite = profiler.build_suite()

    assert len(suite.expectations) == 45
    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    unexpected_expectations = {
        "expect_column_values_to_be_unique",
        "expect_column_values_to_be_null",
        "expect_compound_columns_to_be_unique",
    }
    assert expectations_from_suite == {
        i
        for i in possible_expectations_set if i not in unexpected_expectations
    }

    ignored_included_columns_overlap = [
        i for i in columns_with_expectations if i in taxi_data_ignored_columns
    ]
    assert len(ignored_included_columns_overlap) == 0

    with pytest.deprecated_call(
    ):  # parse_strings_as_datetimes is deprecated in V3
        results = context.run_validation_operator(
            "action_list_operator", assets_to_validate=[taxi_validator_spark])

    assert results["success"]
Beispiel #4
0
def test_user_configurable_profiler_progress_bar_config_enabled(
        mock_tqdm, cardinality_validator):
    semantic_types = {
        "numeric": ["col_few", "col_many", "col_very_many"],
        "value_set": ["col_two", "col_very_few"],
    }

    profiler = UserConfigurableProfiler(
        cardinality_validator,
        semantic_types_dict=semantic_types,
    )

    profiler.build_suite()

    assert mock_tqdm.called
    assert mock_tqdm.call_count == 1
Beispiel #5
0
def test_build_suite_with_config_and_no_semantic_types_dict(
        titanic_validator, possible_expectations_set):
    """
    What does this test do and why?
    Tests that the build_suite function works as expected with a config and without a semantic_types dict
    """
    profiler = UserConfigurableProfiler(
        titanic_validator,
        ignored_columns=["Survived", "Unnamed: 0"],
        excluded_expectations=["expect_column_mean_to_be_between"],
        primary_or_compound_key=["Name"],
        table_expectations_only=False,
        value_set_threshold="very_few",
    )
    suite = profiler.build_suite()
    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    columns_expected_in_suite = {"Name", "PClass", "Age", "Sex", "SexCode"}
    assert columns_with_expectations == columns_expected_in_suite
    assert expectations_from_suite.issubset(possible_expectations_set)
    assert "expect_column_mean_to_be_between" not in expectations_from_suite
    assert len(suite.expectations) == 29
def test_build_suite_with_config_and_no_semantic_types_dict(
    mock_emit, titanic_validator, possible_expectations_set
):
    """
    What does this test do and why?
    Tests that the build_suite function works as expected with a config and without a semantic_types dict
    """
    profiler = UserConfigurableProfiler(
        titanic_validator,
        ignored_columns=["Survived", "Unnamed: 0"],
        excluded_expectations=["expect_column_mean_to_be_between"],
        primary_or_compound_key=["Name"],
        table_expectations_only=False,
        value_set_threshold="very_few",
    )
    suite = profiler.build_suite()
    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    columns_expected_in_suite = {"Name", "PClass", "Age", "Sex", "SexCode"}
    assert columns_with_expectations == columns_expected_in_suite
    assert expectations_from_suite.issubset(possible_expectations_set)
    assert "expect_column_mean_to_be_between" not in expectations_from_suite
    assert len(suite.expectations) == 29

    assert mock_emit.call_count == 1
    assert "expectation_suite.add_expectation" not in [
        mock_emit.call_args_list[0][0][0]["event"]
    ]

    # noinspection PyUnresolvedReferences
    expected_events: List[unittest.mock._Call]
    # noinspection PyUnresolvedReferences
    actual_events: List[unittest.mock._Call]

    expected_events = [
        mock.call(
            {
                "event": "legacy_profiler.build_suite",
                "event_payload": {
                    "profile_dataset_type": "Validator",
                    "excluded_expectations_specified": True,
                    "ignored_columns_specified": True,
                    "not_null_only": False,
                    "primary_or_compound_key_specified": True,
                    "semantic_types_dict_specified": False,
                    "table_expectations_only": False,
                    "value_set_threshold_specified": True,
                    "api_version": "v2",
                },
                "success": True,
            }
        ),
    ]
    actual_events = mock_emit.call_args_list
    assert actual_events == expected_events
def test_config_with_not_null_only(
    titanic_data_context_modular_api, possible_expectations_set
):
    """
    What does this test do and why?
    Confirms that the not_null_only key in config works as expected.
    """

    excluded_expectations = [i for i in possible_expectations_set if "null" not in i]

    df = pd.DataFrame(
        {
            "mostly_null": [i if i % 3 == 0 else None for i in range(0, 1000)],
            "mostly_not_null": [None if i % 3 == 0 else i for i in range(0, 1000)],
        }
    )

    validator = get_pandas_runtime_validator(titanic_data_context_modular_api, df)

    profiler_without_not_null_only = UserConfigurableProfiler(
        validator, excluded_expectations, not_null_only=False
    )
    suite_without_not_null_only = profiler_without_not_null_only.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(
        suite_without_not_null_only
    )
    assert expectations == {
        "expect_column_values_to_be_null",
        "expect_column_values_to_not_be_null",
    }

    profiler_with_not_null_only = UserConfigurableProfiler(
        validator, excluded_expectations, not_null_only=True
    )
    not_null_only_suite = profiler_with_not_null_only.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(
        not_null_only_suite
    )
    assert expectations == {"expect_column_values_to_not_be_null"}

    no_config_profiler = UserConfigurableProfiler(validator)
    no_config_suite = no_config_profiler.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(no_config_suite)
    assert "expect_column_values_to_be_null" in expectations
def test_profiler_all_expectation_types_pandas(
    titanic_data_context_modular_api,
    taxi_validator_pandas,
    possible_expectations_set,
    taxi_data_semantic_types,
    taxi_data_ignored_columns,
):
    """
    What does this test do and why?
    Ensures that all available expectation types work as expected for pandas
    """
    context = titanic_data_context_modular_api

    profiler = UserConfigurableProfiler(
        taxi_validator_pandas,
        semantic_types_dict=taxi_data_semantic_types,
        ignored_columns=taxi_data_ignored_columns,
        primary_or_compound_key=[
            "vendor_id",
            "pickup_datetime",
            "dropoff_datetime",
            "trip_distance",
            "pickup_location_id",
            "dropoff_location_id",
        ],
    )

    assert profiler.column_info.get("rate_code_id")

    suite = profiler.build_suite()

    assert len(suite.expectations) == 41
    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    unexpected_expectations = {
        "expect_column_values_to_be_unique",
        "expect_column_values_to_be_null",
        "expect_column_values_to_be_between",
    }
    assert expectations_from_suite == {
        i for i in possible_expectations_set if i not in unexpected_expectations
    }

    ignored_included_columns_overlap = [
        i for i in columns_with_expectations if i in taxi_data_ignored_columns
    ]
    assert len(ignored_included_columns_overlap) == 0
    results = context.run_validation_operator(
        "action_list_operator", assets_to_validate=[taxi_validator_pandas]
    )

    assert results["success"]
Beispiel #9
0
def test_build_suite_no_config(titanic_validator, possible_expectations_set):
    """
    What does this test do and why?
    Tests that the build_suite function works as expected with no config
    """
    profiler = UserConfigurableProfiler(titanic_validator)
    suite = profiler.build_suite()
    expectations_from_suite = {i.expectation_type for i in suite.expectations}

    assert expectations_from_suite.issubset(possible_expectations_set)
    assert len(suite.expectations) == 48
Beispiel #10
0
def test_user_configurable_profiler_progress_bar_config_disabled(
        mock_tqdm, cardinality_validator):
    data_context = cardinality_validator.data_context
    data_context.project_config_with_variables_substituted.progress_bars = (
        ProgressBarsConfig(profilers=False))

    semantic_types = {
        "numeric": ["col_few", "col_many", "col_very_many"],
        "value_set": ["col_two", "col_very_few"],
    }

    profiler = UserConfigurableProfiler(
        cardinality_validator,
        semantic_types_dict=semantic_types,
    )

    profiler.build_suite()

    assert not mock_tqdm.called
    assert mock_tqdm.call_count == 0
Beispiel #11
0
def test_build_suite_when_suite_already_exists(cardinality_dataset):
    """
    What does this test do and why?
    Confirms that creating a new suite on an existing profiler wipes the previous suite
    """
    profiler = UserConfigurableProfiler(
        cardinality_dataset,
        table_expectations_only=True,
        excluded_expectations=["expect_table_row_count_to_be_between"],
    )

    suite = profiler.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(suite)
    assert len(suite.expectations) == 1
    assert "expect_table_columns_to_match_ordered_list" in expectations

    profiler.excluded_expectations = ["expect_table_columns_to_match_ordered_list"]
    suite = profiler.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(suite)
    assert len(suite.expectations) == 1
    assert "expect_table_row_count_to_be_between" in expectations
Beispiel #12
0
def test_build_suite_with_semantic_types_dict(
    mock_emit,
    cardinality_dataset,
    possible_expectations_set,
):
    """
    What does this test do and why?
    Tests that the build_suite function works as expected with a semantic_types dict
    """

    semantic_types = {
        "numeric": ["col_few", "col_many", "col_very_many"],
        "value_set": ["col_two", "col_very_few"],
    }

    profiler = UserConfigurableProfiler(
        cardinality_dataset,
        semantic_types_dict=semantic_types,
        primary_or_compound_key=["col_unique"],
        ignored_columns=["col_one"],
        value_set_threshold="unique",
        table_expectations_only=False,
        excluded_expectations=["expect_column_values_to_not_be_null"],
    )
    suite = profiler.build_suite()
    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    assert "column_one" not in columns_with_expectations
    assert "expect_column_values_to_not_be_null" not in expectations_from_suite
    assert expectations_from_suite.issubset(possible_expectations_set)
    assert len(suite.expectations) == 33

    value_set_expectations = [
        i for i in suite.expectations
        if i.expectation_type == "expect_column_values_to_be_in_set"
    ]
    value_set_columns = {
        i.kwargs.get("column")
        for i in value_set_expectations
    }

    assert len(value_set_columns) == 2
    assert value_set_columns == {"col_two", "col_very_few"}

    # Note 20211209 - Currently the only method called by the Profiler that is instrumented for usage_statistics
    # is ExpectationSuite's add_expectation(). It will not send a usage_stats event when called from a Profiler.
    # this number can change in the future if our instrumentation changes.
    assert mock_emit.call_count == 0
    assert mock_emit.call_args_list == []
Beispiel #13
0
def test_nullity_expectations_mostly_tolerance(
    nulls_dataset, possible_expectations_set
):
    excluded_expectations = [i for i in possible_expectations_set if "null" not in i]

    batch_df = nulls_dataset

    profiler = UserConfigurableProfiler(
        batch_df, excluded_expectations, not_null_only=False
    )
    suite = profiler.build_suite()

    for i in suite.expectations:
        assert i["kwargs"]["mostly"] == 0.66
def test_build_suite_no_config(
    mock_emit,
    titanic_validator,
    possible_expectations_set,
):
    """
    What does this test do and why?
    Tests that the build_suite function works as expected with no config
    """
    profiler = UserConfigurableProfiler(titanic_validator)
    suite = profiler.build_suite()
    expectations_from_suite = {i.expectation_type for i in suite.expectations}

    assert expectations_from_suite.issubset(possible_expectations_set)
    assert len(suite.expectations) == 48

    # Note 20211209 - Profiler will also call ExpectationSuite's add_expectation(), but it will not
    # send a usage_stats event when called from a Profiler.
    assert mock_emit.call_count == 1
    assert "expectation_suite.add_expectation" not in [
        mock_emit.call_args_list[0][0][0]["event"]
    ]

    # noinspection PyUnresolvedReferences
    expected_events: List[unittest.mock._Call]
    # noinspection PyUnresolvedReferences
    actual_events: List[unittest.mock._Call]

    expected_events = [
        mock.call(
            {
                "event": "legacy_profiler.build_suite",
                "event_payload": {
                    "profile_dataset_type": "Validator",
                    "excluded_expectations_specified": False,
                    "ignored_columns_specified": False,
                    "not_null_only": False,
                    "primary_or_compound_key_specified": False,
                    "semantic_types_dict_specified": False,
                    "table_expectations_only": False,
                    "value_set_threshold_specified": True,
                    "api_version": "v2",
                },
                "success": True,
            }
        ),
    ]
    actual_events = mock_emit.call_args_list
    assert actual_events == expected_events
Beispiel #15
0
def test_profiled_dataset_passes_own_validation(cardinality_validator,
                                                titanic_data_context):
    """
    What does this test do and why?
    Confirms that a suite created on a validator with no config will pass when validated against itself
    """
    context = titanic_data_context
    profiler = UserConfigurableProfiler(cardinality_validator,
                                        ignored_columns=["col_none"])
    suite = profiler.build_suite()

    context.save_expectation_suite(suite)
    results = context.run_validation_operator(
        "action_list_operator", assets_to_validate=[cardinality_validator])

    assert results["success"]
Beispiel #16
0
def test_build_suite_with_semantic_types_dict(
    cardinality_validator,
    possible_expectations_set,
):
    """
    What does this test do and why?
    Tests that the build_suite function works as expected with a semantic_types dict
    """

    semantic_types = {
        "numeric": ["col_few", "col_many", "col_very_many"],
        "value_set": ["col_two", "col_very_few"],
    }

    profiler = UserConfigurableProfiler(
        cardinality_validator,
        semantic_types_dict=semantic_types,
        primary_or_compound_key=["col_unique"],
        ignored_columns=["col_one"],
        value_set_threshold="unique",
        table_expectations_only=False,
        excluded_expectations=["expect_column_values_to_not_be_null"],
    )
    suite = profiler.build_suite()
    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    assert "column_one" not in columns_with_expectations
    assert "expect_column_values_to_not_be_null" not in expectations_from_suite
    assert expectations_from_suite.issubset(possible_expectations_set)
    assert len(suite.expectations) == 32

    value_set_expectations = [
        i for i in suite.expectations
        if i.expectation_type == "expect_column_values_to_be_in_set"
    ]
    value_set_columns = {
        i.kwargs.get("column")
        for i in value_set_expectations
    }

    assert len(value_set_columns) == 2
    assert value_set_columns == {"col_two", "col_very_few"}
# </snippet>
# <snippet>
validator = context.get_validator(batch_request=mysql_runtime_batch_request, )
# </snippet>
# <snippet>
profiler = UserConfigurableProfiler(
    profile_dataset=validator,
    excluded_expectations=[
        "expect_column_quantile_values_to_be_between",
        "expect_column_mean_to_be_between",
    ],
)
# </snippet>
# <snippet>
expectation_suite_name = "compare_two_tables"
suite = profiler.build_suite()
context.save_expectation_suite(expectation_suite=suite,
                               expectation_suite_name=expectation_suite_name)
# </snippet>
# <snippet>
my_checkpoint_name = "comparison_checkpoint"

yaml_config = f"""
name: {my_checkpoint_name}
config_version: 1.0
class_name: SimpleCheckpoint
run_name_template: "%Y%m%d-%H%M%S-my-run-name-template"
expectation_suite_name: {expectation_suite_name}
"""

context.add_checkpoint(**yaml.load(yaml_config))
Beispiel #18
0
def test_profiler_all_expectation_types(titanic_data_context,
                                        possible_expectations_set):
    """
    What does this test do and why?
    Ensures that all available expectation types work as expected
    """
    context = titanic_data_context
    df = ge.read_csv(
        file_relative_path(
            __file__,
            "../test_sets/taxi_yellow_trip_data_samples/yellow_trip_data_sample_2019-01.csv",
        ))
    batch_df = ge.dataset.PandasDataset(df)

    ignored_columns = [
        "pickup_location_id",
        "dropoff_location_id",
        "fare_amount",
        "extra",
        "mta_tax",
        "tip_amount",
        "tolls_amount",
        "improvement_surcharge",
        "congestion_surcharge",
    ]
    semantic_types = {
        "datetime": ["pickup_datetime", "dropoff_datetime"],
        "numeric": ["total_amount", "passenger_count"],
        "value_set": [
            "payment_type",
            "rate_code_id",
            "store_and_fwd_flag",
            "passenger_count",
        ],
        "boolean": ["store_and_fwd_flag"],
    }

    profiler = UserConfigurableProfiler(
        batch_df,
        semantic_types_dict=semantic_types,
        ignored_columns=ignored_columns,
        primary_or_compound_key=[
            "vendor_id",
            "pickup_datetime",
            "dropoff_datetime",
            "trip_distance",
            "pickup_location_id",
            "dropoff_location_id",
        ],
    )

    assert profiler.column_info.get("rate_code_id")
    suite = profiler.build_suite()
    assert len(suite.expectations) == 46
    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    unexpected_expectations = {
        "expect_column_values_to_be_unique",
        "expect_column_values_to_be_null",
    }
    assert expectations_from_suite == {
        i
        for i in possible_expectations_set if i not in unexpected_expectations
    }

    ignored_included_columns_overlap = [
        i for i in columns_with_expectations if i in ignored_columns
    ]
    assert len(ignored_included_columns_overlap) == 0

    results = context.run_validation_operator("action_list_operator",
                                              assets_to_validate=[batch_df])

    assert results["success"]
def test_expect_compound_columns_to_be_unique(
    taxi_validator_spark, taxi_data_ignored_columns, caplog
):
    """
    Until all ExecutionEngine implementations for V3 are completed for this expectation:
    1) Use the "taxi_validator_" argument for this test method, corresponding to one of the ExecutionEngine subclasses,
       for which this expectation has not yet been implemented (and update the :param annotation below accordingly);
    2) With every additional ExecutionEngine implementation for this expectation, update the corresponding
       "test_profiler_all_expectation_types_" test method to include this expectation in the appropriate assertion.
    3) Once this expectation has been implemented for all ExecutionEngine subclasses, delete this test method entirely.

    :param taxi_validator_spark:
    :param taxi_data_ignored_columns:
    :param caplog:
    :return:
    """

    taxi_validator = taxi_validator_spark

    ignored_columns = taxi_data_ignored_columns + [
        "pickup_datetime",
        "dropoff_datetime",
        "total_amount",
        "passenger_count",
        "payment_type",
        "rate_code_id",
        "store_and_fwd_flag",
        "passenger_count",
        "store_and_fwd_flag",
        "vendor_id",
        "trip_distance",
    ]

    profiler = UserConfigurableProfiler(
        taxi_validator,
        ignored_columns=ignored_columns,
        primary_or_compound_key=[
            "vendor_id",
            "pickup_datetime",
            "dropoff_datetime",
            "trip_distance",
            "pickup_location_id",
            "dropoff_location_id",
        ],
    )
    with caplog.at_level(logging.WARNING):
        suite = profiler.build_suite()

    log_warning_records = list(
        filter(lambda record: record.levelname == "WARNING", caplog.records)
    )
    assert len(log_warning_records) == 0
    assert len(suite.expectations) == 3

    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    expected_expectations = {
        "expect_table_columns_to_match_ordered_list",
        "expect_table_row_count_to_be_between",
        "expect_compound_columns_to_be_unique",
    }

    assert expected_expectations == expectations_from_suite

    profiler_with_single_column_key = UserConfigurableProfiler(
        taxi_validator,
        ignored_columns=ignored_columns,
        primary_or_compound_key=["pickup_datetime"],
    )

    suite = profiler_with_single_column_key.build_suite()

    assert len(suite.expectations) == 3

    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    expected_expectations = {
        "expect_table_columns_to_match_ordered_list",
        "expect_table_row_count_to_be_between",
        "expect_column_values_to_be_unique",
    }

    assert expected_expectations == expectations_from_suite
def test_build_suite_with_semantic_types_dict(
    mock_emit,
    cardinality_validator,
    possible_expectations_set,
):
    """
    What does this test do and why?
    Tests that the build_suite function works as expected with a semantic_types dict
    """

    semantic_types = {
        "numeric": ["col_few", "col_many", "col_very_many"],
        "value_set": ["col_two", "col_very_few"],
    }

    profiler = UserConfigurableProfiler(
        cardinality_validator,
        semantic_types_dict=semantic_types,
        primary_or_compound_key=["col_unique"],
        ignored_columns=["col_one"],
        value_set_threshold="unique",
        table_expectations_only=False,
        excluded_expectations=["expect_column_values_to_not_be_null"],
    )
    suite = profiler.build_suite()
    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    assert "column_one" not in columns_with_expectations
    assert "expect_column_values_to_not_be_null" not in expectations_from_suite
    assert expectations_from_suite.issubset(possible_expectations_set)
    assert len(suite.expectations) == 32

    value_set_expectations = [
        i
        for i in suite.expectations
        if i.expectation_type == "expect_column_values_to_be_in_set"
    ]
    value_set_columns = {i.kwargs.get("column") for i in value_set_expectations}

    assert len(value_set_columns) == 2
    assert value_set_columns == {"col_two", "col_very_few"}

    # Note 20211209 - Profiler will also call ExpectationSuite's add_expectation(), but it will not
    # send a usage_stats event when called from a Profiler.
    assert mock_emit.call_count == 1

    # noinspection PyUnresolvedReferences
    expected_events: List[unittest.mock._Call]
    # noinspection PyUnresolvedReferences
    actual_events: List[unittest.mock._Call]

    expected_events = [
        mock.call(
            {
                "event": "legacy_profiler.build_suite",
                "event_payload": {
                    "profile_dataset_type": "Validator",
                    "excluded_expectations_specified": True,
                    "ignored_columns_specified": True,
                    "not_null_only": False,
                    "primary_or_compound_key_specified": True,
                    "semantic_types_dict_specified": True,
                    "table_expectations_only": False,
                    "value_set_threshold_specified": True,
                    "api_version": "v2",
                },
                "success": True,
            }
        ),
    ]
    actual_events = mock_emit.call_args_list
    assert actual_events == expected_events
def test_profiler_all_expectation_types_sqlalchemy(
    titanic_data_context_modular_api,
    taxi_validator_sqlalchemy,
    possible_expectations_set,
):
    """
    What does this test do and why?
    Ensures that all available expectation types work as expected for sqlalchemy
    """
    if taxi_validator_sqlalchemy == None:
        pytest.skip("a message")

    context = titanic_data_context_modular_api

    ignored_columns = [
        "pickup_location_id",
        "dropoff_location_id",
        "fare_amount",
        "extra",
        "mta_tax",
        "tip_amount",
        "tolls_amount",
        "improvement_surcharge",
        "congestion_surcharge",
    ]
    semantic_types = {
        "datetime": ["pickup_datetime", "dropoff_datetime"],
        "numeric": ["total_amount", "passenger_count"],
        "value_set": [
            "payment_type",
            "rate_code_id",
            "store_and_fwd_flag",
            "passenger_count",
        ],
        "boolean": ["store_and_fwd_flag"],
    }

    profiler = UserConfigurableProfiler(
        taxi_validator_sqlalchemy,
        semantic_types_dict=semantic_types,
        ignored_columns=ignored_columns,
        # TODO: Add primary_or_compound_key test
        #  primary_or_compound_key=[
        #     "vendor_id",
        #     "pickup_datetime",
        #     "dropoff_datetime",
        #     "trip_distance",
        #     "pickup_location_id",
        #     "dropoff_location_id",
        #  ],
    )

    assert profiler.column_info.get("rate_code_id")
    suite = profiler.build_suite()
    assert len(suite.expectations) == 45
    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    unexpected_expectations = {
        "expect_column_values_to_be_unique",
        "expect_column_values_to_be_null",
        "expect_compound_columns_to_be_unique",
    }
    assert expectations_from_suite == {
        i for i in possible_expectations_set if i not in unexpected_expectations
    }

    ignored_included_columns_overlap = [
        i for i in columns_with_expectations if i in ignored_columns
    ]
    assert len(ignored_included_columns_overlap) == 0

    results = context.run_validation_operator(
        "action_list_operator", assets_to_validate=[taxi_validator_sqlalchemy]
    )

    assert results["success"]
Beispiel #22
0
def test_error_handling_for_expect_compound_columns_to_be_unique(
        taxi_validator_pandas, taxi_data_ignored_columns, caplog):
    # TODO: When this expectation is implemented for V3, remove this test and test for this expectation
    ignored_columns = taxi_data_ignored_columns + [
        "pickup_datetime",
        "dropoff_datetime",
        "total_amount",
        "passenger_count",
        "payment_type",
        "rate_code_id",
        "store_and_fwd_flag",
        "passenger_count",
        "store_and_fwd_flag",
        "vendor_id",
        "trip_distance",
    ]

    profiler = UserConfigurableProfiler(
        taxi_validator_pandas,
        ignored_columns=ignored_columns,
        primary_or_compound_key=[
            "vendor_id",
            "pickup_datetime",
            "dropoff_datetime",
            "trip_distance",
            "pickup_location_id",
            "dropoff_location_id",
        ],
    )
    with caplog.at_level(logging.WARNING):
        suite = profiler.build_suite()

    log_warnings = caplog.messages
    assert len(log_warnings) == 1

    assert (
        log_warnings[0] ==
        "expect_compound_columns_to_be_unique is not currently available in the V3 (Batch Request) API. Specifying a compound key will not add any expectations. This will be updated when that expectation becomes available."
    )

    assert len(suite.expectations) == 2

    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    expected_expectations = {
        "expect_table_columns_to_match_ordered_list",
        "expect_table_row_count_to_be_between",
    }

    assert expected_expectations == expectations_from_suite

    profiler_with_single_column_key = UserConfigurableProfiler(
        taxi_validator_pandas,
        ignored_columns=ignored_columns,
        primary_or_compound_key=["pickup_datetime"],
    )

    suite = profiler_with_single_column_key.build_suite()

    assert len(suite.expectations) == 3

    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    expected_expectations = {
        "expect_table_columns_to_match_ordered_list",
        "expect_table_row_count_to_be_between",
        "expect_column_values_to_be_unique",
    }

    assert expected_expectations == expectations_from_suite
def test_build_suite_when_suite_already_exists(
    mock_emit,
    cardinality_validator,
):
    """
    What does this test do and why?
    Confirms that creating a new suite on an existing profiler wipes the previous suite
    """
    profiler = UserConfigurableProfiler(
        cardinality_validator,
        table_expectations_only=True,
        excluded_expectations=["expect_table_row_count_to_be_between"],
    )

    suite = profiler.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(suite)
    assert len(suite.expectations) == 1
    assert "expect_table_columns_to_match_ordered_list" in expectations

    profiler.excluded_expectations = ["expect_table_columns_to_match_ordered_list"]
    suite = profiler.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(suite)
    assert len(suite.expectations) == 1
    assert "expect_table_row_count_to_be_between" in expectations

    assert mock_emit.call_count == 2

    # noinspection PyUnresolvedReferences
    expected_events: List[unittest.mock._Call]
    # noinspection PyUnresolvedReferences
    actual_events: List[unittest.mock._Call]

    expected_events = [
        mock.call(
            {
                "event": "legacy_profiler.build_suite",
                "event_payload": {
                    "profile_dataset_type": "Validator",
                    "excluded_expectations_specified": True,
                    "ignored_columns_specified": True,
                    "not_null_only": False,
                    "primary_or_compound_key_specified": False,
                    "semantic_types_dict_specified": False,
                    "table_expectations_only": True,
                    "value_set_threshold_specified": True,
                    "api_version": "v2",
                },
                "success": True,
            }
        ),
        mock.call(
            {
                "event": "legacy_profiler.build_suite",
                "event_payload": {
                    "profile_dataset_type": "Validator",
                    "excluded_expectations_specified": True,
                    "ignored_columns_specified": True,
                    "not_null_only": False,
                    "primary_or_compound_key_specified": False,
                    "semantic_types_dict_specified": False,
                    "table_expectations_only": True,
                    "value_set_threshold_specified": True,
                    "api_version": "v2",
                },
                "success": True,
            }
        ),
    ]
    actual_events = mock_emit.call_args_list
    assert actual_events == expected_events