def test_build_suite_no_config(titanic_validator, possible_expectations_set):
    """
    What does this test do and why?
    Tests that the build_suite function works as expected with no config
    """
    profiler = UserConfigurableProfiler(titanic_validator)
    suite = profiler.build_suite()
    expectations_from_suite = {i.expectation_type for i in suite.expectations}

    assert expectations_from_suite.issubset(possible_expectations_set)
    assert len(suite.expectations) == 48
Example #2
0
def test_build_suite_with_semantic_types_dict(
    mock_emit,
    cardinality_dataset,
    possible_expectations_set,
):
    """
    What does this test do and why?
    Tests that the build_suite function works as expected with a semantic_types dict
    """

    semantic_types = {
        "numeric": ["col_few", "col_many", "col_very_many"],
        "value_set": ["col_two", "col_very_few"],
    }

    profiler = UserConfigurableProfiler(
        cardinality_dataset,
        semantic_types_dict=semantic_types,
        primary_or_compound_key=["col_unique"],
        ignored_columns=["col_one"],
        value_set_threshold="unique",
        table_expectations_only=False,
        excluded_expectations=["expect_column_values_to_not_be_null"],
    )
    suite = profiler.build_suite()
    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    assert "column_one" not in columns_with_expectations
    assert "expect_column_values_to_not_be_null" not in expectations_from_suite
    assert expectations_from_suite.issubset(possible_expectations_set)
    assert len(suite.expectations) == 33

    value_set_expectations = [
        i for i in suite.expectations
        if i.expectation_type == "expect_column_values_to_be_in_set"
    ]
    value_set_columns = {
        i.kwargs.get("column")
        for i in value_set_expectations
    }

    assert len(value_set_columns) == 2
    assert value_set_columns == {"col_two", "col_very_few"}

    # Note 20211209 - Currently the only method called by the Profiler that is instrumented for usage_statistics
    # is ExpectationSuite's add_expectation(). It will not send a usage_stats event when called from a Profiler.
    # this number can change in the future if our instrumentation changes.
    assert mock_emit.call_count == 0
    assert mock_emit.call_args_list == []
Example #3
0
def test__validate_config(cardinality_dataset):
    """
    What does this test do and why?
    Tests the validate config function on the profiler
    """

    with pytest.raises(AssertionError) as e:
        UserConfigurableProfiler(cardinality_dataset, ignored_columns="col_name")
    assert e.typename == "AssertionError"

    with pytest.raises(AssertionError) as e:
        UserConfigurableProfiler(cardinality_dataset, table_expectations_only="True")
    assert e.typename == "AssertionError"
Example #4
0
def test_nullity_expectations_mostly_tolerance(
    nulls_dataset, possible_expectations_set
):
    excluded_expectations = [i for i in possible_expectations_set if "null" not in i]

    batch_df = nulls_dataset

    profiler = UserConfigurableProfiler(
        batch_df, excluded_expectations, not_null_only=False
    )
    suite = profiler.build_suite()

    for i in suite.expectations:
        assert i["kwargs"]["mostly"] == 0.66
def test_build_suite_no_config(
    mock_emit,
    titanic_validator,
    possible_expectations_set,
):
    """
    What does this test do and why?
    Tests that the build_suite function works as expected with no config
    """
    profiler = UserConfigurableProfiler(titanic_validator)
    suite = profiler.build_suite()
    expectations_from_suite = {i.expectation_type for i in suite.expectations}

    assert expectations_from_suite.issubset(possible_expectations_set)
    assert len(suite.expectations) == 48

    # Note 20211209 - Profiler will also call ExpectationSuite's add_expectation(), but it will not
    # send a usage_stats event when called from a Profiler.
    assert mock_emit.call_count == 1
    assert "expectation_suite.add_expectation" not in [
        mock_emit.call_args_list[0][0][0]["event"]
    ]

    # noinspection PyUnresolvedReferences
    expected_events: List[unittest.mock._Call]
    # noinspection PyUnresolvedReferences
    actual_events: List[unittest.mock._Call]

    expected_events = [
        mock.call(
            {
                "event": "legacy_profiler.build_suite",
                "event_payload": {
                    "profile_dataset_type": "Validator",
                    "excluded_expectations_specified": False,
                    "ignored_columns_specified": False,
                    "not_null_only": False,
                    "primary_or_compound_key_specified": False,
                    "semantic_types_dict_specified": False,
                    "table_expectations_only": False,
                    "value_set_threshold_specified": True,
                    "api_version": "v2",
                },
                "success": True,
            }
        ),
    ]
    actual_events = mock_emit.call_args_list
    assert actual_events == expected_events
Example #6
0
def test_profiler_init_full_config_no_semantic_types(cardinality_validator):
    """
    What does this test do and why?
    Confirms that profiler initializes properly with a full config, without a semantic_types dict
    """

    profiler = UserConfigurableProfiler(
        cardinality_validator,
        primary_or_compound_key=["col_unique"],
        ignored_columns=["col_one"],
        value_set_threshold="UNIQUE",
        table_expectations_only=False,
        excluded_expectations=["expect_column_values_to_not_be_null"],
    )
    assert profiler.primary_or_compound_key == ["col_unique"]
    assert profiler.ignored_columns == [
        "col_one",
    ]
    assert profiler.value_set_threshold == "UNIQUE"
    assert not profiler.table_expectations_only
    assert profiler.excluded_expectations == [
        "expect_column_values_to_not_be_null"
    ]

    assert "col_one" not in profiler.column_info
Example #7
0
def test_profiled_dataset_passes_own_validation(cardinality_validator,
                                                titanic_data_context):
    """
    What does this test do and why?
    Confirms that a suite created on a validator with no config will pass when validated against itself
    """
    context = titanic_data_context
    profiler = UserConfigurableProfiler(cardinality_validator,
                                        ignored_columns=["col_none"])
    suite = profiler.build_suite()

    context.save_expectation_suite(suite)
    results = context.run_validation_operator(
        "action_list_operator", assets_to_validate=[cardinality_validator])

    assert results["success"]
Example #8
0
def test_user_configurable_profiler_progress_bar_config_enabled(
        mock_tqdm, cardinality_validator):
    semantic_types = {
        "numeric": ["col_few", "col_many", "col_very_many"],
        "value_set": ["col_two", "col_very_few"],
    }

    profiler = UserConfigurableProfiler(
        cardinality_validator,
        semantic_types_dict=semantic_types,
    )

    profiler.build_suite()

    assert mock_tqdm.called
    assert mock_tqdm.call_count == 1
Example #9
0
def test_column_cardinality_functions(cardinality_validator):
    profiler = UserConfigurableProfiler(cardinality_validator)
    # assert profiler.column_info.get("col_none").get("cardinality") == "NONE"
    assert profiler.column_info.get("col_one").get("cardinality") == "ONE"
    assert profiler.column_info.get("col_two").get("cardinality") == "TWO"
    assert profiler.column_info.get("col_very_few").get(
        "cardinality") == "VERY_FEW"
    assert profiler.column_info.get("col_few").get("cardinality") == "FEW"
    assert profiler.column_info.get("col_many").get("cardinality") == "MANY"
    assert profiler.column_info.get("col_very_many").get(
        "cardinality") == "VERY_MANY"

    cardinality_with_ten_num_and_no_pct = (
        OrderedProfilerCardinality.get_basic_column_cardinality(num_unique=10))
    assert cardinality_with_ten_num_and_no_pct.name == "VERY_FEW"

    cardinality_with_unique_pct_and_no_num = (
        OrderedProfilerCardinality.get_basic_column_cardinality(
            pct_unique=1.0))
    assert cardinality_with_unique_pct_and_no_num.name == "UNIQUE"

    cardinality_with_no_pct_and_no_num = (
        OrderedProfilerCardinality.get_basic_column_cardinality())
    assert cardinality_with_no_pct_and_no_num.name == "NONE"

    cardinality_with_large_pct_and_no_num = (
        OrderedProfilerCardinality.get_basic_column_cardinality(
            pct_unique=0.5))
    assert cardinality_with_large_pct_and_no_num.name == "NONE"
Example #10
0
def test_init_with_semantic_types(cardinality_dataset):
    """
    What does this test do and why?
    Confirms that profiler initializes properly with a full config and a semantic_types dict
    """

    semantic_types = {
        ProfilerSemanticTypes.NUMERIC.value:
        ["col_few", "col_many", "col_very_many"],
        ProfilerSemanticTypes.VALUE_SET.value: ["col_two", "col_very_few"],
    }
    profiler = UserConfigurableProfiler(
        cardinality_dataset,
        semantic_types_dict=semantic_types,
        primary_or_compound_key=["col_unique"],
        ignored_columns=["col_one"],
        value_set_threshold="unique",
        table_expectations_only=False,
        excluded_expectations=["expect_column_values_to_not_be_null"],
    )

    assert "col_one" not in profiler.column_info

    assert profiler.column_info.get("col_none") == {
        "cardinality": "NONE",
        "type": "NUMERIC",
        "semantic_types": [],
    }
    assert profiler.column_info.get("col_two") == {
        "cardinality": "TWO",
        "type": "INT",
        "semantic_types": ["VALUE_SET"],
    }
    assert profiler.column_info.get("col_very_few") == {
        "cardinality": "VERY_FEW",
        "type": "INT",
        "semantic_types": ["VALUE_SET"],
    }
    assert profiler.column_info.get("col_few") == {
        "cardinality": "FEW",
        "type": "INT",
        "semantic_types": ["NUMERIC"],
    }
    assert profiler.column_info.get("col_many") == {
        "cardinality": "MANY",
        "type": "INT",
        "semantic_types": ["NUMERIC"],
    }
    assert profiler.column_info.get("col_very_many") == {
        "cardinality": "VERY_MANY",
        "type": "INT",
        "semantic_types": ["NUMERIC"],
    }
    assert profiler.column_info.get("col_unique") == {
        "cardinality": "UNIQUE",
        "type": "INT",
        "semantic_types": [],
    }
Example #11
0
def test_build_suite_with_semantic_types_dict(
    cardinality_validator,
    possible_expectations_set,
):
    """
    What does this test do and why?
    Tests that the build_suite function works as expected with a semantic_types dict
    """

    semantic_types = {
        "numeric": ["col_few", "col_many", "col_very_many"],
        "value_set": ["col_two", "col_very_few"],
    }

    profiler = UserConfigurableProfiler(
        cardinality_validator,
        semantic_types_dict=semantic_types,
        primary_or_compound_key=["col_unique"],
        ignored_columns=["col_one"],
        value_set_threshold="unique",
        table_expectations_only=False,
        excluded_expectations=["expect_column_values_to_not_be_null"],
    )
    suite = profiler.build_suite()
    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    assert "column_one" not in columns_with_expectations
    assert "expect_column_values_to_not_be_null" not in expectations_from_suite
    assert expectations_from_suite.issubset(possible_expectations_set)
    assert len(suite.expectations) == 32

    value_set_expectations = [
        i for i in suite.expectations
        if i.expectation_type == "expect_column_values_to_be_in_set"
    ]
    value_set_columns = {
        i.kwargs.get("column")
        for i in value_set_expectations
    }

    assert len(value_set_columns) == 2
    assert value_set_columns == {"col_two", "col_very_few"}
Example #12
0
def test_profiler_init_no_config(cardinality_validator, ):
    """
    What does this test do and why?
    Confirms that profiler can initialize with no config.
    """
    profiler = UserConfigurableProfiler(cardinality_validator)
    assert profiler.primary_or_compound_key == []
    assert profiler.ignored_columns == []
    assert profiler.value_set_threshold == "MANY"
    assert not profiler.table_expectations_only
    assert profiler.excluded_expectations == []
Example #13
0
def test_user_configurable_profiler_progress_bar_config_disabled(
        mock_tqdm, cardinality_validator):
    data_context = cardinality_validator.data_context
    data_context.project_config_with_variables_substituted.progress_bars = (
        ProgressBarsConfig(profilers=False))

    semantic_types = {
        "numeric": ["col_few", "col_many", "col_very_many"],
        "value_set": ["col_two", "col_very_few"],
    }

    profiler = UserConfigurableProfiler(
        cardinality_validator,
        semantic_types_dict=semantic_types,
    )

    profiler.build_suite()

    assert not mock_tqdm.called
    assert mock_tqdm.call_count == 0
Example #14
0
def test_build_suite_when_suite_already_exists(cardinality_dataset):
    """
    What does this test do and why?
    Confirms that creating a new suite on an existing profiler wipes the previous suite
    """
    profiler = UserConfigurableProfiler(
        cardinality_dataset,
        table_expectations_only=True,
        excluded_expectations=["expect_table_row_count_to_be_between"],
    )

    suite = profiler.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(suite)
    assert len(suite.expectations) == 1
    assert "expect_table_columns_to_match_ordered_list" in expectations

    profiler.excluded_expectations = ["expect_table_columns_to_match_ordered_list"]
    suite = profiler.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(suite)
    assert len(suite.expectations) == 1
    assert "expect_table_row_count_to_be_between" in expectations
Example #15
0
def configurable_profiler(dataset: PandasDataset) -> ExpectationSuite:
    from great_expectations.profile.user_configurable_profiler import (
        UserConfigurableProfiler, )

    return UserConfigurableProfiler(
        profile_dataset=dataset,
        excluded_expectations=[
            "expect_table_columns_to_match_ordered_list",
            "expect_table_row_count_to_be_between",
        ],
        value_set_threshold="few",
    ).build_suite()
def test__validate_semantic_types_dict(cardinality_validator):
    """
    What does this test do and why?
    Tests that _validate_semantic_types_dict function errors when not formatted correctly
    """

    bad_semantic_types_dict_type = {"value_set": "col_few"}
    with pytest.raises(AssertionError) as e:
        # noinspection PyTypeChecker
        UserConfigurableProfiler(
            cardinality_validator, semantic_types_dict=bad_semantic_types_dict_type
        )
    assert e.value.args[0] == (
        "Entries in semantic type dict must be lists of column names e.g. "
        "{'semantic_types': {'numeric': ['number_of_transactions']}}"
    )

    bad_semantic_types_incorrect_type = {"incorrect_type": ["col_few"]}
    with pytest.raises(ValueError) as e:
        UserConfigurableProfiler(
            cardinality_validator, semantic_types_dict=bad_semantic_types_incorrect_type
        )
    assert e.value.args[0] == (
        f"incorrect_type is not a recognized semantic_type. Please only include one of "
        f"{[semantic_type.value for semantic_type in ProfilerSemanticTypes]}"
    )

    # Error if column is specified for both semantic_types and ignored
    working_semantic_type = {"numeric": ["col_few"]}
    with pytest.raises(ValueError) as e:
        UserConfigurableProfiler(
            cardinality_validator,
            semantic_types_dict=working_semantic_type,
            ignored_columns=["col_few"],
        )
    assert e.value.args[0] == (
        "Column col_few is specified in both the semantic_types_dict and the list of ignored columns. Please remove "
        f"one of these entries to proceed."
    )
def test_config_with_not_null_only(
    titanic_data_context_modular_api, possible_expectations_set
):
    """
    What does this test do and why?
    Confirms that the not_null_only key in config works as expected.
    """

    excluded_expectations = [i for i in possible_expectations_set if "null" not in i]

    df = pd.DataFrame(
        {
            "mostly_null": [i if i % 3 == 0 else None for i in range(0, 1000)],
            "mostly_not_null": [None if i % 3 == 0 else i for i in range(0, 1000)],
        }
    )

    validator = get_pandas_runtime_validator(titanic_data_context_modular_api, df)

    profiler_without_not_null_only = UserConfigurableProfiler(
        validator, excluded_expectations, not_null_only=False
    )
    suite_without_not_null_only = profiler_without_not_null_only.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(
        suite_without_not_null_only
    )
    assert expectations == {
        "expect_column_values_to_be_null",
        "expect_column_values_to_not_be_null",
    }

    profiler_with_not_null_only = UserConfigurableProfiler(
        validator, excluded_expectations, not_null_only=True
    )
    not_null_only_suite = profiler_with_not_null_only.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(
        not_null_only_suite
    )
    assert expectations == {"expect_column_values_to_not_be_null"}

    no_config_profiler = UserConfigurableProfiler(validator)
    no_config_suite = no_config_profiler.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(no_config_suite)
    assert "expect_column_values_to_be_null" in expectations
Example #18
0
def test_primary_or_compound_key_not_found_in_columns(mock_emit,
                                                      cardinality_dataset):
    """
    What does this test do and why?
    Confirms that an error is raised if a primary_or_compound key is specified with a column not found in the dataset
    """
    # regular case, should pass
    working_profiler = UserConfigurableProfiler(
        cardinality_dataset, primary_or_compound_key=["col_unique"])
    assert working_profiler.primary_or_compound_key == ["col_unique"]

    # key includes a non-existent column, should fail
    with pytest.raises(ValueError) as e:
        # noinspection PyUnusedLocal
        bad_key_profiler = UserConfigurableProfiler(
            cardinality_dataset,
            primary_or_compound_key=["col_unique", "col_that_does_not_exist"],
        )
    assert e.value.args[0] == (
        """Column col_that_does_not_exist not found. Please ensure that this column is in the PandasDataset if you \
would like to use it as a primary_or_compound_key.
""")

    # key includes a column that exists, but is in ignored_columns, should pass
    ignored_column_profiler = UserConfigurableProfiler(
        cardinality_dataset,
        primary_or_compound_key=["col_unique", "col_one"],
        ignored_columns=["col_none", "col_one"],
    )
    assert ignored_column_profiler.primary_or_compound_key == [
        "col_unique", "col_one"
    ]

    # Note 20211209 - Currently the only method called by the Profiler that is instrumented for usage_statistics
    # is ExpectationSuite's add_expectation(). It will not send a usage_stats event when called from a Profiler.
    # this number can change in the future if our instrumentation changes.
    assert mock_emit.call_count == 0
    assert mock_emit.call_args_list == []
Example #19
0
def test_profiler_works_with_batch_object(cardinality_validator):
    profiler = UserConfigurableProfiler(cardinality_validator.active_batch)
    assert profiler.primary_or_compound_key == []
    assert profiler.ignored_columns == []
    assert profiler.value_set_threshold == "MANY"
    assert not profiler.table_expectations_only
    assert profiler.excluded_expectations == []

    assert profiler.all_table_columns == [
        "col_one",
        "col_two",
        "col_very_few",
        "col_few",
        "col_many",
        "col_very_many",
        "col_unique",
    ]
Example #20
0
def test_all_table_columns_populates(taxi_validator_pandas):
    taxi_profiler = UserConfigurableProfiler(taxi_validator_pandas)

    assert taxi_profiler.all_table_columns == [
        "vendor_id",
        "pickup_datetime",
        "dropoff_datetime",
        "passenger_count",
        "trip_distance",
        "rate_code_id",
        "store_and_fwd_flag",
        "pickup_location_id",
        "dropoff_location_id",
        "payment_type",
        "fare_amount",
        "extra",
        "mta_tax",
        "tip_amount",
        "tolls_amount",
        "improvement_surcharge",
        "total_amount",
        "congestion_surcharge",
    ]
def test_build_suite_with_semantic_types_dict(
    mock_emit,
    cardinality_validator,
    possible_expectations_set,
):
    """
    What does this test do and why?
    Tests that the build_suite function works as expected with a semantic_types dict
    """

    semantic_types = {
        "numeric": ["col_few", "col_many", "col_very_many"],
        "value_set": ["col_two", "col_very_few"],
    }

    profiler = UserConfigurableProfiler(
        cardinality_validator,
        semantic_types_dict=semantic_types,
        primary_or_compound_key=["col_unique"],
        ignored_columns=["col_one"],
        value_set_threshold="unique",
        table_expectations_only=False,
        excluded_expectations=["expect_column_values_to_not_be_null"],
    )
    suite = profiler.build_suite()
    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    assert "column_one" not in columns_with_expectations
    assert "expect_column_values_to_not_be_null" not in expectations_from_suite
    assert expectations_from_suite.issubset(possible_expectations_set)
    assert len(suite.expectations) == 32

    value_set_expectations = [
        i
        for i in suite.expectations
        if i.expectation_type == "expect_column_values_to_be_in_set"
    ]
    value_set_columns = {i.kwargs.get("column") for i in value_set_expectations}

    assert len(value_set_columns) == 2
    assert value_set_columns == {"col_two", "col_very_few"}

    # Note 20211209 - Profiler will also call ExpectationSuite's add_expectation(), but it will not
    # send a usage_stats event when called from a Profiler.
    assert mock_emit.call_count == 1

    # noinspection PyUnresolvedReferences
    expected_events: List[unittest.mock._Call]
    # noinspection PyUnresolvedReferences
    actual_events: List[unittest.mock._Call]

    expected_events = [
        mock.call(
            {
                "event": "legacy_profiler.build_suite",
                "event_payload": {
                    "profile_dataset_type": "Validator",
                    "excluded_expectations_specified": True,
                    "ignored_columns_specified": True,
                    "not_null_only": False,
                    "primary_or_compound_key_specified": True,
                    "semantic_types_dict_specified": True,
                    "table_expectations_only": False,
                    "value_set_threshold_specified": True,
                    "api_version": "v2",
                },
                "success": True,
            }
        ),
    ]
    actual_events = mock_emit.call_args_list
    assert actual_events == expected_events
def test_profiler_all_expectation_types_sqlalchemy(
    titanic_data_context_modular_api,
    taxi_validator_sqlalchemy,
    possible_expectations_set,
):
    """
    What does this test do and why?
    Ensures that all available expectation types work as expected for sqlalchemy
    """
    if taxi_validator_sqlalchemy == None:
        pytest.skip("a message")

    context = titanic_data_context_modular_api

    ignored_columns = [
        "pickup_location_id",
        "dropoff_location_id",
        "fare_amount",
        "extra",
        "mta_tax",
        "tip_amount",
        "tolls_amount",
        "improvement_surcharge",
        "congestion_surcharge",
    ]
    semantic_types = {
        "datetime": ["pickup_datetime", "dropoff_datetime"],
        "numeric": ["total_amount", "passenger_count"],
        "value_set": [
            "payment_type",
            "rate_code_id",
            "store_and_fwd_flag",
            "passenger_count",
        ],
        "boolean": ["store_and_fwd_flag"],
    }

    profiler = UserConfigurableProfiler(
        taxi_validator_sqlalchemy,
        semantic_types_dict=semantic_types,
        ignored_columns=ignored_columns,
        # TODO: Add primary_or_compound_key test
        #  primary_or_compound_key=[
        #     "vendor_id",
        #     "pickup_datetime",
        #     "dropoff_datetime",
        #     "trip_distance",
        #     "pickup_location_id",
        #     "dropoff_location_id",
        #  ],
    )

    assert profiler.column_info.get("rate_code_id")
    suite = profiler.build_suite()
    assert len(suite.expectations) == 45
    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    unexpected_expectations = {
        "expect_column_values_to_be_unique",
        "expect_column_values_to_be_null",
        "expect_compound_columns_to_be_unique",
    }
    assert expectations_from_suite == {
        i for i in possible_expectations_set if i not in unexpected_expectations
    }

    ignored_included_columns_overlap = [
        i for i in columns_with_expectations if i in ignored_columns
    ]
    assert len(ignored_included_columns_overlap) == 0

    results = context.run_validation_operator(
        "action_list_operator", assets_to_validate=[taxi_validator_sqlalchemy]
    )

    assert results["success"]
def test_expect_compound_columns_to_be_unique(
    taxi_validator_spark, taxi_data_ignored_columns, caplog
):
    """
    Until all ExecutionEngine implementations for V3 are completed for this expectation:
    1) Use the "taxi_validator_" argument for this test method, corresponding to one of the ExecutionEngine subclasses,
       for which this expectation has not yet been implemented (and update the :param annotation below accordingly);
    2) With every additional ExecutionEngine implementation for this expectation, update the corresponding
       "test_profiler_all_expectation_types_" test method to include this expectation in the appropriate assertion.
    3) Once this expectation has been implemented for all ExecutionEngine subclasses, delete this test method entirely.

    :param taxi_validator_spark:
    :param taxi_data_ignored_columns:
    :param caplog:
    :return:
    """

    taxi_validator = taxi_validator_spark

    ignored_columns = taxi_data_ignored_columns + [
        "pickup_datetime",
        "dropoff_datetime",
        "total_amount",
        "passenger_count",
        "payment_type",
        "rate_code_id",
        "store_and_fwd_flag",
        "passenger_count",
        "store_and_fwd_flag",
        "vendor_id",
        "trip_distance",
    ]

    profiler = UserConfigurableProfiler(
        taxi_validator,
        ignored_columns=ignored_columns,
        primary_or_compound_key=[
            "vendor_id",
            "pickup_datetime",
            "dropoff_datetime",
            "trip_distance",
            "pickup_location_id",
            "dropoff_location_id",
        ],
    )
    with caplog.at_level(logging.WARNING):
        suite = profiler.build_suite()

    log_warning_records = list(
        filter(lambda record: record.levelname == "WARNING", caplog.records)
    )
    assert len(log_warning_records) == 0
    assert len(suite.expectations) == 3

    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    expected_expectations = {
        "expect_table_columns_to_match_ordered_list",
        "expect_table_row_count_to_be_between",
        "expect_compound_columns_to_be_unique",
    }

    assert expected_expectations == expectations_from_suite

    profiler_with_single_column_key = UserConfigurableProfiler(
        taxi_validator,
        ignored_columns=ignored_columns,
        primary_or_compound_key=["pickup_datetime"],
    )

    suite = profiler_with_single_column_key.build_suite()

    assert len(suite.expectations) == 3

    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    expected_expectations = {
        "expect_table_columns_to_match_ordered_list",
        "expect_table_row_count_to_be_between",
        "expect_column_values_to_be_unique",
    }

    assert expected_expectations == expectations_from_suite
pg_runtime_batch_request = RuntimeBatchRequest(
    datasource_name="my_postgresql_datasource",
    data_connector_name="default_runtime_data_connector_name",
    data_asset_name="postgres_asset",
    runtime_parameters={"query": "SELECT * from taxi_data LIMIT 10"},
    batch_identifiers={"batch_id": "default_identifier"},
)
# </snippet>
# <snippet>
validator = context.get_validator(batch_request=mysql_runtime_batch_request, )
# </snippet>
# <snippet>
profiler = UserConfigurableProfiler(
    profile_dataset=validator,
    excluded_expectations=[
        "expect_column_quantile_values_to_be_between",
        "expect_column_mean_to_be_between",
    ],
)
# </snippet>
# <snippet>
expectation_suite_name = "compare_two_tables"
suite = profiler.build_suite()
context.save_expectation_suite(expectation_suite=suite,
                               expectation_suite_name=expectation_suite_name)
# </snippet>
# <snippet>
my_checkpoint_name = "comparison_checkpoint"

yaml_config = f"""
name: {my_checkpoint_name}
Example #25
0
def test_profiler_all_expectation_types(titanic_data_context,
                                        possible_expectations_set):
    """
    What does this test do and why?
    Ensures that all available expectation types work as expected
    """
    context = titanic_data_context
    df = ge.read_csv(
        file_relative_path(
            __file__,
            "../test_sets/taxi_yellow_trip_data_samples/yellow_trip_data_sample_2019-01.csv",
        ))
    batch_df = ge.dataset.PandasDataset(df)

    ignored_columns = [
        "pickup_location_id",
        "dropoff_location_id",
        "fare_amount",
        "extra",
        "mta_tax",
        "tip_amount",
        "tolls_amount",
        "improvement_surcharge",
        "congestion_surcharge",
    ]
    semantic_types = {
        "datetime": ["pickup_datetime", "dropoff_datetime"],
        "numeric": ["total_amount", "passenger_count"],
        "value_set": [
            "payment_type",
            "rate_code_id",
            "store_and_fwd_flag",
            "passenger_count",
        ],
        "boolean": ["store_and_fwd_flag"],
    }

    profiler = UserConfigurableProfiler(
        batch_df,
        semantic_types_dict=semantic_types,
        ignored_columns=ignored_columns,
        primary_or_compound_key=[
            "vendor_id",
            "pickup_datetime",
            "dropoff_datetime",
            "trip_distance",
            "pickup_location_id",
            "dropoff_location_id",
        ],
    )

    assert profiler.column_info.get("rate_code_id")
    suite = profiler.build_suite()
    assert len(suite.expectations) == 46
    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    unexpected_expectations = {
        "expect_column_values_to_be_unique",
        "expect_column_values_to_be_null",
    }
    assert expectations_from_suite == {
        i
        for i in possible_expectations_set if i not in unexpected_expectations
    }

    ignored_included_columns_overlap = [
        i for i in columns_with_expectations if i in ignored_columns
    ]
    assert len(ignored_included_columns_overlap) == 0

    results = context.run_validation_operator("action_list_operator",
                                              assets_to_validate=[batch_df])

    assert results["success"]
def test_build_suite_when_suite_already_exists(
    mock_emit,
    cardinality_validator,
):
    """
    What does this test do and why?
    Confirms that creating a new suite on an existing profiler wipes the previous suite
    """
    profiler = UserConfigurableProfiler(
        cardinality_validator,
        table_expectations_only=True,
        excluded_expectations=["expect_table_row_count_to_be_between"],
    )

    suite = profiler.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(suite)
    assert len(suite.expectations) == 1
    assert "expect_table_columns_to_match_ordered_list" in expectations

    profiler.excluded_expectations = ["expect_table_columns_to_match_ordered_list"]
    suite = profiler.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(suite)
    assert len(suite.expectations) == 1
    assert "expect_table_row_count_to_be_between" in expectations

    assert mock_emit.call_count == 2

    # noinspection PyUnresolvedReferences
    expected_events: List[unittest.mock._Call]
    # noinspection PyUnresolvedReferences
    actual_events: List[unittest.mock._Call]

    expected_events = [
        mock.call(
            {
                "event": "legacy_profiler.build_suite",
                "event_payload": {
                    "profile_dataset_type": "Validator",
                    "excluded_expectations_specified": True,
                    "ignored_columns_specified": True,
                    "not_null_only": False,
                    "primary_or_compound_key_specified": False,
                    "semantic_types_dict_specified": False,
                    "table_expectations_only": True,
                    "value_set_threshold_specified": True,
                    "api_version": "v2",
                },
                "success": True,
            }
        ),
        mock.call(
            {
                "event": "legacy_profiler.build_suite",
                "event_payload": {
                    "profile_dataset_type": "Validator",
                    "excluded_expectations_specified": True,
                    "ignored_columns_specified": True,
                    "not_null_only": False,
                    "primary_or_compound_key_specified": False,
                    "semantic_types_dict_specified": False,
                    "table_expectations_only": True,
                    "value_set_threshold_specified": True,
                    "api_version": "v2",
                },
                "success": True,
            }
        ),
    ]
    actual_events = mock_emit.call_args_list
    assert actual_events == expected_events
    "fare_amount",
    "extra",
    "mta_tax",
    "tip_amount",
    "tolls_amount",
    "improvement_surcharge",
    "total_amount",
    "congestion_surcharge",
]
# </snippet>

profiler = UserConfigurableProfiler(
    profile_dataset=validator,
    excluded_expectations=None,
    ignored_columns=exclude_column_names,
    not_null_only=False,
    primary_or_compound_key=None,
    semantic_types_dict=None,
    table_expectations_only=False,
    value_set_threshold="MANY",
)
suite = profiler.build_suite()
validator.expectation_suite = suite
validator.save_expectation_suite(discard_failed_expectations=False)

# Create first checkpoint on yellow_tripdata_sample_2019-01.csv
my_checkpoint_config = f"""
name: getting_started_checkpoint
config_version: 1.0
class_name: SimpleCheckpoint
run_name_template: "%Y%m%d-%H%M%S-my-run-name-template"
validations:
Example #28
0
def test_error_handling_for_expect_compound_columns_to_be_unique(
        taxi_validator_pandas, taxi_data_ignored_columns, caplog):
    # TODO: When this expectation is implemented for V3, remove this test and test for this expectation
    ignored_columns = taxi_data_ignored_columns + [
        "pickup_datetime",
        "dropoff_datetime",
        "total_amount",
        "passenger_count",
        "payment_type",
        "rate_code_id",
        "store_and_fwd_flag",
        "passenger_count",
        "store_and_fwd_flag",
        "vendor_id",
        "trip_distance",
    ]

    profiler = UserConfigurableProfiler(
        taxi_validator_pandas,
        ignored_columns=ignored_columns,
        primary_or_compound_key=[
            "vendor_id",
            "pickup_datetime",
            "dropoff_datetime",
            "trip_distance",
            "pickup_location_id",
            "dropoff_location_id",
        ],
    )
    with caplog.at_level(logging.WARNING):
        suite = profiler.build_suite()

    log_warnings = caplog.messages
    assert len(log_warnings) == 1

    assert (
        log_warnings[0] ==
        "expect_compound_columns_to_be_unique is not currently available in the V3 (Batch Request) API. Specifying a compound key will not add any expectations. This will be updated when that expectation becomes available."
    )

    assert len(suite.expectations) == 2

    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    expected_expectations = {
        "expect_table_columns_to_match_ordered_list",
        "expect_table_row_count_to_be_between",
    }

    assert expected_expectations == expectations_from_suite

    profiler_with_single_column_key = UserConfigurableProfiler(
        taxi_validator_pandas,
        ignored_columns=ignored_columns,
        primary_or_compound_key=["pickup_datetime"],
    )

    suite = profiler_with_single_column_key.build_suite()

    assert len(suite.expectations) == 3

    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    expected_expectations = {
        "expect_table_columns_to_match_ordered_list",
        "expect_table_row_count_to_be_between",
        "expect_column_values_to_be_unique",
    }

    assert expected_expectations == expectations_from_suite
Example #29
0
def test_profiler_all_expectation_types_sqlalchemy(
    titanic_data_context_modular_api,
    taxi_validator_sqlalchemy,
    possible_expectations_set,
    taxi_data_semantic_types,
    taxi_data_ignored_columns,
):
    """
    What does this test do and why?
    Ensures that all available expectation types work as expected for sqlalchemy
    """
    if taxi_validator_sqlalchemy is None:
        pytest.skip("a message")

    context = titanic_data_context_modular_api

    profiler = UserConfigurableProfiler(
        taxi_validator_sqlalchemy,
        semantic_types_dict=taxi_data_semantic_types,
        ignored_columns=taxi_data_ignored_columns,
        # TODO: Add primary_or_compound_key test
        #  primary_or_compound_key=[
        #     "vendor_id",
        #     "pickup_datetime",
        #     "dropoff_datetime",
        #     "trip_distance",
        #     "pickup_location_id",
        #     "dropoff_location_id",
        #  ],
    )

    assert profiler.column_info.get("rate_code_id")
    with pytest.deprecated_call(
    ):  # parse_strings_as_datetimes is deprecated in V3
        suite = profiler.build_suite()
    assert len(suite.expectations) == 45
    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    unexpected_expectations = {
        "expect_column_values_to_be_unique",
        "expect_column_values_to_be_null",
        "expect_compound_columns_to_be_unique",
    }
    assert expectations_from_suite == {
        i
        for i in possible_expectations_set if i not in unexpected_expectations
    }

    ignored_included_columns_overlap = [
        i for i in columns_with_expectations if i in taxi_data_ignored_columns
    ]
    assert len(ignored_included_columns_overlap) == 0
    with pytest.deprecated_call(
    ):  # parse_strings_as_datetimes is deprecated in V3
        results = context.run_validation_operator(
            "action_list_operator",
            assets_to_validate=[taxi_validator_sqlalchemy])

    assert results["success"]