def test_build_suite_when_suite_already_exists(mock_emit, cardinality_dataset): """ What does this test do and why? Confirms that creating a new suite on an existing profiler wipes the previous suite """ profiler = UserConfigurableProfiler( cardinality_dataset, table_expectations_only=True, excluded_expectations=["expect_table_row_count_to_be_between"], ) suite = profiler.build_suite() _, expectations = get_set_of_columns_and_expectations_from_suite(suite) assert len(suite.expectations) == 1 assert "expect_table_columns_to_match_ordered_list" in expectations profiler.excluded_expectations = [ "expect_table_columns_to_match_ordered_list" ] suite = profiler.build_suite() _, expectations = get_set_of_columns_and_expectations_from_suite(suite) assert len(suite.expectations) == 1 assert "expect_table_row_count_to_be_between" in expectations # Note 20211209 - Currently the only method called by the Profiler that is instrumented for usage_statistics # is ExpectationSuite's add_expectation(). It will not send a usage_stats event when called from a Profiler. # this number can change in the future if our instrumentation changes. assert mock_emit.call_count == 0 assert mock_emit.call_args_list == []
def test_config_with_not_null_only(titanic_data_context_modular_api, nulls_validator, possible_expectations_set): """ What does this test do and why? Confirms that the not_null_only key in config works as expected. """ excluded_expectations = [ i for i in possible_expectations_set if "null" not in i ] validator = nulls_validator profiler_without_not_null_only = UserConfigurableProfiler( validator, excluded_expectations, not_null_only=False) suite_without_not_null_only = profiler_without_not_null_only.build_suite() _, expectations = get_set_of_columns_and_expectations_from_suite( suite_without_not_null_only) assert expectations == { "expect_column_values_to_be_null", "expect_column_values_to_not_be_null", } profiler_with_not_null_only = UserConfigurableProfiler( validator, excluded_expectations, not_null_only=True) not_null_only_suite = profiler_with_not_null_only.build_suite() _, expectations = get_set_of_columns_and_expectations_from_suite( not_null_only_suite) assert expectations == {"expect_column_values_to_not_be_null"} no_config_profiler = UserConfigurableProfiler(validator) no_config_suite = no_config_profiler.build_suite() _, expectations = get_set_of_columns_and_expectations_from_suite( no_config_suite) assert "expect_column_values_to_be_null" in expectations
def test_profiler_all_expectation_types_spark( titanic_data_context_modular_api, taxi_validator_spark, possible_expectations_set, taxi_data_semantic_types, taxi_data_ignored_columns, ): """ What does this test do and why? Ensures that all available expectation types work as expected for spark """ context = titanic_data_context_modular_api profiler = UserConfigurableProfiler( taxi_validator_spark, semantic_types_dict=taxi_data_semantic_types, ignored_columns=taxi_data_ignored_columns, # TODO: Add primary_or_compound_key test # primary_or_compound_key=[ # "vendor_id", # "pickup_datetime", # "dropoff_datetime", # "trip_distance", # "pickup_location_id", # "dropoff_location_id", # ], ) assert profiler.column_info.get("rate_code_id") with pytest.deprecated_call( ): # parse_strings_as_datetimes is deprecated in V3 suite = profiler.build_suite() assert len(suite.expectations) == 45 ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite) unexpected_expectations = { "expect_column_values_to_be_unique", "expect_column_values_to_be_null", "expect_compound_columns_to_be_unique", } assert expectations_from_suite == { i for i in possible_expectations_set if i not in unexpected_expectations } ignored_included_columns_overlap = [ i for i in columns_with_expectations if i in taxi_data_ignored_columns ] assert len(ignored_included_columns_overlap) == 0 with pytest.deprecated_call( ): # parse_strings_as_datetimes is deprecated in V3 results = context.run_validation_operator( "action_list_operator", assets_to_validate=[taxi_validator_spark]) assert results["success"]
def test_user_configurable_profiler_progress_bar_config_enabled( mock_tqdm, cardinality_validator): semantic_types = { "numeric": ["col_few", "col_many", "col_very_many"], "value_set": ["col_two", "col_very_few"], } profiler = UserConfigurableProfiler( cardinality_validator, semantic_types_dict=semantic_types, ) profiler.build_suite() assert mock_tqdm.called assert mock_tqdm.call_count == 1
def test_build_suite_with_config_and_no_semantic_types_dict( titanic_validator, possible_expectations_set): """ What does this test do and why? Tests that the build_suite function works as expected with a config and without a semantic_types dict """ profiler = UserConfigurableProfiler( titanic_validator, ignored_columns=["Survived", "Unnamed: 0"], excluded_expectations=["expect_column_mean_to_be_between"], primary_or_compound_key=["Name"], table_expectations_only=False, value_set_threshold="very_few", ) suite = profiler.build_suite() ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite) columns_expected_in_suite = {"Name", "PClass", "Age", "Sex", "SexCode"} assert columns_with_expectations == columns_expected_in_suite assert expectations_from_suite.issubset(possible_expectations_set) assert "expect_column_mean_to_be_between" not in expectations_from_suite assert len(suite.expectations) == 29
def test_build_suite_with_config_and_no_semantic_types_dict( mock_emit, titanic_validator, possible_expectations_set ): """ What does this test do and why? Tests that the build_suite function works as expected with a config and without a semantic_types dict """ profiler = UserConfigurableProfiler( titanic_validator, ignored_columns=["Survived", "Unnamed: 0"], excluded_expectations=["expect_column_mean_to_be_between"], primary_or_compound_key=["Name"], table_expectations_only=False, value_set_threshold="very_few", ) suite = profiler.build_suite() ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite) columns_expected_in_suite = {"Name", "PClass", "Age", "Sex", "SexCode"} assert columns_with_expectations == columns_expected_in_suite assert expectations_from_suite.issubset(possible_expectations_set) assert "expect_column_mean_to_be_between" not in expectations_from_suite assert len(suite.expectations) == 29 assert mock_emit.call_count == 1 assert "expectation_suite.add_expectation" not in [ mock_emit.call_args_list[0][0][0]["event"] ] # noinspection PyUnresolvedReferences expected_events: List[unittest.mock._Call] # noinspection PyUnresolvedReferences actual_events: List[unittest.mock._Call] expected_events = [ mock.call( { "event": "legacy_profiler.build_suite", "event_payload": { "profile_dataset_type": "Validator", "excluded_expectations_specified": True, "ignored_columns_specified": True, "not_null_only": False, "primary_or_compound_key_specified": True, "semantic_types_dict_specified": False, "table_expectations_only": False, "value_set_threshold_specified": True, "api_version": "v2", }, "success": True, } ), ] actual_events = mock_emit.call_args_list assert actual_events == expected_events
def test_config_with_not_null_only( titanic_data_context_modular_api, possible_expectations_set ): """ What does this test do and why? Confirms that the not_null_only key in config works as expected. """ excluded_expectations = [i for i in possible_expectations_set if "null" not in i] df = pd.DataFrame( { "mostly_null": [i if i % 3 == 0 else None for i in range(0, 1000)], "mostly_not_null": [None if i % 3 == 0 else i for i in range(0, 1000)], } ) validator = get_pandas_runtime_validator(titanic_data_context_modular_api, df) profiler_without_not_null_only = UserConfigurableProfiler( validator, excluded_expectations, not_null_only=False ) suite_without_not_null_only = profiler_without_not_null_only.build_suite() _, expectations = get_set_of_columns_and_expectations_from_suite( suite_without_not_null_only ) assert expectations == { "expect_column_values_to_be_null", "expect_column_values_to_not_be_null", } profiler_with_not_null_only = UserConfigurableProfiler( validator, excluded_expectations, not_null_only=True ) not_null_only_suite = profiler_with_not_null_only.build_suite() _, expectations = get_set_of_columns_and_expectations_from_suite( not_null_only_suite ) assert expectations == {"expect_column_values_to_not_be_null"} no_config_profiler = UserConfigurableProfiler(validator) no_config_suite = no_config_profiler.build_suite() _, expectations = get_set_of_columns_and_expectations_from_suite(no_config_suite) assert "expect_column_values_to_be_null" in expectations
def test_profiler_all_expectation_types_pandas( titanic_data_context_modular_api, taxi_validator_pandas, possible_expectations_set, taxi_data_semantic_types, taxi_data_ignored_columns, ): """ What does this test do and why? Ensures that all available expectation types work as expected for pandas """ context = titanic_data_context_modular_api profiler = UserConfigurableProfiler( taxi_validator_pandas, semantic_types_dict=taxi_data_semantic_types, ignored_columns=taxi_data_ignored_columns, primary_or_compound_key=[ "vendor_id", "pickup_datetime", "dropoff_datetime", "trip_distance", "pickup_location_id", "dropoff_location_id", ], ) assert profiler.column_info.get("rate_code_id") suite = profiler.build_suite() assert len(suite.expectations) == 41 ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite) unexpected_expectations = { "expect_column_values_to_be_unique", "expect_column_values_to_be_null", "expect_column_values_to_be_between", } assert expectations_from_suite == { i for i in possible_expectations_set if i not in unexpected_expectations } ignored_included_columns_overlap = [ i for i in columns_with_expectations if i in taxi_data_ignored_columns ] assert len(ignored_included_columns_overlap) == 0 results = context.run_validation_operator( "action_list_operator", assets_to_validate=[taxi_validator_pandas] ) assert results["success"]
def test_build_suite_no_config(titanic_validator, possible_expectations_set): """ What does this test do and why? Tests that the build_suite function works as expected with no config """ profiler = UserConfigurableProfiler(titanic_validator) suite = profiler.build_suite() expectations_from_suite = {i.expectation_type for i in suite.expectations} assert expectations_from_suite.issubset(possible_expectations_set) assert len(suite.expectations) == 48
def test_user_configurable_profiler_progress_bar_config_disabled( mock_tqdm, cardinality_validator): data_context = cardinality_validator.data_context data_context.project_config_with_variables_substituted.progress_bars = ( ProgressBarsConfig(profilers=False)) semantic_types = { "numeric": ["col_few", "col_many", "col_very_many"], "value_set": ["col_two", "col_very_few"], } profiler = UserConfigurableProfiler( cardinality_validator, semantic_types_dict=semantic_types, ) profiler.build_suite() assert not mock_tqdm.called assert mock_tqdm.call_count == 0
def test_build_suite_when_suite_already_exists(cardinality_dataset): """ What does this test do and why? Confirms that creating a new suite on an existing profiler wipes the previous suite """ profiler = UserConfigurableProfiler( cardinality_dataset, table_expectations_only=True, excluded_expectations=["expect_table_row_count_to_be_between"], ) suite = profiler.build_suite() _, expectations = get_set_of_columns_and_expectations_from_suite(suite) assert len(suite.expectations) == 1 assert "expect_table_columns_to_match_ordered_list" in expectations profiler.excluded_expectations = ["expect_table_columns_to_match_ordered_list"] suite = profiler.build_suite() _, expectations = get_set_of_columns_and_expectations_from_suite(suite) assert len(suite.expectations) == 1 assert "expect_table_row_count_to_be_between" in expectations
def test_build_suite_with_semantic_types_dict( mock_emit, cardinality_dataset, possible_expectations_set, ): """ What does this test do and why? Tests that the build_suite function works as expected with a semantic_types dict """ semantic_types = { "numeric": ["col_few", "col_many", "col_very_many"], "value_set": ["col_two", "col_very_few"], } profiler = UserConfigurableProfiler( cardinality_dataset, semantic_types_dict=semantic_types, primary_or_compound_key=["col_unique"], ignored_columns=["col_one"], value_set_threshold="unique", table_expectations_only=False, excluded_expectations=["expect_column_values_to_not_be_null"], ) suite = profiler.build_suite() ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite) assert "column_one" not in columns_with_expectations assert "expect_column_values_to_not_be_null" not in expectations_from_suite assert expectations_from_suite.issubset(possible_expectations_set) assert len(suite.expectations) == 33 value_set_expectations = [ i for i in suite.expectations if i.expectation_type == "expect_column_values_to_be_in_set" ] value_set_columns = { i.kwargs.get("column") for i in value_set_expectations } assert len(value_set_columns) == 2 assert value_set_columns == {"col_two", "col_very_few"} # Note 20211209 - Currently the only method called by the Profiler that is instrumented for usage_statistics # is ExpectationSuite's add_expectation(). It will not send a usage_stats event when called from a Profiler. # this number can change in the future if our instrumentation changes. assert mock_emit.call_count == 0 assert mock_emit.call_args_list == []
def test_nullity_expectations_mostly_tolerance( nulls_dataset, possible_expectations_set ): excluded_expectations = [i for i in possible_expectations_set if "null" not in i] batch_df = nulls_dataset profiler = UserConfigurableProfiler( batch_df, excluded_expectations, not_null_only=False ) suite = profiler.build_suite() for i in suite.expectations: assert i["kwargs"]["mostly"] == 0.66
def test_build_suite_no_config( mock_emit, titanic_validator, possible_expectations_set, ): """ What does this test do and why? Tests that the build_suite function works as expected with no config """ profiler = UserConfigurableProfiler(titanic_validator) suite = profiler.build_suite() expectations_from_suite = {i.expectation_type for i in suite.expectations} assert expectations_from_suite.issubset(possible_expectations_set) assert len(suite.expectations) == 48 # Note 20211209 - Profiler will also call ExpectationSuite's add_expectation(), but it will not # send a usage_stats event when called from a Profiler. assert mock_emit.call_count == 1 assert "expectation_suite.add_expectation" not in [ mock_emit.call_args_list[0][0][0]["event"] ] # noinspection PyUnresolvedReferences expected_events: List[unittest.mock._Call] # noinspection PyUnresolvedReferences actual_events: List[unittest.mock._Call] expected_events = [ mock.call( { "event": "legacy_profiler.build_suite", "event_payload": { "profile_dataset_type": "Validator", "excluded_expectations_specified": False, "ignored_columns_specified": False, "not_null_only": False, "primary_or_compound_key_specified": False, "semantic_types_dict_specified": False, "table_expectations_only": False, "value_set_threshold_specified": True, "api_version": "v2", }, "success": True, } ), ] actual_events = mock_emit.call_args_list assert actual_events == expected_events
def test_profiled_dataset_passes_own_validation(cardinality_validator, titanic_data_context): """ What does this test do and why? Confirms that a suite created on a validator with no config will pass when validated against itself """ context = titanic_data_context profiler = UserConfigurableProfiler(cardinality_validator, ignored_columns=["col_none"]) suite = profiler.build_suite() context.save_expectation_suite(suite) results = context.run_validation_operator( "action_list_operator", assets_to_validate=[cardinality_validator]) assert results["success"]
def test_build_suite_with_semantic_types_dict( cardinality_validator, possible_expectations_set, ): """ What does this test do and why? Tests that the build_suite function works as expected with a semantic_types dict """ semantic_types = { "numeric": ["col_few", "col_many", "col_very_many"], "value_set": ["col_two", "col_very_few"], } profiler = UserConfigurableProfiler( cardinality_validator, semantic_types_dict=semantic_types, primary_or_compound_key=["col_unique"], ignored_columns=["col_one"], value_set_threshold="unique", table_expectations_only=False, excluded_expectations=["expect_column_values_to_not_be_null"], ) suite = profiler.build_suite() ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite) assert "column_one" not in columns_with_expectations assert "expect_column_values_to_not_be_null" not in expectations_from_suite assert expectations_from_suite.issubset(possible_expectations_set) assert len(suite.expectations) == 32 value_set_expectations = [ i for i in suite.expectations if i.expectation_type == "expect_column_values_to_be_in_set" ] value_set_columns = { i.kwargs.get("column") for i in value_set_expectations } assert len(value_set_columns) == 2 assert value_set_columns == {"col_two", "col_very_few"}
# </snippet> # <snippet> validator = context.get_validator(batch_request=mysql_runtime_batch_request, ) # </snippet> # <snippet> profiler = UserConfigurableProfiler( profile_dataset=validator, excluded_expectations=[ "expect_column_quantile_values_to_be_between", "expect_column_mean_to_be_between", ], ) # </snippet> # <snippet> expectation_suite_name = "compare_two_tables" suite = profiler.build_suite() context.save_expectation_suite(expectation_suite=suite, expectation_suite_name=expectation_suite_name) # </snippet> # <snippet> my_checkpoint_name = "comparison_checkpoint" yaml_config = f""" name: {my_checkpoint_name} config_version: 1.0 class_name: SimpleCheckpoint run_name_template: "%Y%m%d-%H%M%S-my-run-name-template" expectation_suite_name: {expectation_suite_name} """ context.add_checkpoint(**yaml.load(yaml_config))
def test_profiler_all_expectation_types(titanic_data_context, possible_expectations_set): """ What does this test do and why? Ensures that all available expectation types work as expected """ context = titanic_data_context df = ge.read_csv( file_relative_path( __file__, "../test_sets/taxi_yellow_trip_data_samples/yellow_trip_data_sample_2019-01.csv", )) batch_df = ge.dataset.PandasDataset(df) ignored_columns = [ "pickup_location_id", "dropoff_location_id", "fare_amount", "extra", "mta_tax", "tip_amount", "tolls_amount", "improvement_surcharge", "congestion_surcharge", ] semantic_types = { "datetime": ["pickup_datetime", "dropoff_datetime"], "numeric": ["total_amount", "passenger_count"], "value_set": [ "payment_type", "rate_code_id", "store_and_fwd_flag", "passenger_count", ], "boolean": ["store_and_fwd_flag"], } profiler = UserConfigurableProfiler( batch_df, semantic_types_dict=semantic_types, ignored_columns=ignored_columns, primary_or_compound_key=[ "vendor_id", "pickup_datetime", "dropoff_datetime", "trip_distance", "pickup_location_id", "dropoff_location_id", ], ) assert profiler.column_info.get("rate_code_id") suite = profiler.build_suite() assert len(suite.expectations) == 46 ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite) unexpected_expectations = { "expect_column_values_to_be_unique", "expect_column_values_to_be_null", } assert expectations_from_suite == { i for i in possible_expectations_set if i not in unexpected_expectations } ignored_included_columns_overlap = [ i for i in columns_with_expectations if i in ignored_columns ] assert len(ignored_included_columns_overlap) == 0 results = context.run_validation_operator("action_list_operator", assets_to_validate=[batch_df]) assert results["success"]
def test_expect_compound_columns_to_be_unique( taxi_validator_spark, taxi_data_ignored_columns, caplog ): """ Until all ExecutionEngine implementations for V3 are completed for this expectation: 1) Use the "taxi_validator_" argument for this test method, corresponding to one of the ExecutionEngine subclasses, for which this expectation has not yet been implemented (and update the :param annotation below accordingly); 2) With every additional ExecutionEngine implementation for this expectation, update the corresponding "test_profiler_all_expectation_types_" test method to include this expectation in the appropriate assertion. 3) Once this expectation has been implemented for all ExecutionEngine subclasses, delete this test method entirely. :param taxi_validator_spark: :param taxi_data_ignored_columns: :param caplog: :return: """ taxi_validator = taxi_validator_spark ignored_columns = taxi_data_ignored_columns + [ "pickup_datetime", "dropoff_datetime", "total_amount", "passenger_count", "payment_type", "rate_code_id", "store_and_fwd_flag", "passenger_count", "store_and_fwd_flag", "vendor_id", "trip_distance", ] profiler = UserConfigurableProfiler( taxi_validator, ignored_columns=ignored_columns, primary_or_compound_key=[ "vendor_id", "pickup_datetime", "dropoff_datetime", "trip_distance", "pickup_location_id", "dropoff_location_id", ], ) with caplog.at_level(logging.WARNING): suite = profiler.build_suite() log_warning_records = list( filter(lambda record: record.levelname == "WARNING", caplog.records) ) assert len(log_warning_records) == 0 assert len(suite.expectations) == 3 ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite) expected_expectations = { "expect_table_columns_to_match_ordered_list", "expect_table_row_count_to_be_between", "expect_compound_columns_to_be_unique", } assert expected_expectations == expectations_from_suite profiler_with_single_column_key = UserConfigurableProfiler( taxi_validator, ignored_columns=ignored_columns, primary_or_compound_key=["pickup_datetime"], ) suite = profiler_with_single_column_key.build_suite() assert len(suite.expectations) == 3 ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite) expected_expectations = { "expect_table_columns_to_match_ordered_list", "expect_table_row_count_to_be_between", "expect_column_values_to_be_unique", } assert expected_expectations == expectations_from_suite
def test_build_suite_with_semantic_types_dict( mock_emit, cardinality_validator, possible_expectations_set, ): """ What does this test do and why? Tests that the build_suite function works as expected with a semantic_types dict """ semantic_types = { "numeric": ["col_few", "col_many", "col_very_many"], "value_set": ["col_two", "col_very_few"], } profiler = UserConfigurableProfiler( cardinality_validator, semantic_types_dict=semantic_types, primary_or_compound_key=["col_unique"], ignored_columns=["col_one"], value_set_threshold="unique", table_expectations_only=False, excluded_expectations=["expect_column_values_to_not_be_null"], ) suite = profiler.build_suite() ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite) assert "column_one" not in columns_with_expectations assert "expect_column_values_to_not_be_null" not in expectations_from_suite assert expectations_from_suite.issubset(possible_expectations_set) assert len(suite.expectations) == 32 value_set_expectations = [ i for i in suite.expectations if i.expectation_type == "expect_column_values_to_be_in_set" ] value_set_columns = {i.kwargs.get("column") for i in value_set_expectations} assert len(value_set_columns) == 2 assert value_set_columns == {"col_two", "col_very_few"} # Note 20211209 - Profiler will also call ExpectationSuite's add_expectation(), but it will not # send a usage_stats event when called from a Profiler. assert mock_emit.call_count == 1 # noinspection PyUnresolvedReferences expected_events: List[unittest.mock._Call] # noinspection PyUnresolvedReferences actual_events: List[unittest.mock._Call] expected_events = [ mock.call( { "event": "legacy_profiler.build_suite", "event_payload": { "profile_dataset_type": "Validator", "excluded_expectations_specified": True, "ignored_columns_specified": True, "not_null_only": False, "primary_or_compound_key_specified": True, "semantic_types_dict_specified": True, "table_expectations_only": False, "value_set_threshold_specified": True, "api_version": "v2", }, "success": True, } ), ] actual_events = mock_emit.call_args_list assert actual_events == expected_events
def test_profiler_all_expectation_types_sqlalchemy( titanic_data_context_modular_api, taxi_validator_sqlalchemy, possible_expectations_set, ): """ What does this test do and why? Ensures that all available expectation types work as expected for sqlalchemy """ if taxi_validator_sqlalchemy == None: pytest.skip("a message") context = titanic_data_context_modular_api ignored_columns = [ "pickup_location_id", "dropoff_location_id", "fare_amount", "extra", "mta_tax", "tip_amount", "tolls_amount", "improvement_surcharge", "congestion_surcharge", ] semantic_types = { "datetime": ["pickup_datetime", "dropoff_datetime"], "numeric": ["total_amount", "passenger_count"], "value_set": [ "payment_type", "rate_code_id", "store_and_fwd_flag", "passenger_count", ], "boolean": ["store_and_fwd_flag"], } profiler = UserConfigurableProfiler( taxi_validator_sqlalchemy, semantic_types_dict=semantic_types, ignored_columns=ignored_columns, # TODO: Add primary_or_compound_key test # primary_or_compound_key=[ # "vendor_id", # "pickup_datetime", # "dropoff_datetime", # "trip_distance", # "pickup_location_id", # "dropoff_location_id", # ], ) assert profiler.column_info.get("rate_code_id") suite = profiler.build_suite() assert len(suite.expectations) == 45 ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite) unexpected_expectations = { "expect_column_values_to_be_unique", "expect_column_values_to_be_null", "expect_compound_columns_to_be_unique", } assert expectations_from_suite == { i for i in possible_expectations_set if i not in unexpected_expectations } ignored_included_columns_overlap = [ i for i in columns_with_expectations if i in ignored_columns ] assert len(ignored_included_columns_overlap) == 0 results = context.run_validation_operator( "action_list_operator", assets_to_validate=[taxi_validator_sqlalchemy] ) assert results["success"]
def test_error_handling_for_expect_compound_columns_to_be_unique( taxi_validator_pandas, taxi_data_ignored_columns, caplog): # TODO: When this expectation is implemented for V3, remove this test and test for this expectation ignored_columns = taxi_data_ignored_columns + [ "pickup_datetime", "dropoff_datetime", "total_amount", "passenger_count", "payment_type", "rate_code_id", "store_and_fwd_flag", "passenger_count", "store_and_fwd_flag", "vendor_id", "trip_distance", ] profiler = UserConfigurableProfiler( taxi_validator_pandas, ignored_columns=ignored_columns, primary_or_compound_key=[ "vendor_id", "pickup_datetime", "dropoff_datetime", "trip_distance", "pickup_location_id", "dropoff_location_id", ], ) with caplog.at_level(logging.WARNING): suite = profiler.build_suite() log_warnings = caplog.messages assert len(log_warnings) == 1 assert ( log_warnings[0] == "expect_compound_columns_to_be_unique is not currently available in the V3 (Batch Request) API. Specifying a compound key will not add any expectations. This will be updated when that expectation becomes available." ) assert len(suite.expectations) == 2 ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite) expected_expectations = { "expect_table_columns_to_match_ordered_list", "expect_table_row_count_to_be_between", } assert expected_expectations == expectations_from_suite profiler_with_single_column_key = UserConfigurableProfiler( taxi_validator_pandas, ignored_columns=ignored_columns, primary_or_compound_key=["pickup_datetime"], ) suite = profiler_with_single_column_key.build_suite() assert len(suite.expectations) == 3 ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite) expected_expectations = { "expect_table_columns_to_match_ordered_list", "expect_table_row_count_to_be_between", "expect_column_values_to_be_unique", } assert expected_expectations == expectations_from_suite
def test_build_suite_when_suite_already_exists( mock_emit, cardinality_validator, ): """ What does this test do and why? Confirms that creating a new suite on an existing profiler wipes the previous suite """ profiler = UserConfigurableProfiler( cardinality_validator, table_expectations_only=True, excluded_expectations=["expect_table_row_count_to_be_between"], ) suite = profiler.build_suite() _, expectations = get_set_of_columns_and_expectations_from_suite(suite) assert len(suite.expectations) == 1 assert "expect_table_columns_to_match_ordered_list" in expectations profiler.excluded_expectations = ["expect_table_columns_to_match_ordered_list"] suite = profiler.build_suite() _, expectations = get_set_of_columns_and_expectations_from_suite(suite) assert len(suite.expectations) == 1 assert "expect_table_row_count_to_be_between" in expectations assert mock_emit.call_count == 2 # noinspection PyUnresolvedReferences expected_events: List[unittest.mock._Call] # noinspection PyUnresolvedReferences actual_events: List[unittest.mock._Call] expected_events = [ mock.call( { "event": "legacy_profiler.build_suite", "event_payload": { "profile_dataset_type": "Validator", "excluded_expectations_specified": True, "ignored_columns_specified": True, "not_null_only": False, "primary_or_compound_key_specified": False, "semantic_types_dict_specified": False, "table_expectations_only": True, "value_set_threshold_specified": True, "api_version": "v2", }, "success": True, } ), mock.call( { "event": "legacy_profiler.build_suite", "event_payload": { "profile_dataset_type": "Validator", "excluded_expectations_specified": True, "ignored_columns_specified": True, "not_null_only": False, "primary_or_compound_key_specified": False, "semantic_types_dict_specified": False, "table_expectations_only": True, "value_set_threshold_specified": True, "api_version": "v2", }, "success": True, } ), ] actual_events = mock_emit.call_args_list assert actual_events == expected_events