def test_ValidationResultsTableContentBlockRenderer_get_observed_value(evr_success): evr_no_result_key = ExpectationValidationResult( success=True, exception_info={ "raised_exception": False, "exception_message": None, "exception_traceback": None, }, expectation_config=ExpectationConfiguration( expectation_type="expect_table_row_count_to_be_between", kwargs={"min_value": 0, "max_value": None, "result_format": "SUMMARY"}, ), ) evr_expect_column_values_to_not_be_null = ExpectationValidationResult( success=True, result={ "element_count": 1313, "unexpected_count": 1050, "unexpected_percent": 79.96953541508, "partial_unexpected_list": [], }, exception_info={ "raised_exception": False, "exception_message": None, "exception_traceback": None, }, expectation_config=ExpectationConfiguration( expectation_type="expect_column_values_to_not_be_null", kwargs={"column": "Unnamed: 0", "mostly": 0.5, "result_format": "SUMMARY"}, ), ) evr_expect_column_values_to_be_null = ExpectationValidationResult( success=True, result={ "element_count": 1313, "unexpected_count": 0, "unexpected_percent": 0.0, "partial_unexpected_list": [], }, exception_info={ "raised_exception": False, "exception_message": None, "exception_traceback": None, }, expectation_config=ExpectationConfiguration( expectation_type="expect_column_values_to_be_null", kwargs={"column": "Unnamed: 0", "mostly": 0.5, "result_format": "SUMMARY"}, ), ) # test _get_observed_value when evr.result["observed_value"] exists output_1 = ValidationResultsTableContentBlockRenderer._get_observed_value( evr_success ) assert output_1 == "1,313" # test _get_observed_value when evr.result does not exist output_2 = ValidationResultsTableContentBlockRenderer._get_observed_value( evr_no_result_key ) assert output_2 == "--" # test _get_observed_value for expect_column_values_to_not_be_null expectation type output_3 = ValidationResultsTableContentBlockRenderer._get_observed_value( evr_expect_column_values_to_not_be_null ) assert output_3 == "≈20.03% not null" # test _get_observed_value for expect_column_values_to_be_null expectation type output_4 = ValidationResultsTableContentBlockRenderer._get_observed_value( evr_expect_column_values_to_be_null ) assert output_4 == "100% null"
def test_ge_pandas_automatic_failure_removal(): df = ge.dataset.PandasDataset( { "A": [1, 2, 3, 4], "B": [5, 6, 7, 8], "C": ["a", "b", "c", "d"], "D": ["e", "f", "g", "h"], } ) # Put some simple expectations on the data frame df.profile(ge.profile.ColumnsExistProfiler) df.expect_column_values_to_be_in_set("A", [1, 2, 3, 4]) df.expect_column_values_to_be_in_set("B", [5, 6, 7, 8]) df.expect_column_values_to_be_in_set("C", ["w", "x", "y", "z"]) df.expect_column_values_to_be_in_set("D", ["e", "f", "g", "h"]) # First check that failing expectations are NOT automatically # dropped when sampling. # For this data frame, the expectation on column "C" above fails. exp1 = [ ExpectationConfiguration( expectation_type="expect_column_to_exist", kwargs={"column": "A"} ), ExpectationConfiguration( expectation_type="expect_column_to_exist", kwargs={"column": "B"} ), ExpectationConfiguration( expectation_type="expect_column_to_exist", kwargs={"column": "C"} ), ExpectationConfiguration( expectation_type="expect_column_to_exist", kwargs={"column": "D"} ), ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={"column": "A", "value_set": [1, 2, 3, 4]}, ), ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={"column": "B", "value_set": [5, 6, 7, 8]}, ), ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={"column": "C", "value_set": ["w", "x", "y", "z"]}, ), ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={"column": "D", "value_set": ["e", "f", "g", "h"]}, ), ] samp1 = df.sample(n=2) assert samp1.find_expectations() == exp1 # Now check subsetting to verify that failing expectations are NOT # automatically dropped when subsetting. sub1 = df[["A", "D"]] assert samp1.find_expectations() == exp1 # Set property/attribute so that failing expectations are # automatically removed when sampling or subsetting. df.discard_subset_failing_expectations = True ### # Note: Order matters in this test, and a validationoperator may change order ### exp_samp = [ ExpectationConfiguration( expectation_type="expect_column_to_exist", kwargs={"column": "A"} ), ExpectationConfiguration( expectation_type="expect_column_to_exist", kwargs={"column": "B"} ), ExpectationConfiguration( expectation_type="expect_column_to_exist", kwargs={"column": "C"} ), ExpectationConfiguration( expectation_type="expect_column_to_exist", kwargs={"column": "D"} ), ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={"column": "A", "value_set": [1, 2, 3, 4]}, ), ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={"column": "B", "value_set": [5, 6, 7, 8]}, ), ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={"column": "D", "value_set": ["e", "f", "g", "h"]}, ), ] samp2 = df.sample(n=2) assert samp2.find_expectations() == exp_samp # Now check subsetting. In additional to the failure on column "C", # the expectations on column "B" now fail since column "B" doesn't # exist in the subset. sub2 = df[["A", "D"]] exp_sub = [ ExpectationConfiguration( expectation_type="expect_column_to_exist", kwargs={"column": "A"} ), ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={"column": "A", "value_set": [1, 2, 3, 4]}, ), ExpectationConfiguration( expectation_type="expect_column_to_exist", kwargs={"column": "D"} ), ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={"column": "D", "value_set": ["e", "f", "g", "h"]}, ), ] assert samp2.find_expectations() == exp_samp
def _profile(cls, dataset, configuration=None): logger.debug(f"Running profiler with configuration: {configuration}") if configuration == "demo": return cls._demo_profile(dataset) existing_columns = dataset.get_table_columns() selected_columns = existing_columns included_expectations = [] excluded_expectations = [] if configuration: if ( "included_expectations" in configuration and "excluded_expectations" in configuration ): raise ProfilerError( "Please specify either `included_expectations` or `excluded_expectations`." ) if "included_expectations" in configuration: included_expectations = configuration["included_expectations"] if included_expectations in [False, None, []]: included_expectations = None _check_that_expectations_are_available(dataset, included_expectations) if "excluded_expectations" in configuration: excluded_expectations = configuration["excluded_expectations"] if excluded_expectations in [False, None, []]: excluded_expectations = None _check_that_expectations_are_available(dataset, excluded_expectations) if ( "included_columns" in configuration and "excluded_columns" in configuration ): raise ProfilerError( "Please specify either `excluded_columns` or `included_columns`." ) elif "included_columns" in configuration: selected_columns = configuration["included_columns"] if selected_columns in [False, None, []]: selected_columns = [] elif "excluded_columns" in configuration: excluded_columns = configuration["excluded_columns"] if excluded_columns in [False, None, []]: excluded_columns = [] selected_columns = set(existing_columns) - set(excluded_columns) _check_that_columns_exist(dataset, selected_columns) if included_expectations is None: suite = cls._build_column_description_metadata(dataset) # remove column exist expectations suite.expectations = [] return suite dataset.set_default_expectation_argument("catch_exceptions", False) dataset = cls._build_table_row_count_expectation( dataset, excluded_expectations=excluded_expectations, included_expectations=included_expectations, ) dataset.set_config_value("interactive_evaluation", True) dataset = cls._build_table_column_expectations( dataset, excluded_expectations=excluded_expectations, included_expectations=included_expectations, ) column_cache = {} if selected_columns: for column in selected_columns: cardinality = cls._get_column_cardinality_with_caching( dataset, column, column_cache ) column_type = cls._get_column_type_with_caching( dataset, column, column_cache ) if cardinality in [ ProfilerCardinality.TWO, ProfilerCardinality.VERY_FEW, ProfilerCardinality.FEW, ]: cls._create_expectations_for_low_card_column( dataset, column, column_cache ) elif cardinality in [ ProfilerCardinality.MANY, ProfilerCardinality.VERY_MANY, ProfilerCardinality.UNIQUE, ]: # TODO we will want to finesse the number and types of # expectations created here. The simple version is deny/allow list # and the more complex version is desired per column type and # cardinality. This deserves more thought on configuration. dataset.expect_column_values_to_be_unique(column) if column_type in [ProfilerDataType.INT, ProfilerDataType.FLOAT]: cls._create_expectations_for_numeric_column(dataset, column) elif column_type in [ProfilerDataType.DATETIME]: cls._create_expectations_for_datetime_column( dataset, column, excluded_expectations=excluded_expectations, included_expectations=included_expectations, ) elif column_type in [ProfilerDataType.STRING]: cls._create_expectations_for_string_column( dataset, column, excluded_expectations=excluded_expectations, included_expectations=included_expectations, ) elif column_type in [ProfilerDataType.UNKNOWN]: logger.debug( f"Skipping expectation creation for column {column} of unknown type: {column_type}" ) if excluded_expectations: # NOTE: we reach into a private member here because of an expected future # refactor that will make the suite directly accessible dataset._expectation_suite.remove_all_expectations_of_type( excluded_expectations ) if included_expectations: for expectation in dataset.get_expectation_suite( discard_failed_expectations=False, suppress_logging=True, ).expectations: if expectation.expectation_type not in included_expectations: try: dataset.remove_expectation( ExpectationConfiguration( expectation_type=expectation.expectation_type, kwargs=expectation.kwargs, ), match_type="domain", remove_multiple_matches=True, ) except ValueError: logger.debug( f"Attempted to remove {expectation}, which was not found." ) expectation_suite = cls._build_column_description_metadata(dataset) return expectation_suite
"action": { "class_name": "OpenLineageValidationAction", "module_name": "openlineage.common.provider.great_expectations.action" } }] } }, anonymous_usage_statistics={'enabled': False} ) TABLE_NAME = "test_data" # Common validation results table_result = ExpectationValidationResult(success=True, expectation_config=ExpectationConfiguration( expectation_type='expect_table_row_count_to_equal', kwargs={'value': 10}), result={"observed_value": 10}) column_result = ExpectationValidationResult(success=True, expectation_config=ExpectationConfiguration( expectation_type='expect_column_sum_to_be_between', kwargs={'column': 'size', 'min_value': 0, 'max_value': 100} ), result={'observed_value': 60}) result_suite = ExpectationSuiteValidationResult(success=True, meta={'batch_kwargs': {}}, results=[table_result, column_result]) @pytest.fixture(scope='session') def test_db_file():
def test_find_expectations(): my_df = ge.dataset.PandasDataset( { "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "y": [1, 2, None, 4, None, 6, 7, 8, 9, None], "z": [ "cello", "hello", "jello", "bellow", "fellow", "mellow", "wellow", "xello", "yellow", "zello", ], }, profiler=ge.profile.ColumnsExistProfiler, ) my_df.expect_column_values_to_be_of_type("x", "int") my_df.expect_column_values_to_be_of_type("y", "int") my_df.expect_column_values_to_be_of_type("z", "int") my_df.expect_column_values_to_be_increasing("x") my_df.expect_column_values_to_match_regex("z", "ello") assert my_df.find_expectations("expect_column_to_exist", "w") == [] assert my_df.find_expectations( "expect_column_to_exist", "x", expectation_kwargs={}) == [ ExpectationConfiguration(expectation_type="expect_column_to_exist", kwargs={"column": "x"}) ] assert my_df.find_expectations( "expect_column_to_exist", expectation_kwargs={"column": "y"}) == [ ExpectationConfiguration(expectation_type="expect_column_to_exist", kwargs={"column": "y"}) ] exp1 = [ ExpectationConfiguration(expectation_type="expect_column_to_exist", kwargs={"column": "x"}), ExpectationConfiguration(expectation_type="expect_column_to_exist", kwargs={"column": "y"}), ExpectationConfiguration(expectation_type="expect_column_to_exist", kwargs={"column": "z"}), ] assert my_df.find_expectations("expect_column_to_exist") == exp1 with pytest.raises(ValueError) as exc: my_df.find_expectations("expect_column_to_exist", "x", {"column": "y"}) assert "Conflicting column names in find_expectation_indexes:" in str( exc.value) exp1 = [ ExpectationConfiguration(expectation_type="expect_column_to_exist", kwargs={"column": "x"}), ExpectationConfiguration( expectation_type="expect_column_values_to_be_of_type", kwargs={ "column": "x", "type_": "int" }, ), ExpectationConfiguration( expectation_type="expect_column_values_to_be_increasing", kwargs={"column": "x"}, ), ] assert my_df.find_expectations(column="x") == exp1
def test_discard_failing_expectations(): df = ge.dataset.PandasDataset( { "A": [1, 2, 3, 4], "B": [5, 6, 7, 8], "C": ["a", "b", "c", "d"], "D": ["e", "f", "g", "h"], }, profiler=ge.profile.ColumnsExistProfiler, ) # Put some simple expectations on the data frame df.expect_column_values_to_be_in_set("A", [1, 2, 3, 4]) df.expect_column_values_to_be_in_set("B", [5, 6, 7, 8]) df.expect_column_values_to_be_in_set("C", ["a", "b", "c", "d"]) df.expect_column_values_to_be_in_set("D", ["e", "f", "g", "h"]) exp1 = [ ExpectationConfiguration(expectation_type="expect_column_to_exist", kwargs={"column": "A"}), ExpectationConfiguration(expectation_type="expect_column_to_exist", kwargs={"column": "B"}), ExpectationConfiguration(expectation_type="expect_column_to_exist", kwargs={"column": "C"}), ExpectationConfiguration(expectation_type="expect_column_to_exist", kwargs={"column": "D"}), ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={ "column": "A", "value_set": [1, 2, 3, 4] }, ), ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={ "column": "B", "value_set": [5, 6, 7, 8] }, ), ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={ "column": "C", "value_set": ["a", "b", "c", "d"] }, ), ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={ "column": "D", "value_set": ["e", "f", "g", "h"] }, ), ] sub1 = df[:3] sub1.discard_failing_expectations() assert sub1.find_expectations() == exp1 sub1 = df[1:2] sub1.discard_failing_expectations() assert sub1.find_expectations() == exp1 sub1 = df[:-1] sub1.discard_failing_expectations() assert sub1.find_expectations() == exp1 sub1 = df[-1:] sub1.discard_failing_expectations() assert sub1.find_expectations() == exp1 sub1 = df[["A", "D"]] exp1 = [ ExpectationConfiguration(expectation_type="expect_column_to_exist", kwargs={"column": "A"}), ExpectationConfiguration(expectation_type="expect_column_to_exist", kwargs={"column": "D"}), ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={ "column": "A", "value_set": [1, 2, 3, 4] }, ), ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={ "column": "D", "value_set": ["e", "f", "g", "h"] }, ), ] with pytest.warns(UserWarning, match=r"Removed \d expectations that were 'False'"): sub1.discard_failing_expectations() assert sub1.find_expectations() == exp1 sub1 = df[["A"]] exp1 = [ ExpectationConfiguration(expectation_type="expect_column_to_exist", kwargs={"column": "A"}), ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={ "column": "A", "value_set": [1, 2, 3, 4] }, ), ] with pytest.warns(UserWarning, match=r"Removed \d expectations that were 'False'"): sub1.discard_failing_expectations() assert sub1.find_expectations() == exp1 sub1 = df.iloc[:3, 1:4] exp1 = [ ExpectationConfiguration(expectation_type="expect_column_to_exist", kwargs={"column": "B"}), ExpectationConfiguration(expectation_type="expect_column_to_exist", kwargs={"column": "C"}), ExpectationConfiguration(expectation_type="expect_column_to_exist", kwargs={"column": "D"}), ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={ "column": "B", "value_set": [5, 6, 7, 8] }, ), ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={ "column": "C", "value_set": ["a", "b", "c", "d"] }, ), ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={ "column": "D", "value_set": ["e", "f", "g", "h"] }, ), ] with pytest.warns(UserWarning, match=r"Removed \d expectations that were 'False'"): sub1.discard_failing_expectations() assert sub1.find_expectations() == exp1 sub1 = df.loc[0:, "A":"B"] exp1 = [ ExpectationConfiguration(expectation_type="expect_column_to_exist", kwargs={"column": "A"}), ExpectationConfiguration(expectation_type="expect_column_to_exist", kwargs={"column": "B"}), ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={ "column": "A", "value_set": [1, 2, 3, 4] }, ), ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={ "column": "B", "value_set": [5, 6, 7, 8] }, ), ] with pytest.warns(UserWarning, match=r"Removed \d expectations that were 'False'"): sub1.discard_failing_expectations() assert sub1.find_expectations() == exp1
def table_exp1(): return ExpectationConfiguration( expectation_type="expect_table_columns_to_match_ordered_list", kwargs={"value": ["a", "b", "c"]}, )
def test_validate(): with open( file_relative_path(__file__, "./test_sets/titanic_expectations.json")) as f: my_expectation_suite = expectationSuiteSchema.loads(f.read()) with mock.patch("uuid.uuid1") as uuid: uuid.return_value = "1234" my_df = ge.read_csv( file_relative_path(__file__, "./test_sets/Titanic.csv"), expectation_suite=my_expectation_suite, ) my_df.set_default_expectation_argument("result_format", "COMPLETE") with mock.patch("datetime.datetime") as mock_datetime: mock_datetime.utcnow.return_value = datetime(1955, 11, 5) results = my_df.validate(catch_exceptions=False) with open( file_relative_path( __file__, "./test_sets/titanic_expected_data_asset_validate_results.json" )) as f: expected_results = expectationSuiteValidationResultSchema.loads( f.read()) del results.meta["great_expectations.__version__"] assert expected_results == results # Now, change the results and ensure they are no longer equal results.results[0] = ExpectationValidationResult() assert expected_results != results # Finally, confirm that only_return_failures works # and does not affect the "statistics" field. with mock.patch("datetime.datetime") as mock_datetime: mock_datetime.utcnow.return_value = datetime(1955, 11, 5) validation_results = my_df.validate(only_return_failures=True) del validation_results.meta["great_expectations.__version__"] expected_results = ExpectationSuiteValidationResult( meta={ "expectation_suite_name": "titanic", "run_id": "19551105T000000.000000Z", "batch_kwargs": { "ge_batch_id": "1234" }, "batch_markers": {}, "batch_parameters": {}, }, results=[ ExpectationValidationResult( expectation_config=ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={ "column": "PClass", "value_set": ["1st", "2nd", "3rd"] }, ), success=False, exception_info={ "exception_message": None, "exception_traceback": None, "raised_exception": False, }, result={ "partial_unexpected_index_list": [456], "unexpected_count": 1, "unexpected_list": ["*"], "unexpected_percent": 0.07616146230007616, "element_count": 1313, "missing_percent": 0.0, "partial_unexpected_counts": [{ "count": 1, "value": "*" }], "partial_unexpected_list": ["*"], "unexpected_percent_nonmissing": 0.07616146230007616, "missing_count": 0, "unexpected_index_list": [456], }, ) ], success=expected_results.success, # unaffected statistics=expected_results["statistics"], # unaffected ) assert expected_results == validation_results
def test_find_expectations(baseline_suite, exp1, exp2): # Note: most of the logic in this method is based on # find_expectation_indexes and _copy_and_clean_up_expectations_from_indexes # These tests do not thoroughly cover that logic. # Instead, they focus on the behavior of the discard_* methods assert (baseline_suite.find_expectations( column="a", expectation_type="expect_column_values_to_be_between", ) == []) result = baseline_suite.find_expectations( column="a", expectation_type="expect_column_values_to_be_in_set", ) assert len(result) == 1 assert result[0] == ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={ "column": "a", "value_set": [1, 2, 3], # "result_format": "BASIC" }, meta={"notes": "This is an expectation."}, ) exp_with_all_the_params = ExpectationConfiguration( expectation_type="expect_column_values_to_not_be_null", kwargs={ "column": "a", "result_format": "BASIC", "include_config": True, "catch_exceptions": True, }, meta={}, ) baseline_suite.append_expectation(exp_with_all_the_params) assert baseline_suite.find_expectations( column="a", expectation_type="expect_column_values_to_not_be_null", )[0] == ExpectationConfiguration( expectation_type="expect_column_values_to_not_be_null", kwargs={ "column": "a", }, meta={}, ) assert (baseline_suite.find_expectations( column="a", expectation_type="expect_column_values_to_not_be_null", discard_result_format_kwargs=False, discard_include_config_kwargs=False, discard_catch_exceptions_kwargs=False, )[0] == exp_with_all_the_params) assert baseline_suite.find_expectations( column="a", expectation_type="expect_column_values_to_not_be_null", discard_result_format_kwargs=False, discard_catch_exceptions_kwargs=False, )[0] == ExpectationConfiguration( expectation_type="expect_column_values_to_not_be_null", kwargs={ "column": "a", "result_format": "BASIC", "catch_exceptions": True, }, meta={}, )
def test_evaluation_parameter_store_methods(data_context): run_id = "20191125T000000.000000Z" source_patient_data_results = ExpectationSuiteValidationResult( meta={ "expectation_suite_name": "source_patient_data.default", "run_id": run_id }, results=[ ExpectationValidationResult( expectation_config=ExpectationConfiguration( expectation_type="expect_table_row_count_to_equal", kwargs={ "value": 1024, }), success=True, exception_info={ "exception_message": None, "exception_traceback": None, "raised_exception": False }, result={ "observed_value": 1024, "element_count": 1024, "missing_percent": 0.0, "missing_count": 0 }) ], success=True) data_context.store_evaluation_parameters(source_patient_data_results) bound_parameters = data_context.evaluation_parameter_store.get_bind_params( run_id) assert bound_parameters == { 'urn:great_expectations:validations:source_patient_data.default:expect_table_row_count_to_equal.result' '.observed_value': 1024 } source_diabetes_data_results = ExpectationSuiteValidationResult( meta={ "expectation_suite_name": "source_diabetes_data.default", "run_id": run_id }, results=[ ExpectationValidationResult( expectation_config=ExpectationConfiguration( expectation_type= "expect_column_unique_value_count_to_be_between", kwargs={ "column": "patient_nbr", "min": 2048, "max": 2048 }), success=True, exception_info={ "exception_message": None, "exception_traceback": None, "raised_exception": False }, result={ "observed_value": 2048, "element_count": 5000, "missing_percent": 0.0, "missing_count": 0 }) ], success=True) data_context.store_evaluation_parameters(source_diabetes_data_results) bound_parameters = data_context.evaluation_parameter_store.get_bind_params( run_id) assert bound_parameters == { 'urn:great_expectations:validations:source_patient_data.default:expect_table_row_count_to_equal.result' '.observed_value': 1024, 'urn:great_expectations:validations:source_diabetes_data.default' ':expect_column_unique_value_count_to_be_between.result.observed_value:column=patient_nbr': 2048 }
def test_notebook_execution_with_pandas_backend( titanic_v013_multi_datasource_pandas_data_context_with_checkpoints_v1_with_empty_store_stats_enabled, ): """ To set this test up we: - create a suite using profiling - verify that no validations have happened - create the suite edit notebook by hijacking the private cli method We then: - execute that notebook (Note this will raise various errors like CellExecutionError if any cell in the notebook fails - create a new context from disk - verify that a validation has been run with our expectation suite """ context: DataContext = titanic_v013_multi_datasource_pandas_data_context_with_checkpoints_v1_with_empty_store_stats_enabled root_dir: str = context.root_directory uncommitted_dir: str = os.path.join(root_dir, "uncommitted") expectation_suite_name: str = "warning" context.create_expectation_suite( expectation_suite_name=expectation_suite_name) batch_request: dict = { "datasource_name": "my_datasource", "data_connector_name": "my_basic_data_connector", "data_asset_name": "Titanic_1912", } # Sanity check test setup original_suite: ExpectationSuite = context.get_expectation_suite( expectation_suite_name=expectation_suite_name) assert len(original_suite.expectations) == 0 assert context.list_expectation_suite_names() == [expectation_suite_name] assert context.list_datasources() == [ { "name": "my_datasource", "class_name": "Datasource", "module_name": "great_expectations.datasource", "execution_engine": { "class_name": "PandasExecutionEngine", "module_name": "great_expectations.execution_engine", }, "data_connectors": { "my_basic_data_connector": { "module_name": "great_expectations.datasource.data_connector", "base_directory": f"{root_dir}/../data/titanic", "default_regex": { "pattern": "(.*)\\.csv", "group_names": ["data_asset_name"], }, "class_name": "InferredAssetFilesystemDataConnector", }, "my_special_data_connector": { "glob_directive": "*.csv", "assets": { "users": { "pattern": "(.+)_(\\d+)_(\\d+)\\.csv", "group_names": ["name", "timestamp", "size"], "class_name": "Asset", "base_directory": f"{root_dir}/../data/titanic", "module_name": "great_expectations.datasource.data_connector.asset", } }, "module_name": "great_expectations.datasource.data_connector", "base_directory": f"{root_dir}/../data/titanic", "default_regex": { "pattern": "(.+)\\.csv", "group_names": ["name"] }, "class_name": "ConfiguredAssetFilesystemDataConnector", }, "my_other_data_connector": { "glob_directive": "*.csv", "assets": { "users": { "class_name": "Asset", "module_name": "great_expectations.datasource.data_connector.asset", } }, "module_name": "great_expectations.datasource.data_connector", "base_directory": f"{root_dir}/../data/titanic", "default_regex": { "pattern": "(.+)\\.csv", "group_names": ["name"] }, "class_name": "ConfiguredAssetFilesystemDataConnector", }, "my_runtime_data_connector": { "module_name": "great_expectations.datasource.data_connector", "batch_identifiers": ["pipeline_stage_name", "airflow_run_id"], "class_name": "RuntimeDataConnector", }, }, }, { "name": "my_additional_datasource", "class_name": "Datasource", "module_name": "great_expectations.datasource", "execution_engine": { "module_name": "great_expectations.execution_engine", "class_name": "PandasExecutionEngine", }, "data_connectors": { "my_additional_data_connector": { "module_name": "great_expectations.datasource.data_connector", "default_regex": { "pattern": "(.*)\\.csv", "group_names": ["data_asset_name"], }, "base_directory": f"{root_dir}/../data/titanic", "class_name": "InferredAssetFilesystemDataConnector", } }, }, ] assert context.get_validation_result( expectation_suite_name="warning") == {} # Create notebook # do not want to actually send usage_message, since the function call is not the result of actual usage _suite_edit_workflow( context=context, expectation_suite_name=expectation_suite_name, profile=True, profiler_name=None, usage_event="test_notebook_execution", interactive_mode=CLISuiteInteractiveFlagCombinations. UNPROMPTED_INTERACTIVE_FALSE_MANUAL_TRUE, no_jupyter=True, create_if_not_exist=False, datasource_name=None, batch_request=batch_request, additional_batch_request_args=None, suppress_usage_message=True, assume_yes=True, ) edit_notebook_path: str = os.path.join(uncommitted_dir, "edit_warning.ipynb") assert os.path.isfile(edit_notebook_path) run_notebook( notebook_path=edit_notebook_path, notebook_dir=uncommitted_dir, string_to_be_replaced= "context.open_data_docs(resource_identifier=validation_result_identifier)", replacement_string="", ) # Assertions about output context = DataContext(context_root_dir=root_dir) obs_validation_result: ExpectationSuiteValidationResult = ( context.get_validation_result(expectation_suite_name="warning")) assert obs_validation_result.statistics == { "evaluated_expectations": 2, "successful_expectations": 2, "unsuccessful_expectations": 0, "success_percent": 100.0, } suite: ExpectationSuite = context.get_expectation_suite( expectation_suite_name=expectation_suite_name) suite["meta"].pop("citations", None) assert suite.expectations == [ ExpectationConfiguration( **{ "expectation_type": "expect_table_columns_to_match_ordered_list", "kwargs": { "column_list": [ "Unnamed: 0", "Name", "PClass", "Age", "Sex", "Survived", "SexCode", ] }, "meta": {}, }), ExpectationConfiguration( **{ "expectation_type": "expect_table_row_count_to_be_between", "kwargs": { "max_value": 1313, "min_value": 1313 }, "meta": {}, }), ] columns_with_expectations: Set[str] expectations_from_suite: Set[str] ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite=suite) expected_expectations: Set[str] = { "expect_table_columns_to_match_ordered_list", "expect_table_row_count_to_be_between", } assert columns_with_expectations == set() assert expectations_from_suite == expected_expectations
def test_ValidationResultsTableContentBlockRenderer_get_unexpected_table(evr_success): evr_failed_no_result = ExpectationValidationResult( success=False, exception_info={ "raised_exception": False, "exception_message": None, "exception_traceback": None, }, expectation_config=ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={ "column": "Unnamed: 0", "value_set": [], "result_format": "SUMMARY", }, ), ) evr_failed_no_unexpected_list_or_counts = ExpectationValidationResult( success=False, result={ "element_count": 1313, "missing_count": 0, "missing_percent": 0.0, "unexpected_count": 1313, "unexpected_percent": 100.0, "unexpected_percent_nonmissing": 100.0, }, exception_info={ "raised_exception": False, "exception_message": None, "exception_traceback": None, }, expectation_config=ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={ "column": "Unnamed: 0", "value_set": [], "result_format": "SUMMARY", }, ), ) evr_failed_partial_unexpected_list = ExpectationValidationResult( success=False, result={ "element_count": 1313, "missing_count": 0, "missing_percent": 0.0, "unexpected_count": 1313, "unexpected_percent": 100.0, "unexpected_percent_nonmissing": 100.0, "partial_unexpected_list": [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ], }, exception_info={ "raised_exception": False, "exception_message": None, "exception_traceback": None, }, expectation_config=ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={ "column": "Unnamed: 0", "value_set": [], "result_format": "SUMMARY", }, ), ) evr_failed_partial_unexpected_counts = ExpectationValidationResult( success=False, result={ "element_count": 1313, "missing_count": 0, "missing_percent": 0.0, "unexpected_count": 1313, "unexpected_percent": 100.0, "unexpected_percent_nonmissing": 100.0, "partial_unexpected_list": [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ], "partial_unexpected_index_list": [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ], "partial_unexpected_counts": [ {"value": 1, "count": 1}, {"value": 2, "count": 1}, {"value": 3, "count": 1}, {"value": 4, "count": 1}, {"value": 5, "count": 1}, {"value": 6, "count": 1}, {"value": 7, "count": 1}, {"value": 8, "count": 1}, {"value": 9, "count": 1}, {"value": 10, "count": 1}, {"value": 11, "count": 1}, {"value": 12, "count": 1}, {"value": 13, "count": 1}, {"value": 14, "count": 1}, {"value": 15, "count": 1}, {"value": 16, "count": 1}, {"value": 17, "count": 1}, {"value": 18, "count": 1}, {"value": 19, "count": 1}, {"value": 20, "count": 1}, ], }, exception_info={ "raised_exception": False, "exception_message": None, "exception_traceback": None, }, expectation_config=ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={ "column": "Unnamed: 0", "value_set": [], "result_format": "SUMMARY", }, ), ) # test for succeeded evr output_1 = ValidationResultsTableContentBlockRenderer._get_unexpected_table( evr_success ) assert output_1 is None # test for failed evr with no "result" key output_2 = ValidationResultsTableContentBlockRenderer._get_unexpected_table( evr_failed_no_result ) assert output_2 is None # test for failed evr with no unexpected list or unexpected counts output_3 = ValidationResultsTableContentBlockRenderer._get_unexpected_table( evr_failed_no_unexpected_list_or_counts ) assert output_3 is None # test for failed evr with partial unexpected list output_4 = ValidationResultsTableContentBlockRenderer._get_unexpected_table( evr_failed_partial_unexpected_list ) assert output_4.to_json_dict() == { "content_block_type": "table", "table": [ [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14], [15], [16], [17], [18], [19], [20], ], "header_row": ["Sampled Unexpected Values"], "styling": {"body": {"classes": ["table-bordered", "table-sm", "mt-3"]}}, } # test for failed evr with partial unexpected counts output_5 = ValidationResultsTableContentBlockRenderer._get_unexpected_table( evr_failed_partial_unexpected_counts ) assert output_5.to_json_dict() == { "content_block_type": "table", "table": [ [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14], [15], [16], [17], [18], [19], [20], ], "header_row": ["Sampled Unexpected Values"], "styling": {"body": {"classes": ["table-bordered", "table-sm", "mt-3"]}}, }
def test_ValidationResultsTableContentBlockRenderer_get_unexpected_statement( evr_success, evr_failed ): evr_no_result = ExpectationValidationResult( success=True, exception_info={ "raised_exception": False, "exception_message": None, "exception_traceback": None, }, expectation_config=ExpectationConfiguration( expectation_type="expect_table_row_count_to_be_between", kwargs={"min_value": 0, "max_value": None, "result_format": "SUMMARY"}, ), ) evr_failed_no_unexpected_count = ExpectationValidationResult( success=False, result={ "element_count": 1313, "missing_count": 0, "missing_percent": 0.0, "unexpected_percent": 0.2284843869002285, "unexpected_percent_nonmissing": 0.2284843869002285, "partial_unexpected_list": [ "Daly, Mr Peter Denis ", "Barber, Ms ", "Geiger, Miss Emily ", ], "partial_unexpected_index_list": [77, 289, 303], "partial_unexpected_counts": [ {"value": "Barber, Ms ", "count": 1}, {"value": "Daly, Mr Peter Denis ", "count": 1}, {"value": "Geiger, Miss Emily ", "count": 1}, ], }, exception_info={ "raised_exception": False, "exception_message": None, "exception_traceback": None, }, expectation_config=ExpectationConfiguration( expectation_type="expect_column_values_to_not_match_regex", kwargs={ "column": "Name", "regex": "^\\s+|\\s+$", "result_format": "SUMMARY", }, ), ) # test for succeeded evr output_1 = ValidationResultsTableContentBlockRenderer._get_unexpected_statement( evr_success ) assert output_1 == [] # test for failed evr output_2 = ValidationResultsTableContentBlockRenderer._get_unexpected_statement( evr_failed ) assert output_2 == [ RenderedStringTemplateContent( **{ "content_block_type": "string_template", "string_template": { "template": "\n\n$unexpected_count unexpected values found. $unexpected_percent of $element_count total rows.", "params": { "unexpected_count": "3", "unexpected_percent": "≈0.2285%", "element_count": "1,313", }, "tag": "strong", "styling": {"classes": ["text-danger"]}, }, } ) ] # test for evr with no "result" key output_3 = ValidationResultsTableContentBlockRenderer._get_unexpected_statement( evr_no_result ) print(json.dumps(output_3, indent=2)) assert output_3 == [] # test for evr with no unexpected count output_4 = ValidationResultsTableContentBlockRenderer._get_unexpected_statement( evr_failed_no_unexpected_count ) print(output_4) assert output_4 == [] # test for evr with exception evr_failed_exception = ExpectationValidationResult( success=False, exception_info={ "raised_exception": True, "exception_message": "Unrecognized column: not_a_real_column", "exception_traceback": "Traceback (most recent call last):\n...more_traceback...", }, expectation_config=ExpectationConfiguration( expectation_type="expect_column_values_to_not_match_regex", kwargs={ "column": "Name", "regex": "^\\s+|\\s+$", "result_format": "SUMMARY", }, ), ) output_5 = ValidationResultsTableContentBlockRenderer._get_unexpected_statement( evr_failed_exception ) output_5 = [content.to_json_dict() for content in output_5] expected_output_5 = [ { "content_block_type": "string_template", "string_template": { "template": "\n\n$expectation_type raised an exception:\n$exception_message", "params": { "expectation_type": "expect_column_values_to_not_match_regex", "exception_message": "Unrecognized column: not_a_real_column", }, "tag": "strong", "styling": { "classes": ["text-danger"], "params": { "exception_message": {"tag": "code"}, "expectation_type": { "classes": ["badge", "badge-danger", "mb-2"] }, }, }, }, }, { "content_block_type": "collapse", "collapse_toggle_link": "Show exception traceback...", "collapse": [ { "content_block_type": "string_template", "string_template": { "template": "Traceback (most recent call last):\n...more_traceback...", "tag": "code", }, } ], "inline_link": False, }, ] assert output_5 == expected_output_5
def test_expectation_summary_in_ExpectationSuitePageRenderer_render_expectation_suite_notes( ): result = ExpectationSuitePageRenderer._render_expectation_suite_notes( ExpectationSuite(expectation_suite_name="test", meta={}, expectations=None)) # print(RenderedContent.rendered_content_list_to_json(result.text)) assert RenderedContent.rendered_content_list_to_json(result.text) == [ "This Expectation suite currently contains 0 total Expectations across 0 columns." ] result = ExpectationSuitePageRenderer._render_expectation_suite_notes( ExpectationSuite( expectation_suite_name="test", meta={"notes": { "format": "markdown", "content": ["hi"] }}, )) # print(RenderedContent.rendered_content_list_to_json(result.text)) try: mistune.markdown("*test*") assert RenderedContent.rendered_content_list_to_json(result.text) == [ "This Expectation suite currently contains 0 total Expectations across 0 columns.", { "content_block_type": "markdown", "styling": { "parent": {} }, "markdown": "hi", }, ] except OSError: assert RenderedContent.rendered_content_list_to_json(result.text) == [ "This Expectation suite currently contains 0 total Expectations across 0 columns.", "hi", ] result = ExpectationSuitePageRenderer._render_expectation_suite_notes( ExpectationSuite( expectation_suite_name="test", meta={}, expectations=[ ExpectationConfiguration( expectation_type="expect_table_row_count_to_be_between", kwargs={ "min_value": 0, "max_value": None }, ), ExpectationConfiguration( expectation_type="expect_column_to_exist", kwargs={"column": "x"}), ExpectationConfiguration( expectation_type="expect_column_to_exist", kwargs={"column": "y"}), ], )) # print(RenderedContent.rendered_content_list_to_json(result.text)[0]) assert ( RenderedContent.rendered_content_list_to_json(result.text)[0] == "This Expectation suite currently contains 3 total Expectations across 2 columns." )
def table_exp3(): return ExpectationConfiguration( expectation_type="expect_table_row_count_to_equal", kwargs={"value": 1})
def bobby_columnar_table_multi_batch(): """ # TODO: <Alex>ALEX -- Add DocString</Alex> """ verbose_profiler_config_file_path: str = file_relative_path( __file__, "bobby_user_workflow_verbose_profiler_config.yml" ) verbose_profiler_config: str with open(verbose_profiler_config_file_path) as f: verbose_profiler_config = f.read() my_row_count_range_rule_expectation_configurations_oneshot_sampling_method: List[ ExpectationConfiguration ] = [ ExpectationConfiguration( **{ "kwargs": {"min_value": 7505, "max_value": 8495}, "expectation_type": "expect_table_row_count_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "table.row_count", "domain_kwargs": {}, }, "num_batches": 2, }, }, }, ), ] my_column_ranges_rule_expectation_configurations_oneshot_sampling_method: List[ ExpectationConfiguration ] = [ ExpectationConfiguration( **{ "expectation_type": "expect_column_min_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.min", "domain_kwargs": {"column": "VendorID"}, }, "num_batches": 2, } }, "kwargs": { "column": "VendorID", "min_value": 1, "max_value": 1, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_max_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.max", "domain_kwargs": {"column": "VendorID"}, }, "num_batches": 2, } }, "kwargs": { "column": "VendorID", "min_value": 4, "max_value": 4, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_min_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.min", "domain_kwargs": {"column": "passenger_count"}, }, "num_batches": 2, } }, "kwargs": { "column": "passenger_count", "min_value": 0, "max_value": 1, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_max_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.max", "domain_kwargs": {"column": "passenger_count"}, }, "num_batches": 2, } }, "kwargs": { "column": "passenger_count", "min_value": 6, "max_value": 6, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_min_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.min", "domain_kwargs": {"column": "trip_distance"}, }, "num_batches": 2, } }, "kwargs": { "column": "trip_distance", "min_value": 0.0, "max_value": 0.0, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_max_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.max", "domain_kwargs": {"column": "trip_distance"}, }, "num_batches": 2, } }, "kwargs": { "column": "trip_distance", "min_value": 37.62, "max_value": 57.85, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_min_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.min", "domain_kwargs": {"column": "RatecodeID"}, }, "num_batches": 2, } }, "kwargs": { "column": "RatecodeID", "min_value": 1, "max_value": 1, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_max_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.max", "domain_kwargs": {"column": "RatecodeID"}, }, "num_batches": 2, } }, "kwargs": { "column": "RatecodeID", "min_value": 5, "max_value": 6, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_min_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.min", "domain_kwargs": {"column": "PULocationID"}, }, "num_batches": 2, } }, "kwargs": { "column": "PULocationID", "min_value": 1, "max_value": 1, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_max_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.max", "domain_kwargs": {"column": "PULocationID"}, }, "num_batches": 2, } }, "kwargs": { "column": "PULocationID", "min_value": 265, "max_value": 265, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_min_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.min", "domain_kwargs": {"column": "DOLocationID"}, }, "num_batches": 2, } }, "kwargs": { "column": "DOLocationID", "min_value": 1, "max_value": 1, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_max_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.max", "domain_kwargs": {"column": "DOLocationID"}, }, "num_batches": 2, } }, "kwargs": { "column": "DOLocationID", "min_value": 265, "max_value": 265, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_min_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.min", "domain_kwargs": {"column": "payment_type"}, }, "num_batches": 2, } }, "kwargs": { "column": "payment_type", "min_value": 1, "max_value": 1, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_max_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.max", "domain_kwargs": {"column": "payment_type"}, }, "num_batches": 2, } }, "kwargs": { "column": "payment_type", "min_value": 4, "max_value": 4, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_min_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.min", "domain_kwargs": {"column": "fare_amount"}, }, "num_batches": 2, } }, "kwargs": { "column": "fare_amount", "min_value": -51.84, "max_value": -21.16, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_max_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.max", "domain_kwargs": {"column": "fare_amount"}, }, "num_batches": 2, } }, "kwargs": { "column": "fare_amount", "min_value": 228.94, "max_value": 2990.05, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_min_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.min", "domain_kwargs": {"column": "extra"}, }, "num_batches": 2, } }, "kwargs": { "column": "extra", "min_value": -36.53, "max_value": -1.18, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_max_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.max", "domain_kwargs": {"column": "extra"}, }, "num_batches": 2, } }, "kwargs": { "column": "extra", "min_value": 4.51, "max_value": 6.99, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_min_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.min", "domain_kwargs": {"column": "mta_tax"}, }, "num_batches": 2, } }, "kwargs": { "column": "mta_tax", "min_value": -0.5, "max_value": -0.5, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_max_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.max", "domain_kwargs": {"column": "mta_tax"}, }, "num_batches": 2, } }, "kwargs": { "column": "mta_tax", "min_value": 0.69, "max_value": 37.32, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_min_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.min", "domain_kwargs": {"column": "tip_amount"}, }, "num_batches": 2, } }, "kwargs": { "column": "tip_amount", "min_value": 0.0, "max_value": 0.0, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_max_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.max", "domain_kwargs": {"column": "tip_amount"}, }, "num_batches": 2, } }, "kwargs": { "column": "tip_amount", "min_value": 46.84, "max_value": 74.86, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_min_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.min", "domain_kwargs": {"column": "tolls_amount"}, }, "num_batches": 2, } }, "kwargs": { "column": "tolls_amount", "min_value": 0.0, "max_value": 0.0, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_max_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.max", "domain_kwargs": {"column": "tolls_amount"}, }, "num_batches": 2, } }, "kwargs": { "column": "tolls_amount", "min_value": 26.4, "max_value": 497.67, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_min_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.min", "domain_kwargs": {"column": "improvement_surcharge"}, }, "num_batches": 2, } }, "kwargs": { "column": "improvement_surcharge", "min_value": -0.3, "max_value": -0.3, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_max_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.max", "domain_kwargs": {"column": "improvement_surcharge"}, }, "num_batches": 2, } }, "kwargs": { "column": "improvement_surcharge", "min_value": 0.3, "max_value": 0.3, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_min_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.min", "domain_kwargs": {"column": "total_amount"}, }, "num_batches": 2, } }, "kwargs": { "column": "total_amount", "min_value": -52.66, "max_value": -24.44, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_max_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.max", "domain_kwargs": {"column": "total_amount"}, }, "num_batches": 2, } }, "kwargs": { "column": "total_amount", "min_value": 550.18, "max_value": 2992.47, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_min_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.min", "domain_kwargs": {"column": "congestion_surcharge"}, }, "num_batches": 2, } }, "kwargs": { "column": "congestion_surcharge", "min_value": -2.49, "max_value": -0.01, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_max_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.max", "domain_kwargs": {"column": "congestion_surcharge"}, }, "num_batches": 2, } }, "kwargs": { "column": "congestion_surcharge", "min_value": 0.01, "max_value": 2.49, "mostly": 1.0, }, }, ), ] expectation_configurations: List[ExpectationConfiguration] = [] expectation_configurations.extend( my_row_count_range_rule_expectation_configurations_oneshot_sampling_method ) expectation_configurations.extend( my_column_ranges_rule_expectation_configurations_oneshot_sampling_method ) expectation_suite_name_oneshot_sampling_method: str = ( "bobby_columnar_table_multi_batch_oneshot_sampling_method" ) expected_expectation_suite_oneshot_sampling_method: ExpectationSuite = ( ExpectationSuite( expectation_suite_name=expectation_suite_name_oneshot_sampling_method ) ) expectation_configuration: ExpectationConfiguration for expectation_configuration in expectation_configurations: expected_expectation_suite_oneshot_sampling_method.add_expectation( expectation_configuration ) yaml = YAML() profiler_config: dict = yaml.load(verbose_profiler_config) expected_expectation_suite_oneshot_sampling_method.add_citation( comment="Suite created by Rule-Based Profiler with the configuration included.", profiler_config=profiler_config, ) return { "profiler_config": verbose_profiler_config, "test_configuration_oneshot_sampling_method": { "expectation_suite_name": expectation_suite_name_oneshot_sampling_method, "expected_expectation_suite": expected_expectation_suite_oneshot_sampling_method, }, }
def test_ValidationResultsTableContentBlockRenderer_generate_expectation_row_happy_path( ): evr = ExpectationValidationResult( success=True, result={ 'observed_value': True, 'element_count': 162, 'missing_count': 153, 'missing_percent': 94.44444444444444 }, exception_info={ 'raised_exception': False, 'exception_message': None, 'exception_traceback': None }, expectation_config=ExpectationConfiguration( expectation_type='expect_column_min_to_be_between', kwargs={ 'column': 'live', 'min_value': None, 'max_value': None, 'result_format': 'SUMMARY' }, meta={'BasicDatasetProfiler': { 'confidence': 'very low' }})) result = ValidationResultsTableContentBlockRenderer.render( [evr]).to_json_dict() print(result) # Note: A better approach to testing would separate out styling into a separate test. assert result == { 'content_block_type': 'table', 'styling': { 'body': { 'classes': ['table'] }, 'classes': [ 'ml-2', 'mr-2', 'mt-0', 'mb-0', 'table-responsive', 'hide-succeeded-validations-column-section-target-child' ] }, 'table': [[{ 'content_block_type': 'string_template', 'styling': { 'parent': { 'classes': ['hide-succeeded-validation-target-child'] } }, 'string_template': { 'template': '$icon', 'params': { 'icon': '' }, 'styling': { 'params': { 'icon': { 'classes': ['fas', 'fa-check-circle', 'text-success'], 'tag': 'i' } } } } }, { 'content_block_type': 'string_template', 'string_template': { 'template': '$column minimum value may have any numerical value.', 'params': { "column": "live", "min_value": None, "max_value": None, "result_format": "SUMMARY", "parse_strings_as_datetimes": None }, 'styling': { 'default': { 'classes': ['badge', 'badge-secondary'] }, 'params': { 'column': { 'classes': ['badge', 'badge-primary'] } } } } }, 'True']], 'header_row': ['Status', 'Expectation', 'Observed Value'] }
def test_get_and_save_expectation_suite(tmp_path_factory): directory_name = str( tmp_path_factory.mktemp("test_get_and_save_expectation_config")) df = ge.dataset.PandasDataset({ "x": [1, 2, 4], "y": [1, 2, 5], "z": ["hello", "jello", "mello"], }) df.expect_column_values_to_be_in_set("x", [1, 2, 4]) df.expect_column_values_to_be_in_set("y", [1, 2, 4], catch_exceptions=True, include_config=True) df.expect_column_values_to_match_regex("z", "ello") ### First test set ### output_config = ExpectationSuite( expectations=[ ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={ "column": "x", "value_set": [1, 2, 4] }, ), ExpectationConfiguration( expectation_type="expect_column_values_to_match_regex", kwargs={ "column": "z", "regex": "ello" }, ), ], expectation_suite_name="default", data_asset_type="Dataset", meta={"great_expectations.__version__": ge.__version__}, ) assert output_config == df.get_expectation_suite() df.save_expectation_suite(directory_name + "/temp1.json") with open(directory_name + "/temp1.json") as infile: loaded_config = expectationSuiteSchema.loads(infile.read()) assert output_config == loaded_config ### Second test set ### output_config = ExpectationSuite( expectations=[ ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={ "column": "x", "value_set": [1, 2, 4] }, ), ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={ "column": "y", "value_set": [1, 2, 4] }, ), ExpectationConfiguration( expectation_type="expect_column_values_to_match_regex", kwargs={ "column": "z", "regex": "ello" }, ), ], expectation_suite_name="default", data_asset_type="Dataset", meta={"great_expectations.__version__": ge.__version__}, ) assert output_config == df.get_expectation_suite( discard_failed_expectations=False) df.save_expectation_suite(directory_name + "/temp2.json", discard_failed_expectations=False) with open(directory_name + "/temp2.json") as infile: loaded_suite = expectationSuiteSchema.loads(infile.read()) assert output_config == loaded_suite ### Third test set ### output_config = ExpectationSuite( expectations=[ ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={ "column": "x", "value_set": [1, 2, 4], "result_format": "BASIC", }, ), ExpectationConfiguration( expectation_type="expect_column_values_to_match_regex", kwargs={ "column": "z", "regex": "ello", "result_format": "BASIC" }, ), ], expectation_suite_name="default", data_asset_type="Dataset", meta={"great_expectations.__version__": ge.__version__}, ) assert output_config == df.get_expectation_suite( discard_result_format_kwargs=False, discard_include_config_kwargs=False, discard_catch_exceptions_kwargs=False, ) df.save_expectation_suite( directory_name + "/temp3.json", discard_result_format_kwargs=False, discard_include_config_kwargs=False, discard_catch_exceptions_kwargs=False, ) with open(directory_name + "/temp3.json") as infile: loaded_suite = expectationSuiteSchema.loads(infile.read()) assert output_config == loaded_suite
def test_ProfilingResultsColumnSectionRenderer_render_header_with_unescaped_dollar_sign( titanic_profiled_name_column_evrs): evr_with_unescaped_dollar_sign = ExpectationValidationResult( success=True, result={"observed_value": "float64"}, exception_info={ "raised_exception": False, "exception_message": None, "exception_traceback": None, }, expectation_config=ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_type_list", kwargs={ "column": "Car Insurance Premiums ($)", "type_list": [ "DOUBLE_PRECISION", "DoubleType", "FLOAT", "FLOAT4", "FLOAT8", "FloatType", "NUMERIC", "float" ], "result_format": "SUMMARY" }, meta={"BasicDatasetProfiler": { "confidence": "very low" }})) content_block = ProfilingResultsColumnSectionRenderer._render_header( [evr_with_unescaped_dollar_sign], column_type=[], ).to_json_dict() print(content_block) assert content_block == { 'content_block_type': 'header', 'styling': { 'classes': ['col-12', 'p-0'], 'header': { 'classes': ['alert', 'alert-secondary'] } }, 'header': { 'content_block_type': 'string_template', 'string_template': { 'template': 'Car Insurance Premiums ($$)', 'tooltip': { 'content': 'expect_column_to_exist', 'placement': 'top' }, 'tag': 'h5', 'styling': { 'classes': ['m-0', 'p-0'] } } }, 'subheader': { 'content_block_type': 'string_template', 'string_template': { 'template': 'Type: []', 'tooltip': { 'content': 'expect_column_values_to_be_of_type <br>expect_column_values_to_be_in_type_list' }, 'tag': 'h6', 'styling': { 'classes': ['mt-1', 'mb-0'] } } } }
def test_remove_expectation(): my_df = ge.dataset.PandasDataset( { "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "y": [1, 2, None, 4, None, 6, 7, 8, 9, None], "z": [ "cello", "hello", "jello", "bellow", "fellow", "mellow", "wellow", "xello", "yellow", "zello", ], }, profiler=ge.profile.ColumnsExistProfiler, ) my_df.expect_column_values_to_be_of_type("x", "int") my_df.expect_column_values_to_be_of_type("y", "int") my_df.expect_column_values_to_be_of_type("z", "int", include_config=True, catch_exceptions=True) my_df.expect_column_values_to_be_increasing("x") my_df.expect_column_values_to_match_regex("z", "ello") with pytest.raises(ValueError) as exc: my_df.remove_expectation("expect_column_to_exist", "w", dry_run=True), assert "No matching expectation found." in str(exc.value) assert my_df.remove_expectation( "expect_column_to_exist", "x", expectation_kwargs={}, dry_run=True) == ExpectationConfiguration( expectation_type="expect_column_to_exist", kwargs={"column": "x"}) assert my_df.remove_expectation( "expect_column_to_exist", expectation_kwargs={"column": "y"}, dry_run=True) == ExpectationConfiguration( expectation_type="expect_column_to_exist", kwargs={"column": "y"}) assert my_df.remove_expectation( "expect_column_to_exist", expectation_kwargs={"column": "y"}, remove_multiple_matches=True, dry_run=True, ) == [ ExpectationConfiguration(expectation_type="expect_column_to_exist", kwargs={"column": "y"}) ] with pytest.raises(ValueError) as exc: my_df.remove_expectation("expect_column_to_exist", dry_run=True) assert "Multiple expectations matched arguments. No expectations removed." in str( exc.value) exp1 = [ ExpectationConfiguration(expectation_type="expect_column_to_exist", kwargs={"column": "x"}), ExpectationConfiguration(expectation_type="expect_column_to_exist", kwargs={"column": "y"}), ExpectationConfiguration(expectation_type="expect_column_to_exist", kwargs={"column": "z"}), ] assert (my_df.remove_expectation("expect_column_to_exist", remove_multiple_matches=True, dry_run=True) == exp1) with pytest.raises(ValueError) as exc: my_df.remove_expectation("expect_column_to_exist", "x", {"column": "y"}, dry_run=True) assert "Conflicting column names in find_expectation_indexes" in str( exc.value) exp1 = [ ExpectationConfiguration(expectation_type="expect_column_to_exist", kwargs={"column": "x"}), ExpectationConfiguration( expectation_type="expect_column_values_to_be_of_type", kwargs={ "column": "x", "type_": "int" }, ), ExpectationConfiguration( expectation_type="expect_column_values_to_be_increasing", kwargs={"column": "x"}, ), ] assert (my_df.remove_expectation(column="x", remove_multiple_matches=True, dry_run=True) == exp1) assert len(my_df._expectation_suite.expectations) == 8 assert my_df.remove_expectation("expect_column_to_exist", "x") is None assert len(my_df._expectation_suite.expectations) == 7 assert my_df.remove_expectation(column="x", remove_multiple_matches=True) is None assert len(my_df._expectation_suite.expectations) == 5 my_df.remove_expectation(column="z", remove_multiple_matches=True) assert len(my_df._expectation_suite.expectations) == 2 assert my_df.get_expectation_suite( discard_failed_expectations=False) == ExpectationSuite( expectations=[ ExpectationConfiguration( expectation_type="expect_column_to_exist", kwargs={"column": "y"}), ExpectationConfiguration( expectation_type="expect_column_values_to_be_of_type", kwargs={ "column": "y", "type_": "int" }, ), ], expectation_suite_name="default", data_asset_type="Dataset", meta={"great_expectations.__version__": ge.__version__}, )
def test_ExpectationSuiteColumnSectionRenderer_render_header( titanic_profiled_name_column_expectations): remaining_expectations, content_blocks = ExpectationSuiteColumnSectionRenderer._render_header( titanic_profiled_name_column_expectations, ) expected = { 'content_block_type': 'header', 'styling': { 'classes': ['col-12'], 'header': { 'classes': ['alert', 'alert-secondary'] } }, 'header': { 'content_block_type': 'string_template', 'string_template': { 'template': 'Name', 'tag': 'h5', 'styling': { 'classes': ['m-0'] } } } } print(content_blocks.to_json_dict()) assert content_blocks.to_json_dict() == expected expectation_with_unescaped_dollar_sign = ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_type_list", kwargs={ "column": "Car Insurance Premiums ($)", "type_list": [ "DOUBLE_PRECISION", "DoubleType", "FLOAT", "FLOAT4", "FLOAT8", "FloatType", "NUMERIC", "float" ], "result_format": "SUMMARY" }, meta={"BasicDatasetProfiler": { "confidence": "very low" }}) remaining_expectations, content_blocks = ExpectationSuiteColumnSectionRenderer._render_header( [expectation_with_unescaped_dollar_sign], ) print(content_blocks.to_json_dict()) expected = { 'content_block_type': 'header', 'styling': { 'classes': ['col-12'], 'header': { 'classes': ['alert', 'alert-secondary'] } }, 'header': { 'content_block_type': 'string_template', 'string_template': { 'template': 'Car Insurance Premiums ($$)', 'tag': 'h5', 'styling': { 'classes': ['m-0'] } } } } assert content_blocks.to_json_dict() == expected
def _create_expectations_for_datetime_column( cls, dataset, column, excluded_expectations=None, included_expectations=None ): cls._create_non_nullity_expectations( dataset, column, excluded_expectations=excluded_expectations, included_expectations=included_expectations, ) if ( not excluded_expectations or "expect_column_min_to_be_between" not in excluded_expectations ) and ( not included_expectations or "expect_column_min_to_be_between" in included_expectations ): min_value = dataset.expect_column_min_to_be_between( column, min_value=None, max_value=None, result_format="SUMMARY" ).result["observed_value"] if min_value is not None: dataset.remove_expectation( ExpectationConfiguration( expectation_type="expect_column_min_to_be_between", kwargs={"column": column}, ), match_type="domain", ) try: min_value = min_value + datetime.timedelta(days=-365) except OverflowError: min_value = datetime.datetime.min except TypeError: min_value = parse(min_value) + datetime.timedelta(days=-365) if ( not excluded_expectations or "expect_column_max_to_be_between" not in excluded_expectations ) and ( not included_expectations or "expect_column_max_to_be_between" in included_expectations ): max_value = dataset.expect_column_max_to_be_between( column, min_value=None, max_value=None, result_format="SUMMARY" ).result["observed_value"] if max_value is not None: dataset.remove_expectation( ExpectationConfiguration( expectation_type="expect_column_max_to_be_between", kwargs={"column": column}, ), match_type="domain", ) try: max_value = max_value + datetime.timedelta(days=365) except OverflowError: max_value = datetime.datetime.max except TypeError: max_value = parse(max_value) + datetime.timedelta(days=365) if ( not excluded_expectations or "expect_column_min_to_be_between" not in excluded_expectations ) and ( not included_expectations or "expect_column_min_to_be_between" in included_expectations ): if min_value is not None or max_value is not None: dataset.expect_column_values_to_be_between( column, min_value, max_value, parse_strings_as_datetimes=True )
def test_ExpectationSuiteColumnSectionRenderer_expectation_with_string_list_meta_notes( ): expectation_with_string_list_note = ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_type_list", kwargs={ "column": "Car Insurance Premiums ($)", "type_list": [ "DOUBLE_PRECISION", "DoubleType", "FLOAT", "FLOAT4", "FLOAT8", "FloatType", "NUMERIC", "float" ], "result_format": "SUMMARY" }, meta={ "BasicDatasetProfiler": { "confidence": "very low" }, "notes": ["This is a list", "of strings", "assigned to the notes", "key."] }) expectations = [expectation_with_string_list_note] expected_result_json = { 'content_blocks': [{ 'content_block_type': 'header', 'styling': { 'classes': ['col-12'], 'header': { 'classes': ['alert', 'alert-secondary'] } }, 'header': { 'content_block_type': 'string_template', 'string_template': { 'template': 'Car Insurance Premiums ($$)', 'tag': 'h5', 'styling': { 'classes': ['m-0'] } } } }, { 'content_block_type': 'bullet_list', 'styling': { 'classes': ['col-12'] }, 'bullet_list': [[{ 'content_block_type': 'string_template', 'string_template': { 'template': 'value types must belong to this set: $v__0 $v__1 ' '$v__2 $v__3 $v__4 $v__5 $v__6 $v__7.', 'params': { "column": "Car Insurance Premiums ($)", "type_list": [ "DOUBLE_PRECISION", "DoubleType", "FLOAT", "FLOAT4", "FLOAT8", "FloatType", "NUMERIC", "float" ], "result_format": "SUMMARY", "mostly": None, "v__0": "DOUBLE_PRECISION", "v__1": "DoubleType", "v__2": "FLOAT", "v__3": "FLOAT4", "v__4": "FLOAT8", "v__5": "FloatType", "v__6": "NUMERIC", "v__7": "float" }, 'styling': { 'default': { 'classes': ['badge', 'badge-secondary'] }, 'params': { 'column': { 'classes': ['badge', 'badge-primary'] } } } } }, { 'content_block_type': 'collapse', 'styling': { 'body': { 'classes': ['card', 'card-body', 'p-1'] }, 'parent': { 'styles': { 'list-style-type': 'none' } } }, 'collapse_toggle_link': { 'content_block_type': 'string_template', 'string_template': { 'template': '$icon', 'params': { 'icon': '' }, 'styling': { 'params': { 'icon': { 'classes': ['fas', 'fa-comment', 'text-info'], 'tag': 'i' } } } } }, 'collapse': [{ 'content_block_type': 'text', 'styling': { 'classes': ['col-12', 'mt-2', 'mb-2'], 'parent': { 'styles': { 'list-style-type': 'none' } } }, 'subheader': 'Notes:', 'text': [ 'This is a list', 'of strings', 'assigned to the notes', 'key.' ] }], 'inline_link': True }], { 'content_block_type': 'string_template', 'styling': { 'parent': { 'styles': { 'list-style-type': 'none' } } }, 'string_template': { 'template': '', 'tag': 'hr', 'styling': { 'classes': ['mt-1', 'mb-1'] } } }] }], 'section_name': 'Car Insurance Premiums ($)' } result_json = ExpectationSuiteColumnSectionRenderer().render( expectations).to_json_dict() print(result_json) assert result_json == expected_result_json
def test_expectation_suite_filedata_asset(): # Load in data files file_path = file_relative_path(__file__, "../test_sets/toy_data_complete.csv") # Create FileDataAsset objects f_dat = ge.data_asset.FileDataAsset(file_path) # Set up expectations f_dat.expect_file_line_regex_match_count_to_equal( regex=r",\S", expected_count=3, skip=1, result_format="BASIC", catch_exceptions=True, ) f_dat.expect_file_line_regex_match_count_to_be_between( regex=r",\S", expected_max_count=2, skip=1, result_format="SUMMARY", include_config=True, ) # Test basic config output complete_config = f_dat.get_expectation_suite() assert [ ExpectationConfiguration( expectation_type="expect_file_line_regex_match_count_to_equal", kwargs=ExpectationKwargs(expected_count=3, regex=",\\S", skip=1), ) ] == complete_config.expectations # Include result format kwargs complete_config2 = f_dat.get_expectation_suite( discard_result_format_kwargs=False, discard_failed_expectations=False) assert [ ExpectationConfiguration( expectation_type="expect_file_line_regex_match_count_to_equal", kwargs={ "expected_count": 3, "regex": ",\\S", "result_format": "BASIC", "skip": 1, }, ), ExpectationConfiguration( expectation_type="expect_file_line_regex_match_count_to_be_between", kwargs={ "expected_max_count": 2, "regex": ",\\S", "result_format": "SUMMARY", "skip": 1, }, ), ] == complete_config2.expectations # Discard Failing Expectations complete_config3 = f_dat.get_expectation_suite( discard_result_format_kwargs=False, discard_failed_expectations=True) assert [ ExpectationConfiguration( expectation_type="expect_file_line_regex_match_count_to_equal", kwargs={ "expected_count": 3, "regex": ",\\S", "result_format": "BASIC", "skip": 1, }, ) ] == complete_config3.expectations
def test_column_map_expectation_decorator(): # Create a new CustomPandasDataset to # (1) demonstrate that custom subclassing works, and # (2) Test expectation business logic without dependencies on any other functions. class CustomPandasDataset(PandasDataset): @MetaPandasDataset.column_map_expectation def expect_column_values_to_be_odd(self, column): return column.map(lambda x: x % 2) @MetaPandasDataset.column_map_expectation def expectation_that_crashes_on_sixes(self, column): return column.map(lambda x: (x - 6) / 0 != "duck") df = CustomPandasDataset({ "all_odd": [1, 3, 5, 5, 5, 7, 9, 9, 9, 11], "mostly_odd": [1, 3, 5, 7, 9, 2, 4, 1, 3, 5], "all_even": [2, 4, 4, 6, 6, 6, 8, 8, 8, 8], "odd_missing": [1, 3, 5, None, None, None, None, 1, 3, None], "mixed_missing": [1, 3, 5, None, None, 2, 4, 1, 3, None], "all_missing": [None, None, None, None, None, None, None, None, None, None], }) df.set_default_expectation_argument("result_format", "COMPLETE") df.set_default_expectation_argument("include_config", False) assert df.expect_column_values_to_be_odd( "all_odd") == ExpectationValidationResult( result={ "element_count": 10, "missing_count": 0, "missing_percent": 0.0, "partial_unexpected_counts": [], "partial_unexpected_index_list": [], "partial_unexpected_list": [], "unexpected_count": 0, "unexpected_index_list": [], "unexpected_list": [], "unexpected_percent": 0.0, "unexpected_percent_nonmissing": 0.0, }, success=True, ) assert df.expect_column_values_to_be_odd( "all_missing") == ExpectationValidationResult( result={ "element_count": 10, "missing_count": 10, "missing_percent": 100.0, "partial_unexpected_counts": [], "partial_unexpected_index_list": [], "partial_unexpected_list": [], "unexpected_count": 0, "unexpected_index_list": [], "unexpected_list": [], "unexpected_percent": 0.0, "unexpected_percent_nonmissing": None, }, success=True, ) assert df.expect_column_values_to_be_odd( "odd_missing") == ExpectationValidationResult( result={ "element_count": 10, "missing_count": 5, "missing_percent": 50.0, "partial_unexpected_counts": [], "partial_unexpected_index_list": [], "partial_unexpected_list": [], "unexpected_count": 0, "unexpected_index_list": [], "unexpected_list": [], "unexpected_percent": 0.0, "unexpected_percent_nonmissing": 0.0, }, success=True, ) assert df.expect_column_values_to_be_odd( "mixed_missing") == ExpectationValidationResult( result={ "element_count": 10, "missing_count": 3, "missing_percent": 30.0, "partial_unexpected_counts": [ { "value": 2.0, "count": 1 }, { "value": 4.0, "count": 1 }, ], "partial_unexpected_index_list": [5, 6], "partial_unexpected_list": [2.0, 4.0], "unexpected_count": 2, "unexpected_index_list": [5, 6], "unexpected_list": [2, 4], "unexpected_percent": 20.0, "unexpected_percent_nonmissing": (2 / 7 * 100), }, success=False, ) assert df.expect_column_values_to_be_odd( "mostly_odd") == ExpectationValidationResult( result={ "element_count": 10, "missing_count": 0, "missing_percent": 0, "partial_unexpected_counts": [ { "value": 2.0, "count": 1 }, { "value": 4.0, "count": 1 }, ], "partial_unexpected_index_list": [5, 6], "partial_unexpected_list": [2.0, 4.0], "unexpected_count": 2, "unexpected_index_list": [5, 6], "unexpected_list": [2, 4], "unexpected_percent": 20.0, "unexpected_percent_nonmissing": 20.0, }, success=False, ) assert df.expect_column_values_to_be_odd( "mostly_odd", mostly=0.6) == ExpectationValidationResult( result={ "element_count": 10, "missing_count": 0, "missing_percent": 0, "partial_unexpected_counts": [ { "value": 2.0, "count": 1 }, { "value": 4.0, "count": 1 }, ], "partial_unexpected_index_list": [5, 6], "partial_unexpected_list": [2.0, 4.0], "unexpected_count": 2, "unexpected_index_list": [5, 6], "unexpected_list": [2, 4], "unexpected_percent": 20.0, "unexpected_percent_nonmissing": 20.0, }, success=True, ) assert df.expect_column_values_to_be_odd( "mostly_odd", result_format="BOOLEAN_ONLY") == ExpectationValidationResult( success=False) df.default_expectation_args["result_format"] = "BOOLEAN_ONLY" assert df.expect_column_values_to_be_odd( "mostly_odd") == ExpectationValidationResult(success=False) df.default_expectation_args["result_format"] = "BASIC" assert df.expect_column_values_to_be_odd( "mostly_odd", include_config=True) == ExpectationValidationResult( expectation_config=ExpectationConfiguration( expectation_type="expect_column_values_to_be_odd", kwargs={ "column": "mostly_odd", "result_format": "BASIC" }, ), result={ "element_count": 10, "missing_count": 0, "missing_percent": 0, "partial_unexpected_list": [2, 4], "unexpected_count": 2, "unexpected_percent": 20.0, "unexpected_percent_nonmissing": 20.0, }, success=False, )
def config3(): return ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={"column": "a", "value_set": [1, 2, 3], "result_format": "BASIC"}, meta={"notes": "This is another expectation."}, )
def test_ge_pandas_automatic_failure_removal(): df = ge.dataset.PandasDataset({ 'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8], 'C': ['a', 'b', 'c', 'd'], 'D': ['e', 'f', 'g', 'h'] }) # Put some simple expectations on the data frame df.profile(ge.profile.ColumnsExistProfiler) df.expect_column_values_to_be_in_set("A", [1, 2, 3, 4]) df.expect_column_values_to_be_in_set("B", [5, 6, 7, 8]) df.expect_column_values_to_be_in_set("C", ['w', 'x', 'y', 'z']) df.expect_column_values_to_be_in_set("D", ['e', 'f', 'g', 'h']) # First check that failing expectations are NOT automatically # dropped when sampling. # For this data frame, the expectation on column "C" above fails. exp1 = [ ExpectationConfiguration(expectation_type='expect_column_to_exist', kwargs={'column': 'A'}), ExpectationConfiguration(expectation_type='expect_column_to_exist', kwargs={'column': 'B'}), ExpectationConfiguration(expectation_type='expect_column_to_exist', kwargs={'column': 'C'}), ExpectationConfiguration(expectation_type='expect_column_to_exist', kwargs={'column': 'D'}), ExpectationConfiguration(expectation_type='expect_column_values_to_be_in_set', kwargs={'column': 'A', 'value_set': [1, 2, 3, 4]}), ExpectationConfiguration(expectation_type='expect_column_values_to_be_in_set', kwargs={'column': 'B', 'value_set': [5, 6, 7, 8]}), ExpectationConfiguration(expectation_type='expect_column_values_to_be_in_set', kwargs={'column': 'C', 'value_set': ['w', 'x', 'y', 'z']}), ExpectationConfiguration(expectation_type='expect_column_values_to_be_in_set', kwargs={'column': 'D', 'value_set': ['e', 'f', 'g', 'h']}) ] samp1 = df.sample(n=2) assert samp1.find_expectations() == exp1 # Now check subsetting to verify that failing expectations are NOT # automatically dropped when subsetting. sub1 = df[['A', 'D']] assert samp1.find_expectations() == exp1 # Set property/attribute so that failing expectations are # automatically removed when sampling or subsetting. df.discard_subset_failing_expectations = True ### # Note: Order matters in this test, and a validationoperator may change order ### exp_samp = [ ExpectationConfiguration(expectation_type='expect_column_to_exist', kwargs={'column': 'A'}), ExpectationConfiguration(expectation_type='expect_column_to_exist', kwargs={'column': 'B'}), ExpectationConfiguration(expectation_type='expect_column_to_exist', kwargs={'column': 'C'}), ExpectationConfiguration(expectation_type='expect_column_to_exist', kwargs={'column': 'D'}), ExpectationConfiguration(expectation_type='expect_column_values_to_be_in_set', kwargs={'column': 'A', 'value_set': [1, 2, 3, 4]}), ExpectationConfiguration(expectation_type='expect_column_values_to_be_in_set', kwargs={'column': 'B', 'value_set': [5, 6, 7, 8]}), ExpectationConfiguration(expectation_type='expect_column_values_to_be_in_set', kwargs={'column': 'D', 'value_set': ['e', 'f', 'g', 'h']}) ] samp2 = df.sample(n=2) assert samp2.find_expectations() == exp_samp # Now check subsetting. In additional to the failure on column "C", # the expectations on column "B" now fail since column "B" doesn't # exist in the subset. sub2 = df[['A', 'D']] exp_sub = [ ExpectationConfiguration(expectation_type='expect_column_to_exist', kwargs={'column': 'A'}), ExpectationConfiguration(expectation_type='expect_column_values_to_be_in_set', kwargs={'column': 'A', 'value_set': [1, 2, 3, 4]}), ExpectationConfiguration(expectation_type='expect_column_to_exist', kwargs={'column': 'D'}), ExpectationConfiguration(expectation_type='expect_column_values_to_be_in_set', kwargs={'column': 'D', 'value_set': ['e', 'f', 'g', 'h']}) ] assert samp2.find_expectations() == exp_samp