def test_profile_excludes_citations( alice_columnar_table_single_batch_context, alice_columnar_table_single_batch, ): # Load data context data_context: DataContext = alice_columnar_table_single_batch_context # Load profiler configs & loop (run tests for each one) yaml_config: str = alice_columnar_table_single_batch["profiler_config"] # Instantiate Profiler profiler_config: dict = yaml.load(yaml_config) # Roundtrip through schema validation to remove any illegal fields add/or restore any missing fields. deserialized_config: dict = ruleBasedProfilerConfigSchema.load(profiler_config) serialized_config: dict = ruleBasedProfilerConfigSchema.dump(deserialized_config) # `class_name`/`module_name` are generally consumed through `instantiate_class_from_config` # so we need to manually remove those values if we wish to use the **kwargs instantiation pattern serialized_config.pop("class_name") serialized_config.pop("module_name") profiler: RuleBasedProfiler = RuleBasedProfiler( **serialized_config, data_context=data_context, ) expectation_suite: ExpectationSuite = profiler.run( expectation_suite_name=alice_columnar_table_single_batch[ "expected_expectation_suite_name" ], include_citation=False, ) assert expectation_suite.meta.get("citations") is None
def test_resolve_config_using_acceptable_arguments( profiler_with_placeholder_args: RuleBasedProfiler, ) -> None: old_config: RuleBasedProfilerConfig = profiler_with_placeholder_args.config # Roundtrip through schema validation to add/or restore any missing fields. old_deserialized_config: dict = ruleBasedProfilerConfigSchema.load( old_config.to_json_dict()) old_deserialized_config.pop("class_name") old_deserialized_config.pop("module_name") old_config = RuleBasedProfilerConfig(**old_deserialized_config) # Brand new config is created but existing attributes are unchanged new_config: RuleBasedProfilerConfig = ( RuleBasedProfilerConfig.resolve_config_using_acceptable_arguments( profiler=profiler_with_placeholder_args, )) # Roundtrip through schema validation to add/or restore any missing fields. # new_deserialized_config: dict = ruleBasedProfilerConfigSchema.load(new_config.to_json_dict()) new_deserialized_config: dict = new_config.to_json_dict() new_deserialized_config.pop("class_name") new_deserialized_config.pop("module_name") new_config = RuleBasedProfilerConfig(**new_deserialized_config) assert id(old_config) != id(new_config) assert all(old_config[attr] == new_config[attr] for attr in ("config_version", "name"))
def test_bobster_profiler_user_workflow_multi_batch_row_count_range_rule_bootstrap_sampling_method( bobster_columnar_table_multi_batch_normal_mean_5000_stdev_1000_data_context, bobster_columnar_table_multi_batch_normal_mean_5000_stdev_1000, ): # Load data context data_context: DataContext = ( bobster_columnar_table_multi_batch_normal_mean_5000_stdev_1000_data_context ) # Load profiler configs & loop (run tests for each one) yaml_config: str = bobster_columnar_table_multi_batch_normal_mean_5000_stdev_1000[ "profiler_config"] # Instantiate Profiler profiler_config: CommentedMap = yaml.load(yaml_config) # Roundtrip through schema validation to remove any illegal fields add/or restore any missing fields. deserialized_config: dict = ruleBasedProfilerConfigSchema.load( profiler_config) serialized_config: dict = ruleBasedProfilerConfigSchema.dump( deserialized_config) # `class_name`/`module_name` are generally consumed through `instantiate_class_from_config` # so we need to manually remove those values if we wish to use the **kwargs instantiation pattern serialized_config.pop("class_name") serialized_config.pop("module_name") profiler: RuleBasedProfiler = RuleBasedProfiler( **serialized_config, data_context=data_context, ) expectation_suite: ExpectationSuite = profiler.run( expectation_suite_name= bobster_columnar_table_multi_batch_normal_mean_5000_stdev_1000[ "test_configuration_bootstrap_sampling_method"] ["expectation_suite_name"], ) expect_table_row_count_to_be_between_expectation_configuration_kwargs: dict = ( expectation_suite.to_json_dict()["expectations"][0]["kwargs"]) min_value: int = ( expect_table_row_count_to_be_between_expectation_configuration_kwargs[ "min_value"]) max_value: int = ( expect_table_row_count_to_be_between_expectation_configuration_kwargs[ "max_value"]) assert (bobster_columnar_table_multi_batch_normal_mean_5000_stdev_1000[ "test_configuration_bootstrap_sampling_method"] ["expect_table_row_count_to_be_between_min_value_mean_value"] < min_value < bobster_columnar_table_multi_batch_normal_mean_5000_stdev_1000[ "test_configuration_bootstrap_sampling_method"] ["expect_table_row_count_to_be_between_mean_value"]) assert (bobster_columnar_table_multi_batch_normal_mean_5000_stdev_1000[ "test_configuration_bootstrap_sampling_method"] ["expect_table_row_count_to_be_between_mean_value"] < max_value < bobster_columnar_table_multi_batch_normal_mean_5000_stdev_1000[ "test_configuration_bootstrap_sampling_method"] ["expect_table_row_count_to_be_between_max_value_mean_value"])
def test_bobby_profiler_user_workflow_multi_batch_row_count_range_rule_and_column_ranges_rule_oneshot_sampling_method( bobby_columnar_table_multi_batch_deterministic_data_context, bobby_columnar_table_multi_batch, ): # Load data context data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context) # Load profiler configs & loop (run tests for each one) yaml_config: str = bobby_columnar_table_multi_batch["profiler_config"] # Instantiate Profiler profiler_config: dict = yaml.load(yaml_config) # Roundtrip through schema validation to remove any illegal fields add/or restore any missing fields. deserialized_config: dict = ruleBasedProfilerConfigSchema.load( profiler_config) serialized_config: dict = ruleBasedProfilerConfigSchema.dump( deserialized_config) # `class_name`/`module_name` are generally consumed through `instantiate_class_from_config` # so we need to manually remove those values if we wish to use the **kwargs instantiation pattern serialized_config.pop("class_name") serialized_config.pop("module_name") profiler: RuleBasedProfiler = RuleBasedProfiler( **serialized_config, data_context=data_context, ) expectation_suite: ExpectationSuite = profiler.run( expectation_suite_name=bobby_columnar_table_multi_batch[ "test_configuration_oneshot_sampling_method"] ["expectation_suite_name"], include_citation=True, ) assert (expectation_suite == bobby_columnar_table_multi_batch[ "test_configuration_oneshot_sampling_method"] ["expected_expectation_suite"])
def test_notebook_execution_rule_based_profiler_with_pandas_backend( titanic_v013_multi_datasource_pandas_data_context_with_checkpoints_v1_with_empty_store_stats_enabled, bobby_columnar_table_multi_batch, ): """ To set this test up we: - create a suite using Rule-Based Profiler - verify that no validations have happened - create the suite edit notebook by hijacking the private cli method We then: - execute that notebook (Note this will raise various errors like CellExecutionError if any cell in the notebook fails - create a new context from disk - verify that a validation has been run with our expectation suite """ context: DataContext = titanic_v013_multi_datasource_pandas_data_context_with_checkpoints_v1_with_empty_store_stats_enabled root_dir: str = context.root_directory uncommitted_dir: str = os.path.join(root_dir, "uncommitted") expectation_suite_name: str = "warning" context.create_expectation_suite( expectation_suite_name=expectation_suite_name) batch_request: dict = { "datasource_name": "my_datasource", "data_connector_name": "my_basic_data_connector", "data_asset_name": "Titanic_1912", } # Sanity check test setup original_suite: ExpectationSuite = context.get_expectation_suite( expectation_suite_name=expectation_suite_name) assert len(original_suite.expectations) == 0 assert context.list_expectation_suite_names() == [expectation_suite_name] assert context.list_datasources() == [ { "name": "my_datasource", "class_name": "Datasource", "module_name": "great_expectations.datasource", "execution_engine": { "class_name": "PandasExecutionEngine", "module_name": "great_expectations.execution_engine", }, "data_connectors": { "my_basic_data_connector": { "module_name": "great_expectations.datasource.data_connector", "base_directory": f"{root_dir}/../data/titanic", "default_regex": { "pattern": "(.*)\\.csv", "group_names": ["data_asset_name"], }, "class_name": "InferredAssetFilesystemDataConnector", }, "my_special_data_connector": { "glob_directive": "*.csv", "assets": { "users": { "pattern": "(.+)_(\\d+)_(\\d+)\\.csv", "group_names": ["name", "timestamp", "size"], "class_name": "Asset", "base_directory": f"{root_dir}/../data/titanic", "module_name": "great_expectations.datasource.data_connector.asset", } }, "module_name": "great_expectations.datasource.data_connector", "base_directory": f"{root_dir}/../data/titanic", "default_regex": { "pattern": "(.+)\\.csv", "group_names": ["name"] }, "class_name": "ConfiguredAssetFilesystemDataConnector", }, "my_other_data_connector": { "glob_directive": "*.csv", "assets": { "users": { "class_name": "Asset", "module_name": "great_expectations.datasource.data_connector.asset", } }, "module_name": "great_expectations.datasource.data_connector", "base_directory": f"{root_dir}/../data/titanic", "default_regex": { "pattern": "(.+)\\.csv", "group_names": ["name"] }, "class_name": "ConfiguredAssetFilesystemDataConnector", }, "my_runtime_data_connector": { "module_name": "great_expectations.datasource.data_connector", "batch_identifiers": ["pipeline_stage_name", "airflow_run_id"], "class_name": "RuntimeDataConnector", }, }, }, { "name": "my_additional_datasource", "class_name": "Datasource", "module_name": "great_expectations.datasource", "execution_engine": { "module_name": "great_expectations.execution_engine", "class_name": "PandasExecutionEngine", }, "data_connectors": { "my_additional_data_connector": { "module_name": "great_expectations.datasource.data_connector", "default_regex": { "pattern": "(.*)\\.csv", "group_names": ["data_asset_name"], }, "base_directory": f"{root_dir}/../data/titanic", "class_name": "InferredAssetFilesystemDataConnector", } }, }, ] assert context.get_validation_result( expectation_suite_name="warning") == {} # Load profiler configs & loop (run tests for each one) yaml_config: str = bobby_columnar_table_multi_batch["profiler_config"] # Instantiate Profiler profiler_config: dict = yaml.load(yaml_config) # Roundtrip through schema validation to remove any illegal fields add/or restore any missing fields. deserialized_config: dict = ruleBasedProfilerConfigSchema.load( profiler_config) serialized_config: dict = ruleBasedProfilerConfigSchema.dump( deserialized_config) # `class_name`/`module_name` are generally consumed through `instantiate_class_from_config` # so we need to manually remove those values if we wish to use the **kwargs instantiation pattern serialized_config.pop("class_name") serialized_config.pop("module_name") profiler: RuleBasedProfiler = RuleBasedProfiler( **serialized_config, data_context=context, ) profiler_name: str = "bobby_user_workflow" context.save_profiler( profiler=profiler, name=profiler_name, ) # Create notebook # do not want to actually send usage_message, since the function call is not the result of actual usage _suite_edit_workflow( context=context, expectation_suite_name=expectation_suite_name, profile=True, profiler_name=profiler_name, usage_event="test_notebook_execution", interactive_mode=CLISuiteInteractiveFlagCombinations. UNPROMPTED_INTERACTIVE_FALSE_MANUAL_TRUE, no_jupyter=True, create_if_not_exist=False, datasource_name=None, batch_request=batch_request, additional_batch_request_args=None, suppress_usage_message=True, assume_yes=True, ) edit_notebook_path: str = os.path.join(uncommitted_dir, "edit_warning.ipynb") assert os.path.isfile(edit_notebook_path) run_notebook( notebook_path=edit_notebook_path, notebook_dir=uncommitted_dir, string_to_be_replaced= "context.open_data_docs(resource_identifier=validation_result_identifier)", replacement_string="", ) # Assertions about output context = DataContext(context_root_dir=root_dir) obs_validation_result: ExpectationSuiteValidationResult = ( context.get_validation_result(expectation_suite_name="warning")) assert obs_validation_result.statistics == { "evaluated_expectations": 13, "successful_expectations": 13, "unsuccessful_expectations": 0, "success_percent": 100.0, } expected_expectation_configurations: List[ExpectationConfiguration] = [ ExpectationConfiguration( **{ "meta": { "profiler_details": { "metric_configuration": { "domain_kwargs": {}, "metric_dependencies": None, "metric_name": "table.row_count", "metric_value_kwargs": None, }, "num_batches": 1, } }, "kwargs": { "max_value": 1313, "min_value": 1313 }, "expectation_type": "expect_table_row_count_to_be_between", }), ExpectationConfiguration( **{ "meta": { "profiler_details": { "metric_configuration": { "domain_kwargs": { "column": "Unnamed: 0" }, "metric_dependencies": None, "metric_name": "column.min", "metric_value_kwargs": None, }, "num_batches": 1, } }, "kwargs": { "column": "Unnamed: 0", "max_value": 1, "min_value": 1, "mostly": 1.0, }, "expectation_type": "expect_column_min_to_be_between", }), ExpectationConfiguration( **{ "meta": { "profiler_details": { "metric_configuration": { "domain_kwargs": { "column": "Unnamed: 0" }, "metric_dependencies": None, "metric_name": "column.max", "metric_value_kwargs": None, }, "num_batches": 1, } }, "kwargs": { "column": "Unnamed: 0", "max_value": 1313, "min_value": 1313, "mostly": 1.0, }, "expectation_type": "expect_column_max_to_be_between", }), ExpectationConfiguration( **{ "meta": { "profiler_details": { "metric_configuration": { "domain_kwargs": { "column": "Age" }, "metric_dependencies": None, "metric_name": "column.min", "metric_value_kwargs": None, }, "num_batches": 1, } }, "kwargs": { "column": "Age", "max_value": 0.17, "min_value": 0.17, "mostly": 1.0, }, "expectation_type": "expect_column_min_to_be_between", }), ExpectationConfiguration( **{ "meta": { "profiler_details": { "metric_configuration": { "domain_kwargs": { "column": "Age" }, "metric_dependencies": None, "metric_name": "column.max", "metric_value_kwargs": None, }, "num_batches": 1, } }, "kwargs": { "column": "Age", "max_value": 71.0, "min_value": 71.0, "mostly": 1.0, }, "expectation_type": "expect_column_max_to_be_between", }), ExpectationConfiguration( **{ "meta": { "profiler_details": { "metric_configuration": { "domain_kwargs": { "column": "Survived" }, "metric_dependencies": None, "metric_name": "column.min", "metric_value_kwargs": None, }, "num_batches": 1, } }, "kwargs": { "column": "Survived", "max_value": 0, "min_value": 0, "mostly": 1.0, }, "expectation_type": "expect_column_min_to_be_between", }), ExpectationConfiguration( **{ "meta": { "profiler_details": { "metric_configuration": { "domain_kwargs": { "column": "Survived" }, "metric_dependencies": None, "metric_name": "column.max", "metric_value_kwargs": None, }, "num_batches": 1, } }, "kwargs": { "column": "Survived", "max_value": 1, "min_value": 1, "mostly": 1.0, }, "expectation_type": "expect_column_max_to_be_between", }), ExpectationConfiguration( **{ "meta": { "profiler_details": { "metric_configuration": { "domain_kwargs": { "column": "SexCode" }, "metric_dependencies": None, "metric_name": "column.min", "metric_value_kwargs": None, }, "num_batches": 1, } }, "kwargs": { "column": "SexCode", "max_value": 0, "min_value": 0, "mostly": 1.0, }, "expectation_type": "expect_column_min_to_be_between", }), ExpectationConfiguration( **{ "meta": { "profiler_details": { "metric_configuration": { "domain_kwargs": { "column": "SexCode" }, "metric_dependencies": None, "metric_name": "column.max", "metric_value_kwargs": None, }, "num_batches": 1, } }, "kwargs": { "column": "SexCode", "max_value": 1, "min_value": 1, "mostly": 1.0, }, "expectation_type": "expect_column_max_to_be_between", }), ExpectationConfiguration( **{ "meta": {}, "kwargs": { "column": "PClass", "value_set": [ "*", "1st", "2nd", "3rd", ], }, "expectation_type": "expect_column_values_to_be_in_set", }), ExpectationConfiguration( **{ "meta": {}, "kwargs": { "column": "Sex", "value_set": ["female", "male"] }, "expectation_type": "expect_column_values_to_be_in_set", }), ExpectationConfiguration( **{ "meta": {}, "kwargs": { "column": "Survived", "value_set": [0, 1] }, "expectation_type": "expect_column_values_to_be_in_set", }), ExpectationConfiguration( **{ "meta": {}, "kwargs": { "column": "SexCode", "value_set": [0, 1] }, "expectation_type": "expect_column_values_to_be_in_set", }), ] suite: ExpectationSuite = context.get_expectation_suite( expectation_suite_name=expectation_suite_name) expectation_configurations: List[ExpectationConfiguration] = [] expectation_configuration: ExpectationConfiguration for expectation_configuration in suite.expectations: kwargs: dict = expectation_configuration.kwargs key: str value: Any kwargs = { key: sorted(value) if isinstance(value, (list, set, tuple)) else value for key, value in kwargs.items() } expectation_configuration.kwargs = kwargs expectation_configurations.append(expectation_configuration) assert expectation_configurations == expected_expectation_configurations
def test_quentin_profiler_user_workflow_multi_batch_quantiles_value_ranges_rule( quentin_columnar_table_multi_batch_data_context, quentin_columnar_table_multi_batch, ): # Load data context data_context: DataContext = quentin_columnar_table_multi_batch_data_context # Load profiler configs & loop (run tests for each one) yaml_config: str = quentin_columnar_table_multi_batch["profiler_config"] # Instantiate Profiler profiler_config: CommentedMap = yaml.load(yaml_config) # Roundtrip through schema validation to remove any illegal fields add/or restore any missing fields. deserialized_config: dict = ruleBasedProfilerConfigSchema.load( profiler_config) serialized_config: dict = ruleBasedProfilerConfigSchema.dump( deserialized_config) # `class_name`/`module_name` are generally consumed through `instantiate_class_from_config` # so we need to manually remove those values if we wish to use the **kwargs instantiation pattern serialized_config.pop("class_name") serialized_config.pop("module_name") profiler: RuleBasedProfiler = RuleBasedProfiler( **serialized_config, data_context=data_context, ) expectation_suite: ExpectationSuite = profiler.run( expectation_suite_name=quentin_columnar_table_multi_batch[ "test_configuration"]["expectation_suite_name"], ) expectation_configuration_dict: dict column_name: str expectation_kwargs: dict expect_column_quantile_values_to_be_between_expectation_configurations_kwargs_dict: Dict[ str, dict] = { expectation_configuration_dict["kwargs"]["column"]: expectation_configuration_dict["kwargs"] for expectation_configuration_dict in expectation_suite.to_json_dict()["expectations"] } expect_column_quantile_values_to_be_between_expectation_configurations_value_ranges_by_column: Dict[ str, List[List[Number]]] = { column_name: expectation_kwargs["value_ranges"] for column_name, expectation_kwargs in expect_column_quantile_values_to_be_between_expectation_configurations_kwargs_dict .items() } assert ( expect_column_quantile_values_to_be_between_expectation_configurations_value_ranges_by_column[ "tolls_amount"] == quentin_columnar_table_multi_batch["test_configuration"] ["expect_column_quantile_values_to_be_between_quantile_ranges_by_column"] ["tolls_amount"]) # Measure of "closeness" between "actual" and "desired" is computed as: atol + rtol * abs(desired) # (see "https://numpy.org/doc/stable/reference/generated/numpy.testing.assert_allclose.html" for details). rtol: float = 1.0e-7 atol: float = 5.0e-2 value_range: List[Number] paired_quantiles: zip column_quantiles: List[List[Number]] idx: int for ( column_name, column_quantiles, ) in (expect_column_quantile_values_to_be_between_expectation_configurations_value_ranges_by_column .items()): paired_quantiles = zip( column_quantiles, quentin_columnar_table_multi_batch["test_configuration"] ["expect_column_quantile_values_to_be_between_quantile_ranges_by_column"] [column_name], ) for value_range in list(paired_quantiles): for idx in range(2): np.testing.assert_allclose( actual=value_range[0][idx], desired=value_range[1][idx], rtol=rtol, atol=atol, err_msg= f"Actual value of {value_range[0][idx]} differs from expected value of {value_range[1][idx]} by more than {atol + rtol * abs(value_range[1][idx])} tolerance.", )