def __init__(self, expectation_suite_identifier, run_id, batch_identifier): """Constructs a ValidationResultIdentifier Args: expectation_suite_identifier (ExpectationSuiteIdentifier, list, tuple, or dict): identifying information for the fully qualified expectation suite used to validate run_id (RunIdentifier): The run_id for which validation occurred """ super().__init__() self._expectation_suite_identifier = expectation_suite_identifier if isinstance(run_id, str): warnings.warn( "String run_ids will be deprecated in the future. Please provide a run_id of type " "RunIdentifier(run_name=None, run_time=None), or a dictionary containing run_name " "and run_time (both optional).", DeprecationWarning, ) try: run_time = parse(run_id) except (ValueError, TypeError): run_time = None run_id = RunIdentifier(run_name=run_id, run_time=run_time) elif isinstance(run_id, dict): run_id = RunIdentifier(**run_id) elif run_id is None: run_id = RunIdentifier() elif not isinstance(run_id, RunIdentifier): run_id = RunIdentifier(run_name=str(run_id)) self._run_id = run_id self._batch_identifier = batch_identifier
def __init__( self, run_id, data_asset_name, expectation_suite_identifier, metric_name, metric_kwargs, metric_value, ): super().__init__(metric_name, metric_kwargs, metric_value) if not isinstance(expectation_suite_identifier, ExpectationSuiteIdentifier): expectation_suite_identifier = ExpectationSuiteIdentifier( expectation_suite_name=expectation_suite_identifier) if isinstance(run_id, str): warnings.warn( "String run_ids will be deprecated in the future. Please provide a run_id of type " "RunIdentifier(run_name=None, run_time=None), or a dictionary containing run_name " "and run_time (both optional).", DeprecationWarning, ) try: run_time = parse(run_id) except (ValueError, TypeError): run_time = None run_id = RunIdentifier(run_name=run_id, run_time=run_time) elif isinstance(run_id, dict): run_id = RunIdentifier(**run_id) elif run_id is None: run_id = RunIdentifier() elif not isinstance(run_id, RunIdentifier): run_id = RunIdentifier(run_name=str(run_id)) self._run_id = run_id self._data_asset_name = data_asset_name self._expectation_suite_identifier = expectation_suite_identifier
def profile( cls, data_asset, run_id=None, profiler_configuration=None, run_name=None, run_time=None, ): assert not (run_id and run_name) and not ( run_id and run_time ), "Please provide either a run_id or run_name and/or run_time." if isinstance(run_id, str) and not run_name: warnings.warn( "String run_ids will be deprecated in the future. Please provide a run_id of type " "RunIdentifier(run_name=None, run_time=None), or a dictionary containing run_name " "and run_time (both optional). Instead of providing a run_id, you may also provide" "run_name and run_time separately.", DeprecationWarning, ) try: run_time = parse(run_id) except (ValueError, TypeError): pass run_id = RunIdentifier(run_name=run_id, run_time=run_time) elif isinstance(run_id, dict): run_id = RunIdentifier(**run_id) elif not isinstance(run_id, RunIdentifier): run_name = run_name or "profiling" run_id = RunIdentifier(run_name=run_name, run_time=run_time) if not cls.validate(data_asset): raise GreatExpectationsError( "Invalid data_asset for profiler; aborting") expectation_suite = cls._profile(data_asset, configuration=profiler_configuration) batch_kwargs = data_asset.batch_kwargs expectation_suite = cls.add_meta(expectation_suite, batch_kwargs) validation_results = data_asset.validate(expectation_suite, run_id=run_id, result_format="SUMMARY") expectation_suite.add_citation( comment=str(cls.__name__) + " added a citation based on the current batch.", batch_kwargs=data_asset.batch_kwargs, batch_markers=data_asset.batch_markers, batch_parameters=data_asset.batch_parameters, ) return expectation_suite, validation_results
def validation_result_suite_id(): return ValidationResultIdentifier( expectation_suite_identifier=ExpectationSuiteIdentifier( "asset.default"), run_id=RunIdentifier(run_name="test_100"), batch_identifier="1234", )
def test_TupleGCSStoreBackend_base_public_path(): """ What does this test and why? the base_public_path parameter allows users to point to a custom DNS when hosting Data docs. This test will exercise the get_url_for_key method twice to see that we are getting the expected url, with or without base_public_path """ bucket = "leakybucket" prefix = "this_is_a_test_prefix" project = "dummy-project" base_public_path = "http://www.test.com/" my_store_with_base_public_path = TupleGCSStoreBackend( filepath_template=None, bucket=bucket, prefix=prefix, project=project, base_public_path=base_public_path, ) with patch("google.cloud.storage.Client", autospec=True) as mock_gcs_client: mock_client = mock_gcs_client.return_value mock_bucket = mock_client.get_bucket.return_value mock_blob = mock_bucket.blob.return_value my_store_with_base_public_path.set(("BBB", ), b"bbb", content_encoding=None, content_type="image/png") run_id = RunIdentifier("my_run_id", datetime.datetime.utcnow()) key = ValidationResultIdentifier( ExpectationSuiteIdentifier(expectation_suite_name="my_suite_name"), run_id, "my_batch_id", ) run_time_string = run_id.to_tuple()[1] url = my_store_with_base_public_path.get_public_url_for_key(key.to_tuple()) assert ( url == "http://www.test.com/leakybucket" + f"/this_is_a_test_prefix/my_suite_name/my_run_id/{run_time_string}/my_batch_id" )
def validation_result_suite_extended_id(): return ValidationResultIdentifier( expectation_suite_identifier=ExpectationSuiteIdentifier( "asset.default"), run_id=RunIdentifier(run_name="test_100", run_time="Tue May 08 15:14:45 +0800 2012"), batch_identifier=BatchIdentifier(batch_identifier="1234", data_asset_name="asset"), )
def test_StoreAction(): fake_in_memory_store = ValidationsStore( store_backend={"class_name": "InMemoryStoreBackend",} ) stores = {"fake_in_memory_store": fake_in_memory_store} class Object: pass data_context = Object() data_context.stores = stores action = StoreValidationResultAction( data_context=data_context, target_store_name="fake_in_memory_store", ) assert fake_in_memory_store.list_keys() == [] action.run( validation_result_suite_identifier=ValidationResultIdentifier( expectation_suite_identifier=ExpectationSuiteIdentifier( expectation_suite_name="default_expectations" ), run_id="prod_20190801", batch_identifier="1234", ), validation_result_suite=ExpectationSuiteValidationResult( success=False, results=[] ), data_asset=None, ) expected_run_id = RunIdentifier( run_name="prod_20190801", run_time="20190926T134241.000000Z" ) assert len(fake_in_memory_store.list_keys()) == 1 stored_identifier = fake_in_memory_store.list_keys()[0] assert stored_identifier.batch_identifier == "1234" assert ( stored_identifier.expectation_suite_identifier.expectation_suite_name == "default_expectations" ) assert stored_identifier.run_id == expected_run_id assert fake_in_memory_store.get( ValidationResultIdentifier( expectation_suite_identifier=ExpectationSuiteIdentifier( expectation_suite_name="default_expectations" ), run_id=expected_run_id, batch_identifier="1234", ) ) == ExpectationSuiteValidationResult(success=False, results=[])
def test_database_evaluation_parameter_store_get_bind_params(param_store): # Bind params must be expressed as a string-keyed dictionary. # Verify that the param_store supports that run_id = RunIdentifier(run_name=datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")) metric_identifier = ValidationMetricIdentifier( run_id=run_id, data_asset_name=None, expectation_suite_identifier="asset.warning", metric_name= "expect_column_values_to_match_regex.result.unexpected_percent", metric_kwargs_id="column=mycol", ) metric_value = 12.3456789 param_store.set(metric_identifier, metric_value) metric_identifier = ValidationMetricIdentifier( run_id=run_id, data_asset_name=None, expectation_suite_identifier="asset.warning", metric_name= "expect_table_row_count_to_be_between.result.observed_value", metric_kwargs_id=None, ) metric_value = 512 param_store.set(metric_identifier, metric_value) metric_identifier = ValidationMetricIdentifier( run_id=run_id, data_asset_name=None, expectation_suite_identifier="asset2.warning", metric_name= "expect_column_values_to_match_regex.result.unexpected_percent", metric_kwargs_id="column=mycol", ) metric_value = 12.3456789 param_store.set(metric_identifier, metric_value) params = param_store.get_bind_params(run_id) assert params == { "urn:great_expectations:validations:asset.warning:" "expect_column_values_to_match_regex.result.unexpected_percent:column=mycol": 12.3456789, "urn:great_expectations:validations:asset.warning:" "expect_table_row_count_to_be_between.result.observed_value": 512, "urn:great_expectations:validations:asset2.warning:" "expect_column_values_to_match_regex.result.unexpected_percent:column=mycol": 12.3456789, }
def test_database_evaluation_parameter_store_basics(param_store): run_id = RunIdentifier(run_name=datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")) metric_identifier = ValidationMetricIdentifier( run_id=run_id, data_asset_name=None, expectation_suite_identifier="asset.warning", metric_name= "expect_column_values_to_match_regex.result.unexpected_percent", metric_kwargs_id="column=mycol", ) metric_value = 12.3456789 param_store.set(metric_identifier, metric_value) value = param_store.get(metric_identifier) assert value == metric_value
def from_tuple(cls, tuple_): if len(tuple_) < 6: raise GreatExpectationsError( "ValidationMetricIdentifier tuple must have at least six components." ) if tuple_[2] == "__": tuple_data_asset_name = None else: tuple_data_asset_name = tuple_[2] metric_id = MetricIdentifier.from_tuple(tuple_[-2:]) return cls( run_id=RunIdentifier.from_tuple((tuple_[0], tuple_[1])), data_asset_name=tuple_data_asset_name, expectation_suite_identifier=ExpectationSuiteIdentifier.from_tuple( tuple_[3:-2]), metric_name=metric_id.metric_name, metric_kwargs_id=metric_id.metric_kwargs_id, )
def from_fixed_length_tuple(cls, tuple_): if len(tuple_) != 6: raise GreatExpectationsError( "ValidationMetricIdentifier fixed length tuple must have exactly six " "components.") if tuple_[2] == "__": tuple_data_asset_name = None else: tuple_data_asset_name = tuple_[2] metric_id = MetricIdentifier.from_tuple(tuple_[-2:]) return cls( run_id=RunIdentifier.from_fixed_length_tuple( (tuple_[0], tuple_[1])), data_asset_name=tuple_data_asset_name, expectation_suite_identifier=ExpectationSuiteIdentifier. from_fixed_length_tuple(tuple((tuple_[3], ))), metric_name=metric_id.metric_name, metric_kwargs_id=metric_id.metric_kwargs_id, )
def test_StoreMetricsAction_column_metric( basic_in_memory_data_context_for_validation_operator, ): action = StoreMetricsAction( data_context=basic_in_memory_data_context_for_validation_operator, requested_metrics={ "*": [ { "column": { "provider_id": [ "expect_column_values_to_be_unique.result.unexpected_count" ] } }, "statistics.evaluated_expectations", "statistics.successful_expectations", ] }, target_store_name="metrics_store", ) run_id = RunIdentifier(run_name="bar") validation_result = ExpectationSuiteValidationResult( success=False, meta={ "expectation_suite_name": "foo", "run_id": run_id }, results=[ ExpectationValidationResult( meta={}, result={ "element_count": 10, "missing_count": 0, "missing_percent": 0.0, "unexpected_count": 7, "unexpected_percent": 0.0, "unexpected_percent_nonmissing": 0.0, "partial_unexpected_list": [], }, success=True, expectation_config=ExpectationConfiguration( expectation_type="expect_column_values_to_be_unique", kwargs={ "column": "provider_id", "result_format": "BASIC" }, ), exception_info=None, ) ], statistics={ "evaluated_expectations": 5, "successful_expectations": 3 }, ) action.run( validation_result, ValidationResultIdentifier.from_object(validation_result), data_asset=None, ) assert ( basic_in_memory_data_context_for_validation_operator. stores["metrics_store"].get( ValidationMetricIdentifier( run_id=run_id, data_asset_name=None, expectation_suite_identifier=ExpectationSuiteIdentifier("foo"), metric_name= "expect_column_values_to_be_unique.result.unexpected_count", metric_kwargs_id="column=provider_id", )) == 7)
def run( self, template_name: Optional[str] = None, run_name_template: Optional[str] = None, expectation_suite_name: Optional[str] = None, batch_request: Optional[Union[BatchRequest, dict]] = None, action_list: Optional[List[dict]] = None, evaluation_parameters: Optional[dict] = None, runtime_configuration: Optional[dict] = None, validations: Optional[List[dict]] = None, profilers: Optional[List[dict]] = None, run_id=None, run_name=None, run_time=None, result_format=None, **kwargs, ) -> CheckpointResult: assert not (run_id and run_name) and not ( run_id and run_time ), "Please provide either a run_id or run_name and/or run_time." run_time = run_time or datetime.now() runtime_configuration: dict = runtime_configuration or {} result_format: Optional[ dict] = result_format or runtime_configuration.get("result_format") if result_format is None: result_format = {"result_format": "SUMMARY"} runtime_kwargs = { "template_name": template_name, "run_name_template": run_name_template, "expectation_suite_name": expectation_suite_name, "batch_request": batch_request, "action_list": action_list, "evaluation_parameters": evaluation_parameters, "runtime_configuration": runtime_configuration, "validations": validations, "profilers": profilers, } substituted_runtime_config: CheckpointConfig = self.get_substituted_config( runtime_kwargs=runtime_kwargs) run_name_template: Optional[ str] = substituted_runtime_config.run_name_template validations: list = substituted_runtime_config.validations run_results = {} if run_name is None and run_name_template is not None: run_name: str = get_datetime_string_from_strftime_format( format_str=run_name_template, datetime_obj=run_time) run_id = run_id or RunIdentifier(run_name=run_name, run_time=run_time) for idx, validation_dict in enumerate(validations): try: substituted_validation_dict: dict = get_substituted_validation_dict( substituted_runtime_config=substituted_runtime_config, validation_dict=validation_dict, ) batch_request: BatchRequest = substituted_validation_dict.get( "batch_request") expectation_suite_name: str = substituted_validation_dict.get( "expectation_suite_name") action_list: list = substituted_validation_dict.get( "action_list") validator: Validator = self.data_context.get_validator( batch_request=batch_request, expectation_suite_name=expectation_suite_name, ) action_list_validation_operator: ActionListValidationOperator = ( ActionListValidationOperator( data_context=self.data_context, action_list=action_list, result_format=result_format, name=f"{self.name}-checkpoint-validation[{idx}]", )) val_op_run_result: ValidationOperatorResult = ( action_list_validation_operator.run( assets_to_validate=[validator], run_id=run_id, evaluation_parameters=substituted_validation_dict.get( "evaluation_parameters"), result_format=result_format, )) run_results.update(val_op_run_result.run_results) except CheckpointError as e: raise CheckpointError( f"Exception occurred while running validation[{idx}] of checkpoint '{self.name}': {e.message}" ) return CheckpointResult(run_id=run_id, run_results=run_results, checkpoint_config=self.config)
def test_configuration_driven_site_builder( site_builder_data_context_with_html_store_titanic_random, ): context = site_builder_data_context_with_html_store_titanic_random context.add_validation_operator( "validate_and_store", { "class_name": "ActionListValidationOperator", "action_list": [ { "name": "store_validation_result", "action": { "class_name": "StoreValidationResultAction", "target_store_name": "validations_store", }, }, { "name": "extract_and_store_eval_parameters", "action": { "class_name": "StoreEvaluationParametersAction", "target_store_name": "evaluation_parameter_store", }, }, ], }, ) # profiling the Titanic datasource will generate one expectation suite and one validation # that is a profiling result datasource_name = "titanic" data_asset_name = "Titanic" profiler_name = "BasicDatasetProfiler" generator_name = "subdir_reader" context.profile_datasource(datasource_name) # creating another validation result using the profiler's suite (no need to use a new expectation suite # for this test). having two validation results - one with run id "profiling" - allows us to test # the logic of run_name_filter that helps filtering validation results to be included in # the profiling and the validation sections. batch_kwargs = context.build_batch_kwargs( datasource=datasource_name, batch_kwargs_generator=generator_name, name=data_asset_name, ) expectation_suite_name = "{}.{}.{}.{}".format(datasource_name, generator_name, data_asset_name, profiler_name) batch = context.get_batch( batch_kwargs=batch_kwargs, expectation_suite_name=expectation_suite_name, ) run_id = RunIdentifier(run_name="test_run_id_12345") context.run_validation_operator( assets_to_validate=[batch], run_id=run_id, validation_operator_name="validate_and_store", ) data_docs_config = context._project_config.data_docs_sites local_site_config = data_docs_config["local_site"] validations_set = set(context.stores["validations_store"].list_keys()) assert len(validations_set) == 6 assert (ValidationResultIdentifier( expectation_suite_identifier=ExpectationSuiteIdentifier( expectation_suite_name=expectation_suite_name), run_id="test_run_id_12345", batch_identifier=batch.batch_id, ) in validations_set) assert (ValidationResultIdentifier( expectation_suite_identifier=ExpectationSuiteIdentifier( expectation_suite_name=expectation_suite_name), run_id="profiling", batch_identifier=batch.batch_id, ) in validations_set) assert (ValidationResultIdentifier( expectation_suite_identifier=ExpectationSuiteIdentifier( expectation_suite_name=expectation_suite_name), run_id="profiling", batch_identifier=batch.batch_id, ) in validations_set) assert (ValidationResultIdentifier( expectation_suite_identifier=ExpectationSuiteIdentifier( expectation_suite_name=expectation_suite_name), run_id="profiling", batch_identifier=batch.batch_id, ) in validations_set) site_builder = SiteBuilder( data_context=context, runtime_environment={"root_directory": context.root_directory}, **local_site_config) res = site_builder.build() index_page_locator_info = res[0] index_links_dict = res[1] # assert that how-to buttons and related elements are rendered (default behavior) assert_how_to_buttons(context, index_page_locator_info, index_links_dict) # print(json.dumps(index_page_locator_info, indent=2)) assert (index_page_locator_info == "file://" + context.root_directory + "/uncommitted/data_docs/local_site/index.html") # print(json.dumps(index_links_dict, indent=2)) assert "site_name" in index_links_dict assert "expectations_links" in index_links_dict assert len(index_links_dict["expectations_links"]) == 5 assert "validations_links" in index_links_dict assert (len(index_links_dict["validations_links"]) == 1), """ The only rendered validation should be the one not generated by the profiler """ assert "profiling_links" in index_links_dict assert len(index_links_dict["profiling_links"]) == 5 # save documentation locally os.makedirs("./tests/render/output", exist_ok=True) os.makedirs("./tests/render/output/documentation", exist_ok=True) if os.path.isdir("./tests/render/output/documentation"): shutil.rmtree("./tests/render/output/documentation") shutil.copytree( os.path.join( site_builder_data_context_with_html_store_titanic_random. root_directory, "uncommitted/data_docs/", ), "./tests/render/output/documentation", ) # let's create another validation result and run the site builder to add it # to the data docs # the operator does not have an StoreValidationResultAction action configured, so the site # will not be updated without our call to site builder expectation_suite_path_component = expectation_suite_name.replace(".", "/") validation_result_page_path = os.path.join( site_builder.site_index_builder.target_store. store_backends[ValidationResultIdentifier].full_base_directory, "validations", expectation_suite_path_component, run_id.run_name, run_id.run_time.strftime("%Y%m%dT%H%M%S.%fZ"), batch.batch_id + ".html", ) ts_last_mod_0 = os.path.getmtime(validation_result_page_path) run_id = RunIdentifier(run_name="test_run_id_12346") operator_result = context.run_validation_operator( assets_to_validate=[batch], run_id=run_id, validation_operator_name="validate_and_store", ) validation_result_id = operator_result.list_validation_result_identifiers( )[0] res = site_builder.build(resource_identifiers=[validation_result_id]) index_links_dict = res[1] # verify that an additional validation result HTML file was generated assert len(index_links_dict["validations_links"]) == 2 site_builder.site_index_builder.target_store.store_backends[ ValidationResultIdentifier].full_base_directory # verify that the validation result HTML file rendered in the previous run was NOT updated ts_last_mod_1 = os.path.getmtime(validation_result_page_path) assert ts_last_mod_0 == ts_last_mod_1 # verify that the new method of the site builder that returns the URL of the HTML file that renders # a resource new_validation_result_page_path = os.path.join( site_builder.site_index_builder.target_store. store_backends[ValidationResultIdentifier].full_base_directory, "validations", expectation_suite_path_component, run_id.run_name, run_id.run_time.strftime("%Y%m%dT%H%M%S.%fZ"), batch.batch_id + ".html", ) html_url = site_builder.get_resource_url( resource_identifier=validation_result_id) assert "file://" + new_validation_result_page_path == html_url html_url = site_builder.get_resource_url() assert ("file://" + os.path.join( site_builder.site_index_builder.target_store. store_backends[ValidationResultIdentifier].full_base_directory, "index.html", ) == html_url) team_site_config = data_docs_config["team_site"] team_site_builder = SiteBuilder( data_context=context, runtime_environment={"root_directory": context.root_directory}, **team_site_config) team_site_builder.clean_site() obs = [ url_dict for url_dict in context.get_docs_sites_urls(site_name="team_site") if url_dict.get("site_url") ] assert len(obs) == 0 # exercise clean_site site_builder.clean_site() obs = [ url_dict for url_dict in context.get_docs_sites_urls() if url_dict.get("site_url") ] assert len(obs) == 0 # restore site context = site_builder_data_context_with_html_store_titanic_random site_builder = SiteBuilder( data_context=context, runtime_environment={"root_directory": context.root_directory}, **local_site_config) res = site_builder.build()
def test_TupleGCSStoreBackend(): # pytest.importorskip("google-cloud-storage") """ What does this test test and why? Since no package like moto exists for GCP services, we mock the GCS client and assert that the store backend makes the right calls for set, get, and list. TODO : One option may be to have a GCS Store in Docker, which can be use to "actually" run these tests. """ bucket = "leakybucket" prefix = "this_is_a_test_prefix" project = "dummy-project" my_store = TupleGCSStoreBackend( filepath_template="my_file_{0}", bucket=bucket, prefix=prefix, project=project ) my_store_with_no_filepath_template = TupleGCSStoreBackend( filepath_template=None, bucket=bucket, prefix=prefix, project=project ) with patch("google.cloud.storage.Client", autospec=True) as mock_gcs_client: mock_client = mock_gcs_client.return_value mock_bucket = mock_client.get_bucket.return_value mock_blob = mock_bucket.blob.return_value my_store.set(("AAA",), "aaa", content_type="text/html") mock_gcs_client.assert_called_once_with("dummy-project") mock_client.get_bucket.assert_called_once_with("leakybucket") mock_bucket.blob.assert_called_once_with("this_is_a_test_prefix/my_file_AAA") mock_blob.upload_from_string.assert_called_once_with( b"aaa", content_type="text/html" ) with patch("google.cloud.storage.Client", autospec=True) as mock_gcs_client: mock_client = mock_gcs_client.return_value mock_bucket = mock_client.get_bucket.return_value mock_blob = mock_bucket.blob.return_value my_store_with_no_filepath_template.set( ("AAA",), b"aaa", content_encoding=None, content_type="image/png" ) mock_gcs_client.assert_called_once_with("dummy-project") mock_client.get_bucket.assert_called_once_with("leakybucket") mock_bucket.blob.assert_called_once_with("this_is_a_test_prefix/AAA") mock_blob.upload_from_string.assert_called_once_with( b"aaa", content_type="image/png" ) with patch("google.cloud.storage.Client", autospec=True) as mock_gcs_client: mock_client = mock_gcs_client.return_value mock_bucket = mock_client.get_bucket.return_value mock_blob = mock_bucket.get_blob.return_value mock_str = mock_blob.download_as_string.return_value my_store.get(("BBB",)) mock_gcs_client.assert_called_once_with("dummy-project") mock_client.get_bucket.assert_called_once_with("leakybucket") mock_bucket.get_blob.assert_called_once_with( "this_is_a_test_prefix/my_file_BBB" ) mock_blob.download_as_string.assert_called_once() mock_str.decode.assert_called_once_with("utf-8") with patch("google.cloud.storage.Client", autospec=True) as mock_gcs_client: mock_client = mock_gcs_client.return_value my_store.list_keys() mock_client.list_blobs.assert_called_once_with( "leakybucket", prefix="this_is_a_test_prefix" ) my_store.remove_key("leakybucket") from google.cloud.exceptions import NotFound try: mock_client.get_bucket.assert_called_once_with("leakybucket") except NotFound: pass with patch("google.cloud.storage.Client", autospec=True) as mock_gcs_client: mock_gcs_client.side_effect = InvalidKeyError("Hi I am an InvalidKeyError") with pytest.raises(InvalidKeyError): my_store.get(("non_existent_key",)) run_id = RunIdentifier("my_run_id", datetime.datetime.utcnow()) key = ValidationResultIdentifier( ExpectationSuiteIdentifier(expectation_suite_name="my_suite_name"), run_id, "my_batch_id", ) run_time_string = run_id.to_tuple()[1] url = my_store_with_no_filepath_template.get_url_for_key(key.to_tuple()) assert ( url == "https://storage.googleapis.com/leakybucket" + f"/this_is_a_test_prefix/my_suite_name/my_run_id/{run_time_string}/my_batch_id" )
def validation_operator_run(name, run_name, validation_config_file, suite, directory): # Note though the long lines here aren't pythonic, they look best if Click does the line wraps. """ Run a validation operator against some data. There are two modes to run this command: 1. Interactive (good for development): Specify the name of the validation operator using the --name argument and the name of the expectation suite using the --suite argument. The cli will help you specify the batch of data that you want to validate interactively. 2. Non-interactive (good for production): Use the `--validation_config_file` argument to specify the path of the validation configuration JSON file. This file can be used to instruct a validation operator to validate multiple batches of data and use multiple expectation suites to validate each batch. Learn how to create a validation config file here: https://great-expectations.readthedocs.io/en/latest/command_line.html#great-expectations-validation-operator-run-validation-config-file-validation-config-file-path This command exits with 0 if the validation operator ran and the "success" attribute in its return object is True. Otherwise, the command exits with 1. To learn more about validation operators, go here: https://great-expectations.readthedocs.io/en/latest/features/validation.html#validation-operators """ try: context = DataContext(directory) except ge_exceptions.ConfigNotFoundError as err: cli_message("Failed to process <red>{}</red>".format(err.message)) sys.exit(1) try: if validation_config_file is not None: try: with open(validation_config_file) as f: validation_config = json.load(f) except (IOError, json_parse_exception) as e: cli_message( f"Failed to process the --validation_config_file argument: <red>{e}</red>" ) send_usage_message( data_context=context, event="cli.validation_operator.run", success=False, ) sys.exit(1) validation_config_error_message = _validate_valdiation_config( validation_config) if validation_config_error_message is not None: cli_message( "<red>The validation config in {0:s} is misconfigured: {1:s}</red>" .format(validation_config_file, validation_config_error_message)) send_usage_message( data_context=context, event="cli.validation_operator.run", success=False, ) sys.exit(1) else: if suite is None: cli_message(""" Please use --suite argument to specify the name of the expectation suite. Call `great_expectation suite list` command to list the expectation suites in your project. """) send_usage_message( data_context=context, event="cli.validation_operator.run", success=False, ) sys.exit(0) suite = toolkit.load_expectation_suite( context, suite, "cli.validation_operator.run") if name is None: cli_message(""" Please use --name argument to specify the name of the validation operator. Call `great_expectation validation-operator list` command to list the operators in your project. """) send_usage_message( data_context=context, event="cli.validation_operator.run", success=False, ) sys.exit(1) else: if name not in context.list_validation_operator_names(): cli_message(f""" Could not find a validation operator {name}. Call `great_expectation validation-operator list` command to list the operators in your project. """) send_usage_message( data_context=context, event="cli.validation_operator.run", success=False, ) sys.exit(1) batch_kwargs = None cli_message(""" Let us help you specify the batch of data your want the validation operator to validate.""" ) try: data_source = toolkit.select_datasource(context) except ValueError as ve: cli_message("<red>{}</red>".format(ve)) send_usage_message( data_context=context, event="cli.validation_operator.run", success=False, ) sys.exit(1) if not data_source: cli_message("<red>No datasources found in the context.</red>") send_usage_message( data_context=context, event="cli.validation_operator.run", success=False, ) sys.exit(1) if batch_kwargs is None: ( datasource_name, batch_kwargs_generator, data_asset, batch_kwargs, ) = get_batch_kwargs( context, datasource_name=data_source.name, batch_kwargs_generator_name=None, data_asset_name=None, additional_batch_kwargs=None, ) validation_config = { "validation_operator_name": name, "batches": [{ "batch_kwargs": batch_kwargs, "expectation_suite_names": [suite.expectation_suite_name], }], } try: validation_operator_name = validation_config[ "validation_operator_name"] batches_to_validate = [] for entry in validation_config["batches"]: for expectation_suite_name in entry["expectation_suite_names"]: batch = context.get_batch(entry["batch_kwargs"], expectation_suite_name) batches_to_validate.append(batch) if run_name is None: run_name = datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ") run_id = RunIdentifier(run_name=run_name) if suite is None: results = context.run_validation_operator( validation_operator_name, assets_to_validate=batches_to_validate, run_id=run_id, ) else: if suite.evaluation_parameters is None: results = context.run_validation_operator( validation_operator_name, assets_to_validate=batches_to_validate, run_id=run_id, ) else: results = context.run_validation_operator( validation_operator_name, assets_to_validate=batches_to_validate, run_id=run_id, evaluation_parameters=suite.evaluation_parameters, ) except ( ge_exceptions.DataContextError, IOError, SQLAlchemyError, ) as e: cli_message("<red>{}</red>".format(e)) send_usage_message(data_context=context, event="cli.validation_operator.run", success=False) sys.exit(1) if not results["success"]: cli_message("Validation Failed!") send_usage_message(data_context=context, event="cli.validation_operator.run", success=True) sys.exit(1) else: cli_message("Validation Succeeded!") send_usage_message(data_context=context, event="cli.validation_operator.run", success=True) sys.exit(0) except Exception as e: send_usage_message(data_context=context, event="cli.validation_operator.run", success=False) raise e
def test_configuration_driven_site_builder_skip_and_clean_missing( site_builder_data_context_with_html_store_titanic_random, ): # tests auto-cleaning functionality of DefaultSiteIndexBuilder # when index page is built, if an HTML page is present without corresponding suite or validation result, # the HTML page should be removed and not appear on index page context = site_builder_data_context_with_html_store_titanic_random context.add_validation_operator( "validate_and_store", { "class_name": "ActionListValidationOperator", "action_list": [ { "name": "store_validation_result", "action": { "class_name": "StoreValidationResultAction", "target_store_name": "validations_store", }, }, { "name": "extract_and_store_eval_parameters", "action": { "class_name": "StoreEvaluationParametersAction", "target_store_name": "evaluation_parameter_store", }, }, ], }, ) # profiling the Titanic datasource will generate one expectation suite and one validation # that is a profiling result datasource_name = "titanic" data_asset_name = "Titanic" profiler_name = "BasicDatasetProfiler" generator_name = "subdir_reader" context.profile_datasource(datasource_name) # creating another validation result using the profiler's suite (no need to use a new expectation suite # for this test). having two validation results - one with run id "profiling" - allows us to test # the logic of run_name_filter that helps filtering validation results to be included in # the profiling and the validation sections. batch_kwargs = context.build_batch_kwargs( datasource=datasource_name, batch_kwargs_generator=generator_name, data_asset_name=data_asset_name, ) expectation_suite_name = "{}.{}.{}.{}".format(datasource_name, generator_name, data_asset_name, profiler_name) batch = context.get_batch( batch_kwargs=batch_kwargs, expectation_suite_name=expectation_suite_name, ) run_id = RunIdentifier(run_name="test_run_id_12345") context.run_validation_operator( assets_to_validate=[batch], run_id=run_id, validation_operator_name="validate_and_store", ) data_docs_config = context._project_config.data_docs_sites local_site_config = data_docs_config["local_site"] validations_set = set(context.stores["validations_store"].list_keys()) assert len(validations_set) == 6 expectation_suite_set = set( context.stores["expectations_store"].list_keys()) assert len(expectation_suite_set) == 5 site_builder = SiteBuilder( data_context=context, runtime_environment={"root_directory": context.root_directory}, **local_site_config) site_builder.build() # test expectation suite pages expectation_suite_html_pages = { ExpectationSuiteIdentifier.from_tuple(suite_tuple) for suite_tuple in site_builder.target_store. store_backends[ExpectationSuiteIdentifier].list_keys() } # suites in expectations store should match html pages assert expectation_suite_set == expectation_suite_html_pages # remove suites from expectations store for i in range(2): context.stores["expectations_store"].remove_key( list(expectation_suite_set)[i]) # re-build data docs, which should remove suite HTML pages that no longer have corresponding suite in # expectations store site_builder.build() expectation_suite_set = set( context.stores["expectations_store"].list_keys()) expectation_suite_html_pages = { ExpectationSuiteIdentifier.from_tuple(suite_tuple) for suite_tuple in site_builder.target_store. store_backends[ExpectationSuiteIdentifier].list_keys() } assert expectation_suite_set == expectation_suite_html_pages # test validation result pages validation_html_pages = { ValidationResultIdentifier.from_tuple(result_tuple) for result_tuple in site_builder.target_store. store_backends[ValidationResultIdentifier].list_keys() } # validations in store should match html pages assert validations_set == validation_html_pages # remove validations from store for i in range(2): context.stores["validations_store"].store_backend.remove_key( list(validations_set)[i]) # re-build data docs, which should remove validation HTML pages that no longer have corresponding validation in # validations store site_builder.build() validations_set = set(context.stores["validations_store"].list_keys()) validation_html_pages = { ValidationResultIdentifier.from_tuple(result_tuple) for result_tuple in site_builder.target_store. store_backends[ValidationResultIdentifier].list_keys() } assert validations_set == validation_html_pages
def test_resource_key_passes_run_name_filter(): resource_key = ValidationResultIdentifier( expectation_suite_identifier=ExpectationSuiteIdentifier("test_suite"), run_id=RunIdentifier(run_name="foofooprofilingfoo"), batch_identifier="f14c3d2f6e8028c2db0c25edabdb0d61", ) assert ( resource_key_passes_run_name_filter( resource_key, run_name_filter={"equals": "profiling"} ) is False ) assert ( resource_key_passes_run_name_filter( resource_key, run_name_filter={"equals": "foofooprofilingfoo"} ) is True ) assert ( resource_key_passes_run_name_filter( resource_key, run_name_filter={"not_equals": "profiling"} ) is True ) assert ( resource_key_passes_run_name_filter( resource_key, run_name_filter={"not_equals": "foofooprofilingfoo"} ) is False ) assert ( resource_key_passes_run_name_filter( resource_key, run_name_filter={"includes": "profiling"} ) is True ) assert ( resource_key_passes_run_name_filter( resource_key, run_name_filter={"includes": "foobar"} ) is False ) assert ( resource_key_passes_run_name_filter( resource_key, run_name_filter={"not_includes": "foobar"} ) is True ) assert ( resource_key_passes_run_name_filter( resource_key, run_name_filter={"not_includes": "profiling"} ) is False ) assert ( resource_key_passes_run_name_filter( resource_key, run_name_filter={"matches_regex": "(foo){2}profiling(" "foo)+"}, ) is True ) assert ( resource_key_passes_run_name_filter( resource_key, run_name_filter={"matches_regex": "(foo){3}profiling(" "foo)+"}, ) is False ) with pytest.warns(DeprecationWarning): assert ( resource_key_passes_run_name_filter( resource_key, run_name_filter={"eq": "profiling"} ) is False ) assert ( resource_key_passes_run_name_filter( resource_key, run_name_filter={"eq": "foofooprofilingfoo"} ) is True ) with pytest.warns(DeprecationWarning): assert ( resource_key_passes_run_name_filter( resource_key, run_name_filter={"ne": "profiling"} ) is True ) assert ( resource_key_passes_run_name_filter( resource_key, run_name_filter={"ne": "foofooprofilingfoo"} ) is False )
def from_fixed_length_tuple(cls, tuple_): return cls( ExpectationSuiteIdentifier(tuple_[0]), RunIdentifier.from_tuple((tuple_[1], tuple_[2])), tuple_[3], )
def run( self, assets_to_validate, run_id=None, base_expectation_suite_name=None, evaluation_parameters=None, run_name=None, run_time=None, result_format=None, ): assert not (run_id and run_name) and not ( run_id and run_time ), "Please provide either a run_id or run_name and/or run_time." if isinstance(run_id, str) and not run_name: warnings.warn( "String run_ids will be deprecated in the future. Please provide a run_id of type " "RunIdentifier(run_name=None, run_time=None), or a dictionary containing run_name " "and run_time (both optional). Instead of providing a run_id, you may also provide" "run_name and run_time separately.", DeprecationWarning, ) try: run_time = parse(run_id) except (ParserError, TypeError): pass run_id = RunIdentifier(run_name=run_id, run_time=run_time) elif isinstance(run_id, dict): run_id = RunIdentifier(**run_id) elif not isinstance(run_id, RunIdentifier): run_id = RunIdentifier(run_name=run_name, run_time=run_time) if base_expectation_suite_name is None: if self.base_expectation_suite_name is None: raise ValueError( "base_expectation_suite_name must be configured in the validation operator or passed at runtime" ) base_expectation_suite_name = self.base_expectation_suite_name run_results = {} for item in assets_to_validate: batch = self._build_batch_from_item(item) batch_id = batch.batch_id run_id = run_id assert not batch_id is None assert not run_id is None failure_expectation_suite_identifier = ExpectationSuiteIdentifier( expectation_suite_name=base_expectation_suite_name + self.expectation_suite_name_suffixes[0] ) failure_validation_result_id = ValidationResultIdentifier( expectation_suite_identifier=failure_expectation_suite_identifier, run_id=run_id, batch_identifier=batch_id, ) failure_expectation_suite = None try: failure_expectation_suite = self.data_context.stores[ self.data_context.expectations_store_name ].get(failure_expectation_suite_identifier) # NOTE : Abe 2019/09/17 : I'm concerned that this may be too permissive, since # it will catch any error in the Store, not just KeyErrors. In the longer term, a better # solution will be to have the Stores catch other known errors and raise KeyErrors, # so that methods like this can catch and handle a single error type. except Exception: logger.debug( "Failure expectation suite not found: {}".format( failure_expectation_suite_identifier ) ) if failure_expectation_suite: failure_run_result_obj = {"expectation_suite_severity_level": "failure"} failure_validation_result = batch.validate( failure_expectation_suite, result_format=result_format if result_format else self.result_format, evaluation_parameters=evaluation_parameters, ) failure_run_result_obj["validation_result"] = failure_validation_result failure_actions_results = self._run_actions( batch, failure_expectation_suite_identifier, failure_expectation_suite, failure_validation_result, run_id, ) failure_run_result_obj["actions_results"] = failure_actions_results run_results[failure_validation_result_id] = failure_run_result_obj if not failure_validation_result.success and self.stop_on_first_error: break warning_expectation_suite_identifier = ExpectationSuiteIdentifier( expectation_suite_name=base_expectation_suite_name + self.expectation_suite_name_suffixes[1] ) warning_validation_result_id = ValidationResultIdentifier( expectation_suite_identifier=warning_expectation_suite_identifier, run_id=run_id, batch_identifier=batch.batch_id, ) warning_expectation_suite = None try: warning_expectation_suite = self.data_context.stores[ self.data_context.expectations_store_name ].get(warning_expectation_suite_identifier) except Exception: logger.debug( "Warning expectation suite not found: {}".format( warning_expectation_suite_identifier ) ) if warning_expectation_suite: warning_run_result_obj = {"expectation_suite_severity_level": "warning"} warning_validation_result = batch.validate( warning_expectation_suite, result_format=result_format if result_format else self.result_format, evaluation_parameters=evaluation_parameters, ) warning_run_result_obj["validation_result"] = warning_validation_result warning_actions_results = self._run_actions( batch, warning_expectation_suite_identifier, warning_expectation_suite, warning_validation_result, run_id, ) warning_run_result_obj["actions_results"] = warning_actions_results run_results[warning_validation_result_id] = warning_run_result_obj validation_operator_result = ValidationOperatorResult( run_id=run_id, run_results=run_results, validation_operator_config=self.validation_operator_config, evaluation_parameters=evaluation_parameters, success=all( [ run_result_obj["validation_result"].success for run_result_obj in run_results.values() ] ), ) if self.slack_webhook: if ( self.notify_on == "all" or self.notify_on == "success" and validation_operator_result.success or self.notify_on == "failure" and not validation_operator_result.success ): slack_query = self._build_slack_query( validation_operator_result=validation_operator_result ) send_slack_notification( query=slack_query, slack_webhook=self.slack_webhook ) return validation_operator_result
def run( self, template_name: Optional[str] = None, run_name_template: Optional[str] = None, expectation_suite_name: Optional[str] = None, batch_request: Optional[Union[BatchRequestBase, dict]] = None, action_list: Optional[List[dict]] = None, evaluation_parameters: Optional[dict] = None, runtime_configuration: Optional[dict] = None, validations: Optional[List[dict]] = None, profilers: Optional[List[dict]] = None, run_id: Optional[Union[str, RunIdentifier]] = None, run_name: Optional[str] = None, run_time: Optional[Union[str, datetime.datetime]] = None, result_format: Optional[Union[str, dict]] = None, expectation_suite_ge_cloud_id: Optional[str] = None, ) -> CheckpointResult: assert not (run_id and run_name) and not ( run_id and run_time ), "Please provide either a run_id or run_name and/or run_time." run_time = run_time or datetime.datetime.now() runtime_configuration = runtime_configuration or {} result_format = result_format or runtime_configuration.get( "result_format") batch_request = get_batch_request_as_dict(batch_request=batch_request) validations = get_validations_with_batch_request_as_dict( validations=validations) runtime_kwargs: dict = { "template_name": template_name, "run_name_template": run_name_template, "expectation_suite_name": expectation_suite_name, "batch_request": batch_request or {}, "action_list": action_list or [], "evaluation_parameters": evaluation_parameters or {}, "runtime_configuration": runtime_configuration or {}, "validations": validations or [], "profilers": profilers or [], "expectation_suite_ge_cloud_id": expectation_suite_ge_cloud_id, } substituted_runtime_config: dict = self.get_substituted_config( runtime_kwargs=runtime_kwargs) run_name_template = substituted_runtime_config.get("run_name_template") batch_request = substituted_runtime_config.get("batch_request") validations = substituted_runtime_config.get("validations") or [] if len(validations) == 0 and not batch_request: raise ge_exceptions.CheckpointError( f'Checkpoint "{self.name}" must contain either a batch_request or validations.' ) if run_name is None and run_name_template is not None: run_name = get_datetime_string_from_strftime_format( format_str=run_name_template, datetime_obj=run_time) run_id = run_id or RunIdentifier(run_name=run_name, run_time=run_time) # Use AsyncExecutor to speed up I/O bound validations by running them in parallel with multithreading (if # concurrency is enabled in the data context configuration) -- please see the below arguments used to initialize # AsyncExecutor and the corresponding AsyncExecutor docstring for more details on when multiple threads are # used. with AsyncExecutor(self.data_context.concurrency, max_workers=len(validations)) as async_executor: # noinspection PyUnresolvedReferences async_validation_operator_results: List[ AsyncResult[ValidationOperatorResult]] = [] if len(validations) > 0: for idx, validation_dict in enumerate(validations): self._run_validation( substituted_runtime_config=substituted_runtime_config, async_validation_operator_results= async_validation_operator_results, async_executor=async_executor, result_format=result_format, run_id=run_id, idx=idx, validation_dict=validation_dict, ) else: self._run_validation( substituted_runtime_config=substituted_runtime_config, async_validation_operator_results= async_validation_operator_results, async_executor=async_executor, result_format=result_format, run_id=run_id, ) run_results: dict = {} for async_validation_operator_result in async_validation_operator_results: run_results.update( async_validation_operator_result.result().run_results) return CheckpointResult( run_id=run_id, run_results=run_results, checkpoint_config=self.config, )
def test_run_identifier_parses_datetime_run_name(): time = datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ") run_id = RunIdentifier(run_name=time) assert run_id.run_name == run_id.run_time.strftime("%Y%m%dT%H%M%S.%fZ")
def run( self, checkpoint_name: str = None, ge_checkpoint: Checkpoint = None, checkpoint_kwargs: dict = None, context: ge.DataContext = None, assets_to_validate: list = None, batch_kwargs: dict = None, expectation_suite_name: str = None, context_root_dir: str = None, runtime_environment: Optional[dict] = None, run_name: str = None, run_info_at_end: bool = True, disable_markdown_artifact: bool = False, validation_operator: str = "action_list_operator", evaluation_parameters: Optional[dict] = None, ): """ Task run method. Args: - checkpoint_name (str, optional): the name of a pre-configured checkpoint; should match the filename of the checkpoint without the extension. Either checkpoint_name or checkpoint_config is required when using the Great Expectations v3 API. - ge_checkpoint (Checkpoint, optional): an in-memory GE `Checkpoint` object used to perform validation. If not provided then `checkpoint_name` will be used to load the specified checkpoint. - checkpoint_kwargs (Dict, optional): A dictionary whose keys match the parameters of `CheckpointConfig` which can be used to update and populate the task's Checkpoint at runtime. - context (DataContext, optional): an in-memory GE `DataContext` object. e.g. `ge.data_context.DataContext()` If not provided then `context_root_dir` will be used to look for one. - assets_to_validate (list, optional): A list of assets to validate when running the validation operator. Only used in the Great Expectations v2 API - batch_kwargs (dict, optional): a dictionary of batch kwargs to be used when validating assets. Only used in the Great Expectations v2 API - expectation_suite_name (str, optional): the name of an expectation suite to be used when validating assets. Only used in the Great Expectations v2 API - context_root_dir (str, optional): the absolute or relative path to the directory holding your `great_expectations.yml` - runtime_environment (dict, optional): a dictionary of great expectation config key-value pairs to overwrite your config in `great_expectations.yml` - run_name (str, optional): the name of this Great Expectation validation run; defaults to the task slug - run_info_at_end (bool, optional): add run info to the end of the artifact generated by this task. Defaults to `True`. - disable_markdown_artifact (bool, optional): toggle the posting of a markdown artifact from this tasks. Defaults to `False`. - evaluation_parameters (Optional[dict], optional): the evaluation parameters to use when running validation. For more information, see [example](https://docs.prefect.io/api/latest/tasks/great_expectations.html#rungreatexpectationsvalidation) and [docs](https://docs.greatexpectations.io/en/latest/reference/core_concepts/evaluation_parameters.html). - validation_operator (str, optional): configure the actions to be executed after running validation. Defaults to `action_list_operator`. Raises: - 'signals.FAIL' if the validation was not a success Returns: - result ('great_expectations.validation_operators.types.validation_operator_result.ValidationOperatorResult'): The Great Expectations metadata returned from the validation if the v2 (batch_kwargs) API is used. ('great_expectations.checkpoint.checkpoint.CheckpointResult'): The Great Expectations metadata returned from running the provided checkpoint if a checkpoint name is provided. """ if version.parse(ge.__version__) < version.parse("0.13.8"): self.logger.warning( f"You are using great_expectations version {ge.__version__} which may cause" "errors in this task. Please upgrade great_expections to 0.13.8 or later." ) runtime_environment = runtime_environment or dict() checkpoint_kwargs = checkpoint_kwargs or dict() # Load context if not provided directly if not context: context = ge.DataContext( context_root_dir=context_root_dir, runtime_environment=runtime_environment, ) # Check that the parameters are mutually exclusive if (sum( bool(x) for x in [ (expectation_suite_name and batch_kwargs), assets_to_validate, checkpoint_name, ge_checkpoint, ]) != 1): raise ValueError( "Exactly one of expectation_suite_name + batch_kwargs, assets_to_validate, " "checkpoint_name, or ge_checkpoint is required to run validation." ) results = None # If there is a checkpoint or checkpoint name provided, run the checkpoint. # Checkpoints are the preferred deployment of validation configuration. if ge_checkpoint or checkpoint_name: ge_checkpoint = ge_checkpoint or context.get_checkpoint( checkpoint_name) results = ge_checkpoint.run( evaluation_parameters=evaluation_parameters, run_id=RunIdentifier(run_name or prefect.context.get("task_slug")), **checkpoint_kwargs, ) else: # If assets are not provided directly through `assets_to_validate` then they need be loaded # get batch from `batch_kwargs` and `expectation_suite_name` if not assets_to_validate: assets_to_validate = [ context.get_batch(batch_kwargs, expectation_suite_name) ] # Run validation operator results = context.run_validation_operator( validation_operator, assets_to_validate=assets_to_validate, run_id=RunIdentifier(run_name or prefect.context.get("task_slug")), evaluation_parameters=evaluation_parameters, ) # Generate artifact markdown if not disable_markdown_artifact: validation_results_page_renderer = ( ge.render.renderer.ValidationResultsPageRenderer( run_info_at_end=run_info_at_end)) rendered_content_list = validation_results_page_renderer.render_validation_operator_result( # This also works with a CheckpointResult because of duck typing. # The passed in object needs a list_validation_results method that # returns a list of ExpectationSuiteValidationResult. validation_operator_result=results) markdown_artifact = " ".join( ge.render.view.DefaultMarkdownPageView().render( rendered_content_list)) create_markdown_artifact(markdown_artifact) if results.success is False: raise signals.FAIL(result=results) return results
def test_StoreMetricsAction( basic_in_memory_data_context_for_validation_operator): action = StoreMetricsAction( data_context=basic_in_memory_data_context_for_validation_operator, requested_metrics={ "*": [ "statistics.evaluated_expectations", "statistics.successful_expectations", ] }, target_store_name="metrics_store", ) run_id = RunIdentifier(run_name="bar") validation_result = ExpectationSuiteValidationResult( success=False, meta={ "expectation_suite_name": "foo", "run_id": run_id }, statistics={ "evaluated_expectations": 5, "successful_expectations": 3 }, ) # Run the action and store our metrics action.run( validation_result, ValidationResultIdentifier.from_object(validation_result), data_asset=None, ) validation_result = ExpectationSuiteValidationResult( success=False, meta={ "expectation_suite_name": "foo.warning", "run_id": run_id }, statistics={ "evaluated_expectations": 8, "successful_expectations": 4 }, ) action.run( validation_result, ValidationResultIdentifier.from_object(validation_result), data_asset=None, ) assert ( basic_in_memory_data_context_for_validation_operator. stores["metrics_store"].get( ValidationMetricIdentifier( run_id=run_id, data_asset_name=None, expectation_suite_identifier=ExpectationSuiteIdentifier("foo"), metric_name="statistics.evaluated_expectations", metric_kwargs_id=None, )) == 5) assert ( basic_in_memory_data_context_for_validation_operator. stores["metrics_store"].get( ValidationMetricIdentifier( run_id=run_id, data_asset_name=None, expectation_suite_identifier=ExpectationSuiteIdentifier("foo"), metric_name="statistics.successful_expectations", metric_kwargs_id=None, )) == 3) assert (basic_in_memory_data_context_for_validation_operator. stores["metrics_store"].get( ValidationMetricIdentifier( run_id=run_id, data_asset_name=None, expectation_suite_identifier=ExpectationSuiteIdentifier( "foo.warning"), metric_name="statistics.evaluated_expectations", metric_kwargs_id=None, )) == 8) assert (basic_in_memory_data_context_for_validation_operator. stores["metrics_store"].get( ValidationMetricIdentifier( run_id=run_id, data_asset_name=None, expectation_suite_identifier=ExpectationSuiteIdentifier( "foo.warning"), metric_name="statistics.successful_expectations", metric_kwargs_id=None, )) == 4)
def test_evaluation_parameter_store_methods( data_context_parameterized_expectation_suite, ): run_id = RunIdentifier(run_name="20191125T000000.000000Z") source_patient_data_results = ExpectationSuiteValidationResult( meta={ "expectation_suite_name": "source_patient_data.default", "run_id": run_id, }, results=[ ExpectationValidationResult( expectation_config=ExpectationConfiguration( expectation_type="expect_table_row_count_to_equal", kwargs={ "value": 1024, }, ), success=True, exception_info={ "exception_message": None, "exception_traceback": None, "raised_exception": False, }, result={ "observed_value": 1024, "element_count": 1024, "missing_percent": 0.0, "missing_count": 0, }, ) ], success=True, ) data_context_parameterized_expectation_suite.store_evaluation_parameters( source_patient_data_results) bound_parameters = data_context_parameterized_expectation_suite.evaluation_parameter_store.get_bind_params( run_id) assert bound_parameters == { "urn:great_expectations:validations:source_patient_data.default:expect_table_row_count_to_equal.result" ".observed_value": 1024 } source_diabetes_data_results = ExpectationSuiteValidationResult( meta={ "expectation_suite_name": "source_diabetes_data.default", "run_id": run_id, }, results=[ ExpectationValidationResult( expectation_config=ExpectationConfiguration( expectation_type= "expect_column_unique_value_count_to_be_between", kwargs={ "column": "patient_nbr", "min": 2048, "max": 2048 }, ), success=True, exception_info={ "exception_message": None, "exception_traceback": None, "raised_exception": False, }, result={ "observed_value": 2048, "element_count": 5000, "missing_percent": 0.0, "missing_count": 0, }, ) ], success=True, ) data_context_parameterized_expectation_suite.store_evaluation_parameters( source_diabetes_data_results) bound_parameters = data_context_parameterized_expectation_suite.evaluation_parameter_store.get_bind_params( run_id) assert bound_parameters == { "urn:great_expectations:validations:source_patient_data.default:expect_table_row_count_to_equal.result" ".observed_value": 1024, "urn:great_expectations:validations:source_diabetes_data.default" ":expect_column_unique_value_count_to_be_between.result.observed_value:column=patient_nbr": 2048, }
def run( self, assets_to_validate, run_id=None, evaluation_parameters=None, run_name=None, run_time=None, result_format=None, ): assert not (run_id and run_name) and not ( run_id and run_time ), "Please provide either a run_id or run_name and/or run_time." if isinstance(run_id, str) and not run_name: warnings.warn( "String run_ids will be deprecated in the future. Please provide a run_id of type " "RunIdentifier(run_name=None, run_time=None), or a dictionary containing run_name " "and run_time (both optional). Instead of providing a run_id, you may also provide" "run_name and run_time separately.", DeprecationWarning, ) try: run_time = parse(run_id) except (ParserError, TypeError): pass run_id = RunIdentifier(run_name=run_id, run_time=run_time) elif isinstance(run_id, dict): run_id = RunIdentifier(**run_id) elif not isinstance(run_id, RunIdentifier): run_id = RunIdentifier(run_name=run_name, run_time=run_time) run_results = {} for item in assets_to_validate: run_result_obj = {} batch = self._build_batch_from_item(item) expectation_suite_identifier = ExpectationSuiteIdentifier( expectation_suite_name=batch._expectation_suite.expectation_suite_name ) validation_result_id = ValidationResultIdentifier( batch_identifier=batch.batch_id, expectation_suite_identifier=expectation_suite_identifier, run_id=run_id, ) batch_validation_result = batch.validate( run_id=run_id, result_format=result_format if result_format else self.result_format, evaluation_parameters=evaluation_parameters, ) run_result_obj["validation_result"] = batch_validation_result batch_actions_results = self._run_actions( batch, expectation_suite_identifier, batch._expectation_suite, batch_validation_result, run_id, ) run_result_obj["actions_results"] = batch_actions_results run_results[validation_result_id] = run_result_obj return ValidationOperatorResult( run_id=run_id, run_results=run_results, validation_operator_config=self.validation_operator_config, evaluation_parameters=evaluation_parameters, )
def resolve_config_using_acceptable_arguments( checkpoint: "Checkpoint", # noqa: F821 template_name: Optional[str] = None, run_name_template: Optional[str] = None, expectation_suite_name: Optional[str] = None, batch_request: Optional[Union[BatchRequest, RuntimeBatchRequest, dict]] = None, action_list: Optional[List[dict]] = None, evaluation_parameters: Optional[dict] = None, runtime_configuration: Optional[dict] = None, validations: Optional[List[dict]] = None, profilers: Optional[List[dict]] = None, run_id: Optional[Union[str, RunIdentifier]] = None, run_name: Optional[str] = None, run_time: Optional[Union[str, datetime.datetime]] = None, result_format: Optional[Union[str, dict]] = None, expectation_suite_ge_cloud_id: Optional[str] = None, ) -> dict: """ This method reconciles the Checkpoint configuration (e.g., obtained from the Checkpoint store) with dynamically supplied arguments in order to obtain that Checkpoint specification that is ready for running validation on it. This procedure is necessecitated by the fact that the Checkpoint configuration is hierarchical in its form, which was established for the purposes of making the specification of different Checkpoint capabilities easy. In particular, entities, such as BatchRequest, expectation_suite_name, and action_list, can be specified at the top Checkpoint level with the suitable ovverrides provided at lower levels (e.g., in the validations section). Reconciling and normalizing the Checkpoint configuration is essential for usage statistics, because the exact values of the entities in their formally validated form (e.g., BatchRequest) is the required level of detail. """ assert not (run_id and run_name) and not ( run_id and run_time ), "Please provide either a run_id or run_name and/or run_time." run_time = run_time or datetime.datetime.now() runtime_configuration = runtime_configuration or {} batch_request = get_batch_request_as_dict(batch_request=batch_request) validations = get_validations_with_batch_request_as_dict( validations=validations) runtime_kwargs: dict = { "template_name": template_name, "run_name_template": run_name_template, "expectation_suite_name": expectation_suite_name, "batch_request": batch_request, "action_list": action_list, "evaluation_parameters": evaluation_parameters, "runtime_configuration": runtime_configuration, "validations": validations, "profilers": profilers, "expectation_suite_ge_cloud_id": expectation_suite_ge_cloud_id, } substituted_runtime_config: dict = checkpoint.get_substituted_config( runtime_kwargs=runtime_kwargs) run_name_template = substituted_runtime_config.get("run_name_template") validations = substituted_runtime_config.get("validations") or [] batch_request = substituted_runtime_config.get("batch_request") if len(validations) == 0 and not batch_request: raise ge_exceptions.CheckpointError( f'Checkpoint "{checkpoint.name}" must contain either a batch_request or validations.' ) if run_name is None and run_name_template is not None: run_name = get_datetime_string_from_strftime_format( format_str=run_name_template, datetime_obj=run_time) run_id = run_id or RunIdentifier(run_name=run_name, run_time=run_time) validation_dict: dict for validation_dict in validations: substituted_validation_dict: dict = get_substituted_validation_dict( substituted_runtime_config=substituted_runtime_config, validation_dict=validation_dict, ) validation_batch_request: Union[ BatchRequest, RuntimeBatchRequest] = substituted_validation_dict.get( "batch_request") validation_dict["batch_request"] = validation_batch_request validation_expectation_suite_name: str = substituted_validation_dict.get( "expectation_suite_name") validation_dict[ "expectation_suite_name"] = validation_expectation_suite_name validation_expectation_suite_ge_cloud_id: str = ( substituted_validation_dict.get( "expectation_suite_ge_cloud_id")) validation_dict[ "expectation_suite_ge_cloud_id"] = validation_expectation_suite_ge_cloud_id validation_action_list: list = substituted_validation_dict.get( "action_list") validation_dict["action_list"] = validation_action_list return substituted_runtime_config
def test_MicrosoftTeams_validation_results_with_datadocs(): validation_result_suite = ExpectationSuiteValidationResult( results=[], success=True, statistics={ "evaluated_expectations": 0, "successful_expectations": 0, "unsuccessful_expectations": 0, "success_percent": None, }, meta={ "great_expectations_version": "v0.8.0__develop", "expectation_suite_name": "asset.default", "run_id": "test_100", }, ) validation_result_suite_identifier = ValidationResultIdentifier( expectation_suite_identifier=ExpectationSuiteIdentifier( "asset.default"), run_id=RunIdentifier(run_name="test_100", run_time="Tue May 08 15:14:45 +0800 2012"), batch_identifier=BatchIdentifier(batch_identifier="1234", data_asset_name="asset"), ) data_docs_pages = {"local_site": "file:///localsite/index.html"} rendered_output = MicrosoftTeamsRenderer().render( validation_result_suite, validation_result_suite_identifier, data_docs_pages) expected_output = { "attachments": [{ "content": { "$schema": "http://adaptivecards.io/schemas/adaptive-card.json", "actions": [{ "title": "Open data docs", "type": "Action.OpenUrl", "url": "file:///localsite/index.html", }], "body": [ { "height": "auto", "items": [{ "columns": [{ "items": [ { "size": "large", "text": "Validation " "results", "type": "TextBlock", "weight": "bolder", "wrap": True, }, { "isSubtle": True, "spacing": "none", "text": "May " "08 " "2012 " "07:14:45", "type": "TextBlock", "wrap": True, }, ], "type": "Column", "width": "stretch", }], "type": "ColumnSet", }], "separator": True, "type": "Container", }, { "height": "auto", "items": [ { "color": "good", "horizontalAlignment": "left", "text": "**Batch validation " "status:** Success " "!!!", "type": "TextBlock", }, { "horizontalAlignment": "left", "text": "**Data asset " "name:** asset", "type": "TextBlock", }, { "horizontalAlignment": "left", "text": "**Expectation " "suite name:** " "asset.default", "type": "TextBlock", }, { "horizontalAlignment": "left", "text": "**Run name:** " "test_100", "type": "TextBlock", }, { "horizontalAlignment": "left", "text": "**Batch ID:** 1234", "type": "TextBlock", }, { "horizontalAlignment": "left", "text": "**Summary:** *0* " "of *0* " "expectations were " "met", "type": "TextBlock", }, ], "separator": True, "type": "Container", }, ], "type": "AdaptiveCard", "version": "1.0", }, "contentType": "application/vnd.microsoft.card.adaptive", }], "type": "message", } assert rendered_output == expected_output
def from_tuple(cls, tuple_): return cls( ExpectationSuiteIdentifier.from_tuple(tuple_[0:-3]), RunIdentifier.from_tuple((tuple_[-3], tuple_[-2])), tuple_[-1], )