def __init__(self, expectation_suite_identifier, run_id, batch_identifier): """Constructs a ValidationResultIdentifier Args: expectation_suite_identifier (ExpectationSuiteIdentifier, list, tuple, or dict): identifying information for the fully qualified expectation suite used to validate run_id (RunIdentifier): The run_id for which validation occurred """ super().__init__() self._expectation_suite_identifier = expectation_suite_identifier if isinstance(run_id, str): warnings.warn( "String run_ids will be deprecated in the future. Please provide a run_id of type " "RunIdentifier(run_name=None, run_time=None), or a dictionary containing run_name " "and run_time (both optional).", DeprecationWarning, ) try: run_time = parse(run_id) except (ValueError, TypeError): run_time = None run_id = RunIdentifier(run_name=run_id, run_time=run_time) elif isinstance(run_id, dict): run_id = RunIdentifier(**run_id) elif run_id is None: run_id = RunIdentifier() elif not isinstance(run_id, RunIdentifier): run_id = RunIdentifier(run_name=str(run_id)) self._run_id = run_id self._batch_identifier = batch_identifier
def test_StoreAction(): fake_in_memory_store = ValidationsStore( store_backend={ "class_name": "InMemoryStoreBackend", } ) stores = {"fake_in_memory_store": fake_in_memory_store} class Object: ge_cloud_mode = False data_context = Object() data_context.stores = stores action = StoreValidationResultAction( data_context=data_context, target_store_name="fake_in_memory_store", ) assert fake_in_memory_store.list_keys() == [] action.run( validation_result_suite_identifier=ValidationResultIdentifier( expectation_suite_identifier=ExpectationSuiteIdentifier( expectation_suite_name="default_expectations" ), run_id=RunIdentifier(run_name="prod_20190801"), batch_identifier="1234", ), validation_result_suite=ExpectationSuiteValidationResult( success=False, results=[] ), data_asset=None, ) expected_run_id = RunIdentifier( run_name="prod_20190801", run_time="20190926T134241.000000Z" ) assert len(fake_in_memory_store.list_keys()) == 1 stored_identifier = fake_in_memory_store.list_keys()[0] assert stored_identifier.batch_identifier == "1234" assert ( stored_identifier.expectation_suite_identifier.expectation_suite_name == "default_expectations" ) assert stored_identifier.run_id == expected_run_id assert fake_in_memory_store.get( ValidationResultIdentifier( expectation_suite_identifier=ExpectationSuiteIdentifier( expectation_suite_name="default_expectations" ), run_id=expected_run_id, batch_identifier="1234", ) ) == ExpectationSuiteValidationResult(success=False, results=[])
def profile( cls, data_asset, run_id=None, profiler_configuration=None, run_name=None, run_time=None, ): assert not (run_id and run_name) and not ( run_id and run_time ), "Please provide either a run_id or run_name and/or run_time." if isinstance(run_id, str) and not run_name: # deprecated-v0.11.0 warnings.warn( "String run_ids are deprecated as of v0.11.0 and support will be removed in v0.16. Please provide a run_id of type " "RunIdentifier(run_name=None, run_time=None), or a dictionary containing run_name " "and run_time (both optional). Instead of providing a run_id, you may also provide" "run_name and run_time separately.", DeprecationWarning, ) try: run_time = parse(run_id) except (ValueError, TypeError): pass run_id = RunIdentifier(run_name=run_id, run_time=run_time) elif isinstance(run_id, dict): run_id = RunIdentifier(**run_id) elif not isinstance(run_id, RunIdentifier): run_name = run_name or "profiling" run_id = RunIdentifier(run_name=run_name, run_time=run_time) if not cls.validate(data_asset): raise GreatExpectationsError( "Invalid data_asset for profiler; aborting") expectation_suite = cls._profile(data_asset, configuration=profiler_configuration) batch_kwargs = data_asset.batch_kwargs expectation_suite = cls.add_meta(expectation_suite, batch_kwargs) validation_results = data_asset.validate(expectation_suite, run_id=run_id, result_format="SUMMARY") expectation_suite.add_citation( comment= f"{str(cls.__name__)} added a citation based on the current batch.", batch_kwargs=data_asset.batch_kwargs, batch_markers=data_asset.batch_markers, batch_parameters=data_asset.batch_parameters, ) return expectation_suite, validation_results
def test_warning_and_failure_validation_operator( validation_operators_data_context): data_context = validation_operators_data_context validator_batch_kwargs = data_context.build_batch_kwargs( "my_datasource", "subdir_reader", "f1") batch = data_context.get_batch(expectation_suite_name="f1.warning", batch_kwargs=validator_batch_kwargs) # NOTE: 20200130 - JPC - currently the warning and failure validation operator ignores the batch-provided suite and # fetches its own assert data_context.validations_store.list_keys() == [] # We want to demonstrate running the validation operator with both a pre-built batch (DataAsset) and with # a tuple of parameters for get_batch results = data_context.run_validation_operator( assets_to_validate=[batch], run_id=RunIdentifier(run_name="test-100"), validation_operator_name="errors_and_warnings_validation_operator", base_expectation_suite_name="f1", ) validations_keys = data_context.validations_store.list_keys() assert ( len(validations_keys) == 2 ) # we should have run two suites even though there was only one batch suite_names = [ key.expectation_suite_identifier.expectation_suite_name for key in validations_keys ] assert "f1.warning" in suite_names assert "f1.failure" in suite_names
def test_TupleGCSStoreBackend_base_public_path(): """ What does this test and why? the base_public_path parameter allows users to point to a custom DNS when hosting Data docs. This test will exercise the get_url_for_key method twice to see that we are getting the expected url, with or without base_public_path """ bucket = "leakybucket" prefix = "this_is_a_test_prefix" project = "dummy-project" base_public_path = "http://www.test.com/" with patch("google.cloud.storage.Client", autospec=True) as mock_gcs_client: mock_client = mock_gcs_client.return_value mock_bucket = mock_client.get_bucket.return_value mock_blob = mock_bucket.blob.return_value my_store_with_base_public_path = TupleGCSStoreBackend( filepath_template=None, bucket=bucket, prefix=prefix, project=project, base_public_path=base_public_path, ) my_store_with_base_public_path.set(("BBB", ), b"bbb", content_encoding=None, content_type="image/png") run_id = RunIdentifier("my_run_id", datetime.datetime.utcnow()) key = ValidationResultIdentifier( ExpectationSuiteIdentifier(expectation_suite_name="my_suite_name"), run_id, "my_batch_id", ) run_time_string = run_id.to_tuple()[1] url = my_store_with_base_public_path.get_public_url_for_key(key.to_tuple()) assert ( url == "http://www.test.com/leakybucket" + f"/this_is_a_test_prefix/my_suite_name/my_run_id/{run_time_string}/my_batch_id" )
def test_database_evaluation_parameter_store_get_bind_params(param_store): # Bind params must be expressed as a string-keyed dictionary. # Verify that the param_store supports that run_id = RunIdentifier(run_name=datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")) metric_identifier = ValidationMetricIdentifier( run_id=run_id, data_asset_name=None, expectation_suite_identifier="asset.warning", metric_name= "expect_column_values_to_match_regex.result.unexpected_percent", metric_kwargs_id="column=mycol", ) metric_value = 12.3456789 param_store.set(metric_identifier, metric_value) metric_identifier = ValidationMetricIdentifier( run_id=run_id, data_asset_name=None, expectation_suite_identifier="asset.warning", metric_name= "expect_table_row_count_to_be_between.result.observed_value", metric_kwargs_id=None, ) metric_value = 512 param_store.set(metric_identifier, metric_value) metric_identifier = ValidationMetricIdentifier( run_id=run_id, data_asset_name=None, expectation_suite_identifier="asset2.warning", metric_name= "expect_column_values_to_match_regex.result.unexpected_percent", metric_kwargs_id="column=mycol", ) metric_value = 12.3456789 param_store.set(metric_identifier, metric_value) params = param_store.get_bind_params(run_id) assert params == { "urn:great_expectations:validations:asset.warning:" "expect_column_values_to_match_regex.result.unexpected_percent:column=mycol": 12.3456789, "urn:great_expectations:validations:asset.warning:" "expect_table_row_count_to_be_between.result.observed_value": 512, "urn:great_expectations:validations:asset2.warning:" "expect_column_values_to_match_regex.result.unexpected_percent:column=mycol": 12.3456789, }
def ge_validation_result_suite_id() -> ValidationResultIdentifier: validation_result_suite_id = ValidationResultIdentifier( expectation_suite_identifier=ExpectationSuiteIdentifier( "asset.default"), run_id=RunIdentifier( run_name="test_100", run_time=datetime.fromtimestamp(1640701702, tz=timezone.utc), ), batch_identifier="010ef8c1cd417910b971f4468f024ec5", ) return validation_result_suite_id
def test_resource_key_passes_run_name_filter(): resource_key = ValidationResultIdentifier( expectation_suite_identifier=ExpectationSuiteIdentifier("test_suite"), run_id=RunIdentifier(run_name="foofooprofilingfoo"), batch_identifier="f14c3d2f6e8028c2db0c25edabdb0d61", ) assert (resource_key_passes_run_name_filter( resource_key, run_name_filter={"equals": "profiling"}) is False) assert (resource_key_passes_run_name_filter( resource_key, run_name_filter={"equals": "foofooprofilingfoo"}) is True) assert (resource_key_passes_run_name_filter( resource_key, run_name_filter={"not_equals": "profiling"}) is True) assert (resource_key_passes_run_name_filter( resource_key, run_name_filter={"not_equals": "foofooprofilingfoo"}) is False) assert (resource_key_passes_run_name_filter( resource_key, run_name_filter={"includes": "profiling"}) is True) assert (resource_key_passes_run_name_filter( resource_key, run_name_filter={"includes": "foobar"}) is False) assert (resource_key_passes_run_name_filter( resource_key, run_name_filter={"not_includes": "foobar"}) is True) assert (resource_key_passes_run_name_filter( resource_key, run_name_filter={"not_includes": "profiling"}) is False) assert (resource_key_passes_run_name_filter( resource_key, run_name_filter={"matches_regex": "(foo){2}profiling(" "foo)+"}, ) is True) assert (resource_key_passes_run_name_filter( resource_key, run_name_filter={"matches_regex": "(foo){3}profiling(" "foo)+"}, ) is False) with pytest.warns(DeprecationWarning): assert (resource_key_passes_run_name_filter( resource_key, run_name_filter={"eq": "profiling"}) is False) assert (resource_key_passes_run_name_filter( resource_key, run_name_filter={"eq": "foofooprofilingfoo"}) is True) with pytest.warns(DeprecationWarning): assert (resource_key_passes_run_name_filter( resource_key, run_name_filter={"ne": "profiling"}) is True) assert (resource_key_passes_run_name_filter( resource_key, run_name_filter={"ne": "foofooprofilingfoo"}) is False)
def __init__( self, run_id, data_asset_name, expectation_suite_identifier, metric_name, metric_kwargs_id, ) -> None: super().__init__(metric_name, metric_kwargs_id) if not isinstance(expectation_suite_identifier, ExpectationSuiteIdentifier): expectation_suite_identifier = ExpectationSuiteIdentifier( expectation_suite_name=expectation_suite_identifier ) if isinstance(run_id, str): # deprecated-v0.11.0 warnings.warn( "String run_ids are deprecated as of v0.11.0 and support will be removed in v0.16. Please provide a run_id of type " "RunIdentifier(run_name=None, run_time=None), or a dictionary containing run_name " "and run_time (both optional).", DeprecationWarning, ) try: run_time = parse(run_id) except (ValueError, TypeError): run_time = None run_id = RunIdentifier(run_name=run_id, run_time=run_time) elif isinstance(run_id, dict): run_id = RunIdentifier(**run_id) elif run_id is None: run_id = RunIdentifier() elif not isinstance(run_id, RunIdentifier): run_id = RunIdentifier(run_name=str(run_id)) self._run_id = run_id self._data_asset_name = data_asset_name self._expectation_suite_identifier = expectation_suite_identifier
def test_database_evaluation_parameter_store_basics(param_store): run_id = RunIdentifier(run_name=datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")) metric_identifier = ValidationMetricIdentifier( run_id=run_id, data_asset_name=None, expectation_suite_identifier="asset.warning", metric_name= "expect_column_values_to_match_regex.result.unexpected_percent", metric_kwargs_id="column=mycol", ) metric_value = 12.3456789 param_store.set(metric_identifier, metric_value) value = param_store.get(metric_identifier) assert value == metric_value
def from_tuple(cls, tuple_): if len(tuple_) < 6: raise GreatExpectationsError( "ValidationMetricIdentifier tuple must have at least six components." ) if tuple_[2] == "__": tuple_data_asset_name = None else: tuple_data_asset_name = tuple_[2] metric_id = MetricIdentifier.from_tuple(tuple_[-2:]) return cls( run_id=RunIdentifier.from_tuple((tuple_[0], tuple_[1])), data_asset_name=tuple_data_asset_name, expectation_suite_identifier=ExpectationSuiteIdentifier.from_tuple( tuple_[3:-2] ), metric_name=metric_id.metric_name, metric_kwargs_id=metric_id.metric_kwargs_id, )
def from_fixed_length_tuple(cls, tuple_): if len(tuple_) != 6: raise GreatExpectationsError( "ValidationMetricIdentifier fixed length tuple must have exactly six " "components." ) if tuple_[2] == "__": tuple_data_asset_name = None else: tuple_data_asset_name = tuple_[2] metric_id = MetricIdentifier.from_tuple(tuple_[-2:]) return cls( run_id=RunIdentifier.from_fixed_length_tuple((tuple_[0], tuple_[1])), data_asset_name=tuple_data_asset_name, expectation_suite_identifier=ExpectationSuiteIdentifier.from_fixed_length_tuple( tuple((tuple_[3],)) ), metric_name=metric_id.metric_name, metric_kwargs_id=metric_id.metric_kwargs_id, )
def test_evaluation_parameter_store_calls_proper_gcs_tuple_store_methods( mock_parent_list_keys, mock_gcs_list_keys, ): """ What does this test and why? Demonstrate that EvaluationParameterStore works as expected with TupleGCSStoreBackend and that the store backend adheres to the Liskov substitution principle. """ evaluation_parameter_store = EvaluationParameterStore() run_id = RunIdentifier() gcs_store = TupleGCSStoreBackend(bucket="my_bucket", project="my_project") evaluation_parameter_store._store_backend = gcs_store # Sanity check to ensure neither parent nor child method has been called assert not mock_gcs_list_keys.called assert not mock_parent_list_keys.called # `get_bind_params` calls the child method due to proper polymorphism evaluation_parameter_store.get_bind_params(run_id=run_id) assert mock_gcs_list_keys.called assert not mock_parent_list_keys.called
def test_errors_warnings_validation_operator_run_slack_query( warning_failure_validation_operator_data_context, assets_to_validate ): data_context = warning_failure_validation_operator_data_context vo = WarningAndFailureExpectationSuitesValidationOperator( data_context=data_context, action_list=[], name="test", slack_webhook="https://hooks.slack.com/services/test/slack/webhook", ) return_obj = vo.run( assets_to_validate=assets_to_validate, run_id=RunIdentifier(run_name="test_100"), base_expectation_suite_name="f1", ) slack_query = vo._build_slack_query(return_obj) expected_slack_query = { "blocks": [ {"type": "divider"}, { "type": "section", "text": { "type": "mrkdwn", "text": "*FailureVsWarning Validation Operator Completed.*", }, }, {"type": "divider"}, { "type": "section", "text": {"type": "mrkdwn", "text": "*Status*: Failed :x:"}, }, { "type": "section", "text": { "type": "mrkdwn", "text": "*Batch Id List:* ['ge_batch_id=82a8de83-e063-11e9-8133-acde48001122', " "'ge_batch_id=82a8de83-e063-11e9-8226-acde48001122', " "'ge_batch_id=82a8de83-e063-11e9-a53d-acde48001122']", }, }, { "type": "section", "text": { "type": "mrkdwn", "text": "*Failed Batches:* ['f1.failure-ge_batch_id=82a8de83-e063-11e9-8133-acde48001122']", }, }, { "type": "section", "text": {"type": "mrkdwn", "text": "*Run Name:* test_100"}, }, { "type": "section", "text": {"type": "mrkdwn", "text": "*Run Time:* LOCALEDATE"}, }, {"type": "divider"}, { "type": "context", "elements": [ { "type": "mrkdwn", "text": "Learn about FailureVsWarning Validation Operators at https://docs.greatexpectations.i" "o/en/latest/reference/validation_operators/warning_and_failure_expectation_suites_val" "idation_operator.html", } ], }, ] } # We're okay with system variation in locales (OS X likes 24 hour, but not Travis) slack_query["blocks"][7]["text"]["text"] = slack_query["blocks"][7]["text"][ "text" ].replace("09/26/2019 13:42:41", "LOCALEDATE") slack_query["blocks"][7]["text"]["text"] = slack_query["blocks"][7]["text"][ "text" ].replace("09/26/2019 01:42:41 PM", "LOCALEDATE") expected_slack_query["blocks"][7]["text"]["text"] = expected_slack_query["blocks"][ 7 ]["text"]["text"].replace("09/26/2019 13:42:41", "LOCALEDATE") expected_slack_query["blocks"][7]["text"]["text"] = expected_slack_query["blocks"][ 7 ]["text"]["text"].replace("09/26/2019 01:42:41 PM", "LOCALEDATE") import json print(json.dumps(slack_query, indent=2)) print(json.dumps(expected_slack_query, indent=2)) assert slack_query == expected_slack_query
def test_evaluation_parameter_store_methods( data_context_parameterized_expectation_suite: DataContext, ): run_id = RunIdentifier(run_name="20191125T000000.000000Z") source_patient_data_results = ExpectationSuiteValidationResult( meta={ "expectation_suite_name": "source_patient_data.default", "run_id": run_id, }, results=[ ExpectationValidationResult( expectation_config=ExpectationConfiguration( expectation_type="expect_table_row_count_to_equal", kwargs={ "value": 1024, }, ), success=True, exception_info={ "exception_message": None, "exception_traceback": None, "raised_exception": False, }, result={ "observed_value": 1024, "element_count": 1024, "missing_percent": 0.0, "missing_count": 0, }, ) ], success=True, ) data_context_parameterized_expectation_suite.store_evaluation_parameters( source_patient_data_results) bound_parameters = data_context_parameterized_expectation_suite.evaluation_parameter_store.get_bind_params( run_id) assert bound_parameters == { "urn:great_expectations:validations:source_patient_data.default:expect_table_row_count_to_equal.result" ".observed_value": 1024 } source_diabetes_data_results = ExpectationSuiteValidationResult( meta={ "expectation_suite_name": "source_diabetes_data.default", "run_id": run_id, }, results=[ ExpectationValidationResult( expectation_config=ExpectationConfiguration( expectation_type= "expect_column_unique_value_count_to_be_between", kwargs={ "column": "patient_nbr", "min": 2048, "max": 2048 }, ), success=True, exception_info={ "exception_message": None, "exception_traceback": None, "raised_exception": False, }, result={ "observed_value": 2048, "element_count": 5000, "missing_percent": 0.0, "missing_count": 0, }, ) ], success=True, ) data_context_parameterized_expectation_suite.store_evaluation_parameters( source_diabetes_data_results) bound_parameters = data_context_parameterized_expectation_suite.evaluation_parameter_store.get_bind_params( run_id) assert bound_parameters == { "urn:great_expectations:validations:source_patient_data.default:expect_table_row_count_to_equal.result" ".observed_value": 1024, "urn:great_expectations:validations:source_diabetes_data.default" ":expect_column_unique_value_count_to_be_between.result.observed_value:column=patient_nbr": 2048, }
def test_StoreMetricsAction(basic_in_memory_data_context_for_validation_operator): action = StoreMetricsAction( data_context=basic_in_memory_data_context_for_validation_operator, requested_metrics={ "*": [ "statistics.evaluated_expectations", "statistics.successful_expectations", ] }, target_store_name="metrics_store", ) run_id = RunIdentifier(run_name="bar") validation_result = ExpectationSuiteValidationResult( success=False, meta={"expectation_suite_name": "foo", "run_id": run_id}, statistics={"evaluated_expectations": 5, "successful_expectations": 3}, ) # Run the action and store our metrics action.run( validation_result, ValidationResultIdentifier.from_object(validation_result), data_asset=None, ) validation_result = ExpectationSuiteValidationResult( success=False, meta={"expectation_suite_name": "foo.warning", "run_id": run_id}, statistics={"evaluated_expectations": 8, "successful_expectations": 4}, ) action.run( validation_result, ValidationResultIdentifier.from_object(validation_result), data_asset=None, ) assert ( basic_in_memory_data_context_for_validation_operator.stores[ "metrics_store" ].get( ValidationMetricIdentifier( run_id=run_id, data_asset_name=None, expectation_suite_identifier=ExpectationSuiteIdentifier("foo"), metric_name="statistics.evaluated_expectations", metric_kwargs_id=None, ) ) == 5 ) assert ( basic_in_memory_data_context_for_validation_operator.stores[ "metrics_store" ].get( ValidationMetricIdentifier( run_id=run_id, data_asset_name=None, expectation_suite_identifier=ExpectationSuiteIdentifier("foo"), metric_name="statistics.successful_expectations", metric_kwargs_id=None, ) ) == 3 ) assert ( basic_in_memory_data_context_for_validation_operator.stores[ "metrics_store" ].get( ValidationMetricIdentifier( run_id=run_id, data_asset_name=None, expectation_suite_identifier=ExpectationSuiteIdentifier("foo.warning"), metric_name="statistics.evaluated_expectations", metric_kwargs_id=None, ) ) == 8 ) assert ( basic_in_memory_data_context_for_validation_operator.stores[ "metrics_store" ].get( ValidationMetricIdentifier( run_id=run_id, data_asset_name=None, expectation_suite_identifier=ExpectationSuiteIdentifier("foo.warning"), metric_name="statistics.successful_expectations", metric_kwargs_id=None, ) ) == 4 )
def to_python_value( self, ctx: FlyteContext, lv: Literal, expected_python_type: Type[GreatExpectationsType], ) -> GreatExpectationsType: if not (lv and lv.scalar and ((lv.scalar.primitive and lv.scalar.primitive.string_value) or lv.scalar.schema or lv.scalar.blob or lv.scalar.structured_dataset)): raise AssertionError( "Can only validate a literal string/FlyteFile/FlyteSchema value" ) # fetch the configuration type_conf = GreatExpectationsTypeTransformer.get_config( expected_python_type) conf_dict = type_conf[1].to_dict() # type: ignore ge_conf = GreatExpectationsFlyteConfig(**conf_dict) # fetch the data context context = ge.data_context.DataContext( ge_conf.context_root_dir) # type: ignore # determine the type of data connector selected_datasource = list( filter(lambda x: x["name"] == ge_conf.datasource_name, context.list_datasources())) if not selected_datasource: raise ValueError("Datasource doesn't exist!") data_connector_class_lookup = { data_connector_name: data_connector_class["class_name"] for data_connector_name, data_connector_class in selected_datasource[0]["data_connectors"].items() } specified_data_connector_class = data_connector_class_lookup[ ge_conf.data_connector_name] is_runtime = False if specified_data_connector_class == "RuntimeDataConnector": is_runtime = True if not ge_conf.data_asset_name: raise ValueError( "data_asset_name has to be given in a RuntimeBatchRequest") # file path for FlyteSchema and FlyteFile temp_dataset = "" # return value return_dataset = "" # FlyteSchema if lv.scalar.schema or lv.scalar.structured_dataset: return_dataset, temp_dataset = self._flyte_schema( is_runtime=is_runtime, ctx=ctx, ge_conf=ge_conf, lv=lv, expected_python_type=type_conf[0]) # FlyteFile if lv.scalar.blob: return_dataset, temp_dataset = self._flyte_file( ctx=ctx, ge_conf=ge_conf, lv=lv, expected_python_type=type_conf[0]) if lv.scalar.primitive: dataset = return_dataset = lv.scalar.primitive.string_value else: dataset = temp_dataset batch_request_conf = ge_conf.batch_request_config # minimalistic batch request final_batch_request = { "data_asset_name": ge_conf.data_asset_name if is_runtime else dataset, "datasource_name": ge_conf.datasource_name, "data_connector_name": ge_conf.data_connector_name, } # Great Expectations' RuntimeBatchRequest if batch_request_conf and (batch_request_conf["runtime_parameters"] or is_runtime): final_batch_request.update({ "runtime_parameters": batch_request_conf["runtime_parameters"] if batch_request_conf["runtime_parameters"] else {}, "batch_identifiers": batch_request_conf["batch_identifiers"], "batch_spec_passthrough": batch_request_conf["batch_spec_passthrough"], }) if is_runtime and lv.scalar.primitive: final_batch_request["runtime_parameters"]["query"] = dataset elif is_runtime and (lv.scalar.schema or lv.scalar.structured_dataset): final_batch_request["runtime_parameters"][ "batch_data"] = return_dataset.open().all() else: raise AssertionError( "Can only use runtime_parameters for query(str)/schema data" ) # Great Expectations' BatchRequest elif batch_request_conf: final_batch_request.update({ "data_connector_query": batch_request_conf["data_connector_query"], "batch_spec_passthrough": batch_request_conf["batch_spec_passthrough"], }) if ge_conf.checkpoint_params: checkpoint = SimpleCheckpoint( f"_tmp_checkpoint_{ge_conf.expectation_suite_name}", context, **ge_conf.checkpoint_params, ) else: checkpoint = SimpleCheckpoint( f"_tmp_checkpoint_{ge_conf.expectation_suite_name}", context) # identify every run uniquely run_id = RunIdentifier( **{ "run_name": ge_conf.datasource_name + "_run", "run_time": datetime.datetime.utcnow(), }) checkpoint_result = checkpoint.run( run_id=run_id, validations=[{ "batch_request": final_batch_request, "expectation_suite_name": ge_conf.expectation_suite_name, }], ) final_result = convert_to_json_serializable( checkpoint_result.list_validation_results())[0] result_string = "" if final_result["success"] is False: for every_result in final_result["results"]: if every_result["success"] is False: result_string += ( every_result["expectation_config"]["kwargs"]["column"] + " -> " + every_result["expectation_config"]["expectation_type"] + "\n") # raise a Great Expectations' exception raise ValidationError( "Validation failed!\nCOLUMN\t\tFAILED EXPECTATION\n" + result_string) logger.info("Validation succeeded!") return typing.cast(GreatExpectationsType, return_dataset)
def test_run_identifier_parses_datetime_run_name(): time = datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ") run_id = RunIdentifier(run_name=time) assert run_id.run_name == run_id.run_time.strftime("%Y%m%dT%H%M%S.%fZ")
def test_TupleGCSStoreBackend(): # pytest.importorskip("google-cloud-storage") """ What does this test test and why? Since no package like moto exists for GCP services, we mock the GCS client and assert that the store backend makes the right calls for set, get, and list. TODO : One option may be to have a GCS Store in Docker, which can be use to "actually" run these tests. """ bucket = "leakybucket" prefix = "this_is_a_test_prefix" project = "dummy-project" base_public_path = "http://www.test.com/" with patch("google.cloud.storage.Client", autospec=True) as mock_gcs_client: mock_client = mock_gcs_client.return_value mock_bucket = mock_client.get_bucket.return_value mock_blob = mock_bucket.blob.return_value my_store = TupleGCSStoreBackend( filepath_template="my_file_{0}", bucket=bucket, prefix=prefix, project=project, ) my_store.set(("AAA", ), "aaa", content_type="text/html") mock_gcs_client.assert_called_with("dummy-project") mock_client.get_bucket.assert_called_with("leakybucket") mock_bucket.blob.assert_called_with( "this_is_a_test_prefix/my_file_AAA") # mock_bucket.blob.assert_any_call("this_is_a_test_prefix/.ge_store_backend_id") mock_blob.upload_from_string.assert_called_with( b"aaa", content_type="text/html") with patch("google.cloud.storage.Client", autospec=True) as mock_gcs_client: mock_client = mock_gcs_client.return_value mock_bucket = mock_client.get_bucket.return_value mock_blob = mock_bucket.blob.return_value my_store_with_no_filepath_template = TupleGCSStoreBackend( filepath_template=None, bucket=bucket, prefix=prefix, project=project) my_store_with_no_filepath_template.set(("AAA", ), b"aaa", content_encoding=None, content_type="image/png") mock_gcs_client.assert_called_with("dummy-project") mock_client.get_bucket.assert_called_with("leakybucket") mock_bucket.blob.assert_called_with("this_is_a_test_prefix/AAA") # mock_bucket.blob.assert_any_call("this_is_a_test_prefix/.ge_store_backend_id") mock_blob.upload_from_string.assert_called_with( b"aaa", content_type="image/png") with patch("google.cloud.storage.Client", autospec=True) as mock_gcs_client: mock_client = mock_gcs_client.return_value mock_bucket = mock_client.get_bucket.return_value mock_blob = mock_bucket.get_blob.return_value mock_str = mock_blob.download_as_string.return_value my_store.get(("BBB", )) mock_gcs_client.assert_called_once_with("dummy-project") mock_client.get_bucket.assert_called_once_with("leakybucket") mock_bucket.get_blob.assert_called_once_with( "this_is_a_test_prefix/my_file_BBB") mock_blob.download_as_string.assert_called_once() mock_str.decode.assert_called_once_with("utf-8") with patch("google.cloud.storage.Client", autospec=True) as mock_gcs_client: mock_client = mock_gcs_client.return_value my_store.list_keys() mock_client.list_blobs.assert_called_once_with( "leakybucket", prefix="this_is_a_test_prefix") my_store.remove_key("leakybucket") from google.cloud.exceptions import NotFound try: mock_client.get_bucket.assert_called_once_with("leakybucket") except NotFound: pass with patch("google.cloud.storage.Client", autospec=True) as mock_gcs_client: mock_gcs_client.side_effect = InvalidKeyError( "Hi I am an InvalidKeyError") with pytest.raises(InvalidKeyError): my_store.get(("non_existent_key", )) run_id = RunIdentifier("my_run_id", datetime.datetime.utcnow()) key = ValidationResultIdentifier( ExpectationSuiteIdentifier(expectation_suite_name="my_suite_name"), run_id, "my_batch_id", ) run_time_string = run_id.to_tuple()[1] url = my_store_with_no_filepath_template.get_url_for_key(key.to_tuple()) assert ( url == "https://storage.googleapis.com/leakybucket" + f"/this_is_a_test_prefix/my_suite_name/my_run_id/{run_time_string}/my_batch_id" )
def from_fixed_length_tuple(cls, tuple_): return cls( ExpectationSuiteIdentifier(tuple_[0]), RunIdentifier.from_tuple((tuple_[1], tuple_[2])), tuple_[3], )
def from_tuple(cls, tuple_): return cls( ExpectationSuiteIdentifier.from_tuple(tuple_[0:-3]), RunIdentifier.from_tuple((tuple_[-3], tuple_[-2])), tuple_[-1], )
def run( self, assets_to_validate, run_id=None, base_expectation_suite_name=None, evaluation_parameters=None, run_name=None, run_time=None, result_format=None, ): assert not (run_id and run_name) and not ( run_id and run_time ), "Please provide either a run_id or run_name and/or run_time." if isinstance(run_id, str) and not run_name: warnings.warn( "String run_ids will be deprecated in the future. Please provide a run_id of type " "RunIdentifier(run_name=None, run_time=None), or a dictionary containing run_name " "and run_time (both optional). Instead of providing a run_id, you may also provide" "run_name and run_time separately.", DeprecationWarning, ) try: run_time = parse(run_id) except (ValueError, TypeError): pass run_id = RunIdentifier(run_name=run_id, run_time=run_time) elif isinstance(run_id, dict): run_id = RunIdentifier(**run_id) elif not isinstance(run_id, RunIdentifier): run_id = RunIdentifier(run_name=run_name, run_time=run_time) if base_expectation_suite_name is None: if self.base_expectation_suite_name is None: raise ValueError( "base_expectation_suite_name must be configured in the validation operator or passed at runtime" ) base_expectation_suite_name = self.base_expectation_suite_name run_results = {} for item in assets_to_validate: batch = self._build_batch_from_item(item) batch_id = batch.batch_id run_id = run_id assert not batch_id is None assert not run_id is None failure_expectation_suite_identifier = ExpectationSuiteIdentifier( expectation_suite_name=base_expectation_suite_name + self.expectation_suite_name_suffixes[0]) failure_validation_result_id = ValidationResultIdentifier( expectation_suite_identifier= failure_expectation_suite_identifier, run_id=run_id, batch_identifier=batch_id, ) failure_expectation_suite = None try: failure_expectation_suite = self.data_context.stores[ self.data_context.expectations_store_name].get( failure_expectation_suite_identifier) # NOTE : Abe 2019/09/17 : I'm concerned that this may be too permissive, since # it will catch any error in the Store, not just KeyErrors. In the longer term, a better # solution will be to have the Stores catch other known errors and raise KeyErrors, # so that methods like this can catch and handle a single error type. except Exception: logger.debug("Failure expectation suite not found: {}".format( failure_expectation_suite_identifier)) if failure_expectation_suite: failure_run_result_obj = { "expectation_suite_severity_level": "failure" } failure_validation_result = batch.validate( failure_expectation_suite, run_id, result_format=result_format if result_format else self.result_format, evaluation_parameters=evaluation_parameters, ) failure_run_result_obj[ "validation_result"] = failure_validation_result failure_actions_results = self._run_actions( batch, failure_expectation_suite_identifier, failure_expectation_suite, failure_validation_result, run_id, ) failure_run_result_obj[ "actions_results"] = failure_actions_results run_results[ failure_validation_result_id] = failure_run_result_obj if not failure_validation_result.success and self.stop_on_first_error: break warning_expectation_suite_identifier = ExpectationSuiteIdentifier( expectation_suite_name=base_expectation_suite_name + self.expectation_suite_name_suffixes[1]) warning_validation_result_id = ValidationResultIdentifier( expectation_suite_identifier= warning_expectation_suite_identifier, run_id=run_id, batch_identifier=batch.batch_id, ) warning_expectation_suite = None try: warning_expectation_suite = self.data_context.stores[ self.data_context.expectations_store_name].get( warning_expectation_suite_identifier) except Exception: logger.debug("Warning expectation suite not found: {}".format( warning_expectation_suite_identifier)) if warning_expectation_suite: warning_run_result_obj = { "expectation_suite_severity_level": "warning" } warning_validation_result = batch.validate( warning_expectation_suite, run_id, result_format=result_format if result_format else self.result_format, evaluation_parameters=evaluation_parameters, ) warning_run_result_obj[ "validation_result"] = warning_validation_result warning_actions_results = self._run_actions( batch, warning_expectation_suite_identifier, warning_expectation_suite, warning_validation_result, run_id, ) warning_run_result_obj[ "actions_results"] = warning_actions_results run_results[ warning_validation_result_id] = warning_run_result_obj validation_operator_result = ValidationOperatorResult( run_id=run_id, run_results=run_results, validation_operator_config=self.validation_operator_config, evaluation_parameters=evaluation_parameters, success=all([ run_result_obj["validation_result"].success for run_result_obj in run_results.values() if run_result_obj["expectation_suite_severity_level"] == "failure" ]), ) if self.slack_webhook: if (self.notify_on == "all" or self.notify_on == "success" and validation_operator_result.success or self.notify_on == "failure" and not validation_operator_result.success): slack_query = self._build_slack_query( validation_operator_result=validation_operator_result) send_slack_notification(query=slack_query, slack_webhook=self.slack_webhook) return validation_operator_result
def test_configuration_driven_site_builder_without_how_to_buttons( site_builder_data_context_with_html_store_titanic_random, ): context = site_builder_data_context_with_html_store_titanic_random context.add_validation_operator( "validate_and_store", { "class_name": "ActionListValidationOperator", "action_list": [ { "name": "store_validation_result", "action": { "class_name": "StoreValidationResultAction", "target_store_name": "validations_store", }, }, { "name": "extract_and_store_eval_parameters", "action": { "class_name": "StoreEvaluationParametersAction", "target_store_name": "evaluation_parameter_store", }, }, ], }, ) # profiling the Titanic datasource will generate one expectation suite and one validation # that is a profiling result datasource_name = "titanic" data_asset_name = "Titanic" profiler_name = "BasicDatasetProfiler" generator_name = "subdir_reader" context.profile_datasource(datasource_name) # creating another validation result using the profiler's suite (no need to use a new expectation suite # for this test). having two validation results - one with run id "profiling" - allows us to test # the logic of run_name_filter that helps filtering validation results to be included in # the profiling and the validation sections. batch_kwargs = context.build_batch_kwargs( datasource=datasource_name, batch_kwargs_generator=generator_name, name=data_asset_name, ) expectation_suite_name = "{}.{}.{}.{}".format(datasource_name, generator_name, data_asset_name, profiler_name) batch = context.get_batch( batch_kwargs=batch_kwargs, expectation_suite_name=expectation_suite_name, ) run_id = "test_run_id_12345" context.run_validation_operator( assets_to_validate=[batch], run_id=RunIdentifier(run_name=run_id), validation_operator_name="validate_and_store", ) data_docs_config = context._project_config.data_docs_sites local_site_config = data_docs_config["local_site"] # set this flag to false in config to hide how-to buttons and related elements local_site_config["show_how_to_buttons"] = False site_builder = SiteBuilder( data_context=context, runtime_environment={"root_directory": context.root_directory}, **local_site_config) res = site_builder.build() index_page_locator_info = res[0] index_links_dict = res[1] assert_how_to_buttons(context, index_page_locator_info, index_links_dict, show_how_to_buttons=False)
def test_errors_warnings_validation_operator_succeeded_vo_result_with_only_failed_warning_suite( warning_failure_validation_operator_data_context, assets_to_validate ): # this tests whether the WarningAndFailureExpectationSuitesValidationOperator properly returns # a failed ValidationOperatorResult if there is a failed validation with a suite severity level of "failure" data_context = warning_failure_validation_operator_data_context vo = WarningAndFailureExpectationSuitesValidationOperator( data_context=data_context, action_list=[], name="test", ) # only pass asset that yields succeeded "failure-level" suite and failed "warning-level" suite return_obj = vo.run( assets_to_validate=[assets_to_validate[0]], run_id=RunIdentifier(run_name="test_100"), base_expectation_suite_name="f1", ) run_results = list(return_obj.run_results.values()) # make sure there are no failed validations with suite severity of failure assert not any( [ run_result for run_result in run_results if run_result["expectation_suite_severity_level"] == "failure" and not run_result["validation_result"].success ] ) # make sure there is at least one failed validation with suite severity of warning assert any( [ run_result for run_result in run_results if run_result["expectation_suite_severity_level"] == "warning" and not run_result["validation_result"].success ] ) assert return_obj.success # only pass asset that yields succeeded "failure-level" suite and succeeded "warning-level" suite return_obj_2 = vo.run( assets_to_validate=[assets_to_validate[2]], run_id=RunIdentifier(run_name="test_100"), base_expectation_suite_name="f1", ) run_results_2 = list(return_obj_2.run_results.values()) # make sure there are no failed validations with suite severity of failure assert not any( [ run_result for run_result in run_results_2 if run_result["expectation_suite_severity_level"] == "failure" and not run_result["validation_result"].success ] ) # make sure there are no failed validation with suite severity of warning assert not any( [ run_result for run_result in run_results_2 if run_result["expectation_suite_severity_level"] == "warning" and not run_result["validation_result"].success ] ) assert return_obj_2.success
def run( self, assets_to_validate, run_id=None, evaluation_parameters=None, run_name=None, run_time=None, catch_exceptions=None, result_format=None, checkpoint_identifier=None, ): assert not (run_id and run_name) and not ( run_id and run_time ), "Please provide either a run_id or run_name and/or run_time." if isinstance(run_id, str) and not run_name: warnings.warn( "String run_ids will be deprecated in the future. Please provide a run_id of type " "RunIdentifier(run_name=None, run_time=None), or a dictionary containing run_name " "and run_time (both optional). Instead of providing a run_id, you may also provide" "run_name and run_time separately.", DeprecationWarning, ) try: run_time = parse(run_id) except (ValueError, TypeError): pass run_id = RunIdentifier(run_name=run_id, run_time=run_time) elif isinstance(run_id, dict): run_id = RunIdentifier(**run_id) elif not isinstance(run_id, RunIdentifier): run_id = RunIdentifier(run_name=run_name, run_time=run_time) ### # NOTE: 20211010 - jdimatteo: This method is called by both Checkpoint.run and LegacyCheckpoint.run and below # usage of AsyncExecutor may speed up I/O bound validations by running them in parallel with multithreading # (if concurrency is enabled in the data context configuration). # # When this method is called by LegacyCheckpoint.run, len(assets_to_validate) may be greater than 1. If # concurrency is enabled in the configuration AND len(assets_to_validate) > 1, then execution is run in multiple # threads with AsyncExecutor -- otherwise AsyncExecutor only uses the current single thread to execute the work. # Please see the below arguments used to initialize AsyncExecutor and the corresponding AsyncExecutor docstring # for more details on when multiple threads are used. # # When this method is called by Checkpoint.run, len(assets_to_validate) may be 1 even if there are multiple # validations, because Checkpoint.run calls this method in a loop for each validation. AsyncExecutor is also # used in the Checkpoint.run loop to optionally run each validation in parallel with multithreading, so this # method's AsyncExecutor is nested within the Checkpoint.run AsyncExecutor. The AsyncExecutor logic to only use # multithreading when max_workers > 1 ensures that no nested multithreading is ever used when # len(assets_to_validate) is equal to 1. So no unnecessary multithreading is ever used here even though it may # be nested inside another AsyncExecutor (and this is a good thing because it avoids extra overhead associated # with each thread and minimizes the total number of threads to simplify debugging). with AsyncExecutor( self.data_context.concurrency, max_workers=len(assets_to_validate)) as async_executor: batch_and_async_result_tuples = [] for item in assets_to_validate: batch = self._build_batch_from_item(item) if hasattr(batch, "active_batch_id"): batch_identifier = batch.active_batch_id else: batch_identifier = batch.batch_id if result_format is None: result_format = self.result_format batch_validate_arguments = { "run_id": run_id, "result_format": result_format, "evaluation_parameters": evaluation_parameters, } if catch_exceptions is not None: batch_validate_arguments[ "catch_exceptions"] = catch_exceptions batch_and_async_result_tuples.append(( batch, async_executor.submit( batch.validate, **batch_validate_arguments, ), )) run_results = {} for batch, async_batch_validation_result in batch_and_async_result_tuples: if self.data_context.ge_cloud_mode: expectation_suite_identifier = GeCloudIdentifier( resource_type="expectation_suite", ge_cloud_id=batch._expectation_suite.ge_cloud_id, ) validation_result_id = GeCloudIdentifier( resource_type="suite_validation_result") else: expectation_suite_identifier = ExpectationSuiteIdentifier( expectation_suite_name=batch._expectation_suite. expectation_suite_name) validation_result_id = ValidationResultIdentifier( batch_identifier=batch_identifier, expectation_suite_identifier= expectation_suite_identifier, run_id=run_id, ) batch_actions_results = self._run_actions( batch=batch, expectation_suite_identifier=expectation_suite_identifier, expectation_suite=batch._expectation_suite, batch_validation_result=async_batch_validation_result. result(), run_id=run_id, validation_result_id=validation_result_id, checkpoint_identifier=checkpoint_identifier, ) run_result_obj = { "validation_result": async_batch_validation_result.result(), "actions_results": batch_actions_results, } run_results[validation_result_id] = run_result_obj return ValidationOperatorResult( run_id=run_id, run_results=run_results, validation_operator_config=self.validation_operator_config, evaluation_parameters=evaluation_parameters, )
def test_StoreMetricsAction_column_metric( basic_in_memory_data_context_for_validation_operator, ): action = StoreMetricsAction( data_context=basic_in_memory_data_context_for_validation_operator, requested_metrics={ "*": [ { "column": { "provider_id": [ "expect_column_values_to_be_unique.result.unexpected_count" ] } }, "statistics.evaluated_expectations", "statistics.successful_expectations", ] }, target_store_name="metrics_store", ) run_id = RunIdentifier(run_name="bar") validation_result = ExpectationSuiteValidationResult( success=False, meta={"expectation_suite_name": "foo", "run_id": run_id}, results=[ ExpectationValidationResult( meta={}, result={ "element_count": 10, "missing_count": 0, "missing_percent": 0.0, "unexpected_count": 7, "unexpected_percent": 0.0, "unexpected_percent_nonmissing": 0.0, "partial_unexpected_list": [], }, success=True, expectation_config=ExpectationConfiguration( expectation_type="expect_column_values_to_be_unique", kwargs={"column": "provider_id", "result_format": "BASIC"}, ), exception_info=None, ) ], statistics={"evaluated_expectations": 5, "successful_expectations": 3}, ) action.run( validation_result, ValidationResultIdentifier.from_object(validation_result), data_asset=None, ) assert ( basic_in_memory_data_context_for_validation_operator.stores[ "metrics_store" ].get( ValidationMetricIdentifier( run_id=run_id, data_asset_name=None, expectation_suite_identifier=ExpectationSuiteIdentifier("foo"), metric_name="expect_column_values_to_be_unique.result.unexpected_count", metric_kwargs_id="column=provider_id", ) ) == 7 )
def test_configuration_driven_site_builder_skip_and_clean_missing( site_builder_data_context_with_html_store_titanic_random, ): # tests auto-cleaning functionality of DefaultSiteIndexBuilder # when index page is built, if an HTML page is present without corresponding suite or validation result, # the HTML page should be removed and not appear on index page context = site_builder_data_context_with_html_store_titanic_random context.add_validation_operator( "validate_and_store", { "class_name": "ActionListValidationOperator", "action_list": [ { "name": "store_validation_result", "action": { "class_name": "StoreValidationResultAction", "target_store_name": "validations_store", }, }, { "name": "extract_and_store_eval_parameters", "action": { "class_name": "StoreEvaluationParametersAction", "target_store_name": "evaluation_parameter_store", }, }, ], }, ) # profiling the Titanic datasource will generate one expectation suite and one validation # that is a profiling result datasource_name = "titanic" data_asset_name = "Titanic" profiler_name = "BasicDatasetProfiler" generator_name = "subdir_reader" context.profile_datasource(datasource_name) # creating another validation result using the profiler's suite (no need to use a new expectation suite # for this test). having two validation results - one with run id "profiling" - allows us to test # the logic of run_name_filter that helps filtering validation results to be included in # the profiling and the validation sections. batch_kwargs = context.build_batch_kwargs( datasource=datasource_name, batch_kwargs_generator=generator_name, data_asset_name=data_asset_name, ) expectation_suite_name = "{}.{}.{}.{}".format( datasource_name, generator_name, data_asset_name, profiler_name ) batch = context.get_batch( batch_kwargs=batch_kwargs, expectation_suite_name=expectation_suite_name, ) run_id = RunIdentifier(run_name="test_run_id_12345") context.run_validation_operator( assets_to_validate=[batch], run_id=run_id, validation_operator_name="validate_and_store", ) data_docs_config = context._project_config.data_docs_sites local_site_config = data_docs_config["local_site"] validations_set = set(context.stores["validations_store"].list_keys()) assert len(validations_set) == 6 expectation_suite_set = set(context.stores["expectations_store"].list_keys()) assert len(expectation_suite_set) == 5 site_builder = SiteBuilder( data_context=context, runtime_environment={"root_directory": context.root_directory}, **local_site_config ) site_builder.build() # test expectation suite pages expectation_suite_html_pages = { ExpectationSuiteIdentifier.from_tuple(suite_tuple) for suite_tuple in site_builder.target_store.store_backends[ ExpectationSuiteIdentifier ].list_keys() } # suites in expectations store should match html pages assert expectation_suite_set == expectation_suite_html_pages # remove suites from expectations store for i in range(2): context.stores["expectations_store"].remove_key(list(expectation_suite_set)[i]) # re-build data docs, which should remove suite HTML pages that no longer have corresponding suite in # expectations store site_builder.build() expectation_suite_set = set(context.stores["expectations_store"].list_keys()) expectation_suite_html_pages = { ExpectationSuiteIdentifier.from_tuple(suite_tuple) for suite_tuple in site_builder.target_store.store_backends[ ExpectationSuiteIdentifier ].list_keys() } assert expectation_suite_set == expectation_suite_html_pages # test validation result pages validation_html_pages = { ValidationResultIdentifier.from_tuple(result_tuple) for result_tuple in site_builder.target_store.store_backends[ ValidationResultIdentifier ].list_keys() } # validations in store should match html pages assert validations_set == validation_html_pages # remove validations from store for i in range(2): context.stores["validations_store"].store_backend.remove_key( list(validations_set)[i] ) # re-build data docs, which should remove validation HTML pages that no longer have corresponding validation in # validations store site_builder.build() validations_set = set(context.stores["validations_store"].list_keys()) validation_html_pages = { ValidationResultIdentifier.from_tuple(result_tuple) for result_tuple in site_builder.target_store.store_backends[ ValidationResultIdentifier ].list_keys() } assert validations_set == validation_html_pages
def get_bind_params(self, run_id: RunIdentifier) -> dict: params = {} for k in self._store_backend.list_keys(run_id.to_tuple()): key = self.tuple_to_key(k) params[key.to_evaluation_parameter_urn()] = self.get(key) return params
def validation_operator_run(name, run_name, validation_config_file, suite, directory): # Note though the long lines here aren't pythonic, they look best if Click does the line wraps. """ Run a validation operator against some data. There are two modes to run this command: 1. Interactive (good for development): Specify the name of the validation operator using the --name argument and the name of the expectation suite using the --suite argument. The cli will help you specify the batch of data that you want to validate interactively. 2. Non-interactive (good for production): Use the `--validation_config_file` argument to specify the path of the validation configuration JSON file. This file can be used to instruct a validation operator to validate multiple batches of data and use multiple expectation suites to validate each batch. Learn how to create a validation config file here: https://great-expectations.readthedocs.io/en/latest/command_line.html#great-expectations-validation-operator-run-validation-config-file-validation-config-file-path This command exits with 0 if the validation operator ran and the "success" attribute in its return object is True. Otherwise, the command exits with 1. To learn more about validation operators, go here: https://great-expectations.readthedocs.io/en/latest/features/validation.html#validation-operators """ try: context = DataContext(directory) except ge_exceptions.ConfigNotFoundError as err: cli_message("Failed to process <red>{}</red>".format(err.message)) sys.exit(1) try: if validation_config_file is not None: try: with open(validation_config_file) as f: validation_config = json.load(f) except (OSError, json_parse_exception) as e: cli_message( f"Failed to process the --validation_config_file argument: <red>{e}</red>" ) send_usage_message( data_context=context, event="cli.validation_operator.run", success=False, ) sys.exit(1) validation_config_error_message = _validate_valdiation_config( validation_config) if validation_config_error_message is not None: cli_message( "<red>The validation config in {:s} is misconfigured: {:s}</red>" .format(validation_config_file, validation_config_error_message)) send_usage_message( data_context=context, event="cli.validation_operator.run", success=False, ) sys.exit(1) else: if suite is None: cli_message(""" Please use --suite argument to specify the name of the expectation suite. Call `great_expectation suite list` command to list the expectation suites in your project. """) send_usage_message( data_context=context, event="cli.validation_operator.run", success=False, ) sys.exit(0) suite = toolkit.load_expectation_suite( context, suite, "cli.validation_operator.run") if name is None: cli_message(""" Please use --name argument to specify the name of the validation operator. Call `great_expectation validation-operator list` command to list the operators in your project. """) send_usage_message( data_context=context, event="cli.validation_operator.run", success=False, ) sys.exit(1) else: if name not in context.list_validation_operator_names(): cli_message(f""" Could not find a validation operator {name}. Call `great_expectation validation-operator list` command to list the operators in your project. """) send_usage_message( data_context=context, event="cli.validation_operator.run", success=False, ) sys.exit(1) batch_kwargs = None cli_message(""" Let us help you specify the batch of data your want the validation operator to validate.""" ) try: data_source = toolkit.select_datasource(context) except ValueError as ve: cli_message("<red>{}</red>".format(ve)) send_usage_message( data_context=context, event="cli.validation_operator.run", success=False, ) sys.exit(1) if not data_source: cli_message("<red>No datasources found in the context.</red>") send_usage_message( data_context=context, event="cli.validation_operator.run", success=False, ) sys.exit(1) if batch_kwargs is None: ( datasource_name, batch_kwargs_generator, data_asset, batch_kwargs, ) = get_batch_kwargs( context, datasource_name=data_source.name, batch_kwargs_generator_name=None, data_asset_name=None, additional_batch_kwargs=None, ) validation_config = { "validation_operator_name": name, "batches": [{ "batch_kwargs": batch_kwargs, "expectation_suite_names": [suite.expectation_suite_name], }], } try: validation_operator_name = validation_config[ "validation_operator_name"] batches_to_validate = [] for entry in validation_config["batches"]: for expectation_suite_name in entry["expectation_suite_names"]: batch = context.get_batch(entry["batch_kwargs"], expectation_suite_name) batches_to_validate.append(batch) if run_name is None: run_name = datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ") run_id = RunIdentifier(run_name=run_name) if suite is None: results = context.run_validation_operator( validation_operator_name, assets_to_validate=batches_to_validate, run_id=run_id, ) else: if suite.evaluation_parameters is None: results = context.run_validation_operator( validation_operator_name, assets_to_validate=batches_to_validate, run_id=run_id, ) else: results = context.run_validation_operator( validation_operator_name, assets_to_validate=batches_to_validate, run_id=run_id, evaluation_parameters=suite.evaluation_parameters, ) except (ge_exceptions.DataContextError, OSError, SQLAlchemyError) as e: cli_message("<red>{}</red>".format(e)) send_usage_message(data_context=context, event="cli.validation_operator.run", success=False) sys.exit(1) if not results["success"]: cli_message("Validation failed!") send_usage_message(data_context=context, event="cli.validation_operator.run", success=True) sys.exit(1) else: cli_message("Validation succeeded!") send_usage_message(data_context=context, event="cli.validation_operator.run", success=True) sys.exit(0) except Exception as e: send_usage_message(data_context=context, event="cli.validation_operator.run", success=False) raise e
def test_configuration_driven_site_builder( site_builder_data_context_v013_with_html_store_titanic_random, ): context = site_builder_data_context_v013_with_html_store_titanic_random context.add_validation_operator( "validate_and_store", { "class_name": "ActionListValidationOperator", "action_list": [ { "name": "store_validation_result", "action": { "class_name": "StoreValidationResultAction", "target_store_name": "validations_store", }, }, { "name": "extract_and_store_eval_parameters", "action": { "class_name": "StoreEvaluationParametersAction", "target_store_name": "evaluation_parameter_store", }, }, ], }, ) # profiling the Titanic datasource will generate one expectation suite and one validation # that is a profiling result datasource_name = "titanic" data_asset_name = "Titanic" profiler_name = "BasicDatasetProfiler" generator_name = "subdir_reader" context.profile_datasource(datasource_name) # creating another validation result using the profiler's suite (no need to use a new expectation suite # for this test). having two validation results - one with run id "profiling" - allows us to test # the logic of run_name_filter that helps filtering validation results to be included in # the profiling and the validation sections. batch_kwargs = context.build_batch_kwargs( datasource=datasource_name, batch_kwargs_generator=generator_name, data_asset_name=data_asset_name, ) expectation_suite_name = "{}.{}.{}.{}".format( datasource_name, generator_name, data_asset_name, profiler_name ) batch = context.get_batch( batch_kwargs=batch_kwargs, expectation_suite_name=expectation_suite_name, ) run_id = RunIdentifier(run_name="test_run_id_12345") context.run_validation_operator( assets_to_validate=[batch], run_id=run_id, validation_operator_name="validate_and_store", ) data_docs_config = context._project_config.data_docs_sites local_site_config = data_docs_config["local_site"] validations_set = set(context.stores["validations_store"].list_keys()) assert len(validations_set) == 6 assert ( ValidationResultIdentifier( expectation_suite_identifier=ExpectationSuiteIdentifier( expectation_suite_name=expectation_suite_name ), run_id="test_run_id_12345", batch_identifier=batch.batch_id, ) in validations_set ) assert ( ValidationResultIdentifier( expectation_suite_identifier=ExpectationSuiteIdentifier( expectation_suite_name=expectation_suite_name ), run_id="profiling", batch_identifier=batch.batch_id, ) in validations_set ) assert ( ValidationResultIdentifier( expectation_suite_identifier=ExpectationSuiteIdentifier( expectation_suite_name=expectation_suite_name ), run_id="profiling", batch_identifier=batch.batch_id, ) in validations_set ) assert ( ValidationResultIdentifier( expectation_suite_identifier=ExpectationSuiteIdentifier( expectation_suite_name=expectation_suite_name ), run_id="profiling", batch_identifier=batch.batch_id, ) in validations_set ) site_builder = SiteBuilder( data_context=context, runtime_environment={"root_directory": context.root_directory}, **local_site_config ) res = site_builder.build() index_page_locator_info = res[0] index_links_dict = res[1] # assert that how-to buttons and related elements are rendered (default behavior) assert_how_to_buttons(context, index_page_locator_info, index_links_dict) # print(json.dumps(index_page_locator_info, indent=2)) assert ( index_page_locator_info == "file://" + context.root_directory + "/uncommitted/data_docs/local_site/index.html" ) # print(json.dumps(index_links_dict, indent=2)) assert "site_name" in index_links_dict assert "expectations_links" in index_links_dict assert len(index_links_dict["expectations_links"]) == 5 assert "validations_links" in index_links_dict assert ( len(index_links_dict["validations_links"]) == 1 ), """ The only rendered validation should be the one not generated by the profiler """ assert "profiling_links" in index_links_dict assert len(index_links_dict["profiling_links"]) == 5 # save documentation locally os.makedirs("./tests/render/output", exist_ok=True) os.makedirs("./tests/render/output/documentation", exist_ok=True) if os.path.isdir("./tests/render/output/documentation"): shutil.rmtree("./tests/render/output/documentation") shutil.copytree( os.path.join( site_builder_data_context_v013_with_html_store_titanic_random.root_directory, "uncommitted/data_docs/", ), "./tests/render/output/documentation", ) # let's create another validation result and run the site builder to add it # to the data docs # the operator does not have an StoreValidationResultAction action configured, so the site # will not be updated without our call to site builder expectation_suite_path_component = expectation_suite_name.replace(".", "/") validation_result_page_path = os.path.join( site_builder.site_index_builder.target_store.store_backends[ ValidationResultIdentifier ].full_base_directory, "validations", expectation_suite_path_component, run_id.run_name, run_id.run_time.strftime("%Y%m%dT%H%M%S.%fZ"), batch.batch_id + ".html", ) ts_last_mod_0 = os.path.getmtime(validation_result_page_path) run_id = RunIdentifier(run_name="test_run_id_12346") operator_result = context.run_validation_operator( assets_to_validate=[batch], run_id=run_id, validation_operator_name="validate_and_store", ) validation_result_id = operator_result.list_validation_result_identifiers()[0] res = site_builder.build(resource_identifiers=[validation_result_id]) index_links_dict = res[1] # verify that an additional validation result HTML file was generated assert len(index_links_dict["validations_links"]) == 2 site_builder.site_index_builder.target_store.store_backends[ ValidationResultIdentifier ].full_base_directory # verify that the validation result HTML file rendered in the previous run was NOT updated ts_last_mod_1 = os.path.getmtime(validation_result_page_path) assert ts_last_mod_0 == ts_last_mod_1 # verify that the new method of the site builder that returns the URL of the HTML file that renders # a resource new_validation_result_page_path = os.path.join( site_builder.site_index_builder.target_store.store_backends[ ValidationResultIdentifier ].full_base_directory, "validations", expectation_suite_path_component, run_id.run_name, run_id.run_time.strftime("%Y%m%dT%H%M%S.%fZ"), batch.batch_id + ".html", ) html_url = site_builder.get_resource_url(resource_identifier=validation_result_id) assert "file://" + new_validation_result_page_path == html_url html_url = site_builder.get_resource_url() assert ( "file://" + os.path.join( site_builder.site_index_builder.target_store.store_backends[ ValidationResultIdentifier ].full_base_directory, "index.html", ) == html_url ) team_site_config = data_docs_config["team_site"] team_site_builder = SiteBuilder( data_context=context, runtime_environment={"root_directory": context.root_directory}, **team_site_config ) team_site_builder.clean_site() obs = [ url_dict for url_dict in context.get_docs_sites_urls(site_name="team_site") if url_dict.get("site_url") ] assert len(obs) == 0 # exercise clean_site site_builder.clean_site() obs = [ url_dict for url_dict in context.get_docs_sites_urls() if url_dict.get("site_url") ] assert len(obs) == 0 # restore site context = site_builder_data_context_v013_with_html_store_titanic_random site_builder = SiteBuilder( data_context=context, runtime_environment={"root_directory": context.root_directory}, **local_site_config ) res = site_builder.build()