def to_json_dict(self) -> dict: json_dict: dict = self.to_dict() deep_filter_properties_iterable( properties=json_dict["batch_request"], inplace=True, ) return json_dict
def __init__( self, context: DataContext, expectation_suite_name: str, profiler_name: str, batch_request: Union[str, Dict[str, Union[str, int, Dict[str, Any]]]], ) -> None: super().__init__(context=context) if batch_request is None: batch_request = {} deep_filter_properties_iterable( properties=batch_request, inplace=True, ) batch_request = standardize_batch_request_display_ordering( batch_request=batch_request ) self._batch_request = batch_request self._validator = context.get_validator( batch_request=BatchRequest(**batch_request), expectation_suite_name=expectation_suite_name, ) self._profiler_name = profiler_name self._expectation_suite_name = self._validator.expectation_suite_name
def __eq__(self, other): return (other is not None) and ( (hasattr(other, "to_json_dict") and self.to_json_dict() == other.to_json_dict()) or (isinstance(other, dict) and deep_filter_properties_iterable( properties=self.to_json_dict(), clean_falsy=True) == deep_filter_properties_iterable(properties=other, clean_falsy=True)) or (self.__str__() == str(other)))
def __repr__(self) -> str: """ # TODO: <Alex>2/4/2022</Alex> This implementation of a custom "__repr__()" occurs frequently and should ideally serve as the reference implementation in the "SerializableDictDot" class. However, the circular import dependencies, due to the location of the "great_expectations/types/__init__.py" and "great_expectations/core/util.py" modules make this refactoring infeasible at the present time. """ json_dict: dict = self.to_json_dict() deep_filter_properties_iterable( properties=json_dict, inplace=True, ) return json.dumps(json_dict, indent=2)
def to_json_dict(self) -> dict: details: dict = {} key: str value: Any for key, value in self["details"].items(): if value: if key == INFERRED_SEMANTIC_TYPE_KEY: column_name: str semantic_type: Union[str, SemanticDomainTypes] value = { column_name: SemanticDomainTypes(semantic_type.lower()).value if isinstance(semantic_type, str) else semantic_type.value for column_name, semantic_type in value.items() } details[key] = convert_to_json_serializable(data=value) json_dict: dict = { "domain_type": self["domain_type"].value, "domain_kwargs": self["domain_kwargs"].to_json_dict(), "details": details, "rule_name": self["rule_name"], } json_dict = convert_to_json_serializable(data=json_dict) return deep_filter_properties_iterable(properties=json_dict, clean_falsy=True)
def get_column_pair_expectations(self) -> List[ExpectationConfiguration]: """Return a list of column_pair map expectations.""" expectation_configurations: List[ExpectationConfiguration] = list( filter( lambda element: element.get_domain_type() == MetricDomainTypes. COLUMN_PAIR, self.expectations, )) expectation_configuration: ExpectationConfiguration kwargs: dict column_A_name: str column_B_name: str for expectation_configuration in expectation_configurations: kwargs = deep_filter_properties_iterable( properties=expectation_configuration.kwargs, clean_falsy=True) column_A_name = kwargs.pop("column_A") column_B_name = kwargs.pop("column_B") expectation_configuration.kwargs = { "column_A": column_A_name, "column_B": column_B_name, **kwargs, } return expectation_configurations
def _convert_dictionaries_to_domain_kwargs( self, source: Optional[Any] = None) -> Optional[Union[Any, "Domain"]]: if source is None: return None if isinstance(source, dict): if not isinstance(source, Domain): deep_filter_properties_iterable(properties=source, inplace=True) source = DomainKwargs(source) key: str value: Any for key, value in source.items(): source[key] = self._convert_dictionaries_to_domain_kwargs( source=value) return source
def test_deep_filter_properties_iterable_on_batch_request_dict(): batch_request: dict = { "datasource_name": "df78ebde1957385a02d8736cd2c9a6d9", "data_connector_name": "123a3221fc4b65014d061cce4a71782e", "data_asset_name": "eac128c5824b698c22b441ada61022d4", "batch_spec_passthrough": {}, "data_connector_query": { "batch_filter_parameters": {} }, "limit": None, } deep_filter_properties_iterable(properties=batch_request, clean_nulls=True, clean_falsy=True, inplace=True) assert batch_request == { "datasource_name": "df78ebde1957385a02d8736cd2c9a6d9", "data_connector_name": "123a3221fc4b65014d061cce4a71782e", "data_asset_name": "eac128c5824b698c22b441ada61022d4", }
def test_batch_request_deepcopy(): test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) batch_request: RuntimeBatchRequest = RuntimeBatchRequest( **{ "datasource_name": "my_datasource", "data_connector_name": "my_runtime_data_connector", "data_asset_name": "default_data_asset_name", "batch_identifiers": { "pipeline_stage_name": "core_processing", "airflow_run_id": 1234567890, }, "runtime_parameters": {"batch_data": test_df}, } ) batch_request_copy: RuntimeBatchRequest = copy.deepcopy(batch_request) assert deep_filter_properties_iterable( properties=batch_request_copy.to_dict(), clean_falsy=True, ) == deep_filter_properties_iterable( properties=batch_request.to_dict(), clean_falsy=True, )
def run( self, variables: Optional[Dict[str, Any]] = None, rules: Optional[Dict[str, Dict[str, Any]]] = None, batch_request: Optional[Union[BatchRequestBase, dict]] = None, **kwargs: dict, ) -> DataAssistantResult: """ variables: attribute name/value pairs, commonly-used in Builder objects, to modify using "runtime_environment" rules: name/(configuration-dictionary) to modify using "runtime_environment" kwargs: additional/override directives supplied at runtime "kwargs" directives structure: { "include_column_names": ["column_a", "column_b", "column_c", ...], "exclude_column_names": ["column_d", "column_e", "column_f", "column_g", ...], ... } Implementation makes best effort at assigning directives to appropriate "MetricDomainTypes" member. Returns: DataAssistantResult: The result object for the DataAssistant """ data_assistant_name: str = self._data_assistant_cls.data_assistant_type validator: Validator = get_validator_with_expectation_suite( batch_request=batch_request, data_context=self._data_context, expectation_suite=None, expectation_suite_name=None, component_name=data_assistant_name, persist=False, ) data_assistant: DataAssistant = self._data_assistant_cls( name=data_assistant_name, validator=validator, ) directives: dict = deep_filter_properties_iterable(properties=kwargs, ) variables_directives_list: List[ RuntimeEnvironmentVariablesDirectives] = build_variables_directives( **directives) domain_type_directives_list: List[ RuntimeEnvironmentDomainTypeDirectives] = build_domain_type_directives( **directives) data_assistant_result: DataAssistantResult = data_assistant.run( variables=variables, rules=rules, variables_directives_list=variables_directives_list, domain_type_directives_list=domain_type_directives_list, ) return data_assistant_result
def render_to_disk(self, notebook_file_path: str, **kwargs: dict) -> None: """ Render a notebook to disk from an expectation suite. If batch_request dictionary is passed, its properties will override any found in suite citations. """ # noinspection PyTypeChecker suite: ExpectationSuite = kwargs.get("suite") batch_request: Optional[Union[str, Dict[str, Any]]] = kwargs.get("batch_request") deep_filter_properties_iterable( properties=batch_request, inplace=True, ) # noinspection PyTypeChecker self.render( suite=suite, batch_request=batch_request, ) self.write_notebook_to_disk(notebook=self._notebook, notebook_file_path=notebook_file_path)
def _anonymize_profiler_run(self, obj: object, **kwargs) -> dict: """ Traverse the entire RuleBasedProfiler configuration structure (as per its formal, validated Marshmallow schema) and anonymize every field that can be customized by a user (public fields are recorded as their original names). """ assert isinstance( obj, RuleBasedProfilerConfig ), "ProfilerAnonymizer can only handle objects of type RuleBasedProfilerConfig" profiler_config: RuleBasedProfilerConfig = obj name: str = profiler_config.name anonymized_name: Optional[str] = self._anonymize_string(name) config_version: float = profiler_config.config_version rules: Dict[str, dict] = profiler_config.rules anonymized_rules: List[dict] = self._anonymize_rules(rules=rules) rule_count: int = len(rules) variables: dict = profiler_config.variables or {} variable_count: int = len(variables) anonymized_profiler_run_properties_dict: dict = { "anonymized_name": anonymized_name, "config_version": config_version, "anonymized_rules": anonymized_rules, "rule_count": rule_count, "variable_count": variable_count, } deep_filter_properties_iterable( properties=anonymized_profiler_run_properties_dict, clean_falsy=True, inplace=True, ) return anonymized_profiler_run_properties_dict
def get_table_expectations(self) -> List[ExpectationConfiguration]: """Return a list of table expectations.""" expectation_configurations: List[ExpectationConfiguration] = list( filter( lambda element: element.get_domain_type() == MetricDomainTypes. TABLE, self.expectations, )) expectation_configuration: ExpectationConfiguration for expectation_configuration in expectation_configurations: expectation_configuration.kwargs = deep_filter_properties_iterable( properties=expectation_configuration.kwargs, clean_falsy=True) return expectation_configurations
def render_to_disk( self, suite: ExpectationSuite, notebook_file_path: str, batch_request: Optional[ Union[str, Dict[str, Union[str, int, Dict[str, Any]]]] ] = None, ) -> None: """ Render a notebook to disk from an expectation suite. If batch_request dictionary is passed, its properties will override any found in suite citations. """ deep_filter_properties_iterable( properties=batch_request, inplace=True, ) self.render( suite=suite, batch_request=batch_request, ) self.write_notebook_to_disk( notebook=self._notebook, notebook_file_path=notebook_file_path )
def get_multicolumn_expectations(self) -> List[ExpectationConfiguration]: """Return a list of multicolumn map expectations.""" expectation_configurations: List[ExpectationConfiguration] = list( filter( lambda element: element.get_domain_type() == MetricDomainTypes. MULTICOLUMN, self.expectations, )) expectation_configuration: ExpectationConfiguration kwargs: dict column_list: str for expectation_configuration in expectation_configurations: kwargs = deep_filter_properties_iterable( properties=expectation_configuration.kwargs, clean_falsy=True) column_list = kwargs.pop("column_list") expectation_configuration.kwargs = { "column_list": column_list, **kwargs } return expectation_configurations
def add_checkpoint( data_context: "DataContext", # noqa: F821 checkpoint_store: CheckpointStore, checkpoint_store_name: str, ge_cloud_mode: bool, name: str, config_version: Optional[Union[int, float]] = None, template_name: Optional[str] = None, module_name: Optional[str] = None, class_name: Optional[str] = None, run_name_template: Optional[str] = None, expectation_suite_name: Optional[str] = None, batch_request: Optional[dict] = None, action_list: Optional[List[dict]] = None, evaluation_parameters: Optional[dict] = None, runtime_configuration: Optional[dict] = None, validations: Optional[List[dict]] = None, profilers: Optional[List[dict]] = None, # Next two fields are for LegacyCheckpoint configuration validation_operator_name: Optional[str] = None, batches: Optional[List[dict]] = None, # the following four arguments are used by SimpleCheckpoint site_names: Optional[Union[str, List[str]]] = None, slack_webhook: Optional[str] = None, notify_on: Optional[str] = None, notify_with: Optional[Union[str, List[str]]] = None, ge_cloud_id: Optional[str] = None, expectation_suite_ge_cloud_id: Optional[str] = None, ) -> Union[Checkpoint, LegacyCheckpoint]: checkpoint_config: Union[CheckpointConfig, dict] # These checks protect against typed objects (BatchRequest and/or RuntimeBatchRequest) encountered in arguments. batch_request = get_batch_request_as_dict(batch_request=batch_request) validations = get_validations_with_batch_request_as_dict(validations=validations) # DataFrames shouldn't be saved to CheckpointStore if batch_request_contains_batch_data(batch_request=batch_request): raise ge_exceptions.InvalidConfigError( f'batch_data found in batch_request cannot be saved to CheckpointStore "{checkpoint_store_name}"' ) if batch_request_in_validations_contains_batch_data(validations=validations): raise ge_exceptions.InvalidConfigError( f'batch_data found in validations cannot be saved to CheckpointStore "{checkpoint_store_name}"' ) checkpoint_config = { "name": name, "config_version": config_version, "template_name": template_name, "module_name": module_name, "class_name": class_name, "run_name_template": run_name_template, "expectation_suite_name": expectation_suite_name, "batch_request": batch_request, "action_list": action_list, "evaluation_parameters": evaluation_parameters, "runtime_configuration": runtime_configuration, "validations": validations, "profilers": profilers, # Next two fields are for LegacyCheckpoint configuration "validation_operator_name": validation_operator_name, "batches": batches, # the following four keys are used by SimpleCheckpoint "site_names": site_names, "slack_webhook": slack_webhook, "notify_on": notify_on, "notify_with": notify_with, "ge_cloud_id": ge_cloud_id, "expectation_suite_ge_cloud_id": expectation_suite_ge_cloud_id, } checkpoint_config = deep_filter_properties_iterable( properties=checkpoint_config, clean_falsy=True, ) new_checkpoint: Union[ Checkpoint, SimpleCheckpoint, LegacyCheckpoint ] = instantiate_class_from_config( config=checkpoint_config, runtime_environment={ "data_context": data_context, }, config_defaults={ "module_name": "great_expectations.checkpoint", }, ) if ge_cloud_mode: key: GeCloudIdentifier = GeCloudIdentifier( resource_type="contract", ge_cloud_id=ge_cloud_id ) else: key: ConfigurationIdentifier = ConfigurationIdentifier( configuration_key=name, ) checkpoint_config = new_checkpoint.get_config() checkpoint_ref = checkpoint_store.set(key=key, value=checkpoint_config) if isinstance(checkpoint_ref, GeCloudIdAwareRef): ge_cloud_id = checkpoint_ref.ge_cloud_id new_checkpoint.ge_cloud_id = uuid.UUID(ge_cloud_id) return new_checkpoint
def test_checkpoint_config_repr_after_substitution(checkpoint): df: pd.DataFrame = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) batch_request_param: dict = { "runtime_parameters": { "batch_data": df }, "batch_identifiers": { "default_identifier_name": "my_simple_df" }, } result_format_param: dict = {"result_format": "SUMMARY"} kwargs: dict = { "batch_request": batch_request_param, "result_format": result_format_param, } # Matching how this is called in usage_statistics.py (parameter style) resolved_runtime_kwargs: dict = ( CheckpointConfig.resolve_config_using_acceptable_arguments( *(checkpoint, ), **kwargs)) json_dict: dict = convert_to_json_serializable( data=resolved_runtime_kwargs) deep_filter_properties_iterable( properties=json_dict, inplace=True, ) keys: List[str] = sorted(list(json_dict.keys())) key: str sorted_json_dict: dict = {key: json_dict[key] for key in keys} checkpoint_config_repr: str = json.dumps(sorted_json_dict, indent=2) assert (checkpoint_config_repr == """{ "action_list": [ { "name": "store_validation_result", "action": { "class_name": "StoreValidationResultAction" } }, { "name": "store_evaluation_params", "action": { "class_name": "StoreEvaluationParametersAction" } }, { "name": "update_data_docs", "action": { "class_name": "UpdateDataDocsAction", "site_names": [] } } ], "batch_request": { "runtime_parameters": { "batch_data": [ { "a": 1, "b": 3 }, { "a": 2, "b": 4 } ] }, "batch_identifiers": { "default_identifier_name": "my_simple_df" } }, "class_name": "Checkpoint", "config_version": 1.0, "evaluation_parameters": {}, "module_name": "great_expectations.checkpoint", "name": "my_checkpoint", "profilers": [], "runtime_configuration": {}, "validations": [ { "batch_request": { "datasource_name": "example_datasource", "data_connector_name": "default_runtime_data_connector_name", "data_asset_name": "my_data_asset", "runtime_parameters": { "batch_data": "<class \'pandas.core.frame.DataFrame\'>" }, "batch_identifiers": { "default_identifier_name": "my_simple_df" } }, "expectation_suite_name": "test_suite", "action_list": [ { "name": "store_validation_result", "action": { "class_name": "StoreValidationResultAction" } }, { "name": "store_evaluation_params", "action": { "class_name": "StoreEvaluationParametersAction" } }, { "name": "update_data_docs", "action": { "class_name": "UpdateDataDocsAction", "site_names": [] } } ] } ] }""")
def test_checkpoint_config_deepcopy( titanic_pandas_data_context_with_v013_datasource_stats_enabled_with_checkpoints_v1_with_templates, monkeypatch, ): monkeypatch.setenv("GE_ENVIRONMENT", "my_ge_environment") monkeypatch.setenv("VAR", "test") monkeypatch.setenv("MY_PARAM", "1") monkeypatch.setenv("OLD_PARAM", "2") context: DataContext = titanic_pandas_data_context_with_v013_datasource_stats_enabled_with_checkpoints_v1_with_templates test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) runtime_batch_request: RuntimeBatchRequest = RuntimeBatchRequest( **{ "datasource_name": "my_datasource", "data_connector_name": "my_runtime_data_connector", "data_asset_name": "default_data_asset_name", "batch_identifiers": { "pipeline_stage_name": "core_processing", "airflow_run_id": 1234567890, }, "runtime_parameters": {"batch_data": test_df}, } ) nested_checkpoint_config: CheckpointConfig = CheckpointConfig( name="my_nested_checkpoint", config_version=1, template_name="my_nested_checkpoint_template_2", expectation_suite_name="users.delivery", validations=[ { "batch_request": { "datasource_name": "my_datasource", "data_connector_name": "my_special_data_connector", "data_asset_name": "users", "data_connector_query": {"partition_index": -1}, } }, { "batch_request": { "datasource_name": "my_datasource", "data_connector_name": "my_other_data_connector", "data_asset_name": "users", "data_connector_query": {"partition_index": -2}, } }, ], ) nested_checkpoint: Checkpoint = Checkpoint( data_context=context, **filter_properties_dict( properties=nested_checkpoint_config.to_json_dict(), delete_fields={"class_name", "module_name"}, clean_falsy=True, ), ) substituted_config_template_and_runtime_kwargs: dict = nested_checkpoint.get_substituted_config( runtime_kwargs={ "batch_request": runtime_batch_request, "expectation_suite_name": "runtime_suite_name", "template_name": "my_nested_checkpoint_template_3", "validations": [ { "batch_request": { "datasource_name": "my_datasource", "data_connector_name": "my_other_data_connector_2_runtime", "data_asset_name": "users", "data_connector_query": {"partition_index": -3}, } }, { "batch_request": { "datasource_name": "my_datasource", "data_connector_name": "my_other_data_connector_3_runtime", "data_asset_name": "users", "data_connector_query": {"partition_index": -4}, } }, ], "run_name_template": "runtime_run_template", "action_list": [ { "name": "store_validation_result", "action": { "class_name": "StoreValidationResultAction", }, }, { "name": "store_evaluation_params", "action": { "class_name": "MyCustomRuntimeStoreEvaluationParametersAction", }, }, { "name": "update_data_docs", "action": None, }, { "name": "update_data_docs_deluxe_runtime", "action": { "class_name": "UpdateDataDocsAction", }, }, ], "evaluation_parameters": { "environment": "runtime-$GE_ENVIRONMENT", "tolerance": 1.0e-2, "aux_param_0": "runtime-$MY_PARAM", "aux_param_1": "1 + $MY_PARAM", "new_runtime_eval_param": "bloopy!", }, "runtime_configuration": { "result_format": "BASIC", "partial_unexpected_count": 999, "new_runtime_config_key": "bleepy!", }, } ) checkpoint_config_copy: dict = copy.deepcopy( substituted_config_template_and_runtime_kwargs ) assert deep_filter_properties_iterable( properties=checkpoint_config_copy, clean_falsy=True, ) == deep_filter_properties_iterable( properties=substituted_config_template_and_runtime_kwargs, clean_falsy=True, )
def _anonymize_checkpoint_run(self, obj: object, **kwargs) -> dict: """ Traverse the entire Checkpoint configuration structure (as per its formal, validated Marshmallow schema) and anonymize every field that can be customized by a user (public fields are recorded as their original names). """ attribute_name: str attribute_value: Optional[Union[str, dict]] validation_obj: dict checkpoint_optional_top_level_keys: List[str] = [] name: Optional[str] = kwargs.get("name") anonymized_name: Optional[str] = self._anonymize_string(name) config_version: Optional[Union[Number, str]] = kwargs.get("config_version") if config_version is None: config_version = 1.0 template_name: Optional[str] = kwargs.get("template_name") anonymized_template_name: Optional[str] = self._anonymize_string( template_name) run_name_template: Optional[str] = kwargs.get("run_name_template") anonymized_run_name_template: Optional[str] = self._anonymize_string( run_name_template) expectation_suite_name: Optional[str] = kwargs.get( "expectation_suite_name") anonymized_expectation_suite_name: Optional[ str] = self._anonymize_string(expectation_suite_name) batch_request: Optional[Union[BatchRequest, RuntimeBatchRequest, dict]] = kwargs.get("batch_request") if batch_request is None: batch_request = {} anonymized_batch_request: Optional[Dict[ str, List[str]]] = self._aggregate_anonymizer.anonymize( *(), **batch_request) action_list: Optional[List[dict]] = kwargs.get("action_list") anonymized_action_list: Optional[List[dict]] = None if action_list: # noinspection PyBroadException try: anonymized_action_list = [ self._aggregate_anonymizer.anonymize( action_name=action_config_dict["name"], action_config=action_config_dict["action"], ) for action_config_dict in action_list ] except Exception: logger.debug( "anonymize_checkpoint_run: Unable to create anonymized_action_list payload field" ) validations: Optional[List[dict]] = kwargs.get("validations") anonymized_validations: Optional[List[dict]] = [] if validations: for validation_obj in validations: validation_batch_request: Optional[ Union[BatchRequest, RuntimeBatchRequest, dict]] = validation_obj.get("batch_request") if validation_batch_request is None: validation_batch_request = {} validation_batch_request = get_batch_request_as_dict( batch_request=validation_batch_request) anonymized_validation_batch_request: Optional[Optional[Dict[ str, List[str]]]] = self._aggregate_anonymizer.anonymize( *(), **validation_batch_request) validation_expectation_suite_name: Optional[ str] = validation_obj.get("expectation_suite_name") anonymized_validation_expectation_suite_name: Optional[ str] = self._anonymize_string( validation_expectation_suite_name) validation_action_list: Optional[ List[dict]] = validation_obj.get("action_list") anonymized_validation_action_list: Optional[List[dict]] = None if validation_action_list: # noinspection PyBroadException try: anonymized_validation_action_list = [ self._aggregate_anonymizer.anonymize( action_name=action_config_dict["name"], action_config=action_config_dict["action"], ) for action_config_dict in validation_action_list ] except Exception: logger.debug( "anonymize_checkpoint_run: Unable to create anonymized_validation_action_list payload field" ) anonymized_validation: Dict[str, Union[str, Dict[str, Any], List[Dict[str, Any]]]] = {} if anonymized_validation_batch_request: anonymized_validation[ "anonymized_batch_request"] = anonymized_validation_batch_request if anonymized_validation_expectation_suite_name: anonymized_validation[ "anonymized_expectation_suite_name"] = anonymized_validation_expectation_suite_name if anonymized_validation_action_list: anonymized_validation[ "anonymized_action_list"] = anonymized_validation_action_list anonymized_validation: Dict[str, Dict[str, Any]] = { "anonymized_batch_request": anonymized_validation_batch_request, "anonymized_expectation_suite_name": anonymized_validation_expectation_suite_name, "anonymized_action_list": anonymized_validation_action_list, } anonymized_validations.append(anonymized_validation) run_id: Optional[Union[str, RunIdentifier]] = kwargs.get("run_id") anonymized_run_id: Optional[Union[str, RunIdentifier]] if run_id is None: anonymized_run_id = None else: anonymized_run_id = self._anonymize_string(str(run_id)) run_name: Optional[str] = kwargs.get("run_name") anonymized_run_name: Optional[str] if run_name is None: anonymized_run_name = None else: anonymized_run_name = self._anonymize_string(run_name) run_time: Optional[Union[str, datetime.datetime]] = kwargs.get("run_time") anonymized_run_time: Optional[str] if run_time is None: anonymized_run_time = None else: anonymized_run_time = self._anonymize_string(str(run_time)) expectation_suite_ge_cloud_id: Optional[str] = kwargs.get( "expectation_suite_ge_cloud_id") anonymized_expectation_suite_ge_cloud_id: Optional[str] if expectation_suite_ge_cloud_id is None: anonymized_expectation_suite_ge_cloud_id = None else: anonymized_expectation_suite_ge_cloud_id = self._anonymize_string( str(expectation_suite_ge_cloud_id)) for attribute_name in sorted(CHECKPOINT_OPTIONAL_TOP_LEVEL_KEYS): attribute_value = kwargs.get(attribute_name) if attribute_value: checkpoint_optional_top_level_keys.append(attribute_name) anonymized_checkpoint_run_properties_dict: Dict[str, List[str]] = { "anonymized_name": anonymized_name, "config_version": config_version, "anonymized_template_name": anonymized_template_name, "anonymized_run_name_template": anonymized_run_name_template, "anonymized_expectation_suite_name": anonymized_expectation_suite_name, "anonymized_batch_request": anonymized_batch_request, "anonymized_action_list": anonymized_action_list, "anonymized_validations": anonymized_validations, "anonymized_run_id": anonymized_run_id, "anonymized_run_name": anonymized_run_name, "anonymized_run_time": anonymized_run_time, "anonymized_expectation_suite_ge_cloud_id": anonymized_expectation_suite_ge_cloud_id, "checkpoint_optional_top_level_keys": checkpoint_optional_top_level_keys, } deep_filter_properties_iterable( properties=anonymized_checkpoint_run_properties_dict, clean_falsy=True, inplace=True, ) return anonymized_checkpoint_run_properties_dict
def test_deep_filter_properties_iterable(): source_dict: dict = { "integer_zero": 0, "null": None, "string": "xyz_0", "integer_one": 1, "scientific_notation_floating_point_number": 9.8e1, "empty_top_level_dictionary": {}, "empty_top_level_list": [], "empty_top_level_set": set(), "non_empty_top_level_set": { 0, 1, 2, "a", "b", "c", }, "non_empty_top_level_dictionary": { "empty_1st_level_list": [], "empty_1st_level_set": set(), "non_empty_1st_level_set": { "empty_2nd_level_list": [], "non_empty_2nd_level_list": [ 0, 1, 2, "a", "b", "c", ], "non_empty_2nd_level_dictionary": { "integer_zero": 0, "null": None, "string": "xyz_0", "integer_one": 1, "scientific_notation_floating_point_number": 9.8e1, }, "empty_2nd_level_dictionary": {}, }, }, } d0_begin: dict = copy.deepcopy(source_dict) deep_filter_properties_iterable( properties=d0_begin, clean_falsy=True, inplace=True, ) d0_end: dict = d0_begin d0_end_expected: dict = { "integer_zero": 0, "string": "xyz_0", "integer_one": 1, "scientific_notation_floating_point_number": 98.0, "non_empty_top_level_set": { 0, 1, 2, "a", "b", "c", }, "non_empty_top_level_dictionary": { "non_empty_1st_level_set": { "non_empty_2nd_level_list": [0, 1, 2, "a", "b", "c"], "non_empty_2nd_level_dictionary": { "integer_zero": 0, "string": "xyz_0", "integer_one": 1, "scientific_notation_floating_point_number": 98.0, }, } }, } assert d0_end == d0_end_expected d1_begin: dict = copy.deepcopy(source_dict) d1_end: dict = deep_filter_properties_iterable( properties=d1_begin, clean_falsy=True, keep_falsy_numerics=False, ) d1_end_expected: dict = { "string": "xyz_0", "integer_one": 1, "scientific_notation_floating_point_number": 98.0, "non_empty_top_level_set": { 0, 1, 2, "a", "b", "c", }, "non_empty_top_level_dictionary": { "non_empty_1st_level_set": { "non_empty_2nd_level_list": [0, 1, 2, "a", "b", "c"], "non_empty_2nd_level_dictionary": { "string": "xyz_0", "integer_one": 1, "scientific_notation_floating_point_number": 98.0, }, } }, } assert d1_end == d1_end_expected
def anonymize(self, obj: Optional[object] = None, **kwargs) -> Any: anonymized_batch_request_properties_dict: Optional[Dict[ str, List[str]]] = None # noinspection PyBroadException try: from great_expectations.core.batch import ( BatchRequest, get_batch_request_from_acceptable_arguments, standardize_batch_request_display_ordering, ) batch_request: BatchRequest = get_batch_request_from_acceptable_arguments( **kwargs) batch_request_dict: dict = batch_request.to_json_dict() anonymized_batch_request_dict: Optional[Union[ str, dict]] = self._anonymize_batch_request_properties( source=batch_request_dict) anonymized_batch_request_dict = standardize_batch_request_display_ordering( batch_request=anonymized_batch_request_dict) deep_filter_properties_iterable( properties=anonymized_batch_request_dict, clean_falsy=True, inplace=True, ) anonymized_batch_request_required_top_level_properties: dict = {} batch_request_optional_top_level_keys: List[str] = [] batch_spec_passthrough_keys: List[str] = [] data_connector_query_keys: List[str] = [] runtime_parameters_keys: List[str] = [] anonymized_batch_request_properties_dict = { "anonymized_batch_request_required_top_level_properties": (anonymized_batch_request_required_top_level_properties), "batch_request_optional_top_level_keys": batch_request_optional_top_level_keys, "batch_spec_passthrough_keys": batch_spec_passthrough_keys, "runtime_parameters_keys": runtime_parameters_keys, "data_connector_query_keys": data_connector_query_keys, } self._build_anonymized_batch_request( destination=anonymized_batch_request_properties_dict, source=anonymized_batch_request_dict, ) deep_filter_properties_iterable( properties=anonymized_batch_request_properties_dict, clean_falsy=True, inplace=True, ) batch_request_optional_top_level_keys.sort() batch_spec_passthrough_keys.sort() data_connector_query_keys.sort() runtime_parameters_keys.sort() except Exception: logger.debug( "anonymize_batch_request: Unable to create anonymized_batch_request payload field" ) return anonymized_batch_request_properties_dict
def run( self, batch_request: Optional[Union[BatchRequestBase, dict]] = None, **kwargs: dict, ) -> DataAssistantResult: """ batch_request: Explicit batch_request used to supply data at runtime kwargs: additional/override directives supplied at runtime (using "runtime_environment") "kwargs" directives structure: { "DomainBuilder" parameters: include_column_names=["column_a", "column_b", "column_c", ...], exclude_column_names=["column_d", "column_e", "column_f", "column_g", ...], max_unexpected_values=0, max_unexpected_ratio=None, min_max_unexpected_values_proportion=9.75e-1, ... Other "DomainBuilder" parameters ... "variables" settings for "Rule" configurations: numeric_columns_rule={ "round_decimals": 12, "false_positive_rate": 0.1, "random_seed": 43792, }, datetime_columns_rule={ "truncate_values": { "lower_bound": 0, "upper_bound": 4481049600, # Friday, January 1, 2112 0:00:00 }, "round_decimals": 0, }, categorical_columns_rule={ "false_positive_rate": 0.1, "round_decimals": 4, }, ... "variables" settings for other "Rule" configurations ... } Implementation makes best effort at assigning directives to appropriate "MetricDomainTypes" member. Returns: DataAssistantResult: The result object for the DataAssistant """ if batch_request is None: data_assistant_name: str = self._data_assistant_cls.data_assistant_type raise ge_exceptions.DataAssistantExecutionError( message=f"""Utilizing "{data_assistant_name}.run()" requires valid "batch_request" to be \ specified (empty or missing "batch_request" detected).""" ) data_assistant: DataAssistant = self._build_data_assistant( batch_request=batch_request ) directives: dict = deep_filter_properties_iterable( properties=kwargs, ) variables_directives_list: List[ RuntimeEnvironmentVariablesDirectives ] = build_variables_directives(**directives) domain_type_directives_list: List[ RuntimeEnvironmentDomainTypeDirectives ] = build_domain_type_directives(**directives) data_assistant_result: DataAssistantResult = data_assistant.run( variables_directives_list=variables_directives_list, domain_type_directives_list=domain_type_directives_list, ) return data_assistant_result
def test_reconcile_profiler_rules_new_rule_override( profiler_with_placeholder_args, ): rules: Dict[str, Dict[str, Any]] = { "rule_0": { "domain_builder": { "class_name": "ColumnDomainBuilder", "module_name": "great_expectations.rule_based_profiler.domain_builder", }, "parameter_builders": [ { "class_name": "MetricMultiBatchParameterBuilder", "module_name": "great_expectations.rule_based_profiler.parameter_builder", "name": "my_parameter", "metric_name": "my_metric", }, { "class_name": "NumericMetricRangeMultiBatchParameterBuilder", "module_name": "great_expectations.rule_based_profiler.parameter_builder", "name": "my_other_parameter", "metric_name": "my_other_metric", }, ], "expectation_configuration_builders": [ { "class_name": "DefaultExpectationConfigurationBuilder", "module_name": "great_expectations.rule_based_profiler.expectation_configuration_builder", "expectation_type": "expect_column_pair_values_A_to_be_greater_than_B", "column_A": "$domain.domain_kwargs.column_A", "column_B": "$domain.domain_kwargs.column_B", "my_one_arg": "$parameter.my_parameter.value[0]", "meta": { "details": { "my_parameter_estimator": "$parameter.my_parameter.details", "note": "Important remarks about estimation algorithm.", }, }, }, { "class_name": "DefaultExpectationConfigurationBuilder", "module_name": "great_expectations.rule_based_profiler.expectation_configuration_builder", "expectation_type": "expect_column_min_to_be_between", "column": "$domain.domain_kwargs.column", "my_another_arg": "$parameter.my_other_parameter.value[0]", "meta": { "details": { "my_other_parameter_estimator": "$parameter.my_other_parameter.details", "note": "Important remarks about estimation algorithm.", }, }, }, ], }, } expected_rules: List[dict] = [ { "name": "rule_0", "domain_builder": {}, "parameter_builders": [ { "name": "my_parameter", "metric_name": "my_metric", "enforce_numeric_metric": False, "replace_nan_with_zero": False, "reduce_scalar_metric": True, }, { "name": "my_other_parameter", "metric_name": "my_other_metric", "sampling_method": "bootstrap", "enforce_numeric_metric": True, "replace_nan_with_zero": True, "reduce_scalar_metric": True, "false_positive_rate": 0.05, "truncate_values": {}, }, ], "expectation_configuration_builders": [ { "expectation_type": "expect_column_pair_values_A_to_be_greater_than_B", "column_A": "$domain.domain_kwargs.column_A", "column_B": "$domain.domain_kwargs.column_B", "my_one_arg": "$parameter.my_parameter.value[0]", "meta": { "details": { "my_parameter_estimator": "$parameter.my_parameter.details", "note": "Important remarks about estimation algorithm.", }, }, }, { "expectation_type": "expect_column_min_to_be_between", "column": "$domain.domain_kwargs.column", "my_another_arg": "$parameter.my_other_parameter.value[0]", "meta": { "details": { "my_other_parameter_estimator": "$parameter.my_other_parameter.details", "note": "Important remarks about estimation algorithm.", }, }, }, ], }, { "name": "rule_1", "domain_builder": {}, "parameter_builders": [ { "name": "my_parameter", "metric_name": "my_metric", "enforce_numeric_metric": False, "replace_nan_with_zero": False, "reduce_scalar_metric": True, }, ], "expectation_configuration_builders": [ { "expectation_type": "expect_column_pair_values_A_to_be_greater_than_B", "column_A": "$domain.domain_kwargs.column_A", "column_B": "$domain.domain_kwargs.column_B", "my_arg": "$parameter.my_parameter.value[0]", "my_other_arg": "$parameter.my_parameter.value[1]", "meta": { "details": { "my_parameter_estimator": "$parameter.my_parameter.details", "note": "Important remarks about estimation algorithm.", }, }, }, ], }, ] effective_rules: List[ Rule] = profiler_with_placeholder_args.reconcile_profiler_rules( rules=rules) rule: Rule effective_rule_configs_actual: dict = { rule.name: rule.to_json_dict() for rule in effective_rules } deep_filter_properties_iterable(effective_rule_configs_actual, inplace=True) rule_config: dict effective_rule_configs_expected: dict = { rule_config["name"]: rule_config for rule_config in expected_rules } assert effective_rule_configs_actual == effective_rule_configs_expected
def get_batch_request_from_acceptable_arguments( datasource_name: Optional[str] = None, data_connector_name: Optional[str] = None, data_asset_name: Optional[str] = None, *, batch_request: Optional[BatchRequestBase] = None, batch_data: Optional[Any] = None, data_connector_query: Optional[dict] = None, batch_identifiers: Optional[dict] = None, limit: Optional[int] = None, index: Optional[Union[int, list, tuple, slice, str]] = None, custom_filter_function: Optional[Callable] = None, batch_spec_passthrough: Optional[dict] = None, sampling_method: Optional[str] = None, sampling_kwargs: Optional[dict] = None, splitter_method: Optional[str] = None, splitter_kwargs: Optional[dict] = None, runtime_parameters: Optional[dict] = None, query: Optional[str] = None, path: Optional[str] = None, batch_filter_parameters: Optional[dict] = None, **kwargs, ) -> Union[BatchRequest, RuntimeBatchRequest]: """Obtain formal BatchRequest typed object from allowed attributes (supplied as arguments). This method applies only to the new (V3) Datasource schema. Args: datasource_name data_connector_name data_asset_name batch_request batch_data query path runtime_parameters data_connector_query batch_identifiers batch_filter_parameters limit index custom_filter_function sampling_method sampling_kwargs splitter_method splitter_kwargs batch_spec_passthrough **kwargs Returns: (BatchRequest or RuntimeBatchRequest) The formal BatchRequest or RuntimeBatchRequest object """ if batch_request: if not isinstance(batch_request, (BatchRequest, RuntimeBatchRequest)): raise TypeError( f"""batch_request must be an instance of BatchRequest or RuntimeBatchRequest object, not \ {type(batch_request)}""") datasource_name = batch_request.datasource_name # ensure that the first parameter is datasource_name, which should be a str. This check prevents users # from passing in batch_request as an unnamed parameter. if not isinstance(datasource_name, str): raise ge_exceptions.GreatExpectationsTypeError( f"the first parameter, datasource_name, must be a str, not {type(datasource_name)}" ) if len([arg for arg in [batch_data, query, path] if arg is not None]) > 1: raise ValueError( "Must provide only one of batch_data, query, or path.") if any([ batch_data is not None and runtime_parameters and "batch_data" in runtime_parameters, query and runtime_parameters and "query" in runtime_parameters, path and runtime_parameters and "path" in runtime_parameters, ]): raise ValueError( "If batch_data, query, or path arguments are provided, the same keys cannot appear in the " "runtime_parameters argument.") if batch_request: # TODO: Raise a warning if any parameters besides batch_requests are specified return batch_request batch_request_class: type batch_request_as_dict: dict if any([batch_data is not None, query, path, runtime_parameters]): batch_request_class = RuntimeBatchRequest runtime_parameters = runtime_parameters or {} if batch_data is not None: runtime_parameters["batch_data"] = batch_data elif query is not None: runtime_parameters["query"] = query elif path is not None: runtime_parameters["path"] = path if batch_identifiers is None: batch_identifiers = kwargs else: # Raise a warning if kwargs exist pass batch_request_as_dict = { "datasource_name": datasource_name, "data_connector_name": data_connector_name, "data_asset_name": data_asset_name, "runtime_parameters": runtime_parameters, "batch_identifiers": batch_identifiers, "batch_spec_passthrough": batch_spec_passthrough, } else: batch_request_class = BatchRequest if data_connector_query is None: if batch_filter_parameters is not None and batch_identifiers is not None: raise ValueError( 'Must provide either "batch_filter_parameters" or "batch_identifiers", not both.' ) if batch_filter_parameters is None and batch_identifiers is not None: logger.warning( 'Attempting to build data_connector_query but "batch_identifiers" was provided ' 'instead of "batch_filter_parameters". The "batch_identifiers" key on ' 'data_connector_query has been renamed to "batch_filter_parameters". Please update ' 'your code. Falling back on provided "batch_identifiers".') batch_filter_parameters = batch_identifiers elif batch_filter_parameters is None and batch_identifiers is None: batch_filter_parameters = kwargs else: # Raise a warning if kwargs exist pass data_connector_query_params: dict = { "batch_filter_parameters": batch_filter_parameters, "limit": limit, "index": index, "custom_filter_function": custom_filter_function, } data_connector_query = IDDict(data_connector_query_params) else: # Raise a warning if batch_filter_parameters or kwargs exist data_connector_query = IDDict(data_connector_query) if batch_spec_passthrough is None: batch_spec_passthrough = {} if sampling_method is not None: sampling_params: dict = { "sampling_method": sampling_method, } if sampling_kwargs is not None: sampling_params["sampling_kwargs"] = sampling_kwargs batch_spec_passthrough.update(sampling_params) if splitter_method is not None: splitter_params: dict = { "splitter_method": splitter_method, } if splitter_kwargs is not None: splitter_params["splitter_kwargs"] = splitter_kwargs batch_spec_passthrough.update(splitter_params) batch_request_as_dict: dict = { "datasource_name": datasource_name, "data_connector_name": data_connector_name, "data_asset_name": data_asset_name, "data_connector_query": data_connector_query, "batch_spec_passthrough": batch_spec_passthrough, } deep_filter_properties_iterable( properties=batch_request_as_dict, inplace=True, ) batch_request = batch_request_class(**batch_request_as_dict) return batch_request
def test_reconcile_profiler_rules_existing_rule_domain_builder_override( profiler_with_placeholder_args, ): rules: Dict[str, Dict[str, Any]] = { "rule_1": { "domain_builder": { "class_name": "SimpleColumnSuffixDomainBuilder", "module_name": "great_expectations.rule_based_profiler.domain_builder", "column_name_suffixes": [ "_ts", ], }, }, } expected_rules: List[dict] = [ { "name": "rule_1", "domain_builder": { "column_name_suffixes": [ "_ts", ], }, "parameter_builders": [ { "name": "my_parameter", "metric_name": "my_metric", "enforce_numeric_metric": False, "replace_nan_with_zero": False, "reduce_scalar_metric": True, }, ], "expectation_configuration_builders": [ { "expectation_type": "expect_column_pair_values_A_to_be_greater_than_B", "column_A": "$domain.domain_kwargs.column_A", "column_B": "$domain.domain_kwargs.column_B", "my_arg": "$parameter.my_parameter.value[0]", "my_other_arg": "$parameter.my_parameter.value[1]", "meta": { "details": { "my_parameter_estimator": "$parameter.my_parameter.details", "note": "Important remarks about estimation algorithm.", }, }, }, ], }, ] effective_rules: List[ Rule] = profiler_with_placeholder_args.reconcile_profiler_rules( rules=rules) rule: Rule effective_rule_configs_actual: dict = { rule.name: rule.to_json_dict() for rule in effective_rules } deep_filter_properties_iterable(effective_rule_configs_actual, inplace=True) rule_config: dict effective_rule_configs_expected: dict = { rule_config["name"]: rule_config for rule_config in expected_rules } assert effective_rule_configs_actual == effective_rule_configs_expected