def wrapper(metric_fn: Callable): @wraps(metric_fn) def inner_func(*args, **kwargs): return metric_fn(*args, **kwargs) inner_func.metric_engine = engine inner_func.metric_fn_type = MetricPartialFunctionTypes( partial_fn_type) # raises ValueError if unknown type inner_func.domain_type = MetricDomainTypes(domain_type) inner_func.metric_definition_kwargs = kwargs return inner_func
def __init__( self, domain_type: Union[str, MetricDomainTypes], domain_kwargs: Optional[Union[Dict[str, Any], DomainKwargs]] = None, details: Optional[Dict[str, Any]] = None, rule_name: Optional[str] = None, ) -> None: if isinstance(domain_type, str): try: domain_type = MetricDomainTypes(domain_type) except (TypeError, KeyError) as e: raise ValueError( f""" {e}: Cannot instantiate Domain (domain_type "{str(domain_type)}" of type \ "{str(type(domain_type))}" is not supported). """) elif not isinstance(domain_type, MetricDomainTypes): raise ValueError( f"""Cannot instantiate Domain (domain_type "{str(domain_type)}" of type "{str(type(domain_type))}" is \ not supported). """) if domain_kwargs is None: domain_kwargs = DomainKwargs({}) elif isinstance(domain_kwargs, dict): domain_kwargs = DomainKwargs(domain_kwargs) domain_kwargs_dot_dict: DomainKwargs = ( self._convert_dictionaries_to_domain_kwargs(source=domain_kwargs)) if details is None: details = {} inferred_semantic_domain_type: Dict[str, Union[ str, SemanticDomainTypes]] = details.get(INFERRED_SEMANTIC_TYPE_KEY) if inferred_semantic_domain_type: semantic_domain_key: str metric_domain_key: str metric_domain_value: Any is_consistent: bool for semantic_domain_key in inferred_semantic_domain_type: is_consistent = False for ( metric_domain_key, metric_domain_value, ) in domain_kwargs_dot_dict.items(): if (isinstance(metric_domain_value, (list, set, tuple)) and semantic_domain_key in metric_domain_value ) or (semantic_domain_key == metric_domain_value): is_consistent = True break if not is_consistent: raise ValueError( f"""Cannot instantiate Domain (domain_type "{str(domain_type)}" of type \ "{str(type(domain_type))}" -- key "{semantic_domain_key}", detected in "{INFERRED_SEMANTIC_TYPE_KEY}" dictionary, does \ not exist as value of appropriate key in "domain_kwargs" dictionary. """) super().__init__( domain_type=domain_type, domain_kwargs=domain_kwargs_dot_dict, details=details, rule_name=rule_name, )
def get_compute_domain( self, domain_kwargs: Dict, domain_type: Union[str, "MetricDomainTypes"], accessor_keys: Optional[Iterable[str]] = None, ) -> Tuple["sa.sql.Selectable", dict, dict]: """Uses a given batch dictionary and domain kwargs to obtain a SqlAlchemy column object. Args: domain_kwargs (dict) - A dictionary consisting of the domain kwargs specifying which data to obtain domain_type (str or "MetricDomainTypes") - an Enum value indicating which metric domain the user would like to be using, or a corresponding string value representing it. String types include "identity", "column", "column_pair", "table" and "other". Enum types include capitalized versions of these from the class MetricDomainTypes. accessor_keys (str iterable) - keys that are part of the compute domain but should be ignored when describing the domain and simply transferred with their associated values into accessor_domain_kwargs. Returns: SqlAlchemy column """ # Extracting value from enum if it is given for future computation domain_type = MetricDomainTypes(domain_type) batch_id = domain_kwargs.get("batch_id") if batch_id is None: # We allow no batch id specified if there is only one batch if self.active_batch_data: data_object = self.active_batch_data else: raise GreatExpectationsError( "No batch is specified, but could not identify a loaded batch." ) else: if batch_id in self.loaded_batch_data_dict: data_object = self.loaded_batch_data_dict[batch_id] else: raise GreatExpectationsError( f"Unable to find batch with batch_id {batch_id}" ) compute_domain_kwargs = copy.deepcopy(domain_kwargs) accessor_domain_kwargs = dict() if "table" in domain_kwargs and domain_kwargs["table"] is not None: if domain_kwargs["table"] != data_object.record_set_name: raise ValueError("Unrecognized table name.") else: selectable = data_object.selectable elif "query" in domain_kwargs: raise ValueError( "query is not currently supported by SqlAlchemyExecutionEngine" ) else: selectable = data_object.selectable if ( "row_condition" in domain_kwargs and domain_kwargs["row_condition"] is not None ): condition_parser = domain_kwargs["condition_parser"] if condition_parser == "great_expectations__experimental__": parsed_condition = parse_condition_to_sqlalchemy( domain_kwargs["row_condition"] ) selectable = sa.select( "*", from_obj=selectable, whereclause=parsed_condition ) else: raise GreatExpectationsError( "SqlAlchemyExecutionEngine only supports the great_expectations condition_parser." ) # Warning user if accessor keys are in any domain that is not of type table, will be ignored if ( domain_type != MetricDomainTypes.TABLE and accessor_keys is not None and len(accessor_keys) > 0 ): logger.warning( "Accessor keys ignored since Metric Domain Type is not 'table'" ) if domain_type == MetricDomainTypes.TABLE: if accessor_keys is not None and len(accessor_keys) > 0: for key in accessor_keys: accessor_domain_kwargs[key] = compute_domain_kwargs.pop(key) if len(domain_kwargs.keys()) > 0: for key in compute_domain_kwargs.keys(): # Warning user if kwarg not "normal" if key not in [ "batch_id", "table", "row_condition", "condition_parser", ]: logger.warning( f"Unexpected key {key} found in domain_kwargs for domain type {domain_type.value}" ) return selectable, compute_domain_kwargs, accessor_domain_kwargs # If user has stated they want a column, checking if one is provided, and elif domain_type == MetricDomainTypes.COLUMN: if "column" in compute_domain_kwargs: # Checking if case- sensitive and using appropriate name if self.active_batch_data.use_quoted_name: accessor_domain_kwargs["column"] = quoted_name( compute_domain_kwargs.pop("column") ) else: accessor_domain_kwargs["column"] = compute_domain_kwargs.pop( "column" ) else: # If column not given raise GreatExpectationsError( "Column not provided in compute_domain_kwargs" ) # Else, if column pair values requested elif domain_type == MetricDomainTypes.COLUMN_PAIR: # Ensuring column_A and column_B parameters provided if ( "column_A" in compute_domain_kwargs and "column_B" in compute_domain_kwargs ): if self.active_batch_data.use_quoted_name: # If case matters... accessor_domain_kwargs["column_A"] = quoted_name( compute_domain_kwargs.pop("column_A") ) accessor_domain_kwargs["column_B"] = quoted_name( compute_domain_kwargs.pop("column_B") ) else: accessor_domain_kwargs["column_A"] = compute_domain_kwargs.pop( "column_A" ) accessor_domain_kwargs["column_B"] = compute_domain_kwargs.pop( "column_B" ) else: raise GreatExpectationsError( "column_A or column_B not found within compute_domain_kwargs" ) # Checking if table or identity or other provided, column is not specified. If it is, warning the user elif domain_type == MetricDomainTypes.MULTICOLUMN: if "columns" in compute_domain_kwargs: # If columns exist accessor_domain_kwargs["columns"] = compute_domain_kwargs.pop("columns") # Filtering if identity elif domain_type == MetricDomainTypes.IDENTITY: # If we would like our data to become a single column if "column" in compute_domain_kwargs: if self.active_batch_data.use_quoted_name: selectable = sa.select( [sa.column(quoted_name(compute_domain_kwargs["column"]))] ).select_from(selectable) else: selectable = sa.select( [sa.column(compute_domain_kwargs["column"])] ).select_from(selectable) # If we would like our data to now become a column pair elif ("column_A" in compute_domain_kwargs) and ( "column_B" in compute_domain_kwargs ): if self.active_batch_data.use_quoted_name: selectable = sa.select( [ sa.column(quoted_name(compute_domain_kwargs["column_A"])), sa.column(quoted_name(compute_domain_kwargs["column_B"])), ] ).select_from(selectable) else: selectable = sa.select( [ sa.column(compute_domain_kwargs["column_A"]), sa.column(compute_domain_kwargs["column_B"]), ] ).select_from(selectable) else: # If we would like our data to become a multicolumn if "columns" in compute_domain_kwargs: if self.active_batch_data.use_quoted_name: # Building a list of column objects used for sql alchemy selection to_select = [ sa.column(quoted_name(col)) for col in compute_domain_kwargs["columns"] ] selectable = sa.select(to_select).select_from(selectable) else: to_select = [ sa.column(col) for col in compute_domain_kwargs["columns"] ] selectable = sa.select(to_select).select_from(selectable) # Letting selectable fall through return selectable, compute_domain_kwargs, accessor_domain_kwargs
def get_compute_domain( self, domain_kwargs: dict, domain_type: Union[str, MetricDomainTypes], accessor_keys: Optional[Iterable[str]] = None, ) -> Tuple["pyspark.sql.DataFrame", dict, dict]: """Uses a given batch dictionary and domain kwargs (which include a row condition and a condition parser) to obtain and/or query a batch. Returns in the format of a Pandas Series if only a single column is desired, or otherwise a Data Frame. Args: domain_kwargs (dict) - A dictionary consisting of the domain kwargs specifying which data to obtain domain_type (str or MetricDomainTypes) - an Enum value indicating which metric domain the user would like to be using, or a corresponding string value representing it. String types include "identity", "column", "column_pair", "table" and "other". Enum types include capitalized versions of these from the class MetricDomainTypes. accessor_keys (str iterable) - keys that are part of the compute domain but should be ignored when describing the domain and simply transferred with their associated values into accessor_domain_kwargs. Returns: A tuple including: - a DataFrame (the data on which to compute) - a dictionary of compute_domain_kwargs, describing the DataFrame - a dictionary of accessor_domain_kwargs, describing any accessors needed to identify the domain within the compute domain """ # Extracting value from enum if it is given for future computation domain_type = MetricDomainTypes(domain_type) batch_id = domain_kwargs.get("batch_id") if batch_id is None: # We allow no batch id specified if there is only one batch if self.active_batch_data: data = self.active_batch_data.dataframe else: raise ValidationError( "No batch is specified, but could not identify a loaded batch." ) else: if batch_id in self.loaded_batch_data_dict: data = self.loaded_batch_data_dict[batch_id].dataframe else: raise ValidationError( f"Unable to find batch with batch_id {batch_id}") compute_domain_kwargs = copy.deepcopy(domain_kwargs) accessor_domain_kwargs = dict() table = domain_kwargs.get("table", None) if table: raise ValueError( "SparkDFExecutionEngine does not currently support multiple named tables." ) row_condition = domain_kwargs.get("row_condition", None) if row_condition: condition_parser = domain_kwargs.get("condition_parser", None) if condition_parser == "spark": data = data.filter(row_condition) elif condition_parser == "great_expectations__experimental__": parsed_condition = parse_condition_to_spark(row_condition) data = data.filter(parsed_condition) else: raise GreatExpectationsError( f"unrecognized condition_parser {str(condition_parser)}for Spark execution engine" ) # Warning user if accessor keys are in any domain that is not of type table, will be ignored if (domain_type != MetricDomainTypes.TABLE and accessor_keys is not None and len(list(accessor_keys)) > 0): logger.warning( 'Accessor keys ignored since Metric Domain Type is not "table"' ) if domain_type == MetricDomainTypes.TABLE: if accessor_keys is not None and len(list(accessor_keys)) > 0: for key in accessor_keys: accessor_domain_kwargs[key] = compute_domain_kwargs.pop( key) if len(compute_domain_kwargs.keys()) > 0: # Warn user if kwarg not "normal". unexpected_keys: set = set( compute_domain_kwargs.keys()).difference({ "batch_id", "table", "row_condition", "condition_parser", }) if len(unexpected_keys) > 0: unexpected_keys_str: str = ", ".join( map(lambda element: f'"{element}"', unexpected_keys)) logger.warning( f'Unexpected key(s) {unexpected_keys_str} found in domain_kwargs for domain type "{domain_type.value}".' ) return data, compute_domain_kwargs, accessor_domain_kwargs # If user has stated they want a column, checking if one is provided, and elif domain_type == MetricDomainTypes.COLUMN: if "column" in compute_domain_kwargs: accessor_domain_kwargs["column"] = compute_domain_kwargs.pop( "column") else: # If column not given raise GreatExpectationsError( "Column not provided in compute_domain_kwargs") # Else, if column pair values requested elif domain_type == MetricDomainTypes.COLUMN_PAIR: # Ensuring column_A and column_B parameters provided if ("column_A" in compute_domain_kwargs and "column_B" in compute_domain_kwargs): accessor_domain_kwargs["column_A"] = compute_domain_kwargs.pop( "column_A") accessor_domain_kwargs["column_B"] = compute_domain_kwargs.pop( "column_B") else: raise GreatExpectationsError( "column_A or column_B not found within compute_domain_kwargs" ) # Checking if table or identity or other provided, column is not specified. If it is, warning the user elif domain_type == MetricDomainTypes.MULTICOLUMN: if "column_list" in compute_domain_kwargs: # If column_list exists accessor_domain_kwargs[ "column_list"] = compute_domain_kwargs.pop("column_list") # Filtering if identity elif domain_type == MetricDomainTypes.IDENTITY: # If we would like our data to become a single column if "column" in compute_domain_kwargs: data = data.select(compute_domain_kwargs["column"]) # If we would like our data to now become a column pair elif ("column_A" in compute_domain_kwargs) and ("column_B" in compute_domain_kwargs): data = data.select(compute_domain_kwargs["column_A"], compute_domain_kwargs["column_B"]) else: # If we would like our data to become a multicolumn if "column_list" in compute_domain_kwargs: data = data.select(compute_domain_kwargs["column_list"]) return data, compute_domain_kwargs, accessor_domain_kwargs
def get_compute_domain( self, domain_kwargs: Dict, domain_type: Union[str, MetricDomainTypes], accessor_keys: Optional[Iterable[str]] = None, ) -> Tuple[Selectable, dict, dict]: """Uses a given batch dictionary and domain kwargs to obtain a SqlAlchemy column object. Args: domain_kwargs (dict) - A dictionary consisting of the domain kwargs specifying which data to obtain domain_type (str or MetricDomainTypes) - an Enum value indicating which metric domain the user would like to be using, or a corresponding string value representing it. String types include "identity", "column", "column_pair", "table" and "other". Enum types include capitalized versions of these from the class MetricDomainTypes. accessor_keys (str iterable) - keys that are part of the compute domain but should be ignored when describing the domain and simply transferred with their associated values into accessor_domain_kwargs. Returns: SqlAlchemy column """ selectable = self.get_domain_records(domain_kwargs=domain_kwargs, ) # Extracting value from enum if it is given for future computation domain_type = MetricDomainTypes(domain_type) # Warning user if accessor keys are in any domain that is not of type table, will be ignored if (domain_type != MetricDomainTypes.TABLE and accessor_keys is not None and len(list(accessor_keys)) > 0): logger.warning( 'Accessor keys ignored since Metric Domain Type is not "table"' ) compute_domain_kwargs = copy.deepcopy(domain_kwargs) accessor_domain_kwargs = {} if domain_type == MetricDomainTypes.TABLE: if accessor_keys is not None and len(list(accessor_keys)) > 0: for key in accessor_keys: accessor_domain_kwargs[key] = compute_domain_kwargs.pop( key) if len(domain_kwargs.keys()) > 0: # Warn user if kwarg not "normal". unexpected_keys: set = set( compute_domain_kwargs.keys()).difference({ "batch_id", "table", "row_condition", "condition_parser", }) if len(unexpected_keys) > 0: unexpected_keys_str: str = ", ".join( map(lambda element: f'"{element}"', unexpected_keys)) logger.warning( f'Unexpected key(s) {unexpected_keys_str} found in domain_kwargs for domain type "{domain_type.value}".' ) return selectable, compute_domain_kwargs, accessor_domain_kwargs elif domain_type == MetricDomainTypes.COLUMN: if "column" not in compute_domain_kwargs: raise GreatExpectationsError( "Column not provided in compute_domain_kwargs") # Checking if case-sensitive and using appropriate name if self.active_batch_data.use_quoted_name: accessor_domain_kwargs["column"] = quoted_name( compute_domain_kwargs.pop("column"), quote=True) else: accessor_domain_kwargs["column"] = compute_domain_kwargs.pop( "column") return selectable, compute_domain_kwargs, accessor_domain_kwargs elif domain_type == MetricDomainTypes.COLUMN_PAIR: if not ("column_A" in compute_domain_kwargs and "column_B" in compute_domain_kwargs): raise GreatExpectationsError( "column_A or column_B not found within compute_domain_kwargs" ) # Checking if case-sensitive and using appropriate name if self.active_batch_data.use_quoted_name: accessor_domain_kwargs["column_A"] = quoted_name( compute_domain_kwargs.pop("column_A"), quote=True) accessor_domain_kwargs["column_B"] = quoted_name( compute_domain_kwargs.pop("column_B"), quote=True) else: accessor_domain_kwargs["column_A"] = compute_domain_kwargs.pop( "column_A") accessor_domain_kwargs["column_B"] = compute_domain_kwargs.pop( "column_B") return selectable, compute_domain_kwargs, accessor_domain_kwargs elif domain_type == MetricDomainTypes.MULTICOLUMN: if "column_list" not in domain_kwargs: raise GreatExpectationsError( "column_list not found within domain_kwargs") column_list = compute_domain_kwargs.pop("column_list") if len(column_list) < 2: raise GreatExpectationsError( "column_list must contain at least 2 columns") # Checking if case-sensitive and using appropriate name if self.active_batch_data.use_quoted_name: accessor_domain_kwargs["column_list"] = [ quoted_name(column_name, quote=True) for column_name in column_list ] else: accessor_domain_kwargs["column_list"] = column_list return selectable, compute_domain_kwargs, accessor_domain_kwargs # Letting selectable fall through return selectable, compute_domain_kwargs, accessor_domain_kwargs
def get_compute_domain( self, domain_kwargs: dict, domain_type: Union[str, MetricDomainTypes], accessor_keys: Optional[Iterable[str]] = None, ) -> Tuple[DataFrame, dict, dict]: """Uses a given batch dictionary and domain kwargs (which include a row condition and a condition parser) to obtain and/or query a batch. Returns in the format of a Spark DataFrame. Args: domain_kwargs (dict) - A dictionary consisting of the domain kwargs specifying which data to obtain domain_type (str or MetricDomainTypes) - an Enum value indicating which metric domain the user would like to be using, or a corresponding string value representing it. String types include "identity", "column", "column_pair", "table" and "other". Enum types include capitalized versions of these from the class MetricDomainTypes. accessor_keys (str iterable) - keys that are part of the compute domain but should be ignored when describing the domain and simply transferred with their associated values into accessor_domain_kwargs. Returns: A tuple including: - a DataFrame (the data on which to compute) - a dictionary of compute_domain_kwargs, describing the DataFrame - a dictionary of accessor_domain_kwargs, describing any accessors needed to identify the domain within the compute domain """ data = self.get_domain_records( domain_kwargs=domain_kwargs, ) # Extracting value from enum if it is given for future computation domain_type = MetricDomainTypes(domain_type) compute_domain_kwargs = copy.deepcopy(domain_kwargs) accessor_domain_kwargs = {} table = domain_kwargs.get("table", None) if table: raise ValueError( "SparkDFExecutionEngine does not currently support multiple named tables." ) # Warning user if accessor keys are in any domain that is not of type table, will be ignored if ( domain_type != MetricDomainTypes.TABLE and accessor_keys is not None and len(list(accessor_keys)) > 0 ): logger.warning( 'Accessor keys ignored since Metric Domain Type is not "table"' ) if domain_type == MetricDomainTypes.TABLE: if accessor_keys is not None and len(list(accessor_keys)) > 0: for key in accessor_keys: accessor_domain_kwargs[key] = compute_domain_kwargs.pop(key) if len(compute_domain_kwargs.keys()) > 0: # Warn user if kwarg not "normal". unexpected_keys: set = set(compute_domain_kwargs.keys()).difference( { "batch_id", "table", "row_condition", "condition_parser", } ) if len(unexpected_keys) > 0: unexpected_keys_str: str = ", ".join( map(lambda element: f'"{element}"', unexpected_keys) ) logger.warning( f'Unexpected key(s) {unexpected_keys_str} found in domain_kwargs for domain type "{domain_type.value}".' ) return data, compute_domain_kwargs, accessor_domain_kwargs elif domain_type == MetricDomainTypes.COLUMN: if "column" not in compute_domain_kwargs: raise GreatExpectationsError( "Column not provided in compute_domain_kwargs" ) accessor_domain_kwargs["column"] = compute_domain_kwargs.pop("column") elif domain_type == MetricDomainTypes.COLUMN_PAIR: if not ( "column_A" in compute_domain_kwargs and "column_B" in compute_domain_kwargs ): raise GreatExpectationsError( "column_A or column_B not found within compute_domain_kwargs" ) accessor_domain_kwargs["column_A"] = compute_domain_kwargs.pop("column_A") accessor_domain_kwargs["column_B"] = compute_domain_kwargs.pop("column_B") elif domain_type == MetricDomainTypes.MULTICOLUMN: if "column_list" not in domain_kwargs: raise ge_exceptions.GreatExpectationsError( "column_list not found within domain_kwargs" ) column_list = compute_domain_kwargs.pop("column_list") if len(column_list) < 2: raise ge_exceptions.GreatExpectationsError( "column_list must contain at least 2 columns" ) accessor_domain_kwargs["column_list"] = column_list return data, compute_domain_kwargs, accessor_domain_kwargs