def _get_domains( self, rule_name: str, variables: Optional[ParameterContainer] = None, ) -> List[Domain]: """ Obtains and returns domains for all columns of a table (or for configured columns, if they exist in the table). """ batch_ids: List[str] = self.get_batch_ids(variables=variables) validator: "Validator" = self.get_validator( variables=variables) # noqa: F821 effective_column_names: List[str] = self.get_effective_column_names( batch_ids=batch_ids, validator=validator, variables=variables, ) column_name: str domains: List[Domain] = build_domains_from_column_names( rule_name=rule_name, column_names=effective_column_names, domain_type=self.domain_type, table_column_name_to_inferred_semantic_domain_type_map=self. semantic_type_filter. table_column_name_to_inferred_semantic_domain_type_map, ) return domains
def _get_domains( self, rule_name: str, variables: Optional[ParameterContainer] = None, ) -> List[Domain]: """ Find the semantic column type for each column and return all domains matching the specified type or types. """ batch_ids: List[str] = self.get_batch_ids(variables=variables) table_column_names: List[str] = self.get_validator( variables=variables ).get_metric( metric=MetricConfiguration( metric_name="table.columns", metric_domain_kwargs={ "batch_id": batch_ids[-1], # active_batch_id }, metric_value_kwargs=None, metric_dependencies=None, ) ) # First check the column name ends in "_id". candidate_column_names: List[str] = list( filter( lambda candidate_column_name: candidate_column_name.endswith( tuple(self.column_name_suffixes) ), table_column_names, ) ) column_name: str domains: List[Domain] = build_domains_from_column_names( rule_name=rule_name, column_names=candidate_column_names, domain_type=self.domain_type, table_column_name_to_inferred_semantic_domain_type_map=None, ) return domains
def _get_domains( self, rule_name: str, variables: Optional[ParameterContainer] = None, ) -> List[Domain]: """Return domains matching the selected cardinality_limit_mode. Args: rule_name: name of Rule object, for which "Domain" objects are obtained. variables: Optional variables to substitute when evaluating. Returns: List of domains that match the desired cardinality. """ batch_ids: List[str] = self.get_batch_ids(variables=variables) validator: "Validator" = self.get_validator(variables=variables) # noqa: F821 effective_column_names: List[str] = self.get_effective_column_names( batch_ids=batch_ids, validator=validator, variables=variables, ) # Obtain cardinality_limit_mode from "rule state" (i.e., variables and parameters); from instance variable otherwise. cardinality_limit_mode: Optional[ Union[str, CardinalityLimitMode, dict] ] = get_parameter_value_and_validate_return_type( domain=None, parameter_reference=self.cardinality_limit_mode, expected_return_type=None, variables=variables, parameters=None, ) # Obtain max_unique_values from "rule state" (i.e., variables and parameters); from instance variable otherwise. max_unique_values: Optional[int] = get_parameter_value_and_validate_return_type( domain=None, parameter_reference=self.max_unique_values, expected_return_type=None, variables=variables, parameters=None, ) # Obtain max_proportion_unique from "rule state" (i.e., variables and parameters); from instance variable otherwise. max_proportion_unique: Optional[ float ] = get_parameter_value_and_validate_return_type( domain=None, parameter_reference=self.max_proportion_unique, expected_return_type=None, variables=variables, parameters=None, ) validate_input_parameters( cardinality_limit_mode=cardinality_limit_mode, max_unique_values=max_unique_values, max_proportion_unique=max_proportion_unique, ) self._cardinality_checker = CardinalityChecker( cardinality_limit_mode=cardinality_limit_mode, max_unique_values=max_unique_values, max_proportion_unique=max_proportion_unique, ) # Obtain allowed_semantic_types_passthrough from "rule state" (i.e., variables and parameters); from instance variable otherwise. allowed_semantic_types_passthrough: Union[ str, SemanticDomainTypes, List[Union[str, SemanticDomainTypes]] ] = get_parameter_value_and_validate_return_type( domain=None, parameter_reference=self.allowed_semantic_types_passthrough, expected_return_type=None, variables=variables, parameters=None, ) allowed_semantic_types_passthrough = ( self.semantic_type_filter.parse_semantic_domain_type_argument( semantic_types=allowed_semantic_types_passthrough ) ) column_name: str allowed_column_names_passthrough: List[str] = [ column_name for column_name in effective_column_names if self.semantic_type_filter.table_column_name_to_inferred_semantic_domain_type_map[ column_name ] in allowed_semantic_types_passthrough ] effective_column_names = [ column_name for column_name in effective_column_names if column_name not in allowed_column_names_passthrough ] metrics_for_cardinality_check: Dict[ str, List[MetricConfiguration] ] = self._generate_metric_configurations_to_check_cardinality( batch_ids=batch_ids, column_names=effective_column_names ) candidate_column_names: List[ str ] = self._column_names_meeting_cardinality_limit( validator=validator, metrics_for_cardinality_check=metrics_for_cardinality_check, ) candidate_column_names.extend(allowed_column_names_passthrough) column_name: str domains: List[Domain] = build_domains_from_column_names( rule_name=rule_name, column_names=candidate_column_names, domain_type=self.domain_type, table_column_name_to_inferred_semantic_domain_type_map=self.semantic_type_filter.table_column_name_to_inferred_semantic_domain_type_map, ) return domains
def _get_domains( self, rule_name: str, variables: Optional[ParameterContainer] = None, ) -> List[Domain]: """Return domains matching the specified tolerance limits. Args: rule_name: name of Rule object, for which "Domain" objects are obtained. variables: Optional variables to substitute when evaluating. Returns: List of domains that match the desired tolerance limits. """ # Obtain map_metric_name from "rule state" (i.e., variables and parameters); from instance variable otherwise. map_metric_name: str = get_parameter_value_and_validate_return_type( domain=None, parameter_reference=self.map_metric_name, expected_return_type=str, variables=variables, parameters=None, ) # Obtain max_unexpected_values from "rule state" (i.e., variables and parameters); from instance variable otherwise. max_unexpected_values: int = get_parameter_value_and_validate_return_type( domain=None, parameter_reference=self.max_unexpected_values, expected_return_type=int, variables=variables, parameters=None, ) # Obtain max_unexpected_ratio from "rule state" (i.e., variables and parameters); from instance variable otherwise. max_unexpected_ratio: Optional[ float] = get_parameter_value_and_validate_return_type( domain=None, parameter_reference=self.max_unexpected_ratio, expected_return_type=None, variables=variables, parameters=None, ) # Obtain min_max_unexpected_values_proportion from "rule state" (i.e., variables and parameters); from instance variable otherwise. min_max_unexpected_values_proportion: float = ( get_parameter_value_and_validate_return_type( domain=None, parameter_reference=self.min_max_unexpected_values_proportion, expected_return_type=float, variables=variables, parameters=None, )) batch_ids: List[str] = self.get_batch_ids(variables=variables) num_batch_ids: int = len(batch_ids) validator: "Validator" = self.get_validator( variables=variables) # noqa: F821 table_column_names: List[str] = self.get_effective_column_names( batch_ids=batch_ids, validator=validator, variables=variables, ) table_row_counts: Dict[str, int] = self.get_table_row_counts( validator=validator, batch_ids=batch_ids, variables=variables, ) mean_table_row_count_as_float: float = ( 1.0 * sum(table_row_counts.values()) / num_batch_ids) # If no "max_unexpected_ratio" is given, compute it based on average number of records across all Batch objects. if max_unexpected_ratio is None: max_unexpected_ratio = max_unexpected_values / mean_table_row_count_as_float metric_configurations_by_column_name: Dict[ str, List[MetricConfiguration]] = self._generate_metric_configurations( map_metric_name=map_metric_name, batch_ids=batch_ids, column_names=table_column_names, ) candidate_column_names: List[ str] = self._get_column_names_satisfying_tolerance_limits( validator=validator, num_batch_ids=num_batch_ids, metric_configurations_by_column_name= metric_configurations_by_column_name, mean_table_row_count_as_float=mean_table_row_count_as_float, max_unexpected_ratio=max_unexpected_ratio, min_max_unexpected_values_proportion= min_max_unexpected_values_proportion, ) column_name: str domains: List[Domain] = build_domains_from_column_names( rule_name=rule_name, column_names=candidate_column_names, domain_type=self.domain_type, table_column_name_to_inferred_semantic_domain_type_map=self. semantic_type_filter. table_column_name_to_inferred_semantic_domain_type_map, ) return domains