Ejemplo n.º 1
0
    def _get_domains(
        self,
        rule_name: str,
        variables: Optional[ParameterContainer] = None,
    ) -> List[Domain]:
        """
        Obtains and returns domains for all columns of a table (or for configured columns, if they exist in the table).
        """
        batch_ids: List[str] = self.get_batch_ids(variables=variables)

        validator: "Validator" = self.get_validator(
            variables=variables)  # noqa: F821

        effective_column_names: List[str] = self.get_effective_column_names(
            batch_ids=batch_ids,
            validator=validator,
            variables=variables,
        )

        column_name: str
        domains: List[Domain] = build_domains_from_column_names(
            rule_name=rule_name,
            column_names=effective_column_names,
            domain_type=self.domain_type,
            table_column_name_to_inferred_semantic_domain_type_map=self.
            semantic_type_filter.
            table_column_name_to_inferred_semantic_domain_type_map,
        )

        return domains
    def _get_domains(
        self,
        rule_name: str,
        variables: Optional[ParameterContainer] = None,
    ) -> List[Domain]:
        """
        Find the semantic column type for each column and return all domains matching the specified type or types.
        """
        batch_ids: List[str] = self.get_batch_ids(variables=variables)
        table_column_names: List[str] = self.get_validator(
            variables=variables
        ).get_metric(
            metric=MetricConfiguration(
                metric_name="table.columns",
                metric_domain_kwargs={
                    "batch_id": batch_ids[-1],  # active_batch_id
                },
                metric_value_kwargs=None,
                metric_dependencies=None,
            )
        )

        # First check the column name ends in "_id".
        candidate_column_names: List[str] = list(
            filter(
                lambda candidate_column_name: candidate_column_name.endswith(
                    tuple(self.column_name_suffixes)
                ),
                table_column_names,
            )
        )

        column_name: str
        domains: List[Domain] = build_domains_from_column_names(
            rule_name=rule_name,
            column_names=candidate_column_names,
            domain_type=self.domain_type,
            table_column_name_to_inferred_semantic_domain_type_map=None,
        )

        return domains
    def _get_domains(
        self,
        rule_name: str,
        variables: Optional[ParameterContainer] = None,
    ) -> List[Domain]:
        """Return domains matching the selected cardinality_limit_mode.

        Args:
            rule_name: name of Rule object, for which "Domain" objects are obtained.
            variables: Optional variables to substitute when evaluating.

        Returns:
            List of domains that match the desired cardinality.
        """
        batch_ids: List[str] = self.get_batch_ids(variables=variables)

        validator: "Validator" = self.get_validator(variables=variables)  # noqa: F821

        effective_column_names: List[str] = self.get_effective_column_names(
            batch_ids=batch_ids,
            validator=validator,
            variables=variables,
        )

        # Obtain cardinality_limit_mode from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        cardinality_limit_mode: Optional[
            Union[str, CardinalityLimitMode, dict]
        ] = get_parameter_value_and_validate_return_type(
            domain=None,
            parameter_reference=self.cardinality_limit_mode,
            expected_return_type=None,
            variables=variables,
            parameters=None,
        )

        # Obtain max_unique_values from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        max_unique_values: Optional[int] = get_parameter_value_and_validate_return_type(
            domain=None,
            parameter_reference=self.max_unique_values,
            expected_return_type=None,
            variables=variables,
            parameters=None,
        )

        # Obtain max_proportion_unique from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        max_proportion_unique: Optional[
            float
        ] = get_parameter_value_and_validate_return_type(
            domain=None,
            parameter_reference=self.max_proportion_unique,
            expected_return_type=None,
            variables=variables,
            parameters=None,
        )

        validate_input_parameters(
            cardinality_limit_mode=cardinality_limit_mode,
            max_unique_values=max_unique_values,
            max_proportion_unique=max_proportion_unique,
        )

        self._cardinality_checker = CardinalityChecker(
            cardinality_limit_mode=cardinality_limit_mode,
            max_unique_values=max_unique_values,
            max_proportion_unique=max_proportion_unique,
        )

        # Obtain allowed_semantic_types_passthrough from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        allowed_semantic_types_passthrough: Union[
            str, SemanticDomainTypes, List[Union[str, SemanticDomainTypes]]
        ] = get_parameter_value_and_validate_return_type(
            domain=None,
            parameter_reference=self.allowed_semantic_types_passthrough,
            expected_return_type=None,
            variables=variables,
            parameters=None,
        )
        allowed_semantic_types_passthrough = (
            self.semantic_type_filter.parse_semantic_domain_type_argument(
                semantic_types=allowed_semantic_types_passthrough
            )
        )

        column_name: str

        allowed_column_names_passthrough: List[str] = [
            column_name
            for column_name in effective_column_names
            if self.semantic_type_filter.table_column_name_to_inferred_semantic_domain_type_map[
                column_name
            ]
            in allowed_semantic_types_passthrough
        ]

        effective_column_names = [
            column_name
            for column_name in effective_column_names
            if column_name not in allowed_column_names_passthrough
        ]

        metrics_for_cardinality_check: Dict[
            str, List[MetricConfiguration]
        ] = self._generate_metric_configurations_to_check_cardinality(
            batch_ids=batch_ids, column_names=effective_column_names
        )

        candidate_column_names: List[
            str
        ] = self._column_names_meeting_cardinality_limit(
            validator=validator,
            metrics_for_cardinality_check=metrics_for_cardinality_check,
        )
        candidate_column_names.extend(allowed_column_names_passthrough)

        column_name: str
        domains: List[Domain] = build_domains_from_column_names(
            rule_name=rule_name,
            column_names=candidate_column_names,
            domain_type=self.domain_type,
            table_column_name_to_inferred_semantic_domain_type_map=self.semantic_type_filter.table_column_name_to_inferred_semantic_domain_type_map,
        )

        return domains
    def _get_domains(
        self,
        rule_name: str,
        variables: Optional[ParameterContainer] = None,
    ) -> List[Domain]:
        """Return domains matching the specified tolerance limits.

        Args:
            rule_name: name of Rule object, for which "Domain" objects are obtained.
            variables: Optional variables to substitute when evaluating.

        Returns:
            List of domains that match the desired tolerance limits.
        """
        # Obtain map_metric_name from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        map_metric_name: str = get_parameter_value_and_validate_return_type(
            domain=None,
            parameter_reference=self.map_metric_name,
            expected_return_type=str,
            variables=variables,
            parameters=None,
        )

        # Obtain max_unexpected_values from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        max_unexpected_values: int = get_parameter_value_and_validate_return_type(
            domain=None,
            parameter_reference=self.max_unexpected_values,
            expected_return_type=int,
            variables=variables,
            parameters=None,
        )

        # Obtain max_unexpected_ratio from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        max_unexpected_ratio: Optional[
            float] = get_parameter_value_and_validate_return_type(
                domain=None,
                parameter_reference=self.max_unexpected_ratio,
                expected_return_type=None,
                variables=variables,
                parameters=None,
            )

        # Obtain min_max_unexpected_values_proportion from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        min_max_unexpected_values_proportion: float = (
            get_parameter_value_and_validate_return_type(
                domain=None,
                parameter_reference=self.min_max_unexpected_values_proportion,
                expected_return_type=float,
                variables=variables,
                parameters=None,
            ))

        batch_ids: List[str] = self.get_batch_ids(variables=variables)
        num_batch_ids: int = len(batch_ids)

        validator: "Validator" = self.get_validator(
            variables=variables)  # noqa: F821

        table_column_names: List[str] = self.get_effective_column_names(
            batch_ids=batch_ids,
            validator=validator,
            variables=variables,
        )

        table_row_counts: Dict[str, int] = self.get_table_row_counts(
            validator=validator,
            batch_ids=batch_ids,
            variables=variables,
        )
        mean_table_row_count_as_float: float = (
            1.0 * sum(table_row_counts.values()) / num_batch_ids)

        # If no "max_unexpected_ratio" is given, compute it based on average number of records across all Batch objects.
        if max_unexpected_ratio is None:
            max_unexpected_ratio = max_unexpected_values / mean_table_row_count_as_float

        metric_configurations_by_column_name: Dict[
            str,
            List[MetricConfiguration]] = self._generate_metric_configurations(
                map_metric_name=map_metric_name,
                batch_ids=batch_ids,
                column_names=table_column_names,
            )

        candidate_column_names: List[
            str] = self._get_column_names_satisfying_tolerance_limits(
                validator=validator,
                num_batch_ids=num_batch_ids,
                metric_configurations_by_column_name=
                metric_configurations_by_column_name,
                mean_table_row_count_as_float=mean_table_row_count_as_float,
                max_unexpected_ratio=max_unexpected_ratio,
                min_max_unexpected_values_proportion=
                min_max_unexpected_values_proportion,
            )

        column_name: str
        domains: List[Domain] = build_domains_from_column_names(
            rule_name=rule_name,
            column_names=candidate_column_names,
            domain_type=self.domain_type,
            table_column_name_to_inferred_semantic_domain_type_map=self.
            semantic_type_filter.
            table_column_name_to_inferred_semantic_domain_type_map,
        )

        return domains