Esempio n. 1
0
    def _build_parameters(
        self,
        domain: Domain,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
        recompute_existing_parameter_values: bool = False,
    ) -> Attributes:
        """
        Builds ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and details.

        Returns:
            Attributes object, containing computed parameter values and parameter computation details metadata.
        """
        # Compute "table.columns" metric value for each Batch object.
        super().build_parameters(
            domain=domain,
            variables=variables,
            parameters=parameters,
            parameter_computation_impl=super()._build_parameters,
            recompute_existing_parameter_values=
            recompute_existing_parameter_values,
        )

        # Retrieve "table.columns" metric values for all Batch objects.
        parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self.raw_fully_qualified_parameter_name,
            expected_return_type=None,
            variables=variables,
            parameters=parameters,
        )
        table_columns_names_multi_batch_value: MetricValues = parameter_node[
            FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY]

        one_batch_table_columns_names_value: MetricValue
        multi_batch_table_columns_names_sets_as_list: List[Set[str]] = [
            set(one_batch_table_columns_names_value)
            for one_batch_table_columns_names_value in
            table_columns_names_multi_batch_value
        ]

        multi_batch_table_columns_names_as_set: Set[str] = set().union(
            *multi_batch_table_columns_names_sets_as_list)

        one_batch_table_columns_names_set: Set[str]
        mean_table_columns_set_match: np.float64 = np.mean(
            np.asarray([
                1 if one_batch_table_columns_names_set
                == multi_batch_table_columns_names_as_set else 0
                for one_batch_table_columns_names_set in
                multi_batch_table_columns_names_sets_as_list
            ]))

        return Attributes({
            FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY:
            multi_batch_table_columns_names_as_set,
            FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY: {
                "success_ratio": mean_table_columns_set_match,
            },
        })
Esempio n. 2
0
    def _build_parameters(
        self,
        domain: Domain,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
        recompute_existing_parameter_values: bool = False,
    ) -> Attributes:
        """
        Builds ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and details.

        Returns:
            Attributes object, containing computed parameter values and parameter computation details metadata.
        """
        batch_ids: Optional[List[str]] = self.get_batch_ids(
            domain=domain,
            variables=variables,
            parameters=parameters,
        )
        num_batch_ids: int = len(batch_ids)
        if num_batch_ids != 1:
            raise ge_exceptions.ProfilerExecutionError(
                message=f"""Utilizing a {self.__class__.__name__} requires exactly one Batch of data to be available
({num_batch_ids} Batch identifiers found).
"""
            )

        # Compute metric value for one Batch object (expressed as list of Batch objects).
        super().build_parameters(
            domain=domain,
            variables=variables,
            parameters=parameters,
            parameter_computation_impl=super()._build_parameters,
            json_serialize=False,
            recompute_existing_parameter_values=recompute_existing_parameter_values,
        )

        # Retrieve metric values for one Batch object (expressed as list of Batch objects).
        parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self.fully_qualified_parameter_name,
            expected_return_type=None,
            variables=variables,
            parameters=parameters,
        )

        return Attributes(
            {
                FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY: None
                if parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY] is None
                else parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY][0],
                FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY: parameter_node[
                    FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY
                ],
            }
        )
    def _build_parameters(
        self,
        domain: Domain,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
        recompute_existing_parameter_values: bool = False,
    ) -> Attributes:
        """
        Builds ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and details.

        Returns:
            Attributes object, containing computed parameter values and parameter computation details metadata.
        """
        # Build the list of unique values for each Batch object.
        super().build_parameters(
            domain=domain,
            variables=variables,
            parameters=parameters,
            parameter_computation_impl=super()._build_parameters,
            recompute_existing_parameter_values=recompute_existing_parameter_values,
        )

        # Retrieve and replace list of unique values for each Batch with set of unique values for all batches in domain.
        parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self.fully_qualified_parameter_name,
            expected_return_type=None,
            variables=variables,
            parameters=parameters,
        )
        metric_values: MetricValues = (
            AttributedResolvedMetrics.get_metric_values_from_attributed_metric_values(
                attributed_metric_values=parameter_node[
                    FULLY_QUALIFIED_PARAMETER_NAME_ATTRIBUTED_VALUE_KEY
                ]
            )
        )

        return Attributes(
            {
                FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY: _get_unique_values_from_nested_collection_of_sets(
                    collection=metric_values
                ),
                FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY: parameter_node[
                    FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY
                ],
            }
        )
Esempio n. 4
0
    def _build_parameters(
        self,
        domain: Domain,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
        recompute_existing_parameter_values: bool = False,
    ) -> Attributes:
        """
        Builds ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and details.

        Returns:
            Attributes object, containing computed parameter values and parameter computation details metadata.
        """
        # Compute metric value for one Batch object (expressed as list of Batch objects).
        super().build_parameters(
            domain=domain,
            variables=variables,
            parameters=parameters,
            parameter_computation_impl=super()._build_parameters,
            json_serialize=False,
            recompute_existing_parameter_values=
            recompute_existing_parameter_values,
        )

        # Retrieve metric values for one Batch object (expressed as list of Batch objects).
        parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self.fully_qualified_parameter_name,
            expected_return_type=None,
            variables=variables,
            parameters=parameters,
        )

        return Attributes({
            FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY:
            None
            if parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY] is None
            else parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY][0],
            FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY:
            parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY],
        })
    def _build_parameters(
        self,
        domain: Domain,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
        recompute_existing_parameter_values: bool = False,
    ) -> Attributes:
        """
        Builds ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and details.

        Returns:
            Attributes object, containing computed parameter values and parameter computation details metadata.
        """
        metric_computation_result: MetricComputationResult = self.get_metrics(
            metric_name=self.metric_name,
            metric_domain_kwargs=self.metric_domain_kwargs,
            metric_value_kwargs=self.metric_value_kwargs,
            enforce_numeric_metric=self.enforce_numeric_metric,
            replace_nan_with_zero=self.replace_nan_with_zero,
            domain=domain,
            variables=variables,
            parameters=parameters,
        )
        details: MetricComputationDetails = metric_computation_result.details

        # Obtain reduce_scalar_metric from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        reduce_scalar_metric: bool = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self.reduce_scalar_metric,
            expected_return_type=bool,
            variables=variables,
            parameters=parameters,
        )

        if len(metric_computation_result.attributed_resolved_metrics) == 1:
            # As a simplification, apply reduction to scalar in case of one-dimensional metric (for convenience).
            if (reduce_scalar_metric and isinstance(
                    metric_computation_result.attributed_resolved_metrics[0].
                    conditioned_metric_values,
                    np.ndarray,
            ) and metric_computation_result.attributed_resolved_metrics[0].
                    conditioned_metric_values.ndim > 1 and
                    metric_computation_result.attributed_resolved_metrics[0].
                    conditioned_metric_values.shape[1] == 1):
                return Attributes({
                    FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY:
                    metric_computation_result.attributed_resolved_metrics[0].
                    conditioned_metric_values[:, 0],
                    FULLY_QUALIFIED_PARAMETER_NAME_ATTRIBUTED_VALUE_KEY:
                    metric_computation_result.attributed_resolved_metrics[0].
                    conditioned_attributed_metric_values,
                    FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY:
                    details,
                })

            return Attributes({
                FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY:
                metric_computation_result.attributed_resolved_metrics[0].
                conditioned_metric_values,
                FULLY_QUALIFIED_PARAMETER_NAME_ATTRIBUTED_VALUE_KEY:
                metric_computation_result.attributed_resolved_metrics[0].
                conditioned_attributed_metric_values,
                FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY:
                details,
            })

        return Attributes({
            FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY:
            metric_computation_result.attributed_resolved_metrics,
            FULLY_QUALIFIED_PARAMETER_NAME_ATTRIBUTED_VALUE_KEY:
            metric_computation_result.attributed_resolved_metrics,
            FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY:
            details,
        })
    def _build_parameters(
        self,
        domain: Domain,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
        recompute_existing_parameter_values: bool = False,
    ) -> Attributes:
        """
        Builds ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and details.

        Returns:
            Attributes object, containing computed parameter values and parameter computation details metadata.
        """
        # Obtain total_count_parameter_builder_name from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        total_count_parameter_builder_name: str = (
            get_parameter_value_and_validate_return_type(
                domain=domain,
                parameter_reference=self.total_count_parameter_builder_name,
                expected_return_type=str,
                variables=variables,
                parameters=parameters,
            ))

        fully_qualified_total_count_parameter_builder_name: str = (
            f"{PARAMETER_KEY}{total_count_parameter_builder_name}")
        # Obtain total_count from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        total_count_parameter_node: ParameterNode = (
            get_parameter_value_and_validate_return_type(
                domain=domain,
                parameter_reference=
                fully_qualified_total_count_parameter_builder_name,
                expected_return_type=None,
                variables=variables,
                parameters=parameters,
            ))
        total_count_values: MetricValues = total_count_parameter_node[
            FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY]

        # Obtain null_count_parameter_builder_name from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        null_count_parameter_builder_name: Optional[
            str] = get_parameter_value_and_validate_return_type(
                domain=domain,
                parameter_reference=self.null_count_parameter_builder_name,
                expected_return_type=None,
                variables=variables,
                parameters=parameters,
            )

        batch_ids: Optional[List[str]] = self.get_batch_ids(
            domain=domain,
            variables=variables,
            parameters=parameters,
        )
        num_batch_ids: int = len(batch_ids)

        null_count_values: MetricValues
        if null_count_parameter_builder_name is None:
            null_count_values = np.zeros(shape=(num_batch_ids, ))
        else:
            fully_qualified_null_count_parameter_builder_name: str = (
                f"{PARAMETER_KEY}{null_count_parameter_builder_name}")
            # Obtain null_count from "rule state" (i.e., variables and parameters); from instance variable otherwise.
            null_count_parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
                domain=domain,
                parameter_reference=
                fully_qualified_null_count_parameter_builder_name,
                expected_return_type=None,
                variables=variables,
                parameters=parameters,
            )
            null_count_values = null_count_parameter_node[
                FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY]

        nonnull_count_values: np.ndarray = total_count_values - null_count_values

        # Compute "unexpected_count" corresponding to "map_metric_name" (given as argument to this "ParameterBuilder").
        super().build_parameters(
            domain=domain,
            variables=variables,
            parameters=parameters,
            parameter_computation_impl=super()._build_parameters,
            json_serialize=None,
            recompute_existing_parameter_values=
            recompute_existing_parameter_values,
        )

        # Retrieve "unexpected_count" corresponding to "map_metric_name" (given as argument to this "ParameterBuilder").
        parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self.fully_qualified_parameter_name,
            expected_return_type=None,
            variables=variables,
            parameters=parameters,
        )
        unexpected_count_values: MetricValues = parameter_node[
            FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY]

        unexpected_count_ratio_values: np.ndarray = (unexpected_count_values /
                                                     nonnull_count_values)
        mean_unexpected_count_ratio: np.float64 = np.mean(
            unexpected_count_ratio_values)

        return Attributes({
            FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY:
            mean_unexpected_count_ratio,
            FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY:
            parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY],
        })
Esempio n. 7
0
    def _build_parameters(
        self,
        domain: Domain,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
        recompute_existing_parameter_values: bool = False,
    ) -> Attributes:
        """
        Builds ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and details.

        Check the percentage of values matching each string, and return the best fit, or None if no string exceeds the
        configured threshold.

        Returns:
            Attributes object, containing computed parameter values and parameter computation details metadata.
        """
        metric_computation_result: MetricComputationResult

        metric_computation_result = self.get_metrics(
            metric_name="column_values.nonnull.count",
            metric_domain_kwargs=self.metric_domain_kwargs,
            metric_value_kwargs=self.metric_value_kwargs,
            domain=domain,
            variables=variables,
            parameters=parameters,
        )

        # This should never happen.
        if len(metric_computation_result.attributed_resolved_metrics) != 1:
            raise ge_exceptions.ProfilerExecutionError(
                message=f'Result of metric computations for {self.__class__.__name__} must be a list with exactly 1 element of type "AttributedResolvedMetrics" ({metric_computation_result.attributed_resolved_metrics} found).'
            )

        attributed_resolved_metrics: AttributedResolvedMetrics

        attributed_resolved_metrics = (
            metric_computation_result.attributed_resolved_metrics[0]
        )

        metric_values: MetricValues

        metric_values = attributed_resolved_metrics.metric_values

        if metric_values is None:
            raise ge_exceptions.ProfilerExecutionError(
                message=f"Result of metric computations for {self.__class__.__name__} is empty."
            )

        # Now obtain 1-dimensional vector of values of computed metric (each element corresponds to a Batch ID).
        metric_values = metric_values[:, 0]

        nonnull_count: int = sum(metric_values)

        # Obtain candidate_strings from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        candidate_strings: Union[
            List[str],
            Set[str],
        ] = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self.candidate_strings,
            expected_return_type=None,
            variables=variables,
            parameters=parameters,
        )

        # Gather "metric_value_kwargs" for all candidate "strftime_format" strings.
        format_string: str
        match_strftime_metric_value_kwargs_list: List[dict] = []
        match_strftime_metric_value_kwargs: dict
        for format_string in candidate_strings:
            if self.metric_value_kwargs:
                match_strftime_metric_value_kwargs = {
                    **self.metric_value_kwargs,
                    **{"strftime_format": format_string},
                }
            else:
                match_strftime_metric_value_kwargs = {
                    "strftime_format": format_string,
                }

            match_strftime_metric_value_kwargs_list.append(
                match_strftime_metric_value_kwargs
            )

        # Obtain resolved metrics and metadata for all metric configurations and available Batch objects simultaneously.
        metric_computation_result = self.get_metrics(
            metric_name="column_values.match_strftime_format.unexpected_count",
            metric_domain_kwargs=self.metric_domain_kwargs,
            metric_value_kwargs=match_strftime_metric_value_kwargs_list,
            domain=domain,
            variables=variables,
            parameters=parameters,
        )

        format_string_success_ratios: dict = {}

        for (
            attributed_resolved_metrics
        ) in metric_computation_result.attributed_resolved_metrics:
            # Now obtain 1-dimensional vector of values of computed metric (each element corresponds to a Batch ID).
            metric_values = attributed_resolved_metrics.metric_values[:, 0]

            match_strftime_unexpected_count: int = sum(metric_values)
            success_ratio: float = (
                nonnull_count - match_strftime_unexpected_count
            ) / nonnull_count
            format_string_success_ratios[
                attributed_resolved_metrics.metric_attributes["strftime_format"]
            ] = success_ratio

        # Obtain threshold from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        threshold: float = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self.threshold,
            expected_return_type=float,
            variables=variables,
            parameters=parameters,
        )

        # get best-matching datetime string that matches greater than threshold
        best_format_string: str
        best_ratio: float
        (
            best_format_string,
            best_ratio,
        ) = ParameterBuilder._get_best_candidate_above_threshold(
            format_string_success_ratios, threshold
        )
        # dict of sorted datetime and ratios for all evaluated candidates
        sorted_format_strings_and_ratios: dict = (
            ParameterBuilder._get_sorted_candidates_and_ratios(
                format_string_success_ratios
            )
        )

        return Attributes(
            {
                FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY: best_format_string,
                FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY: {
                    "success_ratio": best_ratio,
                    "candidate_strings": sorted_format_strings_and_ratios,
                },
            }
        )
    def _build_parameters(
        self,
        domain: Domain,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
        recompute_existing_parameter_values: bool = False,
    ) -> Attributes:
        """
        Builds ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and details.

        Returns:
            Attributes object, containing computed parameter values and parameter computation details metadata.
        """
        # Obtain bucketize_data directive from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        bucketize_data = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self.bucketize_data,
            expected_return_type=bool,
            variables=variables,
            parameters=parameters,
        )

        is_categorical: bool = not bucketize_data

        fully_qualified_column_partition_metric_single_batch_parameter_builder_name: str = f"{RAW_PARAMETER_KEY}{self._column_partition_metric_single_batch_parameter_builder_config.name}"
        # Obtain "column.partition" from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        column_partition_parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=fully_qualified_column_partition_metric_single_batch_parameter_builder_name,
            expected_return_type=None,
            variables=variables,
            parameters=parameters,
        )
        bins: MetricValue = column_partition_parameter_node[
            FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY
        ]

        if bins is None:
            is_categorical = True
        else:
            is_categorical = is_categorical or not np.all(np.diff(bins) > 0.0)

        fully_qualified_column_values_nonnull_count_metric_parameter_builder_name: str = f"{RAW_PARAMETER_KEY}{self._column_values_nonnull_count_metric_single_batch_parameter_builder_config.name}"
        # Obtain "column_values.nonnull.count" from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        column_values_nonnull_count_parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=fully_qualified_column_values_nonnull_count_metric_parameter_builder_name,
            expected_return_type=None,
            variables=variables,
            parameters=parameters,
        )

        partition_object: dict
        details: dict

        weights: list

        if is_categorical:
            fully_qualified_column_value_counts_metric_single_batch_parameter_builder_name: str = f"{RAW_PARAMETER_KEY}{self._column_value_counts_metric_single_batch_parameter_builder_config.name}"
            # Obtain "column.value_counts" from "rule state" (i.e., variables and parameters); from instance variable otherwise.
            column_value_counts_parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
                domain=domain,
                parameter_reference=fully_qualified_column_value_counts_metric_single_batch_parameter_builder_name,
                expected_return_type=None,
                variables=variables,
                parameters=parameters,
            )

            values: list = list(
                column_value_counts_parameter_node[
                    FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY
                ].index
            )
            weights = list(
                np.asarray(
                    column_value_counts_parameter_node[
                        FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY
                    ]
                )
                / column_values_nonnull_count_parameter_node[
                    FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY
                ]
            )

            partition_object = {
                "values": values,
                "weights": weights,
            }
            details = column_value_counts_parameter_node[
                FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY
            ]
        else:
            self.metric_name = "column.histogram"
            self.metric_value_kwargs = {
                "bins": tuple(bins),
            }

            # Compute metric value for one Batch object.
            super().build_parameters(
                domain=domain,
                variables=variables,
                parameters=parameters,
                parameter_computation_impl=super()._build_parameters,
                recompute_existing_parameter_values=recompute_existing_parameter_values,
            )

            # Retrieve metric values for one Batch object.
            parameter_node: ParameterNode = (
                get_parameter_value_and_validate_return_type(
                    domain=domain,
                    parameter_reference=self.raw_fully_qualified_parameter_name,
                    expected_return_type=None,
                    variables=variables,
                    parameters=parameters,
                )
            )

            # in this case, we have requested a partition, histogram using said partition, and nonnull count
            bins = list(bins)
            weights = list(
                np.asarray(parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY])
                / column_values_nonnull_count_parameter_node[
                    FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY
                ]
            )
            tail_weights: float = (1.0 - sum(weights)) / 2.0

            partition_object = {
                "bins": bins,
                "weights": weights,
                "tail_weights": [tail_weights, tail_weights],
            }
            details = parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY]

        return Attributes(
            {
                FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY: partition_object,
                FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY: details,
            }
        )
Esempio n. 9
0
    def _build_parameters(
        self,
        domain: Domain,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
        recompute_existing_parameter_values: bool = False,
    ) -> Attributes:
        """
        Builds ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and details.

        Returns:
            Attributes object, containing computed parameter values and parameter computation details metadata.

         The algorithm operates according to the following steps:
         1. Obtain batch IDs of interest using BaseDataContext and BatchRequest (unless passed explicitly as argument).
         2. Set up metric_domain_kwargs and metric_value_kwargs (using configuration and/or variables and parameters).
         3. Instantiate the Validator object corresponding to BatchRequest (with a temporary expectation_suite_name) in
            order to have access to all Batch objects, on each of which the specified metric_name will be computed.
         4. Perform metric computations and obtain the result in the array-like form (one metric value per each Batch).
         5. Using the configured directives and heuristics, determine whether or not the ranges should be clipped.
         6. Using the configured directives and heuristics, determine if return values should be rounded to an integer.
         7. Convert the multi-dimensional metric computation results to a numpy array (for further computations).
         8. Compute [low, high] for the desired metric using the chosen estimator method.
         9. Return [low, high] for the desired metric as estimated by the specified sampling method.
        10. Set up the arguments and call build_parameter_container() to store the parameter as part of "rule state".
        """
        # Obtain false_positive_rate from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        false_positive_rate: np.float64 = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self.false_positive_rate,
            expected_return_type=(float, np.float64),
            variables=variables,
            parameters=parameters,
        )

        if not (0.0 <= false_positive_rate <= 1.0):
            raise ge_exceptions.ProfilerExecutionError(
                f"""false_positive_rate must be a positive decimal number between 0 and 1 inclusive [0, 1],
but {false_positive_rate} was provided.""")
        elif false_positive_rate <= NP_EPSILON:
            warnings.warn(
                f"""You have chosen a false_positive_rate of {false_positive_rate}, which is too close to 0.
A false_positive_rate of {NP_EPSILON} has been selected instead.""")
            false_positive_rate = NP_EPSILON
        elif false_positive_rate >= (1.0 - NP_EPSILON):
            warnings.warn(
                f"""You have chosen a false_positive_rate of {false_positive_rate}, which is too close to 1.
A false_positive_rate of {1.0-NP_EPSILON} has been selected instead.""")
            false_positive_rate = np.float64(1.0 - NP_EPSILON)

        parameter_reference: str
        if self.metric_multi_batch_parameter_builder_name:
            # Obtain metric_multi_batch_parameter_builder_name from "rule state" (i.e., variables and parameters); from instance variable otherwise.
            metric_multi_batch_parameter_builder_name: str = (
                get_parameter_value_and_validate_return_type(
                    domain=domain,
                    parameter_reference=self.
                    metric_multi_batch_parameter_builder_name,
                    expected_return_type=str,
                    variables=variables,
                    parameters=parameters,
                ))
            parameter_reference = (
                f"{RAW_PARAMETER_KEY}{metric_multi_batch_parameter_builder_name}"
            )
        else:
            # Compute metric value for each Batch object.
            super().build_parameters(
                domain=domain,
                variables=variables,
                parameters=parameters,
                parameter_computation_impl=super()._build_parameters,
                recompute_existing_parameter_values=
                recompute_existing_parameter_values,
            )
            parameter_reference = self.raw_fully_qualified_parameter_name

        # Retrieve metric values for all Batch objects.
        parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=parameter_reference,
            expected_return_type=None,
            variables=variables,
            parameters=parameters,
        )
        metric_values: MetricValues = parameter_node[
            FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY]

        # Obtain estimator directive from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        estimator: str = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self.estimator,
            expected_return_type=str,
            variables=variables,
            parameters=parameters,
        )
        if (estimator not in NumericMetricRangeMultiBatchParameterBuilder.
                RECOGNIZED_SAMPLING_METHOD_NAMES):
            raise ge_exceptions.ProfilerExecutionError(
                message=
                f"""The directive "estimator" for {self.__class__.__name__} can be only one of
{NumericMetricRangeMultiBatchParameterBuilder.RECOGNIZED_SAMPLING_METHOD_NAMES} ("{estimator}" was detected).
""")

        round_decimals: int

        # Obtain quantile_statistic_interpolation_method directive from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        quantile_statistic_interpolation_method: str = (
            get_parameter_value_and_validate_return_type(
                domain=domain,
                parameter_reference=self.
                quantile_statistic_interpolation_method,
                expected_return_type=str,
                variables=variables,
                parameters=parameters,
            ))
        if (quantile_statistic_interpolation_method
                not in NumericMetricRangeMultiBatchParameterBuilder.
                RECOGNIZED_QUANTILE_STATISTIC_INTERPOLATION_METHODS):
            raise ge_exceptions.ProfilerExecutionError(
                message=
                f"""The directive "quantile_statistic_interpolation_method" for {self.__class__.__name__} can \
be only one of {NumericMetricRangeMultiBatchParameterBuilder.RECOGNIZED_QUANTILE_STATISTIC_INTERPOLATION_METHODS} \
("{quantile_statistic_interpolation_method}" was detected).
""")

        if integer_semantic_domain_type(domain=domain):
            round_decimals = 0
        else:
            round_decimals = self._get_round_decimals_using_heuristics(
                metric_values=metric_values,
                domain=domain,
                variables=variables,
                parameters=parameters,
            )

        if quantile_statistic_interpolation_method == "auto":
            if round_decimals == 0:
                quantile_statistic_interpolation_method = "nearest"
            else:
                quantile_statistic_interpolation_method = "linear"

        estimator_func: Callable
        estimator_kwargs: dict
        if estimator == "bootstrap":
            estimator_func = self._get_bootstrap_estimate
            estimator_kwargs = {
                "false_positive_rate": false_positive_rate,
                "quantile_statistic_interpolation_method":
                quantile_statistic_interpolation_method,
                "n_resamples": self.n_resamples,
                "random_seed": self.random_seed,
            }
        elif estimator == "kde":
            estimator_func = self._get_kde_estimate
            estimator_kwargs = {
                "false_positive_rate": false_positive_rate,
                "quantile_statistic_interpolation_method":
                quantile_statistic_interpolation_method,
                "n_resamples": self.n_resamples,
                "bw_method": self.bw_method,
                "random_seed": self.random_seed,
            }
        else:
            estimator_func = self._get_deterministic_estimate
            estimator_kwargs = {
                "false_positive_rate":
                false_positive_rate,
                "quantile_statistic_interpolation_method":
                quantile_statistic_interpolation_method,
            }

        numeric_range_estimation_result: NumericRangeEstimationResult = (
            self._estimate_metric_value_range(
                metric_values=metric_values,
                estimator_func=estimator_func,
                round_decimals=round_decimals,
                domain=domain,
                variables=variables,
                parameters=parameters,
                **estimator_kwargs,
            ))

        value_range: np.ndarray = numeric_range_estimation_result.value_range
        details: Dict[str, Any] = copy.deepcopy(
            parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY])

        # Obtain include_estimator_samples_histogram_in_details from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        include_estimator_samples_histogram_in_details: bool = (
            get_parameter_value_and_validate_return_type(
                domain=domain,
                parameter_reference=self.
                include_estimator_samples_histogram_in_details,
                expected_return_type=bool,
                variables=variables,
                parameters=parameters,
            ))

        if include_estimator_samples_histogram_in_details:
            details[
                "estimation_histogram"] = numeric_range_estimation_result.estimation_histogram

        return Attributes({
            FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY: value_range,
            FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY: details,
        })