Ejemplo n.º 1
0
    def __init__(
        self,
        expectation_type: str,
        meta: Optional[Dict[str, Any]] = None,
        condition: Optional[str] = None,
        validation_parameter_builder_configs: Optional[
            List[ParameterBuilderConfig]] = None,
        data_context: Optional["BaseDataContext"] = None,  # noqa: F821
        **kwargs,
    ) -> None:
        """
        Args:
            expectation_type: the "expectation_type" argument of "ExpectationConfiguration" object to be emitted.
            meta: the "meta" argument of "ExpectationConfiguration" object to be emitted
            condition: Boolean statement (expressed as string and following specified grammar), which controls whether
            or not underlying logic should be executed and thus resulting "ExpectationConfiguration" emitted
            validation_parameter_builder_configs: ParameterBuilder configurations, having whose outputs available (as
            fully-qualified parameter names) is pre-requisite for present ExpectationConfigurationBuilder instance
            These "ParameterBuilder" configurations help build kwargs needed for this "ExpectationConfigurationBuilder"
            data_context: BaseDataContext associated with this ExpectationConfigurationBuilder
            kwargs: additional arguments
        """

        super().__init__(
            expectation_type=expectation_type,
            validation_parameter_builder_configs=
            validation_parameter_builder_configs,
            data_context=data_context,
            **kwargs,
        )

        if meta is None:
            meta = {}

        self._meta = meta

        if not isinstance(meta, dict):
            raise ge_exceptions.ProfilerExecutionError(
                message=
                f"""Argument "{meta}" in "{self.__class__.__name__}" must be of type "dictionary" \
(value of type "{str(type(meta))}" was encountered).
""")

        if condition and (not isinstance(condition, str)):
            raise ge_exceptions.ProfilerExecutionError(
                message=
                f"""Argument "{condition}" in "{self.__class__.__name__}" must be of type "string" \
(value of type "{str(type(condition))}" was encountered).
""")

        self._condition = condition

        self._validation_parameter_builder_configs = (
            validation_parameter_builder_configs)

        self._kwargs = kwargs
Ejemplo n.º 2
0
def validate_fully_qualified_parameter_name(fully_qualified_parameter_name: str):
    if not fully_qualified_parameter_name.startswith("$"):
        raise ge_exceptions.ProfilerExecutionError(
            message=f"""Unable to get value for parameter name "{fully_qualified_parameter_name}" -- parameter \
names must start with $ (e.g., "${fully_qualified_parameter_name}").
"""
        )
    def _get_deterministic_estimate(
        self,
        metric_values: np.ndarray,
        domain: Domain,
        *,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
        **kwargs,
    ) -> Tuple[Number, Number]:
        # Obtain false_positive_rate from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        false_positive_rate: np.float64 = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=kwargs.get("false_positive_rate", 5.0e-2),
            expected_return_type=(float, np.float64),
            variables=variables,
            parameters=parameters,
        )
        if not (0.0 <= false_positive_rate <= 1.0):
            raise ge_exceptions.ProfilerExecutionError(
                message=f"The confidence level for {self.__class__.__name__} is outside of [0.0, 1.0] closed interval."
            )

        return compute_quantiles(
            metric_values=metric_values,
            false_positive_rate=false_positive_rate,
        )
Ejemplo n.º 4
0
def get_batch_ids(
    data_context: Optional["BaseDataContext"] = None,  # noqa: F821
    batch_list: Optional[List[Batch]] = None,
    batch_request: Optional[Union[str, BatchRequestBase, dict]] = None,
    domain: Optional[Domain] = None,
    variables: Optional[ParameterContainer] = None,
    parameters: Optional[Dict[str, ParameterContainer]] = None,
) -> Optional[List[str]]:
    batch: Batch
    if batch_list is None or all([batch is None for batch in batch_list]):
        if batch_request is None:
            return None

        batch_request = build_batch_request(
            domain=domain,
            batch_request=batch_request,
            variables=variables,
            parameters=parameters,
        )

        batch_list = data_context.get_batch_list(batch_request=batch_request)

    batch_ids: List[str] = [batch.id for batch in batch_list]

    num_batch_ids: int = len(batch_ids)
    if num_batch_ids == 0:
        raise ge_exceptions.ProfilerExecutionError(
            message=
            f"""{__name__}.get_batch_ids() must return at least one batch_id ({num_batch_ids} were retrieved).
""")

    return batch_ids
Ejemplo n.º 5
0
    def _get_round_decimals_using_heuristics(
        self,
        metric_values: np.ndarray,
        domain: Domain,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
    ) -> int:
        # Obtain round_decimals directive from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        round_decimals: Optional[
            int] = get_parameter_value_and_validate_return_type(
                domain=domain,
                parameter_reference=self.round_decimals,
                expected_return_type=None,
                variables=variables,
                parameters=parameters,
            )
        if round_decimals is None:
            round_decimals = MAX_DECIMALS
        else:
            if not isinstance(round_decimals, int) or (round_decimals < 0):
                raise ge_exceptions.ProfilerExecutionError(
                    message=
                    f"""The directive "round_decimals" for {self.__class__.__name__} can be 0 or a
positive integer, or must be omitted (or set to None).
""")

        if np.issubdtype(metric_values.dtype, np.integer):
            round_decimals = 0

        return round_decimals
Ejemplo n.º 6
0
def get_parameter_value_and_validate_return_type(
    domain: Optional[Domain] = None,
    parameter_reference: Optional[Union[Any, str]] = None,
    expected_return_type: Optional[Union[type, tuple]] = None,
    variables: Optional[ParameterContainer] = None,
    parameters: Optional[Dict[str, ParameterContainer]] = None,
) -> Optional[Any]:
    """
    This method allows for the parameter_reference to be specified as an object (literal, dict, any typed object, etc.)
    or as a fully-qualified parameter name.  In either case, it can optionally validate the type of the return value.
    """
    if isinstance(parameter_reference, dict):
        parameter_reference = safe_deep_copy(data=parameter_reference)

    parameter_reference = get_parameter_value(
        domain=domain,
        parameter_reference=parameter_reference,
        variables=variables,
        parameters=parameters,
    )

    if expected_return_type is not None:
        if not isinstance(parameter_reference, expected_return_type):
            raise ge_exceptions.ProfilerExecutionError(
                message=
                f"""Argument "{parameter_reference}" must be of type "{str(expected_return_type)}" \
(value of type "{str(type(parameter_reference))}" was encountered).
""")

    return parameter_reference
def validate_fully_qualified_parameter_name(
    fully_qualified_parameter_name: str, ) -> None:
    if not is_fully_qualified_parameter_name_literal_string_format(
            fully_qualified_parameter_name=fully_qualified_parameter_name):
        raise ge_exceptions.ProfilerExecutionError(
            message=
            f"""Unable to get value for parameter name "{fully_qualified_parameter_name}" -- parameter \
names must start with {FULLY_QUALIFIED_PARAMETER_NAME_DELIMITER_CHARACTER} (e.g., "{FULLY_QUALIFIED_PARAMETER_NAME_DELIMITER_CHARACTER}{fully_qualified_parameter_name}").
""")
Ejemplo n.º 8
0
def get_validator(
    purpose: str,
    *,
    data_context: Optional["BaseDataContext"] = None,  # noqa: F821
    batch_list: Optional[List[Batch]] = None,
    batch_request: Optional[Union[str, BatchRequestBase, dict]] = None,
    domain: Optional[Domain] = None,
    variables: Optional[ParameterContainer] = None,
    parameters: Optional[Dict[str, ParameterContainer]] = None,
) -> Optional["Validator"]:  # noqa: F821
    validator: Optional["Validator"]  # noqa: F821

    expectation_suite_name: str = f"tmp.{purpose}"
    if domain is None:
        expectation_suite_name = (
            f"{expectation_suite_name}_suite_{str(uuid.uuid4())[:8]}")
    else:
        expectation_suite_name = (
            f"{expectation_suite_name}_{domain.id}_suite_{str(uuid.uuid4())[:8]}"
        )

    batch: Batch
    if batch_list is None or all([batch is None for batch in batch_list]):
        if batch_request is None:
            return None

        batch_request = build_batch_request(
            domain=domain,
            batch_request=batch_request,
            variables=variables,
            parameters=parameters,
        )

        validator = data_context.get_validator(
            batch_request=batch_request,
            create_expectation_suite_with_name=expectation_suite_name,
        )
    else:
        num_batches: int = len(batch_list)
        if num_batches == 0:
            raise ge_exceptions.ProfilerExecutionError(
                message=
                f"""{__name__}.get_validator() must utilize at least one Batch ({num_batches} are available).
""")

        expectation_suite: ExpectationSuite = data_context.create_expectation_suite(
            expectation_suite_name=expectation_suite_name)
        validator = data_context.get_validator_using_batch_list(
            expectation_suite=expectation_suite,
            batch_list=batch_list,
        )

    # Always disabled for RBP and DataAssistants due to volume of metric calculations
    validator.show_progress_bars = False
    return validator
Ejemplo n.º 9
0
    def _build_parameters(
        self,
        domain: Domain,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
        recompute_existing_parameter_values: bool = False,
    ) -> Attributes:
        """
        Builds ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and details.

        Returns:
            Attributes object, containing computed parameter values and parameter computation details metadata.
        """
        batch_ids: Optional[List[str]] = self.get_batch_ids(
            domain=domain,
            variables=variables,
            parameters=parameters,
        )
        num_batch_ids: int = len(batch_ids)
        if num_batch_ids != 1:
            raise ge_exceptions.ProfilerExecutionError(
                message=f"""Utilizing a {self.__class__.__name__} requires exactly one Batch of data to be available
({num_batch_ids} Batch identifiers found).
"""
            )

        # Compute metric value for one Batch object (expressed as list of Batch objects).
        super().build_parameters(
            domain=domain,
            variables=variables,
            parameters=parameters,
            parameter_computation_impl=super()._build_parameters,
            json_serialize=False,
            recompute_existing_parameter_values=recompute_existing_parameter_values,
        )

        # Retrieve metric values for one Batch object (expressed as list of Batch objects).
        parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self.fully_qualified_parameter_name,
            expected_return_type=None,
            variables=variables,
            parameters=parameters,
        )

        return Attributes(
            {
                FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY: None
                if parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY] is None
                else parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY][0],
                FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY: parameter_node[
                    FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY
                ],
            }
        )
    def _get_domains(
        self,
        rule_name: str,
        variables: Optional[ParameterContainer] = None,
    ) -> List[Domain]:
        """Return domains matching the specified tolerance limits.

        Args:
            rule_name: name of Rule object, for which "Domain" objects are obtained.
            variables: Optional variables to substitute when evaluating.

        Returns:
            List of domains that match the desired tolerance limits.
        """
        batch_ids: List[str] = self.get_batch_ids(variables=variables)

        validator: "Validator" = self.get_validator(variables=variables)  # noqa: F821

        effective_column_names: List[str] = self.get_effective_column_names(
            batch_ids=batch_ids,
            validator=validator,
            variables=variables,
        )

        if not (self.include_column_names and effective_column_names):
            raise ge_exceptions.ProfilerExecutionError(
                message=f'Error: "column_list" in {self.__class__.__name__} must not be empty.'
            )

        column_name: str
        semantic_types_by_column_name: Dict[str, SemanticDomainTypes] = {
            column_name: self.semantic_type_filter.table_column_name_to_inferred_semantic_domain_type_map[
                column_name
            ]
            for column_name in effective_column_names
        }

        domains: List[Domain] = [
            Domain(
                domain_type=self.domain_type,
                domain_kwargs={
                    "column_list": effective_column_names,
                },
                details={
                    INFERRED_SEMANTIC_TYPE_KEY: semantic_types_by_column_name,
                },
                rule_name=rule_name,
            ),
        ]

        return domains
Ejemplo n.º 11
0
    def get_batch_id(
        self,
        variables: Optional[ParameterContainer] = None,
    ) -> Optional[str]:
        batch_ids: Optional[List[str]] = self._get_batch_ids(
            variables=variables, )
        num_batch_ids: int = len(batch_ids)
        if num_batch_ids != 1:
            raise ge_exceptions.ProfilerExecutionError(
                message=
                f"""{self.__class__.__name__}.get_batch_id() expected to return exactly one batch_id \
({num_batch_ids} were retrieved).
""")

        return batch_ids[0]
    def _get_truncate_values_using_heuristics(
        self,
        metric_values: np.ndarray,
        domain: Domain,
        *,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
    ) -> Dict[str, Union[Optional[int], Optional[float]]]:
        # Obtain truncate_values directive from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        truncate_values: Dict[
            str, Optional[Number]
        ] = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self.truncate_values,
            expected_return_type=dict,
            variables=variables,
            parameters=parameters,
        )

        distribution_boundary: Optional[Union[int, float]]
        if not all(
            [
                (
                    distribution_boundary is None
                    or is_numeric(value=distribution_boundary)
                )
                for distribution_boundary in truncate_values.values()
            ]
        ):
            raise ge_exceptions.ProfilerExecutionError(
                message=f"""The directive "truncate_values" for {self.__class__.__name__} must specify the
[lower_bound, upper_bound] closed interval, where either boundary is a numeric value (or None).
"""
            )

        lower_bound: Optional[Number] = truncate_values.get("lower_bound")
        upper_bound: Optional[Number] = truncate_values.get("upper_bound")

        if lower_bound is None and np.all(np.greater(metric_values, NP_EPSILON)):
            lower_bound = 0.0

        if upper_bound is None and np.all(np.less(metric_values, (-NP_EPSILON))):
            upper_bound = 0.0

        return {
            "lower_bound": lower_bound,
            "upper_bound": upper_bound,
        }
Ejemplo n.º 13
0
    def __init__(
        self,
        expectation_type: str,
        meta: Optional[Dict[str, Any]] = None,
        success_on_last_run: Optional[bool] = None,
        **kwargs,
    ):
        self._expectation_type = expectation_type
        self._expectation_kwargs = kwargs
        if meta is None:
            meta = {}
        if not isinstance(meta, dict):
            raise ge_exceptions.ProfilerExecutionError(
                message=f"""Argument "{meta}" in "{self.__class__.__name__}" must be of type "dictionary" \
(value of type "{str(type())}" was encountered).
"""
            )
        self._meta = meta
        self._success_on_last_run = success_on_last_run
Ejemplo n.º 14
0
    def __init__(
        self,
        data_context: DataContext,
        batch_request: Optional[Union[dict, str]] = None,
    ):
        """
        Args:
            data_context: DataContext
            batch_request: specified in DomainBuilder configuration to get Batch objects for domain computation.
        """

        if data_context is None:
            raise ge_exceptions.ProfilerExecutionError(
                message=
                f"{self.__class__.__name__} requires a data_context, but none was provided."
            )

        self._data_context = data_context
        self._batch_request = batch_request
    def _get_domains(
        self,
        variables: Optional[ParameterContainer] = None,
    ) -> List[Domain]:
        """
        Obtains and returns domains for all columns of a table (or for configured columns, if they exist in the table).
        """
        batch_id: str = self.get_batch_id(variables=variables)
        table_columns: List[str] = self.get_validator(variables=variables).get_metric(
            metric=MetricConfiguration(
                metric_name="table.columns",
                metric_domain_kwargs={
                    "batch_id": batch_id,
                },
                metric_value_kwargs=None,
                metric_dependencies=None,
            )
        )
        if self.column_names is None:
            self.column_names = table_columns
        else:
            column_name: str
            for column_name in self.column_names:
                if column_name not in table_columns:
                    raise ge_exceptions.ProfilerExecutionError(
                        message=f'Error: The column "{column_name}" in BatchData does not exist.'
                    )

        column_name: str
        domains: List[Domain] = [
            Domain(
                domain_type=self.domain_type,
                domain_kwargs={
                    "column": column_name,
                },
            )
            for column_name in self.column_names
        ]

        return domains
Ejemplo n.º 16
0
    def __init__(
            self,
            name: str,
            metric_name: Optional[str] = None,
            metric_multi_batch_parameter_builder_name: Optional[str] = None,
            metric_domain_kwargs: Optional[Union[str, dict]] = None,
            metric_value_kwargs: Optional[Union[str, dict]] = None,
            enforce_numeric_metric: Union[str, bool] = True,
            replace_nan_with_zero: Union[str, bool] = True,
            reduce_scalar_metric: Union[str, bool] = True,
            false_positive_rate: Union[str, float] = 5.0e-2,
            quantile_statistic_interpolation_method: str = "auto",
            estimator: str = "bootstrap",
            n_resamples: Optional[Union[str, int]] = None,
            bw_method: Optional[Union[str, float, Callable]] = None,
            random_seed: Optional[Union[str, int]] = None,
            include_estimator_samples_histogram_in_details: Union[
                str, bool] = False,
            truncate_values: Optional[Union[str, Dict[str, Union[
                Optional[int], Optional[float]]]]] = None,
            round_decimals: Optional[Union[str, int]] = None,
            evaluation_parameter_builder_configs: Optional[
                List[ParameterBuilderConfig]] = None,
            data_context: Optional["BaseDataContext"] = None,  # noqa: F821
    ) -> None:
        """
        Args:
            name: the name of this parameter -- this is user-specified parameter name (from configuration); it is not
                the fully-qualified parameter name; a fully-qualified parameter name must start with "$parameter."
                and may contain one or more subsequent parts (e.g., "$parameter.<my_param_from_config>.<metric_name>").
            metric_name: the name of a metric used in MetricConfiguration (must be a supported and registered metric)
            metric_multi_batch_parameter_builder_name: name of parameter that computes "metric_name" (for every Batch).
            metric_domain_kwargs: used in MetricConfiguration
            metric_value_kwargs: used in MetricConfiguration
            enforce_numeric_metric: used in MetricConfiguration to insure that metric computations return numeric values
            replace_nan_with_zero: if False, then if the computed metric gives NaN, then exception is raised; otherwise,
                if True (default), then if the computed metric gives NaN, then it is converted to the 0.0 (float) value.
            reduce_scalar_metric: if True (default), then reduces computation of 1-dimensional metric to scalar value.
            false_positive_rate: user-configured fraction between 0 and 1 expressing desired false positive rate for
                identifying unexpected values as judged by the upper- and lower- quantiles of the observed metric data.
            quantile_statistic_interpolation_method: Applicable only for the "bootstrap" sampling method --
                supplies value of (interpolation) "method" to "np.quantile()" statistic, used for confidence intervals.
            estimator: choice of the estimation algorithm: "oneshot" (one observation), "bootstrap" (default),
                or "kde" (kernel density estimation).
            n_resamples: Applicable only for the "bootstrap" and "kde" sampling methods -- if omitted (default), then
                9999 is used (default in
                "https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.bootstrap.html").
            bw_method: Applicable only for the "kde" sampling method -- if omitted (default), then "scott" is used.
                Possible values for the estimator bandwidth method are described at:
                https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.gaussian_kde.html
            random_seed: Applicable only for the "bootstrap" and "kde" sampling methods -- if omitted (default), then
                uses "np.random.choice"; otherwise, utilizes "np.random.Generator(np.random.PCG64(random_seed))".
            include_estimator_samples_histogram_in_details: Applicable only for the "bootstrap" sampling method -- if
                True, then add 10-bin histogram of bootstraps to "details"; otherwise, omit this information (default).
            truncate_values: user-configured directive for whether or not to allow the computed parameter values
                (i.e., lower_bound, upper_bound) to take on values outside the specified bounds when packaged on output.
            round_decimals: user-configured non-negative integer indicating the number of decimals of the
                rounding precision of the computed parameter values (i.e., min_value, max_value) prior to packaging them
                on output.  If omitted, then no rounding is performed, unless the computed value is already an integer.
            evaluation_parameter_builder_configs: ParameterBuilder configurations, executing and making whose respective
                ParameterBuilder objects' outputs available (as fully-qualified parameter names) is pre-requisite.
                These "ParameterBuilder" configurations help build parameters needed for this "ParameterBuilder".
            data_context: BaseDataContext associated with this ParameterBuilder
        """
        super().__init__(
            name=name,
            metric_name=metric_name,
            metric_domain_kwargs=metric_domain_kwargs,
            metric_value_kwargs=metric_value_kwargs,
            enforce_numeric_metric=enforce_numeric_metric,
            replace_nan_with_zero=replace_nan_with_zero,
            reduce_scalar_metric=reduce_scalar_metric,
            evaluation_parameter_builder_configs=
            evaluation_parameter_builder_configs,
            data_context=data_context,
        )

        self._metric_multi_batch_parameter_builder_name = (
            metric_multi_batch_parameter_builder_name)

        self._false_positive_rate = false_positive_rate

        self._quantile_statistic_interpolation_method = (
            quantile_statistic_interpolation_method)

        self._estimator = estimator

        self._n_resamples = n_resamples

        self._bw_method = bw_method

        self._random_seed = random_seed

        self._include_estimator_samples_histogram_in_details = (
            include_estimator_samples_histogram_in_details)

        self._round_decimals = round_decimals

        if not truncate_values:
            truncate_values = {
                "lower_bound": None,
                "upper_bound": None,
            }
        else:
            if not isinstance(truncate_values, str):
                truncate_values_keys: set = set(truncate_values.keys())
                if (not truncate_values_keys <=
                        NumericMetricRangeMultiBatchParameterBuilder.
                        RECOGNIZED_TRUNCATE_DISTRIBUTION_KEYS):
                    raise ge_exceptions.ProfilerExecutionError(
                        message=
                        f"""Unrecognized truncate_values key(s) in {self.__class__.__name__}:
"{str(truncate_values_keys - NumericMetricRangeMultiBatchParameterBuilder.RECOGNIZED_TRUNCATE_DISTRIBUTION_KEYS)}" \
detected.
""")

        self._truncate_values = truncate_values
Ejemplo n.º 17
0
    def _build_parameters(
        self,
        parameter_container: ParameterContainer,
        domain: Domain,
        *,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
    ):
        """
         Builds ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and optional details.

         :return: ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and optional details

         The algorithm operates according to the following steps:
         1. Obtain batch IDs of interest using DataContext and BatchRequest (unless passed explicitly as argument). Note
         that this specific BatchRequest was specified as part of configuration for the present ParameterBuilder class.
         2. Set up metric_domain_kwargs and metric_value_kwargs (using configuration and/or variables and parameters).
         3. Instantiate the Validator object corresponding to BatchRequest (with a temporary expectation_suite_name) in
            order to have access to all Batch objects, on each of which the specified metric_name will be computed.
         4. Perform metric computations and obtain the result in the array-like form (one metric value per each Batch).
         5. Using the configured directives and heuristics, determine whether or not the ranges should be clipped.
         6. Using the configured directives and heuristics, determine if return values should be rounded to an integer.
         7. Convert the list of floating point metric computation results to a numpy array (for further computations).
         Steps 8 -- 10 are for the "oneshot" sampling method only (the "bootstrap" method achieves same automatically):
         8. Compute the mean and the standard deviation of the metric (aggregated over all the gathered Batch objects).
         9. Compute number of standard deviations (as floating point) needed (around the mean) to achieve the specified
            false_positive_rate (note that false_positive_rate of 0.0 would result in infinite number of standard deviations,
            hence it is "nudged" by small quantity "epsilon" above 0.0 if false_positive_rate of 0.0 appears as argument).
            (Please refer to "https://en.wikipedia.org/wiki/Normal_distribution" and references therein for background.)
        10. Compute the "band" around the mean as the min_value and max_value (to be used in ExpectationConfiguration).
        11. Return [low, high] for the desired metric as estimated by the specified sampling method.
        12. Set up the arguments and call build_parameter_container() to store the parameter as part of "rule state".
        """
        validator: Validator = self.get_validator(
            domain=domain,
            variables=variables,
            parameters=parameters,
        )

        batch_ids: Optional[List[str]] = self.get_batch_ids(
            domain=domain,
            variables=variables,
            parameters=parameters,
        )
        if not batch_ids:
            raise ge_exceptions.ProfilerExecutionError(
                message=
                f"Utilizing a {self.__class__.__name__} requires a non-empty list of batch identifiers."
            )

        metric_computation_result: Dict[
            str, Union[Union[np.ndarray, List[Union[Any, Number]]],
                       Dict[str, Any]]] = self.get_metrics(
                           batch_ids=batch_ids,
                           validator=validator,
                           metric_name=self._metric_name,
                           metric_domain_kwargs=self._metric_domain_kwargs,
                           metric_value_kwargs=self._metric_value_kwargs,
                           enforce_numeric_metric=self._enforce_numeric_metric,
                           replace_nan_with_zero=self._replace_nan_with_zero,
                           domain=domain,
                           variables=variables,
                           parameters=parameters,
                       )
        metric_values: Union[np.ndarray, List[Union[
            Any, Number]]] = metric_computation_result["metric_values"]
        details: Dict[str, Any] = metric_computation_result["details"]

        # Obtain sampling_method directive from rule state (i.e., variables and parameters); from instance variable otherwise.
        sampling_method: str = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self._sampling_method,
            expected_return_type=str,
            variables=variables,
            parameters=parameters,
        )
        if not (sampling_method in NumericMetricRangeMultiBatchParameterBuilder
                .RECOGNIZED_SAMPLING_METHOD_NAMES):
            raise ge_exceptions.ProfilerExecutionError(
                message=
                f"""The directive "sampling_method" for {self.__class__.__name__} can be only one of
{NumericMetricRangeMultiBatchParameterBuilder.RECOGNIZED_SAMPLING_METHOD_NAMES} ("{sampling_method}" was detected).
""")

        # Obtain false_positive_rate from rule state (i.e., variables and parameters); from instance variable otherwise.
        false_positive_rate: Union[
            Any, str] = get_parameter_value_and_validate_return_type(
                domain=domain,
                parameter_reference=self._false_positive_rate,
                expected_return_type=float,
                variables=variables,
                parameters=parameters,
            )
        if not (0.0 <= false_positive_rate <= 1.0):
            raise ge_exceptions.ProfilerExecutionError(
                message=
                f"The confidence level for {self.__class__.__name__} is outside of [0.0, 1.0] closed interval."
            )

        truncate_values: Dict[
            str, Number] = self._get_truncate_values_using_heuristics(
                metric_values=metric_values,
                domain=domain,
                variables=variables,
                parameters=parameters,
            )
        lower_bound: Optional[float] = truncate_values.get("lower_bound")
        upper_bound: Optional[float] = truncate_values.get("upper_bound")

        round_decimals: int = self._get_round_decimals_using_heuristics(
            metric_values=metric_values,
            domain=domain,
            variables=variables,
            parameters=parameters,
        )

        metric_values = np.array(metric_values, dtype=np.float64)

        lower_quantile: Union[Number, float]
        upper_quantile: Union[Number, float]

        if np.all(np.isclose(metric_values, metric_values[0])):
            # Computation is unnecessary if distribution is degenerate.
            lower_quantile = upper_quantile = metric_values[0]
        elif sampling_method == "bootstrap":
            lower_quantile, upper_quantile = self._get_bootstrap_estimate(
                metric_values=metric_values,
                false_positive_rate=false_positive_rate,
                domain=domain,
                variables=variables,
                parameters=parameters,
            )
        else:
            lower_quantile, upper_quantile = compute_quantiles(
                metric_values=metric_values,
                false_positive_rate=false_positive_rate,
            )

        min_value: Union[Number, float]
        max_value: Union[Number, float]

        if round_decimals == 0:
            min_value = round(float(lower_quantile))
            max_value = round(float(upper_quantile))
        else:
            min_value = round(float(lower_quantile), round_decimals)
            max_value = round(float(upper_quantile), round_decimals)

        if lower_bound is not None:
            min_value = max(min_value, lower_bound)
        if upper_bound is not None:
            max_value = min(max_value, upper_bound)

        parameter_values: Dict[str, Any] = {
            f"$parameter.{self.parameter_name}": {
                "value": {
                    "min_value": min_value,
                    "max_value": max_value,
                },
                "details": details,
            },
        }

        build_parameter_container(parameter_container=parameter_container,
                                  parameter_values=parameter_values)
    def infer_semantic_domain_type_from_table_column_type(
        self,
        column_types_dict_list: List[Dict[str, Any]],
        column_name: str,
    ) -> InferredSemanticDomainType:
        # Note: As of Python 3.8, specifying argument type in Lambda functions is not supported by Lambda syntax.
        column_types_dict_list = list(
            filter(
                lambda column_type_dict: column_name == column_type_dict["name"
                                                                         ],
                column_types_dict_list,
            ))
        if len(column_types_dict_list) != 1:
            raise ge_exceptions.ProfilerExecutionError(
                message=
                f"""Error: {len(column_types_dict_list)} columns were found while obtaining semantic type \
information.  Please ensure that the specified column name refers to exactly one column.
""")

        column_type: str = str(column_types_dict_list[0]["type"]).upper()

        semantic_column_type: SemanticDomainTypes
        if column_type in (
            {
                type_name.upper()
                for type_name in ProfilerTypeMapping.INT_TYPE_NAMES
            }
                | {
                    type_name.upper()
                    for type_name in ProfilerTypeMapping.FLOAT_TYPE_NAMES
                }):
            semantic_column_type = SemanticDomainTypes.NUMERIC
        elif column_type in {
                type_name.upper()
                for type_name in ProfilerTypeMapping.STRING_TYPE_NAMES
        }:
            semantic_column_type = SemanticDomainTypes.TEXT
        elif column_type in {
                type_name.upper()
                for type_name in ProfilerTypeMapping.BOOLEAN_TYPE_NAMES
        }:
            semantic_column_type = SemanticDomainTypes.LOGIC
        elif column_type in {
                type_name.upper()
                for type_name in ProfilerTypeMapping.DATETIME_TYPE_NAMES
        }:
            semantic_column_type = SemanticDomainTypes.DATETIME
        elif column_type in {
                type_name.upper()
                for type_name in ProfilerTypeMapping.BINARY_TYPE_NAMES
        }:
            semantic_column_type = SemanticDomainTypes.BINARY
        elif column_type in {
                type_name.upper()
                for type_name in ProfilerTypeMapping.CURRENCY_TYPE_NAMES
        }:
            semantic_column_type = SemanticDomainTypes.CURRENCY
        elif column_type in {
                type_name.upper()
                for type_name in ProfilerTypeMapping.IDENTIFIER_TYPE_NAMES
        }:
            semantic_column_type = SemanticDomainTypes.IDENTIFIER
        elif column_type in (
            {
                type_name.upper()
                for type_name in ProfilerTypeMapping.MISCELLANEOUS_TYPE_NAMES
            }
                | {
                    type_name.upper()
                    for type_name in ProfilerTypeMapping.RECORD_TYPE_NAMES
                }):
            semantic_column_type = SemanticDomainTypes.MISCELLANEOUS
        else:
            semantic_column_type = SemanticDomainTypes.UNKNOWN

        inferred_semantic_column_type: InferredSemanticDomainType = (
            InferredSemanticDomainType(
                semantic_domain_type=semantic_column_type,
                details={
                    "algorithm_type":
                    "deterministic",
                    "mechanism":
                    "lookup_table",
                    "source":
                    "great_expectations.profile.base.ProfilerTypeMapping",
                },
            ))

        return inferred_semantic_column_type
Ejemplo n.º 19
0
    def _build_parameters(
        self,
        domain: Domain,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
        recompute_existing_parameter_values: bool = False,
    ) -> Attributes:
        """
        Builds ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and details.

        Check the percentage of values matching each string, and return the best fit, or None if no string exceeds the
        configured threshold.

        Returns:
            Attributes object, containing computed parameter values and parameter computation details metadata.
        """
        metric_computation_result: MetricComputationResult

        metric_computation_result = self.get_metrics(
            metric_name="column_values.nonnull.count",
            metric_domain_kwargs=self.metric_domain_kwargs,
            metric_value_kwargs=self.metric_value_kwargs,
            domain=domain,
            variables=variables,
            parameters=parameters,
        )

        # This should never happen.
        if len(metric_computation_result.attributed_resolved_metrics) != 1:
            raise ge_exceptions.ProfilerExecutionError(
                message=f'Result of metric computations for {self.__class__.__name__} must be a list with exactly 1 element of type "AttributedResolvedMetrics" ({metric_computation_result.attributed_resolved_metrics} found).'
            )

        attributed_resolved_metrics: AttributedResolvedMetrics

        attributed_resolved_metrics = (
            metric_computation_result.attributed_resolved_metrics[0]
        )

        metric_values: MetricValues

        metric_values = attributed_resolved_metrics.metric_values

        if metric_values is None:
            raise ge_exceptions.ProfilerExecutionError(
                message=f"Result of metric computations for {self.__class__.__name__} is empty."
            )

        # Now obtain 1-dimensional vector of values of computed metric (each element corresponds to a Batch ID).
        metric_values = metric_values[:, 0]

        nonnull_count: int = sum(metric_values)

        # Obtain candidate_strings from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        candidate_strings: Union[
            List[str],
            Set[str],
        ] = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self.candidate_strings,
            expected_return_type=None,
            variables=variables,
            parameters=parameters,
        )

        # Gather "metric_value_kwargs" for all candidate "strftime_format" strings.
        format_string: str
        match_strftime_metric_value_kwargs_list: List[dict] = []
        match_strftime_metric_value_kwargs: dict
        for format_string in candidate_strings:
            if self.metric_value_kwargs:
                match_strftime_metric_value_kwargs = {
                    **self.metric_value_kwargs,
                    **{"strftime_format": format_string},
                }
            else:
                match_strftime_metric_value_kwargs = {
                    "strftime_format": format_string,
                }

            match_strftime_metric_value_kwargs_list.append(
                match_strftime_metric_value_kwargs
            )

        # Obtain resolved metrics and metadata for all metric configurations and available Batch objects simultaneously.
        metric_computation_result = self.get_metrics(
            metric_name="column_values.match_strftime_format.unexpected_count",
            metric_domain_kwargs=self.metric_domain_kwargs,
            metric_value_kwargs=match_strftime_metric_value_kwargs_list,
            domain=domain,
            variables=variables,
            parameters=parameters,
        )

        format_string_success_ratios: dict = {}

        for (
            attributed_resolved_metrics
        ) in metric_computation_result.attributed_resolved_metrics:
            # Now obtain 1-dimensional vector of values of computed metric (each element corresponds to a Batch ID).
            metric_values = attributed_resolved_metrics.metric_values[:, 0]

            match_strftime_unexpected_count: int = sum(metric_values)
            success_ratio: float = (
                nonnull_count - match_strftime_unexpected_count
            ) / nonnull_count
            format_string_success_ratios[
                attributed_resolved_metrics.metric_attributes["strftime_format"]
            ] = success_ratio

        # Obtain threshold from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        threshold: float = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self.threshold,
            expected_return_type=float,
            variables=variables,
            parameters=parameters,
        )

        # get best-matching datetime string that matches greater than threshold
        best_format_string: str
        best_ratio: float
        (
            best_format_string,
            best_ratio,
        ) = ParameterBuilder._get_best_candidate_above_threshold(
            format_string_success_ratios, threshold
        )
        # dict of sorted datetime and ratios for all evaluated candidates
        sorted_format_strings_and_ratios: dict = (
            ParameterBuilder._get_sorted_candidates_and_ratios(
                format_string_success_ratios
            )
        )

        return Attributes(
            {
                FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY: best_format_string,
                FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY: {
                    "success_ratio": best_ratio,
                    "candidate_strings": sorted_format_strings_and_ratios,
                },
            }
        )
Ejemplo n.º 20
0
    def _sanitize_metric_computation(
        self,
        metric_name: str,
        attributed_resolved_metrics: AttributedResolvedMetrics,
        enforce_numeric_metric: Union[str, bool] = False,
        replace_nan_with_zero: Union[str, bool] = False,
        domain: Optional[Domain] = None,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
    ) -> AttributedResolvedMetrics:
        """
        This method conditions (or "sanitizes") data samples in the format "N x R^m", where "N" (most significant
        dimension) is the number of measurements (e.g., one per Batch of data), while "R^m" is the multi-dimensional
        metric, whose values are being estimated.  The "conditioning" operations are:
        1. If "enforce_numeric_metric" flag is set, raise an error if a non-numeric value is found in sample vectors.
        2. Further, if a NaN is encountered in a sample vectors and "replace_nan_with_zero" is True, then replace those
        NaN values with the 0.0 floating point number; if "replace_nan_with_zero" is False, then raise an error.
        """
        # Obtain enforce_numeric_metric from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        enforce_numeric_metric = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=enforce_numeric_metric,
            expected_return_type=bool,
            variables=variables,
            parameters=parameters,
        )

        # Obtain replace_nan_with_zero from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        replace_nan_with_zero = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=replace_nan_with_zero,
            expected_return_type=bool,
            variables=variables,
            parameters=parameters,
        )

        if not (enforce_numeric_metric or replace_nan_with_zero):
            return attributed_resolved_metrics

        metric_values_by_batch_id: Dict[str, MetricValue] = {}

        batch_id: str
        metric_values: MetricValues
        for (
            batch_id,
            metric_values,
        ) in attributed_resolved_metrics.conditioned_attributed_metric_values.items():
            batch_metric_values: MetricValues = []

            metric_value_shape: tuple = metric_values.shape

            # Generate all permutations of indexes for accessing every element of the multi-dimensional metric.
            metric_value_shape_idx: int
            axes: List[np.ndarray] = [
                np.indices(dimensions=(metric_value_shape_idx,))[0]
                for metric_value_shape_idx in metric_value_shape
            ]
            metric_value_indices: List[tuple] = list(itertools.product(*tuple(axes)))

            metric_value_idx: tuple
            for metric_value_idx in metric_value_indices:
                metric_value: MetricValue = metric_values[metric_value_idx]
                if enforce_numeric_metric:
                    if not np.issubdtype(metric_value.dtype, np.number):
                        raise ge_exceptions.ProfilerExecutionError(
                            message=f"""Applicability of {self.__class__.__name__} is restricted to numeric-valued metrics \
(value of type "{str(metric_value.dtype)}" was computed).
"""
                        )

                    if np.isnan(metric_value):
                        if not replace_nan_with_zero:
                            raise ValueError(
                                f"""Computation of metric "{metric_name}" resulted in NaN ("not a number") value.
"""
                            )

                        batch_metric_values.append(0.0)
                    else:
                        batch_metric_values.append(metric_value)

            metric_values_by_batch_id[batch_id] = batch_metric_values

        attributed_resolved_metrics.metric_values_by_batch_id = (
            metric_values_by_batch_id
        )

        return attributed_resolved_metrics
Ejemplo n.º 21
0
    def get_metrics(
        self,
        metric_name: str,
        metric_domain_kwargs: Optional[Union[Union[str, dict],
                                             List[Union[str, dict]]]] = None,
        metric_value_kwargs: Optional[Union[Union[str, dict],
                                            List[Union[str, dict]]]] = None,
        enforce_numeric_metric: Union[str, bool] = False,
        replace_nan_with_zero: Union[str, bool] = False,
        domain: Optional[Domain] = None,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
    ) -> MetricComputationResult:
        """
        General multi-batch metric computation facility.

        Computes specified metric (can be multi-dimensional, numeric, non-numeric, or mixed) and conditions (or
        "sanitizes") result according to two criteria: enforcing metric output to be numeric and handling NaN values.
        :param metric_name: Name of metric of interest, being computed.
        :param metric_domain_kwargs: Metric Domain Kwargs is an essential parameter of the MetricConfiguration object.
        :param metric_value_kwargs: Metric Value Kwargs is an essential parameter of the MetricConfiguration object.
        :param enforce_numeric_metric: Flag controlling whether or not metric output must be numerically-valued.
        :param replace_nan_with_zero: Directive controlling how NaN metric values, if encountered, should be handled.
        :param domain: Domain object scoping "$variable"/"$parameter"-style references in configuration and runtime.
        :param variables: Part of the "rule state" available for "$variable"-style references.
        :param parameters: Part of the "rule state" available for "$parameter"-style references.
        :return: MetricComputationResult object, containing both: data samples in the format "N x R^m", where "N" (most
        significant dimension) is the number of measurements (e.g., one per Batch of data), while "R^m" is the
        multi-dimensional metric, whose values are being estimated, and details (to be used for metadata purposes).
        """
        if not metric_name:
            raise ge_exceptions.ProfilerExecutionError(
                message=
                f"""Utilizing "{self.__class__.__name__}.get_metrics()" requires valid "metric_name" to be \
specified (empty "metric_name" value detected).""")

        batch_ids: Optional[List[str]] = self.get_batch_ids(
            domain=domain,
            variables=variables,
            parameters=parameters,
        )
        if not batch_ids:
            raise ge_exceptions.ProfilerExecutionError(
                message=
                f"Utilizing a {self.__class__.__name__} requires a non-empty list of Batch identifiers."
            )
        """
        Compute metrics, corresponding to multiple "MetricConfiguration" directives, together, rather than individually.

        As a strategy, since "metric_domain_kwargs" changes depending on "batch_id", "metric_value_kwargs" serves as
        identifying entity (through "AttributedResolvedMetrics") for accessing resolved metrics (computation results).

        All "MetricConfiguration" directives are generated by combining each metric_value_kwargs" with
        "metric_domain_kwargs" for all "batch_ids" (where every "metric_domain_kwargs" represents separate "batch_id").
        Then, all "MetricConfiguration" objects, collected into list as container, are resolved simultaneously.
        """

        # Step-1: Gather "metric_domain_kwargs" (corresponding to "batch_ids").

        domain_kwargs: dict = build_metric_domain_kwargs(
            batch_id=None,
            metric_domain_kwargs=metric_domain_kwargs,
            domain=domain,
            variables=variables,
            parameters=parameters,
        )

        batch_id: str

        metric_domain_kwargs = [
            copy.deepcopy(
                build_metric_domain_kwargs(
                    batch_id=batch_id,
                    metric_domain_kwargs=copy.deepcopy(domain_kwargs),
                    domain=domain,
                    variables=variables,
                    parameters=parameters,
                )) for batch_id in batch_ids
        ]

        # Step-2: Gather "metric_value_kwargs" (caller may require same metric computed for multiple arguments).

        if not isinstance(metric_value_kwargs, list):
            metric_value_kwargs = [metric_value_kwargs]

        value_kwargs_cursor: dict
        metric_value_kwargs = [
            # Obtain value kwargs from "rule state" (i.e., variables and parameters); from instance variable otherwise.
            get_parameter_value_and_validate_return_type(
                domain=domain,
                parameter_reference=value_kwargs_cursor,
                expected_return_type=None,
                variables=variables,
                parameters=parameters,
            ) for value_kwargs_cursor in metric_value_kwargs
        ]

        # Step-3: Generate "MetricConfiguration" directives for all "metric_domain_kwargs"/"metric_value_kwargs" pairs.

        domain_kwargs_cursor: dict
        kwargs_combinations: List[List[dict]] = [
            [domain_kwargs_cursor, value_kwargs_cursor]
            for value_kwargs_cursor in metric_value_kwargs
            for domain_kwargs_cursor in metric_domain_kwargs
        ]

        metrics_to_resolve: List[MetricConfiguration]

        kwargs_pair_cursor: List[dict, dict]
        metrics_to_resolve = [
            MetricConfiguration(
                metric_name=metric_name,
                metric_domain_kwargs=kwargs_pair_cursor[0],
                metric_value_kwargs=kwargs_pair_cursor[1],
                metric_dependencies=None,
            ) for kwargs_pair_cursor in kwargs_combinations
        ]

        # Step-4: Sort "MetricConfiguration" directives by "metric_value_kwargs_id" and "batch_id" (in that order).
        # This precise sort order enables pairing every metric value with its respective "batch_id" (e.g., for display).

        metrics_to_resolve = sorted(
            metrics_to_resolve,
            key=lambda metric_configuration_element: (
                metric_configuration_element.metric_value_kwargs_id,
                metric_configuration_element.metric_domain_kwargs["batch_id"],
            ),
        )

        # Step-5: Resolve all metrics in one operation simultaneously.

        # The Validator object used for metric calculation purposes.
        validator: "Validator" = self.get_validator(  # noqa: F821
            domain=domain,
            variables=variables,
            parameters=parameters,
        )

        resolved_metrics: Dict[Tuple[str, str, str],
                               Any] = validator.compute_metrics(
                                   metric_configurations=metrics_to_resolve)

        # Step-6: Sort resolved metrics according to same sort order as was applied to "MetricConfiguration" directives.

        resolved_metrics_sorted: Dict[Tuple[str, str, str], Any] = {}

        metric_configuration: MetricConfiguration

        resolved_metric_value: Any

        for metric_configuration in metrics_to_resolve:
            if metric_configuration.id not in resolved_metrics:
                logger.warning(
                    f"{metric_configuration.id[0]} was not found in the resolved Metrics for ParameterBuilder."
                )
                continue

            resolved_metrics_sorted[
                metric_configuration.id] = resolved_metrics[
                    metric_configuration.id]

        # Step-7: Map resolved metrics to their attributes for identification and recovery by receiver.

        attributed_resolved_metrics_map: Dict[str,
                                              AttributedResolvedMetrics] = {}

        attributed_resolved_metrics: AttributedResolvedMetrics

        for metric_configuration in metrics_to_resolve:
            attributed_resolved_metrics = attributed_resolved_metrics_map.get(
                metric_configuration.metric_value_kwargs_id)
            if attributed_resolved_metrics is None:
                attributed_resolved_metrics = AttributedResolvedMetrics(
                    metric_attributes=metric_configuration.metric_value_kwargs,
                    metric_values_by_batch_id=None,
                )
                attributed_resolved_metrics_map[
                    metric_configuration.
                    metric_value_kwargs_id] = attributed_resolved_metrics

            if metric_configuration.id in resolved_metrics_sorted:
                resolved_metric_value = resolved_metrics_sorted[
                    metric_configuration.id]
                attributed_resolved_metrics.add_resolved_metric(
                    batch_id=metric_configuration.
                    metric_domain_kwargs["batch_id"],
                    value=resolved_metric_value,
                )
            else:
                continue

        # Step-8: Convert scalar metric values to vectors to enable uniformity of processing in subsequent operations.

        metric_attributes_id: str
        for (
                metric_attributes_id,
                attributed_resolved_metrics,
        ) in attributed_resolved_metrics_map.items():
            if (isinstance(attributed_resolved_metrics.metric_values,
                           np.ndarray)
                    and attributed_resolved_metrics.metric_values.ndim == 1):
                attributed_resolved_metrics.metric_values_by_batch_id = {
                    batch_id: [resolved_metric_value]
                    for batch_id, resolved_metric_value in
                    attributed_resolved_metrics.attributed_metric_values.items(
                    )
                }
                attributed_resolved_metrics_map[
                    metric_attributes_id] = attributed_resolved_metrics

        # Step-9: Apply numeric/hygiene flags (e.g., "enforce_numeric_metric", "replace_nan_with_zero") to results.

        for (
                metric_attributes_id,
                attributed_resolved_metrics,
        ) in attributed_resolved_metrics_map.items():
            self._sanitize_metric_computation(
                metric_name=metric_name,
                attributed_resolved_metrics=attributed_resolved_metrics,
                enforce_numeric_metric=enforce_numeric_metric,
                replace_nan_with_zero=replace_nan_with_zero,
                domain=domain,
                variables=variables,
                parameters=parameters,
            )

        # Step-10: Build and return result to receiver (apply simplifications to cases of single "metric_value_kwargs").

        return MetricComputationResult(
            attributed_resolved_metrics=list(
                attributed_resolved_metrics_map.values()),
            details={
                "metric_configuration": {
                    "metric_name":
                    metric_name,
                    "domain_kwargs":
                    domain_kwargs,
                    "metric_value_kwargs":
                    metric_value_kwargs[0]
                    if len(metric_value_kwargs) == 1 else metric_value_kwargs,
                    "metric_dependencies":
                    None,
                },
                "num_batches": len(batch_ids),
            },
        )
    def _build_parameters(
        self,
        parameter_container: ParameterContainer,
        domain: Domain,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
    ):
        """
         Builds ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and optional
         details.

         :return: ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and
         ptional details

         The algorithm operates according to the following steps:
         1. Obtain batch IDs of interest using DataContext and BatchRequest (unless passed explicitly as argument). Note
         that this specific BatchRequest was specified as part of configuration for the present ParameterBuilder class.
         2. Set up metric_domain_kwargs and metric_value_kwargs (using configuration and/or variables and parameters).
         3. Instantiate the Validator object corresponding to BatchRequest (with a temporary expectation_suite_name) in
            order to have access to all Batch objects, on each of which the specified metric_name will be computed.
         4. Perform metric computations and obtain the result in the array-like form (one metric value per each Batch).
         5. Using the configured directives and heuristics, determine whether or not the ranges should be clipped.
         6. Using the configured directives and heuristics, determine if return values should be rounded to an integer.
         7. Convert the multi-dimensional metric computation results to a numpy array (for further computations).
         Steps 8 -- 10 are for the "oneshot" sampling method only (the "bootstrap" method achieves same automatically):
         8. Compute the mean and the standard deviation of the metric (aggregated over all the gathered Batch objects).
         9. Compute number of standard deviations (as floating point) needed (around the mean) to achieve the specified
            false_positive_rate (note that false_positive_rate of 0.0 would result in infinite number of standard deviations,
            hence it is "nudged" by small quantity "epsilon" above 0.0 if false_positive_rate of 0.0 appears as argument).
            (Please refer to "https://en.wikipedia.org/wiki/Normal_distribution" and references therein for background.)
        10. Compute the "band" around the mean as the min_value and max_value (to be used in ExpectationConfiguration).
        11. Return [low, high] for the desired metric as estimated by the specified sampling method.
        12. Set up the arguments and call build_parameter_container() to store the parameter as part of "rule state".
        """
        metric_computation_result: MetricComputationResult = self.get_metrics(
            metric_name=self.metric_name,
            metric_domain_kwargs=self.metric_domain_kwargs,
            metric_value_kwargs=self.metric_value_kwargs,
            enforce_numeric_metric=self.enforce_numeric_metric,
            replace_nan_with_zero=self.replace_nan_with_zero,
            domain=domain,
            variables=variables,
            parameters=parameters,
        )
        metric_values: np.ndarray = metric_computation_result.metric_values
        details: MetricComputationDetails = metric_computation_result.details

        # Obtain sampling_method directive from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        sampling_method: str = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self.sampling_method,
            expected_return_type=str,
            variables=variables,
            parameters=parameters,
        )
        if (
            sampling_method
            not in NumericMetricRangeMultiBatchParameterBuilder.RECOGNIZED_SAMPLING_METHOD_NAMES
        ):
            raise ge_exceptions.ProfilerExecutionError(
                message=f"""The directive "sampling_method" for {self.__class__.__name__} can be only one of
{NumericMetricRangeMultiBatchParameterBuilder.RECOGNIZED_SAMPLING_METHOD_NAMES} ("{sampling_method}" was detected).
"""
            )

        estimator: Callable
        etimator_kwargs: dict
        if sampling_method == "bootstrap":
            estimator = self._get_bootstrap_estimate
            estimator_kwargs = {
                "false_positive_rate": self.false_positive_rate,
                "num_bootstrap_samples": self.num_bootstrap_samples,
            }
        else:
            estimator = self._get_deterministic_estimate
            estimator_kwargs = {
                "false_positive_rate": self.false_positive_rate,
            }

        metric_value_range: np.ndarray = self._estimate_metric_value_range(
            metric_values=metric_values,
            estimator=estimator,
            domain=domain,
            variables=variables,
            parameters=parameters,
            **estimator_kwargs,
        )

        parameter_values: Dict[str, Any] = {
            f"$parameter.{self.name}": {
                "value": {
                    "value_range": metric_value_range,
                },
                "details": details,
            },
        }

        build_parameter_container(
            parameter_container=parameter_container, parameter_values=parameter_values
        )
    def __init__(
        self,
        name: str,
        metric_name: str,
        metric_domain_kwargs: Optional[Union[str, dict]] = None,
        metric_value_kwargs: Optional[Union[str, dict]] = None,
        sampling_method: str = "bootstrap",
        enforce_numeric_metric: Union[str, bool] = True,
        replace_nan_with_zero: Union[str, bool] = True,
        reduce_scalar_metric: Union[str, bool] = True,
        false_positive_rate: Union[str, float] = 5.0e-2,
        num_bootstrap_samples: Optional[Union[str, int]] = None,
        round_decimals: Optional[Union[str, int]] = None,
        truncate_values: Optional[
            Union[str, Dict[str, Union[Optional[int], Optional[float]]]]
        ] = None,
        data_context: Optional["DataContext"] = None,  # noqa: F821
        batch_request: Optional[Union[BatchRequest, RuntimeBatchRequest, dict]] = None,
    ):
        """
        Args:
            name: the name of this parameter -- this is user-specified parameter name (from configuration);
            it is not the fully-qualified parameter name; a fully-qualified parameter name must start with "$parameter."
            and may contain one or more subsequent parts (e.g., "$parameter.<my_param_from_config>.<metric_name>").
            metric_name: the name of a metric used in MetricConfiguration (must be a supported and registered metric)
            metric_domain_kwargs: used in MetricConfiguration
            metric_value_kwargs: used in MetricConfiguration
            sampling_method: choice of the sampling algorithm: "oneshot" (one observation) or "bootstrap" (default)
            enforce_numeric_metric: used in MetricConfiguration to insure that metric computations return numeric values
            replace_nan_with_zero: if False, then if the computed metric gives NaN, then exception is raised; otherwise,
            if True (default), then if the computed metric gives NaN, then it is converted to the 0.0 (float) value.
            reduce_scalar_metric: if True (default), then reduces computation of 1-dimensional metric to scalar value.
            false_positive_rate: user-configured fraction between 0 and 1 expressing desired false positive rate for
            identifying unexpected values as judged by the upper- and lower- quantiles of the observed metric data.
            num_bootstrap_samples: Applicable only for the "bootstrap" sampling method -- if omitted (default), then
            9999 is used (default in "https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.bootstrap.html").
            round_decimals: user-configured non-negative integer indicating the number of decimals of the
            rounding precision of the computed parameter values (i.e., min_value, max_value) prior to packaging them on
            output.  If omitted, then no rounding is performed, unless the computed value is already an integer.
            truncate_values: user-configured directive for whether or not to allow the computed parameter values
            (i.e., lower_bound, upper_bound) to take on values outside the specified bounds when packaged on output.
            data_context: DataContext
            batch_request: specified in ParameterBuilder configuration to get Batch objects for parameter computation.
        """
        super().__init__(
            name=name,
            data_context=data_context,
            batch_request=batch_request,
        )

        self._metric_name = metric_name
        self._metric_domain_kwargs = metric_domain_kwargs
        self._metric_value_kwargs = metric_value_kwargs

        self._sampling_method = sampling_method

        self._enforce_numeric_metric = enforce_numeric_metric
        self._replace_nan_with_zero = replace_nan_with_zero

        self._reduce_scalar_metric = reduce_scalar_metric

        self._false_positive_rate = false_positive_rate

        self._num_bootstrap_samples = num_bootstrap_samples

        self._round_decimals = round_decimals

        if not truncate_values:
            truncate_values = {
                "lower_bound": None,
                "upper_bound": None,
            }
        else:
            if not isinstance(truncate_values, str):
                truncate_values_keys: set = set(truncate_values.keys())
                if (
                    not truncate_values_keys
                    <= NumericMetricRangeMultiBatchParameterBuilder.RECOGNIZED_TRUNCATE_DISTRIBUTION_KEYS
                ):
                    raise ge_exceptions.ProfilerExecutionError(
                        message=f"""Unrecognized truncate_values key(s) in {self.__class__.__name__}:
"{str(truncate_values_keys - NumericMetricRangeMultiBatchParameterBuilder.RECOGNIZED_TRUNCATE_DISTRIBUTION_KEYS)}" \
detected.
"""
                    )

        self._truncate_values = truncate_values
    def get_effective_column_names(
        self,
        batch_ids: Optional[List[str]] = None,
        validator: Optional["Validator"] = None,  # noqa: F821
        variables: Optional[ParameterContainer] = None,
    ) -> List[str]:
        # Obtain include_column_names from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        include_column_names: Optional[
            List[str]
        ] = get_parameter_value_and_validate_return_type(
            domain=None,
            parameter_reference=self.include_column_names,
            expected_return_type=None,
            variables=variables,
            parameters=None,
        )

        # Obtain exclude_column_names from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        exclude_column_names: Optional[
            List[str]
        ] = get_parameter_value_and_validate_return_type(
            domain=None,
            parameter_reference=self.exclude_column_names,
            expected_return_type=None,
            variables=variables,
            parameters=None,
        )

        if batch_ids is None:
            batch_ids: List[str] = self.get_batch_ids(variables=variables)

        if validator is None:
            validator = self.get_validator(variables=variables)

        table_columns: List[str] = validator.get_metric(
            metric=MetricConfiguration(
                metric_name="table.columns",
                metric_domain_kwargs={
                    "batch_id": batch_ids[-1],  # active_batch_id
                },
                metric_value_kwargs=None,
                metric_dependencies=None,
            )
        )

        effective_column_names: List[str] = include_column_names or table_columns

        if exclude_column_names is None:
            exclude_column_names = []

        column_name: str

        effective_column_names = [
            column_name
            for column_name in effective_column_names
            if column_name not in exclude_column_names
        ]

        for column_name in effective_column_names:
            if column_name not in table_columns:
                raise ge_exceptions.ProfilerExecutionError(
                    message=f'Error: The column "{column_name}" in BatchData does not exist.'
                )

        # include_column_name_suffixes column_name_suffixes from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        include_column_name_suffixes: Optional[
            Union[str, Iterable, List[str]]
        ] = get_parameter_value_and_validate_return_type(
            domain=None,
            parameter_reference=self.include_column_name_suffixes,
            expected_return_type=None,
            variables=variables,
            parameters=None,
        )

        # exclude_column_name_suffixes column_name_suffixes from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        exclude_column_name_suffixes: Optional[
            Union[str, Iterable, List[str]]
        ] = get_parameter_value_and_validate_return_type(
            domain=None,
            parameter_reference=self.exclude_column_name_suffixes,
            expected_return_type=None,
            variables=variables,
            parameters=None,
        )

        if include_column_name_suffixes:
            if isinstance(include_column_name_suffixes, str):
                include_column_name_suffixes = [include_column_name_suffixes]
            else:
                if not isinstance(include_column_name_suffixes, (Iterable, list)):
                    raise ValueError(
                        "Unrecognized include_column_name_suffixes directive -- must be a list or a string."
                    )

            effective_column_names: List[str] = list(
                filter(
                    lambda candidate_column_name: candidate_column_name.endswith(
                        tuple(include_column_name_suffixes)
                    ),
                    effective_column_names,
                )
            )

        if exclude_column_name_suffixes:
            if isinstance(exclude_column_name_suffixes, str):
                exclude_column_name_suffixes = [exclude_column_name_suffixes]
            else:
                if not isinstance(exclude_column_name_suffixes, (Iterable, list)):
                    raise ValueError(
                        "Unrecognized exclude_column_name_suffixes directive -- must be a list or a string."
                    )

            effective_column_names: List[str] = list(
                filter(
                    lambda candidate_column_name: not candidate_column_name.endswith(
                        tuple(exclude_column_name_suffixes)
                    ),
                    effective_column_names,
                )
            )

        # Obtain semantic_type_filter_module_name from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        semantic_type_filter_module_name: Optional[
            str
        ] = get_parameter_value_and_validate_return_type(
            domain=None,
            parameter_reference=self.semantic_type_filter_module_name,
            expected_return_type=None,
            variables=variables,
            parameters=None,
        )
        if semantic_type_filter_module_name is None:
            semantic_type_filter_module_name = "great_expectations.rule_based_profiler.helpers.simple_semantic_type_filter"

        # Obtain semantic_type_filter_class_name from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        semantic_type_filter_class_name: Optional[
            str
        ] = get_parameter_value_and_validate_return_type(
            domain=None,
            parameter_reference=self.semantic_type_filter_class_name,
            expected_return_type=None,
            variables=variables,
            parameters=None,
        )
        if semantic_type_filter_class_name is None:
            semantic_type_filter_class_name = "SimpleSemanticTypeFilter"

        semantic_type_filter: SemanticTypeFilter = instantiate_class_from_config(
            config={
                "module_name": semantic_type_filter_module_name,
                "class_name": semantic_type_filter_class_name,
            },
            runtime_environment={
                "batch_ids": batch_ids,
                "validator": validator,
                "column_names": effective_column_names,
            },
            config_defaults={},
        )
        self._semantic_type_filter = semantic_type_filter

        # Obtain include_semantic_types from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        include_semantic_types: Optional[
            Union[str, SemanticDomainTypes, List[Union[str, SemanticDomainTypes]]]
        ] = get_parameter_value_and_validate_return_type(
            domain=None,
            parameter_reference=self.include_semantic_types,
            expected_return_type=None,
            variables=variables,
            parameters=None,
        )
        include_semantic_types = (
            self.semantic_type_filter.parse_semantic_domain_type_argument(
                semantic_types=include_semantic_types
            )
        )

        # Obtain exclude_semantic_types from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        exclude_semantic_types: Optional[
            Union[str, SemanticDomainTypes, List[Union[str, SemanticDomainTypes]]]
        ] = get_parameter_value_and_validate_return_type(
            domain=None,
            parameter_reference=self.exclude_semantic_types,
            expected_return_type=None,
            variables=variables,
            parameters=None,
        )
        exclude_semantic_types = (
            self.semantic_type_filter.parse_semantic_domain_type_argument(
                semantic_types=exclude_semantic_types
            )
        )

        if include_semantic_types:
            effective_column_names = list(
                filter(
                    lambda candidate_column_name: self.semantic_type_filter.table_column_name_to_inferred_semantic_domain_type_map[
                        candidate_column_name
                    ]
                    in include_semantic_types,
                    effective_column_names,
                )
            )

        if exclude_semantic_types:
            effective_column_names = list(
                filter(
                    lambda candidate_column_name: self.semantic_type_filter.table_column_name_to_inferred_semantic_domain_type_map[
                        candidate_column_name
                    ]
                    not in exclude_semantic_types,
                    effective_column_names,
                )
            )

        return effective_column_names
    def get_metrics(
        self,
        metric_name: str,
        metric_domain_kwargs: Optional[Union[str, dict]] = None,
        metric_value_kwargs: Optional[Union[str, dict]] = None,
        enforce_numeric_metric: Union[str, bool] = False,
        replace_nan_with_zero: Union[str, bool] = False,
        domain: Optional[Domain] = None,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
    ) -> MetricComputationResult:
        """
        General multi-batch metric computation facility.

        Computes specified metric (can be multi-dimensional, numeric, non-numeric, or mixed) and conditions (or
        "sanitizes") result according to two criteria: enforcing metric output to be numeric and handling NaN values.
        :param metric_name: Name of metric of interest, being computed.
        :param metric_domain_kwargs: Metric Domain Kwargs is an essential parameter of the MetricConfiguration object.
        :param metric_value_kwargs: Metric Value Kwargs is an essential parameter of the MetricConfiguration object.
        :param enforce_numeric_metric: Flag controlling whether or not metric output must be numerically-valued.
        :param replace_nan_with_zero: Directive controlling how NaN metric values, if encountered, should be handled.
        :param domain: Domain object scoping "$variable"/"$parameter"-style references in configuration and runtime.
        :param variables: Part of the "rule state" available for "$variable"-style references.
        :param parameters: Part of the "rule state" available for "$parameter"-style references.
        :return: MetricComputationResult object, containing both: data samples in the format "N x R^m", where "N" (most
        significant dimension) is the number of measurements (e.g., one per Batch of data), while "R^m" is the
        multi-dimensional metric, whose values are being estimated, and details (to be used for metadata purposes).
        """
        # IDs of Batch objects used to compute the metric -- commonly obtained via the "get_batch_ids()"
        # method in this module, although it can readily accept the list of Batch IDs generated through any other means.
        batch_ids: Optional[List[str]] = self.get_batch_ids(
            domain=domain,
            variables=variables,
            parameters=parameters,
        )
        if not batch_ids:
            raise ge_exceptions.ProfilerExecutionError(
                message=
                f"Utilizing a {self.__class__.__name__} requires a non-empty list of batch identifiers."
            )

        domain_kwargs = build_metric_domain_kwargs(
            batch_id=None,
            metric_domain_kwargs=metric_domain_kwargs,
            domain=domain,
            variables=variables,
            parameters=parameters,
        )

        metric_domain_kwargs: dict = copy.deepcopy(domain_kwargs)

        # Obtain value kwargs from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        metric_value_kwargs = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=metric_value_kwargs,
            expected_return_type=None,
            variables=variables,
            parameters=parameters,
        )

        metric_values: MetricValues = []

        # The Validator object used for metric calculation purposes.
        validator: "Validator" = self.get_validator(
            domain=domain,
            variables=variables,
            parameters=parameters,
        )

        metric_value: MetricValue
        batch_id: str
        for batch_id in batch_ids:
            metric_domain_kwargs["batch_id"] = batch_id
            metric_value = validator.get_metric(metric=MetricConfiguration(
                metric_name=metric_name,
                metric_domain_kwargs=metric_domain_kwargs,
                metric_value_kwargs=metric_value_kwargs,
                metric_dependencies=None,
            ))
            if np.isscalar(metric_value):
                metric_value = [metric_value]

            metric_values.append(metric_value)

        metric_values = np.array(metric_values)

        self._sanitize_metric_computation(
            metric_name=metric_name,
            metric_values=metric_values,
            enforce_numeric_metric=enforce_numeric_metric,
            replace_nan_with_zero=replace_nan_with_zero,
            domain=domain,
            variables=variables,
            parameters=parameters,
        )

        return MetricComputationResult(
            metric_values=metric_values,
            details={
                "metric_configuration": {
                    "metric_name": metric_name,
                    "domain_kwargs": domain_kwargs,
                    "metric_value_kwargs": metric_value_kwargs,
                    "metric_dependencies": None,
                },
                "num_batches": len(metric_values),
            },
        )
Ejemplo n.º 26
0
    def _build_parameters(
        self,
        domain: Domain,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
        recompute_existing_parameter_values: bool = False,
    ) -> Attributes:
        """
        Builds ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and details.

        Returns:
            Attributes object, containing computed parameter values and parameter computation details metadata.

         The algorithm operates according to the following steps:
         1. Obtain batch IDs of interest using BaseDataContext and BatchRequest (unless passed explicitly as argument).
         2. Set up metric_domain_kwargs and metric_value_kwargs (using configuration and/or variables and parameters).
         3. Instantiate the Validator object corresponding to BatchRequest (with a temporary expectation_suite_name) in
            order to have access to all Batch objects, on each of which the specified metric_name will be computed.
         4. Perform metric computations and obtain the result in the array-like form (one metric value per each Batch).
         5. Using the configured directives and heuristics, determine whether or not the ranges should be clipped.
         6. Using the configured directives and heuristics, determine if return values should be rounded to an integer.
         7. Convert the multi-dimensional metric computation results to a numpy array (for further computations).
         8. Compute [low, high] for the desired metric using the chosen estimator method.
         9. Return [low, high] for the desired metric as estimated by the specified sampling method.
        10. Set up the arguments and call build_parameter_container() to store the parameter as part of "rule state".
        """
        # Obtain false_positive_rate from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        false_positive_rate: np.float64 = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self.false_positive_rate,
            expected_return_type=(float, np.float64),
            variables=variables,
            parameters=parameters,
        )

        if not (0.0 <= false_positive_rate <= 1.0):
            raise ge_exceptions.ProfilerExecutionError(
                f"""false_positive_rate must be a positive decimal number between 0 and 1 inclusive [0, 1],
but {false_positive_rate} was provided.""")
        elif false_positive_rate <= NP_EPSILON:
            warnings.warn(
                f"""You have chosen a false_positive_rate of {false_positive_rate}, which is too close to 0.
A false_positive_rate of {NP_EPSILON} has been selected instead.""")
            false_positive_rate = NP_EPSILON
        elif false_positive_rate >= (1.0 - NP_EPSILON):
            warnings.warn(
                f"""You have chosen a false_positive_rate of {false_positive_rate}, which is too close to 1.
A false_positive_rate of {1.0-NP_EPSILON} has been selected instead.""")
            false_positive_rate = np.float64(1.0 - NP_EPSILON)

        parameter_reference: str
        if self.metric_multi_batch_parameter_builder_name:
            # Obtain metric_multi_batch_parameter_builder_name from "rule state" (i.e., variables and parameters); from instance variable otherwise.
            metric_multi_batch_parameter_builder_name: str = (
                get_parameter_value_and_validate_return_type(
                    domain=domain,
                    parameter_reference=self.
                    metric_multi_batch_parameter_builder_name,
                    expected_return_type=str,
                    variables=variables,
                    parameters=parameters,
                ))
            parameter_reference = (
                f"{RAW_PARAMETER_KEY}{metric_multi_batch_parameter_builder_name}"
            )
        else:
            # Compute metric value for each Batch object.
            super().build_parameters(
                domain=domain,
                variables=variables,
                parameters=parameters,
                parameter_computation_impl=super()._build_parameters,
                recompute_existing_parameter_values=
                recompute_existing_parameter_values,
            )
            parameter_reference = self.raw_fully_qualified_parameter_name

        # Retrieve metric values for all Batch objects.
        parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=parameter_reference,
            expected_return_type=None,
            variables=variables,
            parameters=parameters,
        )
        metric_values: MetricValues = parameter_node[
            FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY]

        # Obtain estimator directive from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        estimator: str = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self.estimator,
            expected_return_type=str,
            variables=variables,
            parameters=parameters,
        )
        if (estimator not in NumericMetricRangeMultiBatchParameterBuilder.
                RECOGNIZED_SAMPLING_METHOD_NAMES):
            raise ge_exceptions.ProfilerExecutionError(
                message=
                f"""The directive "estimator" for {self.__class__.__name__} can be only one of
{NumericMetricRangeMultiBatchParameterBuilder.RECOGNIZED_SAMPLING_METHOD_NAMES} ("{estimator}" was detected).
""")

        round_decimals: int

        # Obtain quantile_statistic_interpolation_method directive from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        quantile_statistic_interpolation_method: str = (
            get_parameter_value_and_validate_return_type(
                domain=domain,
                parameter_reference=self.
                quantile_statistic_interpolation_method,
                expected_return_type=str,
                variables=variables,
                parameters=parameters,
            ))
        if (quantile_statistic_interpolation_method
                not in NumericMetricRangeMultiBatchParameterBuilder.
                RECOGNIZED_QUANTILE_STATISTIC_INTERPOLATION_METHODS):
            raise ge_exceptions.ProfilerExecutionError(
                message=
                f"""The directive "quantile_statistic_interpolation_method" for {self.__class__.__name__} can \
be only one of {NumericMetricRangeMultiBatchParameterBuilder.RECOGNIZED_QUANTILE_STATISTIC_INTERPOLATION_METHODS} \
("{quantile_statistic_interpolation_method}" was detected).
""")

        if integer_semantic_domain_type(domain=domain):
            round_decimals = 0
        else:
            round_decimals = self._get_round_decimals_using_heuristics(
                metric_values=metric_values,
                domain=domain,
                variables=variables,
                parameters=parameters,
            )

        if quantile_statistic_interpolation_method == "auto":
            if round_decimals == 0:
                quantile_statistic_interpolation_method = "nearest"
            else:
                quantile_statistic_interpolation_method = "linear"

        estimator_func: Callable
        estimator_kwargs: dict
        if estimator == "bootstrap":
            estimator_func = self._get_bootstrap_estimate
            estimator_kwargs = {
                "false_positive_rate": false_positive_rate,
                "quantile_statistic_interpolation_method":
                quantile_statistic_interpolation_method,
                "n_resamples": self.n_resamples,
                "random_seed": self.random_seed,
            }
        elif estimator == "kde":
            estimator_func = self._get_kde_estimate
            estimator_kwargs = {
                "false_positive_rate": false_positive_rate,
                "quantile_statistic_interpolation_method":
                quantile_statistic_interpolation_method,
                "n_resamples": self.n_resamples,
                "bw_method": self.bw_method,
                "random_seed": self.random_seed,
            }
        else:
            estimator_func = self._get_deterministic_estimate
            estimator_kwargs = {
                "false_positive_rate":
                false_positive_rate,
                "quantile_statistic_interpolation_method":
                quantile_statistic_interpolation_method,
            }

        numeric_range_estimation_result: NumericRangeEstimationResult = (
            self._estimate_metric_value_range(
                metric_values=metric_values,
                estimator_func=estimator_func,
                round_decimals=round_decimals,
                domain=domain,
                variables=variables,
                parameters=parameters,
                **estimator_kwargs,
            ))

        value_range: np.ndarray = numeric_range_estimation_result.value_range
        details: Dict[str, Any] = copy.deepcopy(
            parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY])

        # Obtain include_estimator_samples_histogram_in_details from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        include_estimator_samples_histogram_in_details: bool = (
            get_parameter_value_and_validate_return_type(
                domain=domain,
                parameter_reference=self.
                include_estimator_samples_histogram_in_details,
                expected_return_type=bool,
                variables=variables,
                parameters=parameters,
            ))

        if include_estimator_samples_histogram_in_details:
            details[
                "estimation_histogram"] = numeric_range_estimation_result.estimation_histogram

        return Attributes({
            FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY: value_range,
            FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY: details,
        })
Ejemplo n.º 27
0
    def _get_domains(
        self,
        rule_name: str,
        variables: Optional[ParameterContainer] = None,
    ) -> List[Domain]:
        """Return domains matching the specified tolerance limits.

        Args:
            rule_name: name of Rule object, for which "Domain" objects are obtained.
            variables: Optional variables to substitute when evaluating.

        Returns:
            List of domains that match the desired tolerance limits.
        """
        batch_ids: List[str] = self.get_batch_ids(variables=variables)

        validator: "Validator" = self.get_validator(
            variables=variables)  # noqa: F821

        effective_column_names: List[str] = self.get_effective_column_names(
            batch_ids=batch_ids,
            validator=validator,
            variables=variables,
        )

        if not (effective_column_names and (len(effective_column_names) == 2)):
            raise ge_exceptions.ProfilerExecutionError(
                message=
                f"""Error: Columns specified for {self.__class__.__name__} in sorted order must correspond to \
"column_A" and "column_B" (in this exact order).
""")

        effective_column_names = sorted(effective_column_names)

        domain_kwargs: Dict[str, str] = dict(
            zip(
                [
                    "column_A",
                    "column_B",
                ],
                effective_column_names,
            ))

        column_name: str
        semantic_types_by_column_name: Dict[str, SemanticDomainTypes] = {
            column_name: self.semantic_type_filter.
            table_column_name_to_inferred_semantic_domain_type_map[column_name]
            for column_name in effective_column_names
        }

        domains: List[Domain] = [
            Domain(
                domain_type=self.domain_type,
                domain_kwargs=domain_kwargs,
                details={
                    INFERRED_SEMANTIC_TYPE_KEY: semantic_types_by_column_name,
                },
                rule_name=rule_name,
            ),
        ]

        return domains
Ejemplo n.º 28
0
    def get_effective_column_names(
        self,
        batch_ids: Optional[List[str]] = None,
        validator: Optional["Validator"] = None,  # noqa: F821
        variables: Optional[ParameterContainer] = None,
    ) -> List[str]:
        """
        This method applies multiple directives to obtain columns to be included as part of returned "Domain" objects.
        """
        include_column_names: List[str] = cast(
            List[str],
            self._resolve_list_type_property(
                property_name="include_column_names",
                property_value_type=list,
                variables=variables,
            ),
        )

        if batch_ids is None:
            batch_ids: List[str] = self.get_batch_ids(variables=variables)

        if validator is None:
            validator = self.get_validator(variables=variables)

        table_columns: List[str] = validator.get_metric(
            metric=MetricConfiguration(
                metric_name="table.columns",
                metric_domain_kwargs={
                    "batch_id": batch_ids[-1],  # active_batch_id
                },
                metric_value_kwargs=None,
                metric_dependencies=None,
            ))

        effective_column_names: List[
            str] = include_column_names or table_columns

        exclude_column_names: List[str] = cast(
            List[str],
            self._resolve_list_type_property(
                property_name="exclude_column_names",
                property_value_type=list,
                variables=variables,
            ),
        )

        column_name: str

        effective_column_names = [
            column_name for column_name in effective_column_names
            if column_name not in exclude_column_names
        ]

        for column_name in effective_column_names:
            if column_name not in table_columns:
                raise ge_exceptions.ProfilerExecutionError(
                    message=
                    f'Error: The column "{column_name}" in BatchData does not exist.'
                )

        include_column_name_suffixes: List[str] = cast(
            List[str],
            self._resolve_list_type_property(
                property_name="include_column_name_suffixes",
                property_value_type=(str, Iterable, list),
                variables=variables,
            ),
        )
        if include_column_name_suffixes:
            effective_column_names = list(
                filter(
                    lambda candidate_column_name: candidate_column_name.
                    endswith(tuple(include_column_name_suffixes)),
                    effective_column_names,
                ))

        exclude_column_name_suffixes: List[str] = cast(
            List[str],
            self._resolve_list_type_property(
                property_name="exclude_column_name_suffixes",
                property_value_type=(str, Iterable, list),
                variables=variables,
            ),
        )
        if exclude_column_name_suffixes:
            effective_column_names = list(
                filter(
                    lambda candidate_column_name: not candidate_column_name.
                    endswith(tuple(exclude_column_name_suffixes)),
                    effective_column_names,
                ))

        # Obtain semantic_type_filter_module_name from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        semantic_type_filter_module_name: Optional[
            str] = get_parameter_value_and_validate_return_type(
                domain=None,
                parameter_reference=self.semantic_type_filter_module_name,
                expected_return_type=None,
                variables=variables,
                parameters=None,
            )
        if semantic_type_filter_module_name is None:
            semantic_type_filter_module_name = "great_expectations.rule_based_profiler.helpers.simple_semantic_type_filter"

        # Obtain semantic_type_filter_class_name from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        semantic_type_filter_class_name: Optional[
            str] = get_parameter_value_and_validate_return_type(
                domain=None,
                parameter_reference=self.semantic_type_filter_class_name,
                expected_return_type=None,
                variables=variables,
                parameters=None,
            )
        if semantic_type_filter_class_name is None:
            semantic_type_filter_class_name = "SimpleSemanticTypeFilter"

        semantic_type_filter: SemanticTypeFilter = instantiate_class_from_config(
            config={
                "module_name": semantic_type_filter_module_name,
                "class_name": semantic_type_filter_class_name,
            },
            runtime_environment={
                "batch_ids": batch_ids,
                "validator": validator,
                "column_names": effective_column_names,
            },
            config_defaults={},
        )
        self._semantic_type_filter = semantic_type_filter

        include_semantic_types: Union[List[Union[
            str, SemanticDomainTypes]]] = cast(
                List[Union[str, SemanticDomainTypes]],
                self._resolve_list_type_property(
                    property_name="include_semantic_types",
                    property_value_type=(str, SemanticDomainTypes, list),
                    variables=variables,
                ),
            )
        include_semantic_types = (
            self.semantic_type_filter.parse_semantic_domain_type_argument(
                semantic_types=include_semantic_types))

        if include_semantic_types:
            effective_column_names = list(
                filter(
                    lambda candidate_column_name: self.semantic_type_filter.
                    table_column_name_to_inferred_semantic_domain_type_map[
                        candidate_column_name] in include_semantic_types,
                    effective_column_names,
                ))

        exclude_semantic_types: Union[List[Union[
            str, SemanticDomainTypes]]] = cast(
                List[Union[str, SemanticDomainTypes]],
                self._resolve_list_type_property(
                    property_name="exclude_semantic_types",
                    property_value_type=(str, SemanticDomainTypes, list),
                    variables=variables,
                ),
            )
        exclude_semantic_types = (
            self.semantic_type_filter.parse_semantic_domain_type_argument(
                semantic_types=exclude_semantic_types))

        if exclude_semantic_types:
            effective_column_names = list(
                filter(
                    lambda candidate_column_name: self.semantic_type_filter.
                    table_column_name_to_inferred_semantic_domain_type_map[
                        candidate_column_name] not in exclude_semantic_types,
                    effective_column_names,
                ))

        return effective_column_names
    def _sanitize_metric_computation(
        self,
        metric_name: str,
        metric_values: np.ndarray,
        enforce_numeric_metric: Union[str, bool] = False,
        replace_nan_with_zero: Union[str, bool] = False,
        domain: Optional[Domain] = None,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
    ) -> np.ndarray:
        """
        This method conditions (or "sanitizes") data samples in the format "N x R^m", where "N" (most significant
        dimension) is the number of measurements (e.g., one per Batch of data), while "R^m" is the multi-dimensional
        metric, whose values are being estimated.  The "conditioning" operations are:
        1. If "enforce_numeric_metric" flag is set, raise an error if a non-numeric value is found in sample vectors.
        2. Further, if a NaN is encountered in a sample vectors and "replace_nan_with_zero" is True, then replace those
        NaN values with the 0.0 floating point number; if "replace_nan_with_zero" is False, then raise an error.
        """
        # Obtain enforce_numeric_metric from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        enforce_numeric_metric = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=enforce_numeric_metric,
            expected_return_type=bool,
            variables=variables,
            parameters=parameters,
        )

        # Obtain replace_nan_with_zero from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        replace_nan_with_zero = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=replace_nan_with_zero,
            expected_return_type=bool,
            variables=variables,
            parameters=parameters,
        )

        # Outer-most dimension is data samples (e.g., one per Batch); the rest are dimensions of the actual metric.
        metric_value_shape: tuple = metric_values.shape[1:]

        # Generate all permutations of indexes for accessing every element of the multi-dimensional metric.
        metric_value_shape_idx: int
        axes: List[np.ndarray] = [
            np.indices(dimensions=(metric_value_shape_idx, ))[0]
            for metric_value_shape_idx in metric_value_shape
        ]
        metric_value_indices: List[tuple] = list(
            itertools.product(*tuple(axes)))

        # Generate all permutations of indexes for accessing estimates of every element of the multi-dimensional metric.
        # Prefixing multi-dimensional index with "(slice(None, None, None),)" is equivalent to "[:,]" access.
        metric_value_idx: tuple
        metric_value_vector_indices: List[tuple] = [
            (slice(None, None, None), ) + metric_value_idx
            for metric_value_idx in metric_value_indices
        ]

        # Traverse indices of sample vectors corresponding to every element of multi-dimensional metric.
        metric_value_vector: np.ndarray
        for metric_value_idx in metric_value_vector_indices:
            # Obtain "N"-element-long vector of samples for each element of multi-dimensional metric.
            metric_value_vector = metric_values[metric_value_idx]
            if enforce_numeric_metric:
                if not np.issubdtype(metric_value_vector.dtype, np.number):
                    raise ge_exceptions.ProfilerExecutionError(
                        message=
                        f"""Applicability of {self.__class__.__name__} is restricted to numeric-valued metrics \
(value of type "{str(metric_value_vector.dtype)}" was computed).
""")

                if np.any(np.isnan(metric_value_vector)):
                    if not replace_nan_with_zero:
                        raise ValueError(
                            f"""Computation of metric "{metric_name}" resulted in NaN ("not a number") value.
""")

                    np.nan_to_num(metric_value_vector, copy=True, nan=0.0)

        return metric_values
Ejemplo n.º 30
0
    def get_metrics(
        self,
        batch_ids: List[str],
        validator: Validator,
        metric_name: str,
        metric_domain_kwargs: Optional[Union[str, dict]] = None,
        metric_value_kwargs: Optional[Union[str, dict]] = None,
        enforce_numeric_metric: Optional[Union[str, bool]] = False,
        replace_nan_with_zero: Optional[Union[str, bool]] = False,
        domain: Optional[Domain] = None,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
    ) -> Dict[str, Union[Union[np.ndarray, List[Union[Any, Number]]], Dict[
            str, Any]]]:
        domain_kwargs = build_metric_domain_kwargs(
            batch_id=None,
            metric_domain_kwargs=metric_domain_kwargs,
            domain=domain,
            variables=variables,
            parameters=parameters,
        )

        metric_domain_kwargs: dict = copy.deepcopy(domain_kwargs)

        # Obtain value kwargs from rule state (i.e., variables and parameters); from instance variable otherwise.
        metric_value_kwargs = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=metric_value_kwargs,
            expected_return_type=None,
            variables=variables,
            parameters=parameters,
        )

        # Obtain enforce_numeric_metric from rule state (i.e., variables and parameters); from instance variable otherwise.
        enforce_numeric_metric = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=enforce_numeric_metric,
            expected_return_type=bool,
            variables=variables,
            parameters=parameters,
        )

        # Obtain replace_nan_with_zero from rule state (i.e., variables and parameters); from instance variable otherwise.
        replace_nan_with_zero = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=replace_nan_with_zero,
            expected_return_type=bool,
            variables=variables,
            parameters=parameters,
        )

        metric_values: List[Union[Any, Number]] = []

        metric_value: Union[Any, Number]
        batch_id: str
        for batch_id in batch_ids:
            metric_domain_kwargs["batch_id"] = batch_id
            metric_configuration_arguments: Dict[str, Any] = {
                "metric_name": metric_name,
                "metric_domain_kwargs": metric_domain_kwargs,
                "metric_value_kwargs": metric_value_kwargs,
                "metric_dependencies": None,
            }
            metric_value = validator.get_metric(metric=MetricConfiguration(
                **metric_configuration_arguments))
            if enforce_numeric_metric:
                if not is_numeric(value=metric_value):
                    raise ge_exceptions.ProfilerExecutionError(
                        message=
                        f"""Applicability of {self.__class__.__name__} is restricted to numeric-valued metrics \
(value of type "{str(type(metric_value))}" was computed).
""")
                if np.isnan(metric_value):
                    if not replace_nan_with_zero:
                        raise ValueError(
                            f"""Computation of metric "{metric_name}" resulted in NaN ("not a number") value.
""")
                    metric_value = 0.0

            metric_values.append(metric_value)

        return {
            "metric_values": metric_values,
            "details": {
                "metric_configuration": {
                    "metric_name": metric_name,
                    "domain_kwargs": domain_kwargs,
                    "metric_value_kwargs": metric_value_kwargs,
                    "metric_dependencies": None,
                },
                "num_batches": len(metric_values),
            },
        }