Ejemplo n.º 1
0
    def _validate(
        self,
        configuration: ExpectationConfiguration,
        metrics: Dict,
        runtime_configuration: dict = None,
        execution_engine: ExecutionEngine = None,
    ):
        bucketize_data = configuration.kwargs.get(
            "bucketize_data", self.default_kwarg_values["bucketize_data"]
        )
        partition_object = configuration.kwargs.get(
            "partition_object", self.default_kwarg_values["partition_object"]
        )
        threshold = configuration.kwargs.get(
            "threshold", self.default_kwarg_values["threshold"]
        )
        tail_weight_holdout = configuration.kwargs.get(
            "tail_weight_holdout", self.default_kwarg_values["tail_weight_holdout"]
        )
        internal_weight_holdout = configuration.kwargs.get(
            "internal_weight_holdout",
            self.default_kwarg_values["internal_weight_holdout"],
        )
        if partition_object is None:
            if bucketize_data:
                # in this case, we have requested a partition, histogram using said partition, and nonnull count
                bins = list(metrics["column.partition"])
                weights = list(
                    np.array(metrics["column.histogram"])
                    / metrics["column_values.nonnull.count"]
                )
                tail_weights = (1 - sum(weights)) / 2
                partition_object = {
                    "bins": bins,
                    "weights": weights,
                    "tail_weights": [tail_weights, tail_weights],
                }
            else:
                partition_object = {
                    "values": list(metrics["column.value_counts"].index),
                    "weights": list(
                        np.array(metrics["column.value_counts"])
                        / metrics["column_values.nonnull.count"]
                    ),
                }

        if not is_valid_partition_object(partition_object):
            raise ValueError("Invalid partition object.")

        if threshold is not None and (
            (not isinstance(threshold, (int, float))) or (threshold < 0)
        ):
            raise ValueError(
                "Threshold must be specified, greater than or equal to zero."
            )

        if (
            (not isinstance(tail_weight_holdout, (int, float)))
            or (tail_weight_holdout < 0)
            or (tail_weight_holdout > 1)
        ):
            raise ValueError("tail_weight_holdout must be between zero and one.")

        if (
            (not isinstance(internal_weight_holdout, (int, float)))
            or (internal_weight_holdout < 0)
            or (internal_weight_holdout > 1)
        ):
            raise ValueError("internal_weight_holdout must be between zero and one.")

        if tail_weight_holdout != 0 and "tail_weights" in partition_object:
            raise ValueError(
                "tail_weight_holdout must be 0 when using tail_weights in partition object"
            )

        # TODO: add checks for duplicate values in is_valid_categorical_partition_object
        if is_valid_categorical_partition_object(partition_object):
            if internal_weight_holdout > 0:
                raise ValueError(
                    "Internal weight holdout cannot be used for discrete data."
                )

            # Data are expected to be discrete, use value_counts
            observed_weights = (
                metrics["column.value_counts"] / metrics["column_values.nonnull.count"]
            )
            expected_weights = pd.Series(
                partition_object["weights"],
                index=partition_object["values"],
                name="expected",
            )
            # Sort not available before pandas 0.23.0
            # test_df = pd.concat([expected_weights, observed_weights], axis=1, sort=True)
            test_df = pd.concat([expected_weights, observed_weights], axis=1)

            na_counts = test_df.isnull().sum()

            # Handle NaN: if we expected something that's not there, it's just not there.
            pk = test_df["count"].fillna(0)
            # Handle NaN: if something's there that was not expected,
            # substitute the relevant value for tail_weight_holdout
            if na_counts["expected"] > 0:
                # Scale existing expected values
                test_df["expected"] *= 1 - tail_weight_holdout
                # Fill NAs with holdout.
                qk = test_df["expected"].fillna(
                    tail_weight_holdout / na_counts["expected"]
                )
            else:
                qk = test_df["expected"]

            kl_divergence = stats.entropy(pk, qk)

            if np.isinf(kl_divergence) or np.isnan(kl_divergence):
                observed_value = None
            else:
                observed_value = kl_divergence

            if threshold is None:
                success = True
            else:
                success = kl_divergence <= threshold

            return_obj = {
                "success": success,
                "result": {
                    "observed_value": observed_value,
                    "details": {
                        "observed_partition": {
                            "values": test_df.index.tolist(),
                            "weights": pk.tolist(),
                        },
                        "expected_partition": {
                            "values": test_df.index.tolist(),
                            "weights": qk.tolist(),
                        },
                    },
                },
            }

        else:
            # Data are expected to be continuous; discretize first
            if bucketize_data is False:
                raise ValueError(
                    "KL Divergence cannot be computed with a continuous partition object and the bucketize_data "
                    "parameter set to false."
                )
            # Build the histogram first using expected bins so that the largest bin is >=
            nonnull_count = metrics["column_values.nonnull.count"]
            hist = np.array(metrics["column.histogram"])

            # Add in the frequencies observed above or below the provided partition
            below_partition = metrics["below_partition"]
            above_partition = metrics["above_partition"]

            # Observed Weights is just the histogram values divided by the total number of observations
            observed_weights = hist / nonnull_count

            # Adjust expected_weights to account for tail_weight and internal_weight
            if "tail_weights" in partition_object:
                partition_tail_weight_holdout = np.sum(partition_object["tail_weights"])
            else:
                partition_tail_weight_holdout = 0

            expected_weights = np.array(partition_object["weights"]) * (
                1 - tail_weight_holdout - internal_weight_holdout
            )

            # Assign internal weight holdout values if applicable
            if internal_weight_holdout > 0:
                zero_count = len(expected_weights) - np.count_nonzero(expected_weights)
                if zero_count > 0:
                    for index, value in enumerate(expected_weights):
                        if value == 0:
                            expected_weights[index] = (
                                internal_weight_holdout / zero_count
                            )

            # Assign tail weight holdout if applicable
            # We need to check cases to only add tail weight holdout if it makes sense based on the provided partition.
            if (partition_object["bins"][0] == -np.inf) and (
                partition_object["bins"][-1]
            ) == np.inf:
                if tail_weight_holdout > 0:
                    raise ValueError(
                        "tail_weight_holdout cannot be used for partitions with infinite endpoints."
                    )
                if "tail_weights" in partition_object:
                    raise ValueError(
                        "There can be no tail weights for partitions with one or both endpoints at infinity"
                    )

                # Remove -inf and inf
                expected_bins = partition_object["bins"][1:-1]

                comb_expected_weights = expected_weights
                # Set aside tail weights
                expected_tail_weights = np.concatenate(
                    ([expected_weights[0]], [expected_weights[-1]])
                )
                # Remove tail weights
                expected_weights = expected_weights[1:-1]

                comb_observed_weights = observed_weights
                # Set aside tail weights
                observed_tail_weights = np.concatenate(
                    ([observed_weights[0]], [observed_weights[-1]])
                )
                # Remove tail weights
                observed_weights = observed_weights[1:-1]

            elif partition_object["bins"][0] == -np.inf:

                if "tail_weights" in partition_object:
                    raise ValueError(
                        "There can be no tail weights for partitions with one or both endpoints at infinity"
                    )

                # Remove -inf
                expected_bins = partition_object["bins"][1:]

                comb_expected_weights = np.concatenate(
                    (expected_weights, [tail_weight_holdout])
                )
                # Set aside left tail weight and holdout
                expected_tail_weights = np.concatenate(
                    ([expected_weights[0]], [tail_weight_holdout])
                )
                # Remove left tail weight from main expected_weights
                expected_weights = expected_weights[1:]

                comb_observed_weights = np.concatenate(
                    (observed_weights, [above_partition / nonnull_count],)
                )
                # Set aside left tail weight and above partition weight
                observed_tail_weights = np.concatenate(
                    ([observed_weights[0]], [above_partition / nonnull_count],)
                )
                # Remove left tail weight from main observed_weights
                observed_weights = observed_weights[1:]

            elif partition_object["bins"][-1] == np.inf:

                if "tail_weights" in partition_object:
                    raise ValueError(
                        "There can be no tail weights for partitions with one or both endpoints at infinity"
                    )

                # Remove inf
                expected_bins = partition_object["bins"][:-1]

                comb_expected_weights = np.concatenate(
                    ([tail_weight_holdout], expected_weights)
                )
                # Set aside right tail weight and holdout
                expected_tail_weights = np.concatenate(
                    ([tail_weight_holdout], [expected_weights[-1]])
                )
                # Remove right tail weight from main expected_weights
                expected_weights = expected_weights[:-1]

                comb_observed_weights = np.concatenate(
                    ([below_partition / nonnull_count], observed_weights,)
                )
                # Set aside right tail weight and below partition weight
                observed_tail_weights = np.concatenate(
                    ([below_partition / nonnull_count], [observed_weights[-1]],)
                )
                # Remove right tail weight from main observed_weights
                observed_weights = observed_weights[:-1]
            else:
                # No need to remove -inf or inf
                expected_bins = partition_object["bins"]

                if "tail_weights" in partition_object:
                    tail_weights = partition_object["tail_weights"]
                    # Tack on tail weights
                    comb_expected_weights = np.concatenate(
                        ([tail_weights[0]], expected_weights, [tail_weights[1]])
                    )
                    # Tail weights are just tail_weights
                    expected_tail_weights = np.array(tail_weights)
                else:
                    comb_expected_weights = np.concatenate(
                        (
                            [tail_weight_holdout / 2],
                            expected_weights,
                            [tail_weight_holdout / 2],
                        )
                    )
                    # Tail weights are just tail_weight holdout divided equally to both tails
                    expected_tail_weights = np.concatenate(
                        ([tail_weight_holdout / 2], [tail_weight_holdout / 2])
                    )

                comb_observed_weights = np.concatenate(
                    (
                        [below_partition / nonnull_count],
                        observed_weights,
                        [above_partition / nonnull_count],
                    )
                )
                # Tail weights are just the counts on either side of the partition
                observed_tail_weights = (
                    np.concatenate(([below_partition], [above_partition]))
                    / nonnull_count
                )

                # Main expected_weights and main observed weights had no tail_weights, so nothing needs to be removed.

            # TODO: VERIFY THAT THIS STILL WORKS BASED ON CHANGE TO HIST
            # comb_expected_weights = np.array(comb_expected_weights).astype(float)
            # comb_observed_weights = np.array(comb_observed_weights).astype(float)

            kl_divergence = stats.entropy(comb_observed_weights, comb_expected_weights)

            if np.isinf(kl_divergence) or np.isnan(kl_divergence):
                observed_value = None
            else:
                observed_value = kl_divergence

            if threshold is None:
                success = True
            else:
                success = kl_divergence <= threshold

            return_obj = {
                "success": success,
                "result": {
                    "observed_value": observed_value,
                    "details": {
                        "observed_partition": {
                            # return expected_bins, since we used those bins to compute the observed_weights
                            "bins": expected_bins,
                            "weights": observed_weights.tolist(),
                            "tail_weights": observed_tail_weights.tolist(),
                        },
                        "expected_partition": {
                            "bins": expected_bins,
                            "weights": expected_weights.tolist(),
                            "tail_weights": expected_tail_weights.tolist(),
                        },
                    },
                },
            }

        return return_obj
Ejemplo n.º 2
0
 def get_validation_dependencies(
     self,
     configuration: Optional[ExpectationConfiguration] = None,
     execution_engine: Optional[ExecutionEngine] = None,
     runtime_configuration: Optional[dict] = None,
 ):
     all_dependencies = super().get_validation_dependencies(
         configuration, execution_engine, runtime_configuration
     )
     dependencies = all_dependencies["metrics"]
     partition_object = configuration.kwargs["partition_object"]
     domain_kwargs = configuration.get_domain_kwargs()
     is_categorical = None
     bins = None
     if partition_object is None:
         if configuration.kwargs.get(
             "bucketize_data", self.default_kwarg_values["bucketize_data"]
         ):
             is_categorical = False
             partition_metric_configuration = MetricConfiguration(
                 "column.partition",
                 metric_domain_kwargs=domain_kwargs,
                 metric_value_kwargs={
                     "bins": "auto",
                     "allow_relative_error": False,
                 },
             )
             #
             # Note: 20201116 - JPC - the execution engine doesn't provide capability to evaluate
             # dependencies, so we use a validator
             #
             validator = Validator(execution_engine=execution_engine)
             graph = ValidationGraph()
             validator.build_metric_dependency_graph(
                 graph=graph,
                 child_node=partition_metric_configuration,
                 configuration=configuration,
                 execution_engine=execution_engine,
             )
             bins = validator.resolve_validation_graph(graph, metrics=dict())[
                 partition_metric_configuration.id
             ]
             hist_metric_configuration = MetricConfiguration(
                 "column.histogram",
                 metric_domain_kwargs=domain_kwargs,
                 metric_value_kwargs={"bins": tuple(bins),},
             )
             nonnull_configuration = MetricConfiguration(
                 "column_values.nonnull.count",
                 metric_domain_kwargs=domain_kwargs,
                 metric_value_kwargs=dict(),
             )
             #
             # NOTE 20201117 - JPC - Would prefer not to include partition_metric_configuraiton here,
             # since we have already evaluated it, and its result is in the kwargs for the histogram.
             # However, currently the dependencies' configurations are not passed to the _validate method
             #
             dependencies["column.partition"] = partition_metric_configuration
             dependencies["column.histogram"] = hist_metric_configuration
             dependencies["column_values.nonnull.count"] = nonnull_configuration
         else:
             is_categorical = True
             counts_configuration = MetricConfiguration(
                 "column.value_counts",
                 metric_domain_kwargs=domain_kwargs,
                 metric_value_kwargs={"sort": "value",},
             )
             nonnull_configuration = MetricConfiguration(
                 "column_values.nonnull.count", metric_domain_kwargs=domain_kwargs,
             )
             dependencies["column.value_counts"] = counts_configuration
             dependencies["column_values.nonnull.count"] = nonnull_configuration
     if is_categorical is True or is_valid_categorical_partition_object(
         partition_object
     ):
         dependencies["column.value_counts"] = MetricConfiguration(
             "column.value_counts",
             metric_domain_kwargs=domain_kwargs,
             metric_value_kwargs={"sort": "value"},
         )
         dependencies["column_values.nonnull.count"] = MetricConfiguration(
             "column_values.nonnull.count", domain_kwargs
         )
     else:
         if (
             bins is None
         ):  # if the user did not supply a partition_object, so we just computed it
             if not is_valid_partition_object(partition_object):
                 raise ValueError("Invalid partition_object provided")
             bins = partition_object["bins"]
         hist_metric_configuration = MetricConfiguration(
             "column.histogram",
             metric_domain_kwargs=domain_kwargs,
             metric_value_kwargs={"bins": bins,},
         )
         nonnull_configuration = MetricConfiguration(
             "column_values.nonnull.count",
             metric_domain_kwargs=domain_kwargs,
             metric_value_kwargs=dict(),
         )
         dependencies["column.histogram"] = hist_metric_configuration
         dependencies["column_values.nonnull.count"] = nonnull_configuration
         below_partition = MetricConfiguration(
             "column_values.between.count",
             metric_domain_kwargs=domain_kwargs,
             metric_value_kwargs={"max_value": bins[0]},
         )
         above_partition = MetricConfiguration(
             "column_values.between.count",
             metric_domain_kwargs=domain_kwargs,
             metric_value_kwargs={"min_value": bins[-1], "strict_min": True},
         )
         dependencies["below_partition"] = below_partition
         dependencies["above_partition"] = above_partition
     return all_dependencies