Ejemplo n.º 1
0
    def _get_dataset_column_histogram(self,
                                      column_profile: DatasetFieldProfileClass,
                                      column: str) -> None:
        if self.config.include_field_histogram:
            self.dataset.set_config_value("interactive_evaluation", True)

            res = self.dataset.expect_column_kl_divergence_to_be_less_than(
                column,
                partition_object=None,
                threshold=None,
                result_format="COMPLETE",
            ).result
            if "details" in res and "observed_partition" in res["details"]:
                partition = res["details"]["observed_partition"]
                column_profile.histogram = HistogramClass(
                    [str(v) for v in partition["bins"]],
                    [
                        partition["tail_weights"][0],
                        *partition["weights"],
                        partition["tail_weights"][1],
                    ],
                )
Ejemplo n.º 2
0
    def extract_table_profiles(
        self,
        analysis_metrics: DataFrame,
    ) -> None:
        self.profile.fieldProfiles = []

        analysis_metrics = analysis_metrics.toPandas()
        # DataFrame with following columns:
        #   entity: "Column" for column profile, "Table" for table profile
        #   instance: name of column being profiled. "*" for table profiles
        #   name: name of metric. Histogram metrics are formatted as "Histogram.<metric>.<value>"
        #   value: value of metric

        column_metrics = analysis_metrics[analysis_metrics["entity"] ==
                                          "Column"]

        # resolve histogram types for grouping
        column_metrics["kind"] = column_metrics["name"].apply(
            lambda x: "Histogram" if x.startswith("Histogram.") else x)

        column_histogram_metrics = column_metrics[column_metrics["kind"] ==
                                                  "Histogram"]
        column_nonhistogram_metrics = column_metrics[
            column_metrics["kind"] != "Histogram"]

        histogram_columns = set()

        if len(column_histogram_metrics) > 0:

            # we only want the absolute counts for each histogram for now
            column_histogram_metrics = column_histogram_metrics[
                column_histogram_metrics["name"].apply(
                    lambda x: x.startswith("Histogram.abs."))]
            # get the histogram bins by chopping off the "Histogram.abs." prefix
            column_histogram_metrics["bin"] = column_histogram_metrics[
                "name"].apply(lambda x: x[14:])

            # reshape histogram counts for easier access
            histogram_counts = column_histogram_metrics.set_index(
                ["instance", "bin"])["value"]

            histogram_columns = set(histogram_counts.index.get_level_values(0))

        profiled_columns = set()

        if len(column_nonhistogram_metrics) > 0:
            # reshape other metrics for easier access
            nonhistogram_metrics = column_nonhistogram_metrics.set_index(
                ["instance", "name"])["value"]

            profiled_columns = set(
                nonhistogram_metrics.index.get_level_values(0))
        # histogram_columns = set(histogram_counts.index.get_level_values(0))

        for column_spec in self.column_specs:
            column = column_spec.column
            column_profile = column_spec.column_profile

            if column not in profiled_columns:
                continue

            # convert to Dict so we can use .get
            deequ_column_profile = nonhistogram_metrics.loc[column].to_dict()

            # uniqueCount, uniqueProportion, nullCount, nullProportion, sampleValues already set in TableWrapper
            column_profile.min = null_str(deequ_column_profile.get("Minimum"))
            column_profile.max = null_str(deequ_column_profile.get("Maximum"))
            column_profile.mean = null_str(deequ_column_profile.get("Mean"))
            column_profile.median = null_str(
                deequ_column_profile.get("ApproxQuantiles-0.5"))
            column_profile.stdev = null_str(
                deequ_column_profile.get("StandardDeviation"))
            if all(
                    deequ_column_profile.get(f"ApproxQuantiles-{quantile}")
                    is not None for quantile in QUANTILES):
                column_profile.quantiles = [
                    QuantileClass(
                        quantile=str(quantile),
                        value=str(
                            deequ_column_profile[f"ApproxQuantiles-{quantile}"]
                        ),
                    ) for quantile in QUANTILES
                ]

            if column in histogram_columns:

                column_histogram = histogram_counts.loc[column]
                # sort so output is deterministic
                column_histogram = column_histogram.sort_index()

                if column_spec.histogram_distinct:

                    column_profile.distinctValueFrequencies = [
                        ValueFrequencyClass(value=value,
                                            frequency=int(
                                                column_histogram.loc[value]))
                        for value in column_histogram.index
                    ]
                    # sort so output is deterministic
                    column_profile.distinctValueFrequencies = sorted(
                        column_profile.distinctValueFrequencies,
                        key=lambda x: x.value)

                else:

                    column_profile.histogram = HistogramClass(
                        [str(x) for x in column_histogram.index],
                        [float(x) for x in column_histogram],
                    )

            # append the column profile to the dataset profile
            self.profile.fieldProfiles.append(column_profile)
Ejemplo n.º 3
0
    def _handle_convert_column_evrs(  # noqa: C901 (complexity)
        self,
        profile: DatasetProfileClass,
        column: str,
        col_evrs: Iterable[ExpectationValidationResult],
        pretty_name: str,
        send_sample_values: bool,
    ) -> None:
        # TRICKY: This method mutates the profile directly.

        column_profile = DatasetFieldProfileClass(fieldPath=column)

        profile.fieldProfiles = profile.fieldProfiles or []
        profile.fieldProfiles.append(column_profile)

        for evr in col_evrs:
            exp: str = evr.expectation_config.expectation_type
            res: dict = evr.result
            if not res:
                self.report.report_warning(f"profile of {pretty_name}",
                                           f"{exp} did not yield any results")
                continue

            if exp == "expect_column_unique_value_count_to_be_between":
                column_profile.uniqueCount = res["observed_value"]
            elif exp == "expect_column_proportion_of_unique_values_to_be_between":
                column_profile.uniqueProportion = res["observed_value"]
            elif exp == "expect_column_values_to_not_be_null":
                column_profile.nullCount = res["unexpected_count"]
                if ("unexpected_percent" in res
                        and res["unexpected_percent"] is not None):
                    column_profile.nullProportion = res[
                        "unexpected_percent"] / 100
            elif exp == "expect_column_values_to_not_match_regex":
                # ignore; generally used for whitespace checks using regex r"^\s+|\s+$"
                pass
            elif exp == "expect_column_mean_to_be_between":
                column_profile.mean = str(res["observed_value"])
            elif exp == "expect_column_min_to_be_between":
                column_profile.min = str(res["observed_value"])
            elif exp == "expect_column_max_to_be_between":
                column_profile.max = str(res["observed_value"])
            elif exp == "expect_column_median_to_be_between":
                column_profile.median = str(res["observed_value"])
            elif exp == "expect_column_stdev_to_be_between":
                column_profile.stdev = str(res["observed_value"])
            elif exp == "expect_column_quantile_values_to_be_between":
                if "observed_value" in res:
                    column_profile.quantiles = [
                        QuantileClass(quantile=str(quantile), value=str(value))
                        for quantile, value in zip(
                            res["observed_value"]["quantiles"],
                            res["observed_value"]["values"],
                        )
                    ]
            elif exp == "expect_column_values_to_be_in_set":
                column_profile.sampleValues = [
                    str(v) for v in res["partial_unexpected_list"]
                ]
                if not send_sample_values:
                    column_profile.sampleValues = []
            elif exp == "expect_column_kl_divergence_to_be_less_than":
                if "details" in res and "observed_partition" in res["details"]:
                    partition = res["details"]["observed_partition"]
                    column_profile.histogram = HistogramClass(
                        [str(v) for v in partition["bins"]],
                        [
                            partition["tail_weights"][0],
                            *partition["weights"],
                            partition["tail_weights"][1],
                        ],
                    )
            elif exp == "expect_column_distinct_values_to_be_in_set":
                if "details" in res and "value_counts" in res["details"]:
                    # This can be used to produce a bar chart since it includes values and frequencies.
                    # As such, it is handled differently from expect_column_values_to_be_in_set, which
                    # is nonexhaustive.
                    column_profile.distinctValueFrequencies = [
                        ValueFrequencyClass(value=str(value), frequency=count)
                        for value, count in res["details"]
                        ["value_counts"].items()
                    ]
                    if not send_sample_values:
                        column_profile.distinctValueFrequencies = []
            elif exp == "expect_column_values_to_be_in_type_list":
                # ignore; we already know the types for each column via ingestion
                pass
            elif exp == "expect_column_values_to_be_unique":
                # ignore; this is generally covered by the unique value count test
                pass
            else:
                self.report.report_warning(
                    f"profile of {pretty_name}",
                    f"warning: unknown column mapper {exp} in col {column}",
                )