Example #1
0
    def generate_dataset_profile(  # noqa: C901 (complexity)
            self, ) -> DatasetProfileClass:
        self.dataset.set_default_expectation_argument(
            "catch_exceptions", self.config.catch_exceptions)

        profile = DatasetProfileClass(timestampMillis=get_sys_time())
        if self.partition:
            profile.partitionSpec = PartitionSpecClass(
                partition=self.partition)
        profile.fieldProfiles = []
        self._get_dataset_rows(profile)

        all_columns = self.dataset.get_table_columns()
        profile.columnCount = len(all_columns)
        columns_to_profile = set(self._get_columns_to_profile())

        logger.debug(
            f"profiling {self.dataset_name}: flushing stage 1 queries")
        self.query_combiner.flush()

        columns_profiling_queue: List[_SingleColumnSpec] = []
        for column in all_columns:
            column_profile = DatasetFieldProfileClass(fieldPath=column)
            profile.fieldProfiles.append(column_profile)

            if column in columns_to_profile:
                column_spec = _SingleColumnSpec(column, column_profile)
                columns_profiling_queue.append(column_spec)

                self._get_column_type(column_spec, column)
                self._get_column_cardinality(column_spec, column)

        logger.debug(
            f"profiling {self.dataset_name}: flushing stage 2 queries")
        self.query_combiner.flush()

        assert profile.rowCount is not None
        row_count: int = profile.rowCount

        telemetry.telemetry_instance.ping(
            "profile_sql_table",
            # bucket by taking floor of log of the number of rows scanned
            {
                "rows_profiled": 10**int(log10(row_count + 1)),
            },
        )

        for column_spec in columns_profiling_queue:
            column = column_spec.column
            column_profile = column_spec.column_profile
            type_ = column_spec.type_
            cardinality = column_spec.cardinality

            non_null_count = column_spec.nonnull_count
            unique_count = column_spec.unique_count

            if self.config.include_field_null_count and non_null_count is not None:
                null_count = row_count - non_null_count
                if null_count < 0:
                    null_count = 0

                column_profile.nullCount = null_count
                if row_count > 0:
                    column_profile.nullProportion = null_count / row_count
                    # Sometimes this value is bigger than 1 because of the approx queries
                    if column_profile.nullProportion > 1:
                        column_profile.nullProportion = 1

            if unique_count is not None:
                column_profile.uniqueCount = unique_count
                if non_null_count is not None and non_null_count > 0:
                    column_profile.uniqueProportion = unique_count / non_null_count
                    # Sometimes this value is bigger than 1 because of the approx queries
                    if column_profile.uniqueProportion > 1:
                        column_profile.uniqueProportion = 1

            self._get_dataset_column_sample_values(column_profile, column)

            if (type_ == ProfilerDataType.INT
                    or type_ == ProfilerDataType.FLOAT
                    or type_ == ProfilerDataType.NUMERIC):
                if cardinality == Cardinality.UNIQUE:
                    pass
                elif cardinality in [
                        Cardinality.ONE,
                        Cardinality.TWO,
                        Cardinality.VERY_FEW,
                        Cardinality.FEW,
                        Cardinality.MANY,
                        Cardinality.VERY_MANY,
                        Cardinality.UNIQUE,
                ]:
                    self._get_dataset_column_min(column_profile, column)
                    self._get_dataset_column_max(column_profile, column)
                    self._get_dataset_column_mean(column_profile, column)
                    self._get_dataset_column_median(column_profile, column)

                    if type_ == ProfilerDataType.INT:
                        self._get_dataset_column_stdev(column_profile, column)

                    self._get_dataset_column_quantiles(column_profile, column)
                    self._get_dataset_column_histogram(column_profile, column)
                    if cardinality in [
                            Cardinality.ONE,
                            Cardinality.TWO,
                            Cardinality.VERY_FEW,
                            Cardinality.FEW,
                    ]:
                        self._get_dataset_column_distinct_value_frequencies(
                            column_profile,
                            column,
                        )
                else:  # unknown cardinality - skip
                    pass

            elif type_ == ProfilerDataType.STRING:
                if cardinality in [
                        Cardinality.ONE,
                        Cardinality.TWO,
                        Cardinality.VERY_FEW,
                        Cardinality.FEW,
                ]:
                    self._get_dataset_column_distinct_value_frequencies(
                        column_profile,
                        column,
                    )

            elif type_ == ProfilerDataType.DATETIME:
                self._get_dataset_column_min(column_profile, column)
                self._get_dataset_column_max(column_profile, column)

                # FIXME: Re-add histogram once kl_divergence has been modified to support datetimes

                if cardinality in [
                        Cardinality.ONE,
                        Cardinality.TWO,
                        Cardinality.VERY_FEW,
                        Cardinality.FEW,
                ]:
                    self._get_dataset_column_distinct_value_frequencies(
                        column_profile,
                        column,
                    )

            else:
                if cardinality in [
                        Cardinality.ONE,
                        Cardinality.TWO,
                        Cardinality.VERY_FEW,
                        Cardinality.FEW,
                ]:
                    self._get_dataset_column_distinct_value_frequencies(
                        column_profile,
                        column,
                    )

        logger.debug(
            f"profiling {self.dataset_name}: flushing stage 3 queries")
        self.query_combiner.flush()
        return profile
Example #2
0
    def __init__(
        self,
        dataframe: DataFrame,
        spark: SparkSession,
        profiling_config: DataLakeProfilerConfig,
        report: DataLakeSourceReport,
        file_path: str,
    ):
        self.spark = spark
        self.dataframe = dataframe
        self.analyzer = AnalysisRunner(spark).onData(dataframe)
        self.column_specs = []
        self.row_count = dataframe.count()
        self.profiling_config = profiling_config
        self.file_path = file_path
        self.columns_to_profile = []
        self.ignored_columns = []
        self.profile = DatasetProfileClass(timestampMillis=get_sys_time())
        self.report = report

        self.profile.rowCount = self.row_count
        self.profile.columnCount = len(dataframe.columns)

        column_types = {x.name: x.dataType for x in dataframe.schema.fields}

        if self.profiling_config.profile_table_level_only:

            return

        # get column distinct counts
        for column in dataframe.columns:

            if not self.profiling_config.allow_deny_patterns.allowed(column):
                self.ignored_columns.append(column)
                continue

            self.columns_to_profile.append(column)
            # Normal CountDistinct is ridiculously slow
            self.analyzer.addAnalyzer(ApproxCountDistinct(column))

        if self.profiling_config.max_number_of_fields_to_profile is not None:
            if (len(self.columns_to_profile) >
                    self.profiling_config.max_number_of_fields_to_profile):
                columns_being_dropped = self.columns_to_profile[
                    self.profiling_config.max_number_of_fields_to_profile:]
                self.columns_to_profile = self.columns_to_profile[:self.
                                                                  profiling_config
                                                                  .
                                                                  max_number_of_fields_to_profile]

                self.report.report_file_dropped(
                    f"The max_number_of_fields_to_profile={self.profiling_config.max_number_of_fields_to_profile} reached. Profile of columns {self.file_path}({', '.join(sorted(columns_being_dropped))})"
                )

        analysis_result = self.analyzer.run()
        analysis_metrics = AnalyzerContext.successMetricsAsJson(
            self.spark, analysis_result)

        # reshape distinct counts into dictionary
        column_distinct_counts = {
            x["instance"]: int(x["value"])
            for x in analysis_metrics if x["name"] == "ApproxCountDistinct"
        }

        select_numeric_null_counts = [
            count(when(
                isnan(c) | col(c).isNull(),
                c,
            )).alias(c) for c in self.columns_to_profile
            if column_types[column] in [DoubleType, FloatType]
        ]

        # PySpark doesn't support isnan() on non-float/double columns
        select_nonnumeric_null_counts = [
            count(when(
                col(c).isNull(),
                c,
            )).alias(c) for c in self.columns_to_profile
            if column_types[column] not in [DoubleType, FloatType]
        ]

        null_counts = dataframe.select(select_numeric_null_counts +
                                       select_nonnumeric_null_counts)
        column_null_counts = null_counts.toPandas().T[0].to_dict()
        column_null_fractions = {
            c: column_null_counts[c] / self.row_count
            for c in self.columns_to_profile
        }
        column_nonnull_counts = {
            c: self.row_count - column_null_counts[c]
            for c in self.columns_to_profile
        }

        column_unique_proportions = {
            c:
            (column_distinct_counts[c] /
             column_nonnull_counts[c] if column_nonnull_counts[c] > 0 else 0)
            for c in self.columns_to_profile
        }

        if self.profiling_config.include_field_sample_values:
            # take sample and convert to Pandas DataFrame
            if self.row_count < NUM_SAMPLE_ROWS:
                # if row count is less than number to sample, just take all rows
                rdd_sample = dataframe.rdd.take(self.row_count)
            else:
                rdd_sample = dataframe.rdd.takeSample(False,
                                                      NUM_SAMPLE_ROWS,
                                                      seed=0)

        # init column specs with profiles
        for column in self.columns_to_profile:
            column_profile = DatasetFieldProfileClass(fieldPath=column)

            column_spec = _SingleColumnSpec(column, column_profile)

            column_profile.uniqueCount = column_distinct_counts.get(column)
            column_profile.uniqueProportion = column_unique_proportions.get(
                column)
            column_profile.nullCount = column_null_counts.get(column)
            column_profile.nullProportion = column_null_fractions.get(column)
            if self.profiling_config.include_field_sample_values:
                column_profile.sampleValues = [
                    str(x[column]) for x in rdd_sample
                ]

            column_spec.type_ = column_types[column]
            column_spec.cardinality = _convert_to_cardinality(
                column_distinct_counts[column],
                column_null_fractions[column],
            )

            self.column_specs.append(column_spec)
    def _handle_convert_column_evrs(  # noqa: C901 (complexity)
        self,
        profile: DatasetProfileClass,
        column: str,
        col_evrs: Iterable[ExpectationValidationResult],
        pretty_name: str,
        send_sample_values: bool,
    ) -> None:
        # TRICKY: This method mutates the profile directly.

        column_profile = DatasetFieldProfileClass(fieldPath=column)

        profile.fieldProfiles = profile.fieldProfiles or []
        profile.fieldProfiles.append(column_profile)

        for evr in col_evrs:
            exp: str = evr.expectation_config.expectation_type
            res: dict = evr.result
            if not res:
                self.report.report_warning(f"profile of {pretty_name}",
                                           f"{exp} did not yield any results")
                continue

            if exp == "expect_column_unique_value_count_to_be_between":
                column_profile.uniqueCount = res["observed_value"]
            elif exp == "expect_column_proportion_of_unique_values_to_be_between":
                column_profile.uniqueProportion = res["observed_value"]
            elif exp == "expect_column_values_to_not_be_null":
                column_profile.nullCount = res["unexpected_count"]
                if ("unexpected_percent" in res
                        and res["unexpected_percent"] is not None):
                    column_profile.nullProportion = res[
                        "unexpected_percent"] / 100
            elif exp == "expect_column_values_to_not_match_regex":
                # ignore; generally used for whitespace checks using regex r"^\s+|\s+$"
                pass
            elif exp == "expect_column_mean_to_be_between":
                column_profile.mean = str(res["observed_value"])
            elif exp == "expect_column_min_to_be_between":
                column_profile.min = str(res["observed_value"])
            elif exp == "expect_column_max_to_be_between":
                column_profile.max = str(res["observed_value"])
            elif exp == "expect_column_median_to_be_between":
                column_profile.median = str(res["observed_value"])
            elif exp == "expect_column_stdev_to_be_between":
                column_profile.stdev = str(res["observed_value"])
            elif exp == "expect_column_quantile_values_to_be_between":
                if "observed_value" in res:
                    column_profile.quantiles = [
                        QuantileClass(quantile=str(quantile), value=str(value))
                        for quantile, value in zip(
                            res["observed_value"]["quantiles"],
                            res["observed_value"]["values"],
                        )
                    ]
            elif exp == "expect_column_values_to_be_in_set":
                column_profile.sampleValues = [
                    str(v) for v in res["partial_unexpected_list"]
                ]
                if not send_sample_values:
                    column_profile.sampleValues = []
            elif exp == "expect_column_kl_divergence_to_be_less_than":
                if "details" in res and "observed_partition" in res["details"]:
                    partition = res["details"]["observed_partition"]
                    column_profile.histogram = HistogramClass(
                        [str(v) for v in partition["bins"]],
                        [
                            partition["tail_weights"][0],
                            *partition["weights"],
                            partition["tail_weights"][1],
                        ],
                    )
            elif exp == "expect_column_distinct_values_to_be_in_set":
                if "details" in res and "value_counts" in res["details"]:
                    # This can be used to produce a bar chart since it includes values and frequencies.
                    # As such, it is handled differently from expect_column_values_to_be_in_set, which
                    # is nonexhaustive.
                    column_profile.distinctValueFrequencies = [
                        ValueFrequencyClass(value=str(value), frequency=count)
                        for value, count in res["details"]
                        ["value_counts"].items()
                    ]
                    if not send_sample_values:
                        column_profile.distinctValueFrequencies = []
            elif exp == "expect_column_values_to_be_in_type_list":
                # ignore; we already know the types for each column via ingestion
                pass
            elif exp == "expect_column_values_to_be_unique":
                # ignore; this is generally covered by the unique value count test
                pass
            else:
                self.report.report_warning(
                    f"profile of {pretty_name}",
                    f"warning: unknown column mapper {exp} in col {column}",
                )