def Compliance(self, instance, predicate, where=None):
     result = self.AnalysisRunner.onData(self.df).addAnalyzer(Compliance(instance, predicate, where)).run()
     result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result)
     result_json = AnalyzerContext.successMetricsAsJson(self.spark, result)
     df_from_json = self.spark.read.json(self.sc.parallelize([result_json]))
     self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect())
     return result_df.select("value").collect()
 def ApproxQuantiles(self, column, quantiles):
     result = self.AnalysisRunner.onData(self.df).addAnalyzer(ApproxQuantiles(column, quantiles)).run()
     result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result)
     result_json = AnalyzerContext.successMetricsAsJson(self.spark, result)
     df_from_json = self.spark.read.json(self.sc.parallelize([result_json]))
     self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect())
     return result_df.select("value").collect()
 def UniqueValueRatio(self, columns, where=None):
     result = self.AnalysisRunner.onData(self.df).addAnalyzer(UniqueValueRatio(columns, where)).run()
     result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result)
     result_json = AnalyzerContext.successMetricsAsJson(self.spark, result)
     df_from_json = self.spark.read.json(self.sc.parallelize([result_json]))
     self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect())
     return result_df.select("value").collect()
 def test_PatternMatch(self):
     result = (
         self.AnalysisRunner.onData(self.df).addAnalyzer(PatternMatch(column="a", pattern_regex="ba(r|z)")).run()
     )
     result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result)
     result_json = AnalyzerContext.successMetricsAsJson(self.spark, result)
     df_from_json = self.spark.read.json(self.sc.parallelize([result_json]))
     self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect())
     self.assertEqual(result_df.select("value").collect(), [Row(value=0.0)])
 def Histogram_maxBins(self, column, binningUdf=None, maxDetailBins: int = None, where: str = None):
     result = (
         self.AnalysisRunner.onData(self.df).addAnalyzer(Histogram(column, binningUdf, maxDetailBins, where)).run()
     )
     result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result)
     result_json = AnalyzerContext.successMetricsAsJson(self.spark, result)
     df_from_json = self.spark.read.json(self.sc.parallelize([result_json]))
     self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect())
     return result_df.select("value").collect()
Beispiel #6
0
    def __init__(
        self,
        dataframe: DataFrame,
        spark: SparkSession,
        profiling_config: DataLakeProfilerConfig,
        report: DataLakeSourceReport,
        file_path: str,
    ):
        self.spark = spark
        self.dataframe = dataframe
        self.analyzer = AnalysisRunner(spark).onData(dataframe)
        self.column_specs = []
        self.row_count = dataframe.count()
        self.profiling_config = profiling_config
        self.file_path = file_path
        self.columns_to_profile = []
        self.ignored_columns = []
        self.profile = DatasetProfileClass(timestampMillis=get_sys_time())
        self.report = report

        self.profile.rowCount = self.row_count
        self.profile.columnCount = len(dataframe.columns)

        column_types = {x.name: x.dataType for x in dataframe.schema.fields}

        if self.profiling_config.profile_table_level_only:

            return

        # get column distinct counts
        for column in dataframe.columns:

            if not self.profiling_config.allow_deny_patterns.allowed(column):
                self.ignored_columns.append(column)
                continue

            self.columns_to_profile.append(column)
            # Normal CountDistinct is ridiculously slow
            self.analyzer.addAnalyzer(ApproxCountDistinct(column))

        if self.profiling_config.max_number_of_fields_to_profile is not None:
            if (len(self.columns_to_profile) >
                    self.profiling_config.max_number_of_fields_to_profile):
                columns_being_dropped = self.columns_to_profile[
                    self.profiling_config.max_number_of_fields_to_profile:]
                self.columns_to_profile = self.columns_to_profile[:self.
                                                                  profiling_config
                                                                  .
                                                                  max_number_of_fields_to_profile]

                self.report.report_file_dropped(
                    f"The max_number_of_fields_to_profile={self.profiling_config.max_number_of_fields_to_profile} reached. Profile of columns {self.file_path}({', '.join(sorted(columns_being_dropped))})"
                )

        analysis_result = self.analyzer.run()
        analysis_metrics = AnalyzerContext.successMetricsAsJson(
            self.spark, analysis_result)

        # reshape distinct counts into dictionary
        column_distinct_counts = {
            x["instance"]: int(x["value"])
            for x in analysis_metrics if x["name"] == "ApproxCountDistinct"
        }

        select_numeric_null_counts = [
            count(when(
                isnan(c) | col(c).isNull(),
                c,
            )).alias(c) for c in self.columns_to_profile
            if column_types[column] in [DoubleType, FloatType]
        ]

        # PySpark doesn't support isnan() on non-float/double columns
        select_nonnumeric_null_counts = [
            count(when(
                col(c).isNull(),
                c,
            )).alias(c) for c in self.columns_to_profile
            if column_types[column] not in [DoubleType, FloatType]
        ]

        null_counts = dataframe.select(select_numeric_null_counts +
                                       select_nonnumeric_null_counts)
        column_null_counts = null_counts.toPandas().T[0].to_dict()
        column_null_fractions = {
            c: column_null_counts[c] / self.row_count
            for c in self.columns_to_profile
        }
        column_nonnull_counts = {
            c: self.row_count - column_null_counts[c]
            for c in self.columns_to_profile
        }

        column_unique_proportions = {
            c:
            (column_distinct_counts[c] /
             column_nonnull_counts[c] if column_nonnull_counts[c] > 0 else 0)
            for c in self.columns_to_profile
        }

        if self.profiling_config.include_field_sample_values:
            # take sample and convert to Pandas DataFrame
            if self.row_count < NUM_SAMPLE_ROWS:
                # if row count is less than number to sample, just take all rows
                rdd_sample = dataframe.rdd.take(self.row_count)
            else:
                rdd_sample = dataframe.rdd.takeSample(False,
                                                      NUM_SAMPLE_ROWS,
                                                      seed=0)

        # init column specs with profiles
        for column in self.columns_to_profile:
            column_profile = DatasetFieldProfileClass(fieldPath=column)

            column_spec = _SingleColumnSpec(column, column_profile)

            column_profile.uniqueCount = column_distinct_counts.get(column)
            column_profile.uniqueProportion = column_unique_proportions.get(
                column)
            column_profile.nullCount = column_null_counts.get(column)
            column_profile.nullProportion = column_null_fractions.get(column)
            if self.profiling_config.include_field_sample_values:
                column_profile.sampleValues = [
                    str(x[column]) for x in rdd_sample
                ]

            column_spec.type_ = column_types[column]
            column_spec.cardinality = _convert_to_cardinality(
                column_distinct_counts[column],
                column_null_fractions[column],
            )

            self.column_specs.append(column_spec)
 def Correlation(self, column1, column2, where=None):
     result = self.AnalysisRunner.onData(self.df).addAnalyzer(Correlation(column1, column2, where)).run()
     result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result)
     AnalyzerContext.successMetricsAsJson(self.spark, result)
     return result_df.select("value").collect()