def Compliance(self, instance, predicate, where=None): result = self.AnalysisRunner.onData(self.df).addAnalyzer(Compliance(instance, predicate, where)).run() result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result) result_json = AnalyzerContext.successMetricsAsJson(self.spark, result) df_from_json = self.spark.read.json(self.sc.parallelize([result_json])) self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect()) return result_df.select("value").collect()
def ApproxQuantiles(self, column, quantiles): result = self.AnalysisRunner.onData(self.df).addAnalyzer(ApproxQuantiles(column, quantiles)).run() result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result) result_json = AnalyzerContext.successMetricsAsJson(self.spark, result) df_from_json = self.spark.read.json(self.sc.parallelize([result_json])) self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect()) return result_df.select("value").collect()
def UniqueValueRatio(self, columns, where=None): result = self.AnalysisRunner.onData(self.df).addAnalyzer(UniqueValueRatio(columns, where)).run() result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result) result_json = AnalyzerContext.successMetricsAsJson(self.spark, result) df_from_json = self.spark.read.json(self.sc.parallelize([result_json])) self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect()) return result_df.select("value").collect()
def test_PatternMatch(self): result = ( self.AnalysisRunner.onData(self.df).addAnalyzer(PatternMatch(column="a", pattern_regex="ba(r|z)")).run() ) result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result) result_json = AnalyzerContext.successMetricsAsJson(self.spark, result) df_from_json = self.spark.read.json(self.sc.parallelize([result_json])) self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect()) self.assertEqual(result_df.select("value").collect(), [Row(value=0.0)])
def Histogram_maxBins(self, column, binningUdf=None, maxDetailBins: int = None, where: str = None): result = ( self.AnalysisRunner.onData(self.df).addAnalyzer(Histogram(column, binningUdf, maxDetailBins, where)).run() ) result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result) result_json = AnalyzerContext.successMetricsAsJson(self.spark, result) df_from_json = self.spark.read.json(self.sc.parallelize([result_json])) self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect()) return result_df.select("value").collect()
def __init__( self, dataframe: DataFrame, spark: SparkSession, profiling_config: DataLakeProfilerConfig, report: DataLakeSourceReport, file_path: str, ): self.spark = spark self.dataframe = dataframe self.analyzer = AnalysisRunner(spark).onData(dataframe) self.column_specs = [] self.row_count = dataframe.count() self.profiling_config = profiling_config self.file_path = file_path self.columns_to_profile = [] self.ignored_columns = [] self.profile = DatasetProfileClass(timestampMillis=get_sys_time()) self.report = report self.profile.rowCount = self.row_count self.profile.columnCount = len(dataframe.columns) column_types = {x.name: x.dataType for x in dataframe.schema.fields} if self.profiling_config.profile_table_level_only: return # get column distinct counts for column in dataframe.columns: if not self.profiling_config.allow_deny_patterns.allowed(column): self.ignored_columns.append(column) continue self.columns_to_profile.append(column) # Normal CountDistinct is ridiculously slow self.analyzer.addAnalyzer(ApproxCountDistinct(column)) if self.profiling_config.max_number_of_fields_to_profile is not None: if (len(self.columns_to_profile) > self.profiling_config.max_number_of_fields_to_profile): columns_being_dropped = self.columns_to_profile[ self.profiling_config.max_number_of_fields_to_profile:] self.columns_to_profile = self.columns_to_profile[:self. profiling_config . max_number_of_fields_to_profile] self.report.report_file_dropped( f"The max_number_of_fields_to_profile={self.profiling_config.max_number_of_fields_to_profile} reached. Profile of columns {self.file_path}({', '.join(sorted(columns_being_dropped))})" ) analysis_result = self.analyzer.run() analysis_metrics = AnalyzerContext.successMetricsAsJson( self.spark, analysis_result) # reshape distinct counts into dictionary column_distinct_counts = { x["instance"]: int(x["value"]) for x in analysis_metrics if x["name"] == "ApproxCountDistinct" } select_numeric_null_counts = [ count(when( isnan(c) | col(c).isNull(), c, )).alias(c) for c in self.columns_to_profile if column_types[column] in [DoubleType, FloatType] ] # PySpark doesn't support isnan() on non-float/double columns select_nonnumeric_null_counts = [ count(when( col(c).isNull(), c, )).alias(c) for c in self.columns_to_profile if column_types[column] not in [DoubleType, FloatType] ] null_counts = dataframe.select(select_numeric_null_counts + select_nonnumeric_null_counts) column_null_counts = null_counts.toPandas().T[0].to_dict() column_null_fractions = { c: column_null_counts[c] / self.row_count for c in self.columns_to_profile } column_nonnull_counts = { c: self.row_count - column_null_counts[c] for c in self.columns_to_profile } column_unique_proportions = { c: (column_distinct_counts[c] / column_nonnull_counts[c] if column_nonnull_counts[c] > 0 else 0) for c in self.columns_to_profile } if self.profiling_config.include_field_sample_values: # take sample and convert to Pandas DataFrame if self.row_count < NUM_SAMPLE_ROWS: # if row count is less than number to sample, just take all rows rdd_sample = dataframe.rdd.take(self.row_count) else: rdd_sample = dataframe.rdd.takeSample(False, NUM_SAMPLE_ROWS, seed=0) # init column specs with profiles for column in self.columns_to_profile: column_profile = DatasetFieldProfileClass(fieldPath=column) column_spec = _SingleColumnSpec(column, column_profile) column_profile.uniqueCount = column_distinct_counts.get(column) column_profile.uniqueProportion = column_unique_proportions.get( column) column_profile.nullCount = column_null_counts.get(column) column_profile.nullProportion = column_null_fractions.get(column) if self.profiling_config.include_field_sample_values: column_profile.sampleValues = [ str(x[column]) for x in rdd_sample ] column_spec.type_ = column_types[column] column_spec.cardinality = _convert_to_cardinality( column_distinct_counts[column], column_null_fractions[column], ) self.column_specs.append(column_spec)
def Correlation(self, column1, column2, where=None): result = self.AnalysisRunner.onData(self.df).addAnalyzer(Correlation(column1, column2, where)).run() result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result) AnalyzerContext.successMetricsAsJson(self.spark, result) return result_df.select("value").collect()