def Compliance(self, instance, predicate, where=None): result = self.AnalysisRunner.onData(self.df).addAnalyzer(Compliance(instance, predicate, where)).run() result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result) result_json = AnalyzerContext.successMetricsAsJson(self.spark, result) df_from_json = self.spark.read.json(self.sc.parallelize([result_json])) self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect()) return result_df.select("value").collect()
def UniqueValueRatio(self, columns, where=None): result = self.AnalysisRunner.onData(self.df).addAnalyzer(UniqueValueRatio(columns, where)).run() result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result) result_json = AnalyzerContext.successMetricsAsJson(self.spark, result) df_from_json = self.spark.read.json(self.sc.parallelize([result_json])) self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect()) return result_df.select("value").collect()
def ApproxQuantiles(self, column, quantiles): result = self.AnalysisRunner.onData(self.df).addAnalyzer(ApproxQuantiles(column, quantiles)).run() result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result) result_json = AnalyzerContext.successMetricsAsJson(self.spark, result) df_from_json = self.spark.read.json(self.sc.parallelize([result_json])) self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect()) return result_df.select("value").collect()
def test_PatternMatch(self): result = ( self.AnalysisRunner.onData(self.df).addAnalyzer(PatternMatch(column="a", pattern_regex="ba(r|z)")).run() ) result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result) result_json = AnalyzerContext.successMetricsAsJson(self.spark, result) df_from_json = self.spark.read.json(self.sc.parallelize([result_json])) self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect()) self.assertEqual(result_df.select("value").collect(), [Row(value=0.0)])
def Histogram_maxBins(self, column, binningUdf=None, maxDetailBins: int = None, where: str = None): result = ( self.AnalysisRunner.onData(self.df).addAnalyzer(Histogram(column, binningUdf, maxDetailBins, where)).run() ) result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result) result_json = AnalyzerContext.successMetricsAsJson(self.spark, result) df_from_json = self.spark.read.json(self.sc.parallelize([result_json])) self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect()) return result_df.select("value").collect()
def test_KLLSketch(self): result = ( self.AnalysisRunner.onData(self.df).addAnalyzer(KLLSketch("b", KLLParameters(self.spark, 2, 0.64, 2))).run() ) result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result) result_df.show() return result_df.select("value").collect()
def __init__( self, dataframe: DataFrame, spark: SparkSession, profiling_config: DataLakeProfilerConfig, report: DataLakeSourceReport, file_path: str, ): self.spark = spark self.dataframe = dataframe self.analyzer = AnalysisRunner(spark).onData(dataframe) self.column_specs = [] self.row_count = dataframe.count() self.profiling_config = profiling_config self.file_path = file_path self.columns_to_profile = [] self.ignored_columns = [] self.profile = DatasetProfileClass(timestampMillis=get_sys_time()) self.report = report self.profile.rowCount = self.row_count self.profile.columnCount = len(dataframe.columns) column_types = {x.name: x.dataType for x in dataframe.schema.fields} if self.profiling_config.profile_table_level_only: return # get column distinct counts for column in dataframe.columns: if not self.profiling_config.allow_deny_patterns.allowed(column): self.ignored_columns.append(column) continue self.columns_to_profile.append(column) # Normal CountDistinct is ridiculously slow self.analyzer.addAnalyzer(ApproxCountDistinct(column)) if self.profiling_config.max_number_of_fields_to_profile is not None: if (len(self.columns_to_profile) > self.profiling_config.max_number_of_fields_to_profile): columns_being_dropped = self.columns_to_profile[ self.profiling_config.max_number_of_fields_to_profile:] self.columns_to_profile = self.columns_to_profile[:self. profiling_config . max_number_of_fields_to_profile] self.report.report_file_dropped( f"The max_number_of_fields_to_profile={self.profiling_config.max_number_of_fields_to_profile} reached. Profile of columns {self.file_path}({', '.join(sorted(columns_being_dropped))})" ) analysis_result = self.analyzer.run() analysis_metrics = AnalyzerContext.successMetricsAsJson( self.spark, analysis_result) # reshape distinct counts into dictionary column_distinct_counts = { x["instance"]: int(x["value"]) for x in analysis_metrics if x["name"] == "ApproxCountDistinct" } select_numeric_null_counts = [ count(when( isnan(c) | col(c).isNull(), c, )).alias(c) for c in self.columns_to_profile if column_types[column] in [DoubleType, FloatType] ] # PySpark doesn't support isnan() on non-float/double columns select_nonnumeric_null_counts = [ count(when( col(c).isNull(), c, )).alias(c) for c in self.columns_to_profile if column_types[column] not in [DoubleType, FloatType] ] null_counts = dataframe.select(select_numeric_null_counts + select_nonnumeric_null_counts) column_null_counts = null_counts.toPandas().T[0].to_dict() column_null_fractions = { c: column_null_counts[c] / self.row_count for c in self.columns_to_profile } column_nonnull_counts = { c: self.row_count - column_null_counts[c] for c in self.columns_to_profile } column_unique_proportions = { c: (column_distinct_counts[c] / column_nonnull_counts[c] if column_nonnull_counts[c] > 0 else 0) for c in self.columns_to_profile } if self.profiling_config.include_field_sample_values: # take sample and convert to Pandas DataFrame if self.row_count < NUM_SAMPLE_ROWS: # if row count is less than number to sample, just take all rows rdd_sample = dataframe.rdd.take(self.row_count) else: rdd_sample = dataframe.rdd.takeSample(False, NUM_SAMPLE_ROWS, seed=0) # init column specs with profiles for column in self.columns_to_profile: column_profile = DatasetFieldProfileClass(fieldPath=column) column_spec = _SingleColumnSpec(column, column_profile) column_profile.uniqueCount = column_distinct_counts.get(column) column_profile.uniqueProportion = column_unique_proportions.get( column) column_profile.nullCount = column_null_counts.get(column) column_profile.nullProportion = column_null_fractions.get(column) if self.profiling_config.include_field_sample_values: column_profile.sampleValues = [ str(x[column]) for x in rdd_sample ] column_spec.type_ = column_types[column] column_spec.cardinality = _convert_to_cardinality( column_distinct_counts[column], column_null_fractions[column], ) self.column_specs.append(column_spec)
def ingest_table(self, full_path: str, relative_path: str, is_aws: bool) -> Iterable[MetadataWorkUnit]: table_name = self.get_table_name(relative_path, full_path) # yield the table schema first logger.debug( f"Ingesting {full_path}: making table schemas {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}" ) yield from self.get_table_schema(full_path, table_name, is_aws) # If profiling is not enabled, skip the rest if not self.source_config.profiling.enabled: return # read in the whole table with Spark for profiling table = self.read_file_spark(full_path, is_aws) # if table is not readable, skip if table is None: self.report.report_warning( table_name, f"unable to read table {table_name} from file {full_path}") return with PerfTimer() as timer: # init PySpark analysis object logger.debug( f"Profiling {full_path}: reading file and computing nulls+uniqueness {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}" ) table_profiler = _SingleTableProfiler( table, self.spark, self.source_config.profiling, self.report, full_path, ) logger.debug( f"Profiling {full_path}: preparing profilers to run {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}" ) # instead of computing each profile individually, we run them all in a single analyzer.run() call # we use a single call because the analyzer optimizes the number of calls to the underlying profiler # since multiple profiles reuse computations, this saves a lot of time table_profiler.prepare_table_profiles() # compute the profiles logger.debug( f"Profiling {full_path}: computing profiles {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}" ) analysis_result = table_profiler.analyzer.run() analysis_metrics = AnalyzerContext.successMetricsAsDataFrame( self.spark, analysis_result) logger.debug( f"Profiling {full_path}: extracting profiles {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}" ) table_profiler.extract_table_profiles(analysis_metrics) time_taken = timer.elapsed_seconds() logger.info( f"Finished profiling {full_path}; took {time_taken:.3f} seconds" ) self.profiling_times_taken.append(time_taken) mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=make_dataset_urn(self.source_config.platform, table_name, self.source_config.env), changeType=ChangeTypeClass.UPSERT, aspectName="datasetProfile", aspect=table_profiler.profile, ) wu = MetadataWorkUnit( id=f"profile-{self.source_config.platform}-{full_path}", mcp=mcp) self.report.report_workunit(wu) yield wu
def get_table_profile(self, table_data: TableData, dataset_urn: str) -> Iterable[MetadataWorkUnit]: # read in the whole table with Spark for profiling table = None try: table = self.read_file_spark( table_data.table_path, os.path.splitext(table_data.full_path)[1]) except Exception as e: logger.error(e) # if table is not readable, skip if table is None: self.report.report_warning( table_data.display_name, f"unable to read table {table_data.display_name} from file {table_data.full_path}", ) return with PerfTimer() as timer: # init PySpark analysis object logger.debug( f"Profiling {table_data.full_path}: reading file and computing nulls+uniqueness {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}" ) table_profiler = _SingleTableProfiler( table, self.spark, self.source_config.profiling, self.report, table_data.full_path, ) logger.debug( f"Profiling {table_data.full_path}: preparing profilers to run {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}" ) # instead of computing each profile individually, we run them all in a single analyzer.run() call # we use a single call because the analyzer optimizes the number of calls to the underlying profiler # since multiple profiles reuse computations, this saves a lot of time table_profiler.prepare_table_profiles() # compute the profiles logger.debug( f"Profiling {table_data.full_path}: computing profiles {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}" ) analysis_result = table_profiler.analyzer.run() analysis_metrics = AnalyzerContext.successMetricsAsDataFrame( self.spark, analysis_result) logger.debug( f"Profiling {table_data.full_path}: extracting profiles {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}" ) table_profiler.extract_table_profiles(analysis_metrics) time_taken = timer.elapsed_seconds() logger.info( f"Finished profiling {table_data.full_path}; took {time_taken:.3f} seconds" ) self.profiling_times_taken.append(time_taken) mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, changeType=ChangeTypeClass.UPSERT, aspectName="datasetProfile", aspect=table_profiler.profile, ) wu = MetadataWorkUnit( id=f"profile-{self.source_config.platform}-{table_data.table_path}", mcp=mcp) self.report.report_workunit(wu) yield wu
def test_Size(self): result = self.AnalysisRunner.onData(self.df).addAnalyzer(Size()).run() # result_df = result.select('value').collect() result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result) result_df_row = result_df.select("value").collect() self.assertEqual(result_df_row, [Row(value=3.0)])
def Correlation(self, column1, column2, where=None): result = self.AnalysisRunner.onData(self.df).addAnalyzer(Correlation(column1, column2, where)).run() result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result) AnalyzerContext.successMetricsAsJson(self.spark, result) return result_df.select("value").collect()