def Compliance(self, instance, predicate, where=None):
     result = self.AnalysisRunner.onData(self.df).addAnalyzer(Compliance(instance, predicate, where)).run()
     result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result)
     result_json = AnalyzerContext.successMetricsAsJson(self.spark, result)
     df_from_json = self.spark.read.json(self.sc.parallelize([result_json]))
     self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect())
     return result_df.select("value").collect()
 def UniqueValueRatio(self, columns, where=None):
     result = self.AnalysisRunner.onData(self.df).addAnalyzer(UniqueValueRatio(columns, where)).run()
     result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result)
     result_json = AnalyzerContext.successMetricsAsJson(self.spark, result)
     df_from_json = self.spark.read.json(self.sc.parallelize([result_json]))
     self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect())
     return result_df.select("value").collect()
 def ApproxQuantiles(self, column, quantiles):
     result = self.AnalysisRunner.onData(self.df).addAnalyzer(ApproxQuantiles(column, quantiles)).run()
     result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result)
     result_json = AnalyzerContext.successMetricsAsJson(self.spark, result)
     df_from_json = self.spark.read.json(self.sc.parallelize([result_json]))
     self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect())
     return result_df.select("value").collect()
 def test_KLLSketch(self):
     result = (
         self.AnalysisRunner.onData(self.df).addAnalyzer(KLLSketch("b", KLLParameters(self.spark, 2, 0.64, 2))).run()
     )
     result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result)
     result_df.show()
     return result_df.select("value").collect()
 def test_PatternMatch(self):
     result = (
         self.AnalysisRunner.onData(self.df).addAnalyzer(PatternMatch(column="a", pattern_regex="ba(r|z)")).run()
     )
     result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result)
     result_json = AnalyzerContext.successMetricsAsJson(self.spark, result)
     df_from_json = self.spark.read.json(self.sc.parallelize([result_json]))
     self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect())
     self.assertEqual(result_df.select("value").collect(), [Row(value=0.0)])
 def Histogram_maxBins(self, column, binningUdf=None, maxDetailBins: int = None, where: str = None):
     result = (
         self.AnalysisRunner.onData(self.df).addAnalyzer(Histogram(column, binningUdf, maxDetailBins, where)).run()
     )
     result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result)
     result_json = AnalyzerContext.successMetricsAsJson(self.spark, result)
     df_from_json = self.spark.read.json(self.sc.parallelize([result_json]))
     self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect())
     return result_df.select("value").collect()
Example #7
0
    def ingest_table(self, full_path: str, relative_path: str,
                     is_aws: bool) -> Iterable[MetadataWorkUnit]:

        table_name = self.get_table_name(relative_path, full_path)

        # yield the table schema first
        logger.debug(
            f"Ingesting {full_path}: making table schemas {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}"
        )
        yield from self.get_table_schema(full_path, table_name, is_aws)

        # If profiling is not enabled, skip the rest
        if not self.source_config.profiling.enabled:
            return

        # read in the whole table with Spark for profiling
        table = self.read_file_spark(full_path, is_aws)

        # if table is not readable, skip
        if table is None:
            self.report.report_warning(
                table_name,
                f"unable to read table {table_name} from file {full_path}")
            return

        with PerfTimer() as timer:
            # init PySpark analysis object
            logger.debug(
                f"Profiling {full_path}: reading file and computing nulls+uniqueness {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}"
            )
            table_profiler = _SingleTableProfiler(
                table,
                self.spark,
                self.source_config.profiling,
                self.report,
                full_path,
            )

            logger.debug(
                f"Profiling {full_path}: preparing profilers to run {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}"
            )
            # instead of computing each profile individually, we run them all in a single analyzer.run() call
            # we use a single call because the analyzer optimizes the number of calls to the underlying profiler
            # since multiple profiles reuse computations, this saves a lot of time
            table_profiler.prepare_table_profiles()

            # compute the profiles
            logger.debug(
                f"Profiling {full_path}: computing profiles {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}"
            )
            analysis_result = table_profiler.analyzer.run()
            analysis_metrics = AnalyzerContext.successMetricsAsDataFrame(
                self.spark, analysis_result)

            logger.debug(
                f"Profiling {full_path}: extracting profiles {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}"
            )
            table_profiler.extract_table_profiles(analysis_metrics)

            time_taken = timer.elapsed_seconds()

            logger.info(
                f"Finished profiling {full_path}; took {time_taken:.3f} seconds"
            )

            self.profiling_times_taken.append(time_taken)

        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            entityUrn=make_dataset_urn(self.source_config.platform, table_name,
                                       self.source_config.env),
            changeType=ChangeTypeClass.UPSERT,
            aspectName="datasetProfile",
            aspect=table_profiler.profile,
        )
        wu = MetadataWorkUnit(
            id=f"profile-{self.source_config.platform}-{full_path}", mcp=mcp)
        self.report.report_workunit(wu)
        yield wu
Example #8
0
    def get_table_profile(self, table_data: TableData,
                          dataset_urn: str) -> Iterable[MetadataWorkUnit]:
        # read in the whole table with Spark for profiling
        table = None
        try:
            table = self.read_file_spark(
                table_data.table_path,
                os.path.splitext(table_data.full_path)[1])
        except Exception as e:
            logger.error(e)

        # if table is not readable, skip
        if table is None:
            self.report.report_warning(
                table_data.display_name,
                f"unable to read table {table_data.display_name} from file {table_data.full_path}",
            )
            return

        with PerfTimer() as timer:
            # init PySpark analysis object
            logger.debug(
                f"Profiling {table_data.full_path}: reading file and computing nulls+uniqueness {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}"
            )
            table_profiler = _SingleTableProfiler(
                table,
                self.spark,
                self.source_config.profiling,
                self.report,
                table_data.full_path,
            )

            logger.debug(
                f"Profiling {table_data.full_path}: preparing profilers to run {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}"
            )
            # instead of computing each profile individually, we run them all in a single analyzer.run() call
            # we use a single call because the analyzer optimizes the number of calls to the underlying profiler
            # since multiple profiles reuse computations, this saves a lot of time
            table_profiler.prepare_table_profiles()

            # compute the profiles
            logger.debug(
                f"Profiling {table_data.full_path}: computing profiles {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}"
            )
            analysis_result = table_profiler.analyzer.run()
            analysis_metrics = AnalyzerContext.successMetricsAsDataFrame(
                self.spark, analysis_result)

            logger.debug(
                f"Profiling {table_data.full_path}: extracting profiles {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}"
            )
            table_profiler.extract_table_profiles(analysis_metrics)

            time_taken = timer.elapsed_seconds()

            logger.info(
                f"Finished profiling {table_data.full_path}; took {time_taken:.3f} seconds"
            )

            self.profiling_times_taken.append(time_taken)

        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            entityUrn=dataset_urn,
            changeType=ChangeTypeClass.UPSERT,
            aspectName="datasetProfile",
            aspect=table_profiler.profile,
        )
        wu = MetadataWorkUnit(
            id=f"profile-{self.source_config.platform}-{table_data.table_path}",
            mcp=mcp)
        self.report.report_workunit(wu)
        yield wu
 def test_Size(self):
     result = self.AnalysisRunner.onData(self.df).addAnalyzer(Size()).run()
     # result_df = result.select('value').collect()
     result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result)
     result_df_row = result_df.select("value").collect()
     self.assertEqual(result_df_row, [Row(value=3.0)])
 def Correlation(self, column1, column2, where=None):
     result = self.AnalysisRunner.onData(self.df).addAnalyzer(Correlation(column1, column2, where)).run()
     result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result)
     AnalyzerContext.successMetricsAsJson(self.spark, result)
     return result_df.select("value").collect()