def test_yield_correct_results(self, df_with_numeric_values): df = df_with_numeric_values base_check = Check(CheckLevel.EXCEPTION, description="a description") analyzers = [ Minimum("att1"), Maximum("att1"), Mean("att1"), StandardDeviation("att1"), Sum("att1"), Quantile("att1", 0.5), ] engine = PandasEngine(df) repo = InMemoryMetadataRepository() context_numeric = do_analysis_run(engine, repo, analyzers) assert is_success(base_check.has_min("att1", lambda v: v == 1.0), context_numeric) assert is_success(base_check.has_max("att1", lambda v: v == 6.0), context_numeric) assert is_success(base_check.has_mean("att1", lambda v: v == 3.5), context_numeric) assert is_success( base_check.has_standard_deviation("att1", lambda v: v == 1.870829), context_numeric, ) assert is_success(base_check.has_sum("att1", lambda v: v == 21.0), context_numeric) assert is_success( base_check.has_approx_quantile("att1", 0.5, lambda v: v == 4.0), context_numeric)
def test_return_result_for_configured_analyzers(self, df_full): analyzers = [ Size(), Minimum("item"), Completeness("item"), ] engine = PandasEngine(df_full) repo = InMemoryMetadataRepository() ac = do_analysis_run(engine, repo, analyzers) sm = AnalyzerContext.success_metrics_as_dataframe(ac) expected = pd.DataFrame( [ ("DATASET", "*", "Size", 4.0), ("COLUMN", "item", "Minimum", 1.0), ("COLUMN", "item", "Completeness", 1.0), ], columns=("entity", "instance", "name", "value"), ) ConnectionHandler.close_connections() assert_frame_equal(sm, expected, check_like=True)
def run_checks(data, *checks) -> AnalyzerContext: analyzers = tuple( [a for check in checks for a in check.required_analyzers()]) engine = PandasEngine(data) repo = InMemoryMetadataRepository() result = do_analysis_run(engine, repo, analyzers) ConnectionHandler.close_connections() return result
def run_checks(data, *checks) -> AnalyzerContext: analyzers = tuple([a for check in checks for a in check.required_analyzers()]) engine = PandasEngine(data) repo = SQLMetadataRepositoryFactory.create_sql_metadata_repository("duckdb://:memory:") repo.set_dataset("data","1") result = do_analysis_run(engine, repo, analyzers) ConnectionHandler.close_connections() return result
def run(self, data: DataFrame, dataset_id: str = None, partition_id: str = None) -> VerificationResult: #TODO: maybe drop this function """ Runs all check groups and returns the verification result. Verification result includes all the metrics computed during the run. Parameters ---------- data: tabular data on which the checks should be verified """ engine = PandasEngine(data) repo = InMemoryMetadataRepository() repo.set_dataset(dataset_id,partition_id) return self.do_verification_run( engine, repo, self._checks, self._required_analyzers )
def test_return_basic_statistics(self, df_with_numeric_values): df = df_with_numeric_values analyzers = [ Mean("att1"), StandardDeviation("att1"), Minimum("att1"), Maximum("att1"), ApproxDistinctness("att1"), ApproxDistinctness("att2"), ] engine = PandasEngine(df_with_numeric_values) repo = InMemoryMetadataRepository() result_metrics = do_analysis_run(engine, repo, analyzers).all_metrics() ConnectionHandler.close_connections() assert len(result_metrics) == len(analyzers) assert ( DoubleMetric(Entity.COLUMN, "Mean", "att1", Success(3.5)) in result_metrics ) assert ( DoubleMetric(Entity.COLUMN, "Minimum", "att1", Success(1.0)) in result_metrics ) assert ( DoubleMetric(Entity.COLUMN, "ApproxDistinctness", "att1", Success(1.0)) in result_metrics ) assert ( DoubleMetric(Entity.COLUMN, "ApproxDistinctness", "att2", Success(0.6666666716337205)) in result_metrics ) assert ( DoubleMetric(Entity.COLUMN, "Maximum", "att1", Success(6.0)) in result_metrics ) assert ( DoubleMetric( Entity.COLUMN, "StandardDeviation", "att1", Success(1.870829) ) in result_metrics )
def test_run_individual_analyzer_only_once(self, df_full): analyzers = [ Minimum("item"), Minimum("item"), Minimum("item"), ] engine = PandasEngine(df_full) repo = InMemoryMetadataRepository() ac = do_analysis_run(engine, repo, analyzers) ConnectionHandler.close_connections() assert len(ac.all_metrics()) == 1 metric = ac.metric(Minimum("item")) assert metric is not None assert metric.value.get() == 1
def test_run_analyzers_with_different_where_conditions_separately( self, df_with_numeric_values ): df = df_with_numeric_values analyzers = [ Maximum("att1"), Maximum("att1", where="att1 > att2"), ] engine = PandasEngine(df) repo = InMemoryMetadataRepository() ctx = do_analysis_run(engine, repo, analyzers) ConnectionHandler.close_connections() assert ctx.metric(analyzers[0]) == DoubleMetric( Entity.COLUMN, "Maximum", "att1", Success(6.0) ) assert ctx.metric(analyzers[1]) == DoubleMetric( Entity.COLUMN, "Maximum", "att1", Success(3.0) )
def test_multiple_quantiles_are_computed(self, df_with_numeric_values): df = df_with_numeric_values analyzers = [ Quantile("att1", 0.1), Quantile("att1", 0.5), Quantile("att1", 0.9), ] engine = PandasEngine(df) repo = InMemoryMetadataRepository() context_numeric = do_analysis_run(engine, repo, analyzers) assert len(context_numeric.metric_map) == 3 print(context_numeric) base_check = Check(CheckLevel.EXCEPTION, description="a description") assert is_success( base_check.has_approx_quantile("att1", 0.5, lambda v: v == 4.0), context_numeric) assert is_success( base_check.has_approx_quantile("att1", 0.9, lambda v: v == 6.0), context_numeric) assert is_success( base_check.has_approx_quantile("att1", 0.1, lambda v: v == 1.0), context_numeric)
def on_data(self, data: DataFrame, dataset_id: str = None, partition_id: str = None): engine = PandasEngine(data) return VerificationRunBuilder(engine, dataset_id, partition_id)
def on_data_no_sharing(self, data: DataFrame, dataset_id: str = None, partition_id: str = None): engine = PandasEngine(data, no_sharing=True) return VerificationRunBuilder(engine, dataset_id, partition_id)