def test_compute_correct_metrics(self, df_with_numeric_values): df = df_with_numeric_values assert Compliance("rule1", "att1 > 3").calculate(df) == DoubleMetric( Entity.COLUMN, "Compliance", "rule1", Success(3.0 / 6.0)) assert Compliance("rule2", "att1 > 2").calculate(df) == DoubleMetric( Entity.COLUMN, "Compliance", "rule2", Success(4.0 / 6.0))
def test_computes_correct_metrics(self, df_missing): assert (len(Completeness("some_missing_column").preconditions()) == 1 ), "should check colunm name" assert Completeness("att1").calculate(df_missing) == DoubleMetric( Entity.COLUMN, "Completeness", "att1", Success(0.5)) assert Completeness("att2").calculate(df_missing) == DoubleMetric( Entity.COLUMN, "Completeness", "att2", Success(0.75))
def metric_from_value(value: Union[float, Dict[str, str]], name: str, instance: str, entity: Entity) -> Union[DoubleMetric, SchemaMetric]: if isinstance(value, (float, int)): return DoubleMetric(entity, name, instance, Success(value)) elif isinstance(value, dict): return SchemaMetric(entity, name, instance, Success(value)) else: raise NoMetricForValueException( f"Can not create a Metric for value type {value.__class__.__name__}" )
def test_filtered_uniqueness(sample_data): df = pd.DataFrame([("1", "unique"), ("2", "unique"), ("3", "duplicate"), ("3", "duplicate"), ("4", "unique")], columns=("value", "type")) uniq = Uniqueness(["value"]) uniq_with_filter = Uniqueness(["value"], "type=='unique'") assert uniq.calculate(df) == DoubleMetric(Entity.COLUMN, "Uniqueness", ",".join(["value"]), Success(0.6)) assert uniq_with_filter.calculate(df) == DoubleMetric( Entity.COLUMN, "Uniqueness", ",".join(["value"]), Success(1.0))
def test_run_analyzers_with_different_where_conditions_separately( self, df_with_numeric_values): df = df_with_numeric_values analyzers = [ Maximum("att1"), Maximum("att1", where="att1 > att2"), ] ctx = do_analysis_run(df, analyzers) assert ctx.metric(analyzers[0]) == DoubleMetric( Entity.COLUMN, "Maximum", "att1", Success(6.0)) assert ctx.metric(analyzers[1]) == DoubleMetric( Entity.COLUMN, "Maximum", "att1", Success(3.0))
def test_filter_should_accept_custom_exception_and_message(self): class DummyException(Exception): pass failure = Success(1).filter(lambda x: False, DummyException, "dummy") self.assertRaises(DummyException, failure.get) self.assertEqual(repr(failure), "Failure(DummyException('dummy',))")
def test_equality_of_failure_should_be_based_on_a_type_and_args(self): self.assertEqual(Failure(Exception("e")), Failure(Exception("e"))) self.assertNotEqual(Failure(Exception("foo")), Failure(Exception("bar"))) self.assertNotEqual(Failure(ZeroDivisionError()), Failure(TypeError())) self.assertNotEqual(Failure(Exception("e")), Failure(TypeError("e"))) self.assertNotEqual(Failure(Exception()), Success(1))
def test_computes_max_value_with_predicate_correctly( self, df_with_numeric_values): data = df_with_numeric_values col = "att1" a = Maximum(col, where="item != '6'") value = a.calculate(data).value assert value == Success(5.0)
def test_uniqueness_should_be_correct_for_multiple_fields(sample_data): df = sample_data # because "Address Line 1" is unique, all should be # this should also work when the columns contain None cols = ["Address Line 1", "Address Line 3"] assert Uniqueness(cols).calculate(df) == DoubleMetric( Entity.MULTICOLUMN, "Uniqueness", ",".join(cols), Success(1.0))
def test_computes_std_value_with_predicate_correctly( self, df_with_numeric_values): data = df_with_numeric_values col = "att1" a = StandardDeviation(col, where="item != '6'") value = a.calculate(data).value assert value == Success(1.4142135623730951)
def test_computes_correct_metrics(self): df = pd.DataFrame({"col": ["miguel", "benjamin", "miguelito"]}) assert PatternMatch("col", r"^miguel").calculate(df) == DoubleMetric( entity=Entity.COLUMN, name="PatternMatch", instance="col", value=Success(0.6666666666666666), )
def test_generator_with_argument(self): def f(): x = None while True: x = yield x g = f() g.send(None) self.assertEqual(Try(g, 41).map(lambda x: x + 1), Success(42))
def test_fail_with_unhashable_value(self): with pytest.raises(TypeError): hash(Success([1])) class UnhashableException(Exception): def __hash__(self): raise TypeError() with pytest.raises(TypeError): hash(Failure(UnhashableException()))
def test_double_metric_should_flatten(): metric = DoubleMetric(Entity.COLUMN, "metric-name", "instance-name", Success(50)) assert metric.flatten() == (metric, ) metric = DoubleMetric(Entity.COLUMN, "metric-name", "instance-name", Failure(Exception("sample"))) assert metric.flatten() == (metric, )
def test_return_basic_statistics(self, df_with_numeric_values): df = df_with_numeric_values analyzers = [ Mean("att1"), StandardDeviation("att1"), Minimum("att1"), Maximum("att1"), # CountDistinct("att1") ] result_metrics = do_analysis_run(df, analyzers).all_metrics() assert len(result_metrics) == len(analyzers) assert (DoubleMetric(Entity.COLUMN, "Mean", "att1", Success(3.5)) in result_metrics) assert (DoubleMetric(Entity.COLUMN, "Minimum", "att1", Success(1.0)) in result_metrics) assert (DoubleMetric(Entity.COLUMN, "Maximum", "att1", Success(6.0)) in result_metrics) assert (DoubleMetric(Entity.COLUMN, "StandardDeviation", "att1", Success(1.707825127659933)) in result_metrics)
def test_run_analyzers_with_different_where_conditions_separately( self, df_with_numeric_values ): df = df_with_numeric_values analyzers = [ Maximum("att1"), Maximum("att1", where="att1 > att2"), ] engine = PandasEngine(df) repo = InMemoryMetadataRepository() ctx = do_analysis_run(engine, repo, analyzers) ConnectionHandler.close_connections() assert ctx.metric(analyzers[0]) == DoubleMetric( Entity.COLUMN, "Maximum", "att1", Success(6.0) ) assert ctx.metric(analyzers[1]) == DoubleMetric( Entity.COLUMN, "Maximum", "att1", Success(3.0) )
def test_match_urls(self): maybe_urls = [ "http://foo.com/blah_blah", "http://foo.com/blah_blah_(wikipedia)", "http://foo.bar/?q=Test%20URL-encoded%20stuff", "http://➡.ws/䨹", "http://⌘.ws/", "http://☺.damowmow.com/", "http://例子.测试", "https://foo_bar.example.com/", "http://[email protected]:8080", "http://foo.com/blah_(wikipedia)#cite-1", "http://../", # not really a valid URL "h://test", # not really a valid URL "http://.www.foo.bar/" # not really a valid URL ] df = pd.DataFrame({"some": maybe_urls}) result = PatternMatch("some", hpatterns.URL).calculate(df) assert result.value == Success(10 / 13.0)
def test_match_credit_card_numbers(self): maybe_cc_numbers = [ "378282246310005", # AMEX "6011111111111117", # Discover "6011 1111 1111 1117", # Discover spaced "6011-1111-1111-1117", # Discover dashed "5555555555554444", # MasterCard "5555 5555 5555 4444", # MasterCard spaced "5555-5555-5555-4444", # MasterCard dashed "4111111111111111", # Visa "4111 1111 1111 1111", # Visa spaced "4111-1111-1111-1111", # Visa dashed "0000111122223333", # not really a CC number "000011112222333", # not really a CC number "00001111222233", # not really a CC number ] df = pd.DataFrame({"some": maybe_cc_numbers}) result = PatternMatch("some", hpatterns.CREDITCARD).calculate(df) assert result.value == Success(10.0 / 13.0)
def test_return_basic_statistics(self, df_with_numeric_values): df = df_with_numeric_values analyzers = [ Mean("att1"), StandardDeviation("att1"), Minimum("att1"), Maximum("att1"), ApproxDistinctness("att1"), ApproxDistinctness("att2"), ] engine = PandasEngine(df_with_numeric_values) repo = InMemoryMetadataRepository() result_metrics = do_analysis_run(engine, repo, analyzers).all_metrics() ConnectionHandler.close_connections() assert len(result_metrics) == len(analyzers) assert ( DoubleMetric(Entity.COLUMN, "Mean", "att1", Success(3.5)) in result_metrics ) assert ( DoubleMetric(Entity.COLUMN, "Minimum", "att1", Success(1.0)) in result_metrics ) assert ( DoubleMetric(Entity.COLUMN, "ApproxDistinctness", "att1", Success(1.0)) in result_metrics ) assert ( DoubleMetric(Entity.COLUMN, "ApproxDistinctness", "att2", Success(0.6666666716337205)) in result_metrics ) assert ( DoubleMetric(Entity.COLUMN, "Maximum", "att1", Success(6.0)) in result_metrics ) assert ( DoubleMetric( Entity.COLUMN, "StandardDeviation", "att1", Success(1.870829) ) in result_metrics )
def metric_from_value(value: float, name: str, instance: str, entity: Entity) -> DoubleMetric: return DoubleMetric(entity, name, instance, Success(value))
def test_flatmap_should_fail_if_f_doesnt_return_try(self): self.assertRaises(TypeError, Success(1).flatMap, lambda x: x)
def test_match_email_addresses(self): col = "some" df = pd.DataFrame({col: ["*****@*****.**", "someone@else"]}) assert PatternMatch( col, hpatterns.EMAIL).calculate(df).value == Success(0.5)
def test_computes_correct_metrics(self, data): a = Size() metric = a.calculate(data) assert metric == DoubleMetric(Entity.DATASET, "Size", "*", Success(len(data)))
def test_compute_correct_metric_with_filtering(self, df_with_numeric_values): df = df_with_numeric_values result = Compliance("rule1", "att2 == 0", "att1 < 4").calculate(df) assert result == DoubleMetric(Entity.COLUMN, "Compliance", "rule1", Success(1.0))
def test_map_on_success_should_return_value_depending_on_a_function(self): success = Success(1).map(lambda x: -x) self.assertTrue(success.isSuccess) self.assertEqual(success.get(), -1)
def test_filter_on_success_should_return_value_depending_on_a_predicate( self): self.assertTrue(Success(1).filter(lambda x: x > 0).isSuccess) self.assertTrue(Success(-1).filter(lambda x: x > 0).isFailure)
def test_works_with_filtering(self, df_missing): result = Completeness("att1", "item==1 or item==2").calculate(df_missing) assert result == DoubleMetric(Entity.COLUMN, "Completeness", "att1", Success(1.0))
def test_recover_on_success_should_return_identity(self): success = Success(1) self.assertEqual(success.recover(lambda x: 1 / 0), success)
def test_recover_with_on_success_should_return_identity(self): success = Success(1) self.assertEqual(success.recoverWith(lambda x: Try(lambda x: -1)), success)
def test_uniqunes_should_be_correct_for_a_single_column(sample_data): df = sample_data col = "Address Line 1" assert Uniqueness([col]).calculate(df) == DoubleMetric( Entity.COLUMN, "Uniqueness", col, Success(1.0))