def test_return_basic_statistics(self, df_with_numeric_values): df = df_with_numeric_values analyzers = [ Mean("att1"), StandardDeviation("att1"), Minimum("att1"), Maximum("att1"), # CountDistinct("att1") ] result_metrics = do_analysis_run(df, analyzers).all_metrics() assert len(result_metrics) == len(analyzers) assert (DoubleMetric(Entity.COLUMN, "Mean", "att1", Success(3.5)) in result_metrics) assert (DoubleMetric(Entity.COLUMN, "Minimum", "att1", Success(1.0)) in result_metrics) assert (DoubleMetric(Entity.COLUMN, "Maximum", "att1", Success(6.0)) in result_metrics) assert (DoubleMetric(Entity.COLUMN, "StandardDeviation", "att1", Success(1.707825127659933)) in result_metrics)
def test_match_urls(self): maybe_urls = [ "http://foo.com/blah_blah", "http://foo.com/blah_blah_(wikipedia)", "http://foo.bar/?q=Test%20URL-encoded%20stuff", "http://➡.ws/䨹", "http://⌘.ws/", "http://☺.damowmow.com/", "http://例子.测试", "https://foo_bar.example.com/", "http://[email protected]:8080", "http://foo.com/blah_(wikipedia)#cite-1", "http://../", # not really a valid URL "h://test", # not really a valid URL "http://.www.foo.bar/" # not really a valid URL ] df = pd.DataFrame({"some": maybe_urls}) result = PatternMatch("some", hpatterns.URL).calculate(df) assert result.value == Success(10 / 13.0)
def test_match_credit_card_numbers(self): maybe_cc_numbers = [ "378282246310005", # AMEX "6011111111111117", # Discover "6011 1111 1111 1117", # Discover spaced "6011-1111-1111-1117", # Discover dashed "5555555555554444", # MasterCard "5555 5555 5555 4444", # MasterCard spaced "5555-5555-5555-4444", # MasterCard dashed "4111111111111111", # Visa "4111 1111 1111 1111", # Visa spaced "4111-1111-1111-1111", # Visa dashed "0000111122223333", # not really a CC number "000011112222333", # not really a CC number "00001111222233", # not really a CC number ] df = pd.DataFrame({"some": maybe_cc_numbers}) result = PatternMatch("some", hpatterns.CREDITCARD).calculate(df) assert result.value == Success(10.0 / 13.0)
def test_return_basic_statistics(self, df_with_numeric_values): df = df_with_numeric_values analyzers = [ Mean("att1"), StandardDeviation("att1"), Minimum("att1"), Maximum("att1"), ApproxDistinctness("att1"), ApproxDistinctness("att2"), ] engine = PandasEngine(df_with_numeric_values) repo = InMemoryMetadataRepository() result_metrics = do_analysis_run(engine, repo, analyzers).all_metrics() ConnectionHandler.close_connections() assert len(result_metrics) == len(analyzers) assert ( DoubleMetric(Entity.COLUMN, "Mean", "att1", Success(3.5)) in result_metrics ) assert ( DoubleMetric(Entity.COLUMN, "Minimum", "att1", Success(1.0)) in result_metrics ) assert ( DoubleMetric(Entity.COLUMN, "ApproxDistinctness", "att1", Success(1.0)) in result_metrics ) assert ( DoubleMetric(Entity.COLUMN, "ApproxDistinctness", "att2", Success(0.6666666716337205)) in result_metrics ) assert ( DoubleMetric(Entity.COLUMN, "Maximum", "att1", Success(6.0)) in result_metrics ) assert ( DoubleMetric( Entity.COLUMN, "StandardDeviation", "att1", Success(1.870829) ) in result_metrics )
def test_recover_with_on_success_should_return_identity(self): success = Success(1) self.assertEqual(success.recoverWith(lambda x: Try(lambda x: -1)), success)
def test_uniqunes_should_be_correct_for_a_single_column(sample_data): df = sample_data col = "Address Line 1" assert Uniqueness([col]).calculate(df) == DoubleMetric( Entity.COLUMN, "Uniqueness", col, Success(1.0))
def test_or_else_on_success_should_return_identity(self): success = Success(1) self.assertEqual(success.orElse(lambda: 1), success)
def test_recover_on_success_should_return_identity(self): success = Success(1) self.assertEqual(success.recover(lambda x: 1 / 0), success)
def test_generator_without_arguments(self): g = (lambda: (yield 1))() self.assertEqual(Try(g).map(lambda x: x + 1), Success(2))
def metric_from_value(value: float, name: str, instance: str, entity: Entity) -> DoubleMetric: return DoubleMetric(entity, name, instance, Success(value))
def test_flatmap_on_failure_should_return_failure(self): self.assertTrue( Failure(Exception("")).flatMap(lambda x: Success(1)).isFailure)
def test_match_email_addresses(self): col = "some" df = pd.DataFrame({col: ["*****@*****.**", "someone@else"]}) assert PatternMatch( col, hpatterns.EMAIL).calculate(df).value == Success(0.5)
def test_equality_of_success_should_be_based_on_the_equality_of_values( self): self.assertEqual(Success(1), Success(1)) self.assertNotEqual(Success(1), Success(2))
def test_truthness(self): self.assertFalse(Failure(Exception("e"))) self.assertTrue(Success(1))
def test__try_identity_if_try_or_raise(self): success = Success(1) failure = Failure(Exception("e")) self.assertRaises(TypeError, Try_._identity_if_try_or_raise, 1) self.assertEqual(Try_._identity_if_try_or_raise(success), success) self.assertEqual(Try_._identity_if_try_or_raise(failure), failure)
def test_failed_on_success_should_throw_type_error_exception(self): self.assertRaises(TypeError, Success(1).failed)
def test_or_else_on_failure_should_return_else(self): success = Success(1) self.assertEqual(Failure(Exception("e")).orElse(success), success)
def test_works_with_filtering(self, df_missing): result = Completeness("att1", "item==1 or item==2").calculate(df_missing) assert result == DoubleMetric(Entity.COLUMN, "Completeness", "att1", Success(1.0))
def test_flatmap_on_success_should_return_value_depending_on_a_function( self): self.assertTrue(Success(1).flatMap(lambda x: Success(1)).isSuccess) self.assertTrue( Success(1).flatMap(lambda x: Failure(Exception())).isFailure)
def test_compute_correct_metric_with_filtering(self, df_with_numeric_values): df = df_with_numeric_values result = Compliance("rule1", "att2 == 0", "att1 < 4").calculate(df) assert result == DoubleMetric(Entity.COLUMN, "Compliance", "rule1", Success(1.0))
def test_failed_on_success_should_be_a_failure(self): result = Success(1).failed() self.assertTrue(result.isFailure)
def test_computes_correct_metrics(self, data): a = Size() metric = a.calculate(data) assert metric == DoubleMetric(Entity.DATASET, "Size", "*", Success(len(data)))
def test_map_on_success_should_return_value_depending_on_a_function(self): success = Success(1).map(lambda x: -x) self.assertTrue(success.isSuccess) self.assertEqual(success.get(), -1)
def test_hashable(self): self.assertTrue(hash(Success(1)) == hash(Success(1))) self.assertTrue(hash(Success(1)) == 1) e = Exception("e") self.assertTrue(hash(Failure(e)) == hash(Failure(e)))
def test_filter_on_success_should_return_value_depending_on_a_predicate( self): self.assertTrue(Success(1).filter(lambda x: x > 0).isSuccess) self.assertTrue(Success(-1).filter(lambda x: x > 0).isFailure)
def test_flatmap_should_fail_if_f_doesnt_return_try(self): self.assertRaises(TypeError, Success(1).flatMap, lambda x: x)
def test_get_or_else_with_success_should_return_this_value(self): self.assertEqual(Success(1).getOrElse(lambda: -1), 1)