def test_text_output(self): """test basic comparison""" filths = [ MergedFilth( PhoneFilth(beg=0, end=4, text='1234', detector_name='phone'), KnownFilth(beg=0, end=4, text='1234', comparison_type='phone'), ), KnownFilth(beg=5, end=10, text='12345', comparison_type='phone'), MergedFilth( PhoneFilth(beg=5, end=9, text='1234', detector_name='phone'), KnownFilth(beg=5, end=9, text='1234', comparison_type='phone'), ), KnownFilth(beg=15, end=20, text='12345', comparison_type='phone'), ] text = scrubadub.comparison.get_filth_classification_report( filths, output_dict=False, ).strip() print(text) self.assertEquals( text, " precision recall f1-score support\n" "\n" "phone phone 1.00 0.50 0.67 4\n" "\n" " micro avg 1.00 0.50 0.67 4\n" " macro avg 1.00 0.50 0.67 4\n" " weighted avg 1.00 0.50 0.67 4\n".strip( ), )
def test_filth_type_equality(self): filth_a = PhoneFilth( beg=0, end=4, text='1234', detector_name='phone_a', locale='en_GB', document_name='test.txt' ) filth_b = PhoneFilth( beg=2, end=6, text='1234', detector_name='phone_b', locale='en_GB', document_name='test.txt' ) filth_c = PhoneFilth( beg=10, end=14, text='1234', detector_name='phone_b', locale='en_GB', document_name='test.txt' ) ft = FilthTypePositions(grouping_function=FilthGrouper.grouping_default, filth_type='phone') ft.add_filth(filth_c) ft.add_filth(filth_a) ft.add_filth(filth_b) ft2 = FilthTypePositions(grouping_function=FilthGrouper.grouping_default, filth_type='phone') ft2.add_filth(filth_c) ft2.add_filth(filth_a) ft2.add_filth(filth_b) self.assertTrue(ft == ft2) ft2 = FilthTypePositions(grouping_function=FilthGrouper.grouping_default, filth_type='phone') ft2.add_filth(filth_c) ft2.add_filth(filth_a) self.assertTrue(ft != ft2)
def test_comparison(self): """test basic comparison""" filths = [ MergedFilth( PhoneFilth(beg=0, end=4, text='1234', detector_name='phone'), TaggedEvaluationFilth(beg=0, end=4, text='1234', comparison_type='phone'), ), TaggedEvaluationFilth(beg=5, end=10, text='12345', comparison_type='phone'), MergedFilth( PhoneFilth(beg=12, end=16, text='1234', detector_name='phone'), TaggedEvaluationFilth(beg=12, end=16, text='1234', comparison_type='phone'), ), TaggedEvaluationFilth(beg=20, end=25, text='12345', comparison_type='phone'), ] self.assertEqual( { 'macro avg': { 'f1-score': 0.6666666666666666, 'precision': 1.0, 'recall': 0.5, 'support': 4 }, 'micro avg': { 'f1-score': 0.6666666666666666, 'precision': 1.0, 'recall': 0.5, 'support': 4 }, 'phone:phone:None': { 'f1-score': 0.6666666666666666, 'precision': 1.0, 'recall': 0.5, 'support': 4 }, 'weighted avg': { 'f1-score': 0.6666666666666666, 'precision': 1.0, 'recall': 0.5, 'support': 4 } }, scrubadub.comparison.get_filth_classification_report( filths, output_dict=True, ), )
def test_false_positive(self): """test with incorrect identification""" filths = [ PhoneFilth(beg=0, end=4, text='1234', detector_name='phone_v1'), TaggedEvaluationFilth(beg=5, end=10, text='12345', comparison_type='phone'), MergedFilth( PhoneFilth(beg=12, end=16, text='1234', detector_name='phone_v1'), TaggedEvaluationFilth(beg=12, end=16, text='1234', comparison_type='phone'), ), TaggedEvaluationFilth(beg=20, end=25, text='12345', comparison_type='phone'), ] self.assertEqual( { 'phone:phone_v1:None': { 'precision': 0.5, 'recall': 0.3333333333333333, 'f1-score': 0.4, 'support': 3 }, 'micro avg': { 'precision': 0.5, 'recall': 0.3333333333333333, 'f1-score': 0.4, 'support': 3 }, 'macro avg': { 'precision': 0.5, 'recall': 0.3333333333333333, 'f1-score': 0.4, 'support': 3 }, 'weighted avg': { 'precision': 0.5, 'recall': 0.3333333333333333, 'f1-score': 0.4000000000000001, 'support': 3 } }, scrubadub.comparison.get_filth_classification_report( filths, # [PhoneDetector, KnownFilthDetector], output_dict=True, ), )
def test_text_position_merge_ranges(self): filth_a = PhoneFilth( beg=0, end=4, text='1234', detector_name='phone_a', locale='en_GB', document_name='test.txt' ) filth_b = PhoneFilth( beg=10, end=14, text='1234', detector_name='phone_b', locale='en_GB', document_name='test.txt' ) tp_a = TextPosition(filth_a, FilthGrouper.grouping_default) tp_b = TextPosition(filth_b, FilthGrouper.grouping_default) with self.assertRaises(ValueError): tp_a.merge(tp_b)
def test_filth_grouper(self): filths = [ MergedFilth( PhoneFilth(beg=0, end=4, text='1234', detector_name='phone', locale='en_GB'), TaggedEvaluationFilth(beg=0, end=4, text='1234', comparison_type='phone', locale='en_GB'), ), TaggedEvaluationFilth(beg=5, end=10, text='12345', comparison_type='phone', locale='en_GB'), MergedFilth( PhoneFilth(beg=12, end=16, text='1234', detector_name='phone', locale='en_US'), TaggedEvaluationFilth(beg=12, end=16, text='1234', comparison_type='phone', locale='en_US'), ), TaggedEvaluationFilth(beg=20, end=25, text='12345', comparison_type='phone', locale='en_US'), TaggedEvaluationFilth(beg=30, end=35, text='12345', comparison_type='name', locale='en_US'), ] fg = FilthGrouper(combine_detectors=True, groupby_documents=False, filth_types=None) self.assertEqual(fg.grouping_function, FilthGrouper.grouping_combined) fg = FilthGrouper(combine_detectors=False, groupby_documents=False, filth_types=None) self.assertEqual(fg.grouping_function, FilthGrouper.grouping_default) fg.add_filths(filths) print(fg) self.assertEqual(['phone', 'name'], list(fg.types.keys())) self.assertEqual(1, len(fg.types['name'].positions)) self.assertEqual(6, len(fg.types['phone'].positions)) fg.merge_positions() self.assertEqual(1, len(fg.types['name'].positions)) self.assertEqual(4, len(fg.types['phone'].positions)) fg_from_list = FilthGrouper.from_filth_list(filths) self.assertEqual(list(fg.types.keys()), list(fg_from_list.types.keys())) df = fg.get_counts() print(df) self.assertEqual(['filth', 'detector', 'locale'], df.columns.names) self.assertEqual( [ ('name', 'tagged', 'en_US'), ('phone', 'phone', 'en_GB'), ('phone', 'phone', 'en_US'), ('phone', 'tagged', 'en_GB'), ('phone', 'tagged', 'en_US') ], df.columns.values.tolist(), ) self.assertEqual([0, 0, 0, 0, 1], df[('name', 'tagged', 'en_US')].values.tolist()) self.assertEqual([1, 0, 0, 0, 0], df[('phone', 'phone', 'en_GB')].values.tolist()) self.assertEqual([0, 0, 1, 0, 0], df[('phone', 'phone', 'en_US')].values.tolist()) self.assertEqual([1, 1, 0, 0, 0], df[('phone', 'tagged', 'en_GB')].values.tolist()) self.assertEqual([0, 0, 1, 1, 0], df[('phone', 'tagged', 'en_US')].values.tolist())
def test_text_position_repr(self): filth = PhoneFilth(beg=0, end=4, text='1234', detector_name='phone_a', locale='en_GB', document_name='test.txt') tp = TextPosition(filth, FilthGrouper.grouping_default) self.assertEqual( "<TextPosition beg=0 end=4 tagged=set() detected={('phone', 'phone_a', 'en_GB')} document_name='test.txt'>", tp.__repr__() )
def test_text_position_function(self): filth = PhoneFilth(beg=0, end=4, text='1234', detector_name='phone', locale='en_GB', document_name='test.txt') tp = TextPosition(filth, lambda x: {1:1, 2:2, 3:3}) self.assertEqual( {(1, 2, 3)}, tp.detected, )
def test_filth_type(self): filth_a = PhoneFilth( beg=0, end=4, text='1234', detector_name='phone_a', locale='en_GB', document_name='test.txt' ) filth_b = PhoneFilth( beg=2, end=6, text='1234', detector_name='phone_b', locale='en_GB', document_name='test.txt' ) filth_c = PhoneFilth( beg=10, end=14, text='1234', detector_name='phone_b', locale='en_GB', document_name='test.txt' ) ft = FilthTypePositions(grouping_function=FilthGrouper.grouping_default, filth_type='phone') ft.add_filth(filth_c) ft.add_filth(filth_a) ft.add_filth(filth_b) self.assertEqual(3, len(ft.positions)) self.assertEqual(10, ft.positions[0].beg) self.assertEqual(0, ft.positions[1].beg) self.assertEqual(2, ft.positions[2].beg) self.assertEqual(['filth', 'detector', 'locale'], ft.column_names) ft.merge_positions() self.assertEqual(2, len(ft.positions)) self.assertEqual(10, ft.positions[1].beg) self.assertEqual(0, ft.positions[0].beg) self.assertEqual(6, ft.positions[0].end) self.assertEqual( { ('phone', 'phone_a', 'en_GB'), ('phone', 'phone_b', 'en_GB'), }, ft.positions[0].detected, ) df = ft.get_counts() self.assertEqual(['filth', 'detector', 'locale'], df.columns.names) self.assertEqual( { ('phone', 'phone_b', 'en_GB'), ('phone', 'phone_a', 'en_GB'), }, set(df.columns.values.tolist()), ) self.assertEqual([1, 0], df[('phone', 'phone_a', 'en_GB')].values.tolist()) self.assertEqual([1, 1], df[('phone', 'phone_b', 'en_GB')].values.tolist())
def test_filth_type_touching(self): filth_a = PhoneFilth( beg=0, end=4, text='1234', detector_name='phone_a', locale='en_GB', document_name='test.txt' ) filth_b = PhoneFilth( beg=2, end=6, text='1234', detector_name='phone_b', locale='en_GB', document_name='test.txt' ) filth_c = PhoneFilth( beg=6, end=10, text='1234', detector_name='phone_b', locale='en_GB', document_name='test.txt' ) ft = FilthTypePositions(grouping_function=FilthGrouper.grouping_default, filth_type='phone') ft.add_filth(filth_c) ft.add_filth(filth_a) ft.add_filth(filth_b) ft.merge_positions() self.assertEqual(2, len(ft.positions))
def test_dataframe(self): """test basic comparison""" filths = [ MergedFilth( PhoneFilth(beg=0, end=4, text='1234', detector_name='phone'), KnownFilth(beg=0, end=4, text='1234', comparison_type='phone'), ), KnownFilth(beg=5, end=10, text='12345', comparison_type='phone'), MergedFilth( PhoneFilth(beg=4, end=9, text=' 1234', detector_name='phone'), KnownFilth(beg=5, end=9, text='1234', comparison_type='phone'), ), KnownFilth(beg=15, end=20, text='12345', comparison_type='phone'), ] dataframe = scrubadub.comparison.get_filth_dataframe(filths, ) self.assertEquals(dataframe.shape[0], 4) self.assertEquals( dataframe['filth_type'].fillna('none').values.tolist(), ['phone', 'phone', 'none', 'none']) self.assertEquals(dataframe['beg'].fillna('none').values.tolist(), [0, 4, 'none', 'none']) self.assertEquals(dataframe['end'].fillna('none').values.tolist(), [4, 9, 'none', 'none']) self.assertEquals( dataframe['known_beg'].fillna('none').values.tolist(), [0, 5, 5, 15]) self.assertEquals( dataframe['known_end'].fillna('none').values.tolist(), [4, 9, 10, 20]) self.assertEquals( dataframe['exact_match'].fillna('none').values.tolist(), [True, False, False, False]) self.assertEquals( dataframe['partial_match'].fillna('none').values.tolist(), [True, True, False, False]) self.assertEquals( dataframe['true_positive'].fillna('none').values.tolist(), [True, True, False, False]) self.assertEquals( dataframe['false_positive'].fillna('none').values.tolist(), [False, False, False, False]) self.assertEquals( dataframe['false_negative'].fillna('none').values.tolist(), [False, False, True, True])
def test_text_equality(self): filth = PhoneFilth(beg=0, end=4, text='1234', detector_name='phone', locale='en_GB', document_name='test.txt') tp = TextPosition(filth, FilthGrouper.grouping_default) tp2 = TextPosition(filth, FilthGrouper.grouping_default) self.assertTrue(tp == tp2) filth = PhoneFilth(beg=0, end=4, text='1234', detector_name='phone', locale='fr_FR', document_name='test.txt') tp2 = TextPosition(filth, FilthGrouper.grouping_default) self.assertTrue(tp != tp2) filth = PhoneFilth(beg=0, end=4, text='1234', detector_name='phone2', locale='en_GB', document_name='test.txt') tp2 = TextPosition(filth, FilthGrouper.grouping_default) self.assertTrue(tp != tp2) filth = PhoneFilth(beg=0, end=5, text='12345', detector_name='phone', locale='en_GB', document_name='test.txt') tp2 = TextPosition(filth, FilthGrouper.grouping_default) self.assertTrue(tp != tp2) filth = PhoneFilth(beg=0, end=4, text='1234', detector_name='phone', locale='en_GB', document_name='test2.txt') tp2 = TextPosition(filth, FilthGrouper.grouping_default) self.assertTrue(tp != tp2)
def test_text_position(self): filth = PhoneFilth(beg=0, end=4, text='1234', detector_name='phone', locale='en_GB', document_name='test.txt') tp = TextPosition(filth, FilthGrouper.grouping_default) self.assertEqual(filth.beg, tp.beg) self.assertEqual(filth.end, tp.end) self.assertEqual( {('phone', 'phone', 'en_GB')}, tp.detected, ) self.assertEqual(set(), tp.tagged) self.assertEqual(filth.document_name, tp.document_name)
def test_with_irrelevant_filth(self): """text comparison with irrelevant filths included""" class TempFilth(Filth): type = 'temp' filths = [ MergedFilth( PhoneFilth(beg=0, end=4, text='John', detector_name='phone'), TaggedEvaluationFilth(beg=0, end=4, text='John', comparison_type='phone')), TaggedEvaluationFilth(beg=5, end=10, text='Hello', comparison_type='name'), # KnownFilth(beg=5, end=10, text='Hello', comparison_type='temp'), TempFilth(beg=100, end=103, text='123', detector_name='temp'), ] self.assertEqual( { 'phone:phone:None': { 'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 1 }, 'micro avg': { 'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 1 }, 'macro avg': { 'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 1 }, 'weighted avg': { 'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 1 } }, scrubadub.comparison.get_filth_classification_report( filths, # [PhoneDetector, KnownFilthDetector], output_dict=True, ), )
def test_text_position_merge(self): filth_a = PhoneFilth( beg=0, end=4, text='1234', detector_name='phone_a', locale='en_GB', document_name='test.txt' ) filth_b = PhoneFilth( beg=3, end=6, text='1234', detector_name='phone_b', locale='en_GB', document_name='test.txt' ) tp_a = TextPosition(filth_a, FilthGrouper.grouping_default) tp_b = TextPosition(filth_b, FilthGrouper.grouping_default) tp_a.merge(tp_b) self.assertEqual(0, tp_a.beg) self.assertEqual(6, tp_a.end) self.assertEqual( { ('phone', 'phone_a', 'en_GB'), ('phone', 'phone_b', 'en_GB'), }, tp_a.detected, ) self.assertEqual(set(), tp_a.tagged)
def test_other_predefined_types(self): """test comparison with other predefined filth types""" filths = [ MergedFilth( PhoneFilth(beg=0, end=4, text='John', detector_name='phone'), TaggedEvaluationFilth(beg=0, end=4, text='John', comparison_type='phone')), TaggedEvaluationFilth(beg=5, end=10, text='Hello', comparison_type='word'), ] self.assertEqual( { 'phone:phone:None': { 'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 1 }, 'micro avg': { 'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 1 }, 'macro avg': { 'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 1 }, 'weighted avg': { 'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 1 } }, scrubadub.comparison.get_filth_classification_report( filths, # [PhoneDetector, KnownFilthDetector], output_dict=True, ), )
def test_filth_grouper_equality(self): filths = [ MergedFilth( PhoneFilth(beg=0, end=4, text='1234', detector_name='phone', locale='en_GB', document_name='gb.txt'), TaggedEvaluationFilth(beg=0, end=4, text='1234', comparison_type='phone', locale='en_GB', document_name='gb.txt'), ), TaggedEvaluationFilth(beg=5, end=10, text='12345', comparison_type='phone', locale='en_GB', document_name='gb.txt'), MergedFilth( PhoneFilth(beg=12, end=16, text='1234', detector_name='phone', locale='en_US', document_name='us.txt'), TaggedEvaluationFilth(beg=12, end=16, text='1234', comparison_type='phone', locale='en_US', document_name='us.txt'), ), TaggedEvaluationFilth(beg=20, end=25, text='12345', comparison_type='phone', locale='en_US', document_name='us.txt'), TaggedEvaluationFilth(beg=30, end=35, text='12345', comparison_type='name', locale='en_US', document_name='us.txt'), ] fg = FilthGrouper(combine_detectors=True, groupby_documents=True, filth_types=['phone']) fg.add_filths(filths) fg2 = FilthGrouper(combine_detectors=True, groupby_documents=True, filth_types=['phone']) fg2.add_filths(filths) self.assertTrue(fg == fg2) fg2 = FilthGrouper(combine_detectors=True, groupby_documents=True, filth_types=['phone']) fg2.add_filths(filths[1:]) self.assertTrue(fg != fg2) fg2 = FilthGrouper(grouping_function=FilthGrouper.grouping_default, filth_types=['phone']) fg2.add_filths(filths[1:]) self.assertTrue(fg != fg2) self.assertEqual(['phone'], list(fg.types.keys())) self.assertEqual(6, len(fg.types['phone'].positions)) fg.merge_positions() self.assertEqual(4, len(fg.types['phone'].positions)) fg_from_list = FilthGrouper.from_filth_list(filths, filth_types=['phone'], combine_detectors=True, groupby_documents=True) self.assertEqual(list(fg.types.keys()), list(fg_from_list.types.keys())) df = fg.get_counts() self.assertEqual(['filth', 'document_name', 'detector', 'locale'], df.columns.names) self.assertEqual( [ ('phone', 'gb.txt', 'combined', 'en_GB'), ('phone', 'gb.txt', 'tagged', 'en_GB'), ('phone', 'us.txt', 'combined', 'en_US'), ('phone', 'us.txt', 'tagged', 'en_US') ], df.columns.values.tolist(), ) self.assertEqual([1, 0, 0, 0], df[('phone', 'gb.txt', 'combined', 'en_GB')].values.tolist()) self.assertEqual([1, 1, 0, 0], df[('phone', 'gb.txt', 'tagged', 'en_GB')].values.tolist()) self.assertEqual([0, 0, 1, 0], df[('phone', 'us.txt', 'combined', 'en_US')].values.tolist()) self.assertEqual([0, 0, 1, 1], df[('phone', 'us.txt', 'tagged', 'en_US')].values.tolist())
def test_grouper(self): filths = [ MergedFilth( PhoneFilth(beg=0, end=4, text='John', detector_name='phone_det'), TaggedEvaluationFilth(beg=0, end=4, text='John', comparison_type='phone')), TaggedEvaluationFilth(beg=5, end=10, text='Hello', comparison_type='name'), AddressFilth(beg=100, end=103, text='123', detector_name='address_det'), ] grouper = scrubadub.comparison.FilthGrouper() grouper.add_filths(filths) self.assertEqual(3, len(grouper.types)) self.assertEqual(2, len(grouper.types['phone'].positions)) self.assertEqual(0, grouper.types['phone'].positions[0].beg) self.assertEqual(4, grouper.types['phone'].positions[0].end) self.assertEqual({('phone', 'phone_det', 'None')}, grouper.types['phone'].positions[0].detected) self.assertEqual(set(), grouper.types['phone'].positions[0].tagged) self.assertEqual(set(), grouper.types['phone'].positions[1].detected) self.assertEqual({('phone', 'tagged', 'None')}, grouper.types['phone'].positions[1].tagged) self.assertEqual(1, len(grouper.types['name'].positions)) self.assertEqual(5, grouper.types['name'].positions[0].beg) self.assertEqual(10, grouper.types['name'].positions[0].end) self.assertEqual({('name', 'tagged', 'None')}, grouper.types['name'].positions[0].tagged) self.assertEqual(set(), grouper.types['name'].positions[0].detected) self.assertEqual(1, len(grouper.types['address'].positions)) self.assertEqual(100, grouper.types['address'].positions[0].beg) self.assertEqual(103, grouper.types['address'].positions[0].end) self.assertEqual(set(), grouper.types['address'].positions[0].tagged) self.assertEqual({('address', 'address_det', 'None')}, grouper.types['address'].positions[0].detected) grouper.merge_positions() self.assertEqual(3, len(grouper.types)) self.assertEqual(1, len(grouper.types['phone'].positions)) self.assertEqual(0, grouper.types['phone'].positions[0].beg) self.assertEqual(4, grouper.types['phone'].positions[0].end) self.assertEqual({('phone', 'phone_det', 'None')}, grouper.types['phone'].positions[0].detected) self.assertEqual({('phone', 'tagged', 'None')}, grouper.types['phone'].positions[0].tagged) self.assertEqual(1, len(grouper.types['name'].positions)) self.assertEqual(5, grouper.types['name'].positions[0].beg) self.assertEqual(10, grouper.types['name'].positions[0].end) self.assertEqual({('name', 'tagged', 'None')}, grouper.types['name'].positions[0].tagged) self.assertEqual(set(), grouper.types['name'].positions[0].detected) self.assertEqual(1, len(grouper.types['address'].positions)) self.assertEqual(100, grouper.types['address'].positions[0].beg) self.assertEqual(103, grouper.types['address'].positions[0].end) self.assertEqual(set(), grouper.types['address'].positions[0].tagged) self.assertEqual({('address', 'address_det', 'None')}, grouper.types['address'].positions[0].detected)
def test_two_comparisons(self): """test two filths in comparison""" class TempFilth(Filth): type = 'temp' class TempDetector(Detector): filth_cls = TempFilth filths = [ MergedFilth( PhoneFilth(beg=0, end=4, text='1234', detector_name='phone'), TaggedEvaluationFilth(beg=0, end=4, text='1234', comparison_type='phone'), ), TaggedEvaluationFilth(beg=5, end=10, text='12345', comparison_type='phone'), MergedFilth( TempFilth(beg=5, end=9, text='1234', detector_name='temp'), TaggedEvaluationFilth(beg=5, end=9, text='1234', comparison_type='temp'), ), TaggedEvaluationFilth(beg=15, end=20, text='12345', comparison_type='temp'), ] self.assertEqual( { 'phone:phone:None': { 'precision': 1.0, 'recall': 0.5, 'f1-score': 0.6666666666666666, 'support': 2 }, 'temp:temp:None': { 'precision': 1.0, 'recall': 0.5, 'f1-score': 0.6666666666666666, 'support': 2 }, 'micro avg': { 'precision': 1.0, 'recall': 0.5, 'f1-score': 0.6666666666666666, 'support': 4 }, 'macro avg': { 'precision': 1.0, 'recall': 0.5, 'f1-score': 0.6666666666666666, 'support': 4 }, 'weighted avg': { 'precision': 1.0, 'recall': 0.5, 'f1-score': 0.6666666666666666, 'support': 4 }, 'samples avg': { 'precision': 0.5, 'recall': 0.5, 'f1-score': 0.5, 'support': 4 } }, scrubadub.comparison.get_filth_classification_report( filths, output_dict=True, ), )
def test_overall(self): """test comparison with other predefined filth types""" filths = [ MergedFilth( PhoneFilth(beg=0, end=4, text='1234', detector_name='phone1', locale='en_GB'), TaggedEvaluationFilth(beg=0, end=4, text='1234', comparison_type='phone', locale='en_GB'), ), TaggedEvaluationFilth(beg=5, end=10, text='12345', comparison_type='phone', locale='en_GB'), MergedFilth( PhoneFilth(beg=12, end=16, text='1234', detector_name='phone2', locale='en_US'), TaggedEvaluationFilth(beg=12, end=16, text='1234', comparison_type='phone', locale='en_US'), ), TaggedEvaluationFilth(beg=20, end=25, text='12345', comparison_type='phone', locale='en_US'), ] self.assertEqual( { 'macro avg': { 'f1-score': 0.6666666666666666, 'precision': 1.0, 'recall': 0.5, 'support': 4 }, 'micro avg': { 'f1-score': 0.6666666666666666, 'precision': 1.0, 'recall': 0.5, 'support': 4 }, 'phone:combined:en_GB': { 'f1-score': 0.6666666666666666, 'precision': 1.0, 'recall': 0.5, 'support': 2 }, 'phone:combined:en_US': { 'f1-score': 0.6666666666666666, 'precision': 1.0, 'recall': 0.5, 'support': 2 }, 'samples avg': { 'f1-score': 0.5, 'precision': 0.5, 'recall': 0.5, 'support': 4 }, 'weighted avg': { 'f1-score': 0.6666666666666666, 'precision': 1.0, 'recall': 0.5, 'support': 4 } }, scrubadub.comparison.get_filth_classification_report( filths, combine_detectors=True, output_dict=True, ), )
def test_groupby_document(self): """test grouping by documents""" filths = [ PhoneFilth(beg=0, end=4, text='1234', detector_name='phone_v1', document_name='1.txt'), TaggedEvaluationFilth(beg=5, end=10, text='12345', comparison_type='phone', document_name='1.txt'), MergedFilth( PhoneFilth(beg=12, end=16, text='1234', detector_name='phone_v1', document_name='1.txt'), TaggedEvaluationFilth(beg=12, end=16, text='1234', comparison_type='phone', document_name='1.txt'), ), TaggedEvaluationFilth(beg=20, end=25, text='12345', comparison_type='phone', document_name='1.txt'), PhoneFilth(beg=0, end=4, text='1234', detector_name='phone_v1', document_name='2.txt'), TaggedEvaluationFilth(beg=5, end=10, text='12345', comparison_type='phone', document_name='2.txt'), MergedFilth( PhoneFilth(beg=12, end=16, text='1234', detector_name='phone_v1', document_name='2.txt'), TaggedEvaluationFilth(beg=12, end=16, text='1234', comparison_type='phone', document_name='2.txt'), ), TaggedEvaluationFilth(beg=20, end=25, text='12345', comparison_type='phone', document_name='2.txt'), ] self.assertEqual( { 'phone:1.txt:phone_v1:None': { 'precision': 0.5, 'recall': 0.3333333333333333, 'f1-score': 0.4, 'support': 3 }, 'phone:2.txt:phone_v1:None': { 'precision': 0.5, 'recall': 0.3333333333333333, 'f1-score': 0.4, 'support': 3 }, 'micro avg': { 'precision': 0.5, 'recall': 0.3333333333333333, 'f1-score': 0.4, 'support': 6 }, 'macro avg': { 'precision': 0.5, 'recall': 0.3333333333333333, 'f1-score': 0.4, 'support': 6 }, 'samples avg': { 'f1-score': 0.25, 'precision': 0.25, 'recall': 0.25, 'support': 6 }, 'weighted avg': { 'precision': 0.5, 'recall': 0.3333333333333333, 'f1-score': 0.4000000000000001, 'support': 6 } }, scrubadub.comparison.get_filth_classification_report( filths, output_dict=True, groupby_documents=True, ), )
def test_dataframe(self): """test basic comparison""" # test to ensure it doesn't crash if no filth is given to get_filth_dataframe scrubadub.comparison.get_filth_dataframe([]) # setup some filths for the other tests filths = [ MergedFilth( PhoneFilth(beg=0, end=4, text='1234', detector_name='phone'), TaggedEvaluationFilth(beg=0, end=4, text='1234', comparison_type='phone'), ), TaggedEvaluationFilth(beg=5, end=10, text='12345', comparison_type='phone'), MergedFilth( PhoneFilth(beg=4, end=9, text=' 1234', detector_name='phone'), TaggedEvaluationFilth(beg=5, end=9, text='1234', comparison_type='phone'), ), TaggedEvaluationFilth(beg=15, end=20, text='12345', comparison_type='phone'), ] dataframe = scrubadub.comparison.get_filth_dataframe(filths, ) self.assertEqual(4, dataframe.shape[0]) self.assertEqual( ['phone', 'phone', 'none', 'none'], dataframe['filth_type'].fillna('none').values.tolist(), ) self.assertEqual( [0, 4, 'none', 'none'], dataframe['beg'].fillna('none').values.tolist(), ) self.assertEqual( [4, 9, 'none', 'none'], dataframe['end'].fillna('none').values.tolist(), ) self.assertEqual( [0, 5, 5, 15], dataframe['known_beg'].fillna('none').values.tolist(), ) self.assertEqual( [4, 9, 10, 20], dataframe['known_end'].fillna('none').values.tolist(), ) self.assertEqual( [True, False, False, False], dataframe['exact_match'].fillna('none').values.tolist(), ) self.assertEqual( [True, True, False, False], dataframe['partial_match'].fillna('none').values.tolist(), ) self.assertEqual( [True, True, False, False], dataframe['true_positive'].fillna('none').values.tolist(), ) self.assertEqual( [False, False, False, False], dataframe['false_positive'].fillna('none').values.tolist(), ) self.assertEqual( [False, False, True, True], dataframe['false_negative'].fillna('none').values.tolist(), )