Exemple #1
0
 def test_text_output(self):
     """test basic comparison"""
     filths = [
         MergedFilth(
             PhoneFilth(beg=0, end=4, text='1234', detector_name='phone'),
             KnownFilth(beg=0, end=4, text='1234', comparison_type='phone'),
         ),
         KnownFilth(beg=5, end=10, text='12345', comparison_type='phone'),
         MergedFilth(
             PhoneFilth(beg=5, end=9, text='1234', detector_name='phone'),
             KnownFilth(beg=5, end=9, text='1234', comparison_type='phone'),
         ),
         KnownFilth(beg=15, end=20, text='12345', comparison_type='phone'),
     ]
     text = scrubadub.comparison.get_filth_classification_report(
         filths,
         output_dict=False,
     ).strip()
     print(text)
     self.assertEquals(
         text,
         "                 precision    recall  f1-score   support\n"
         "\n"
         "phone     phone       1.00      0.50      0.67         4\n"
         "\n"
         "      micro avg       1.00      0.50      0.67         4\n"
         "      macro avg       1.00      0.50      0.67         4\n"
         "   weighted avg       1.00      0.50      0.67         4\n".strip(
         ),
     )
Exemple #2
0
    def test_with_irrelevant_filth(self):
        """text comparison with irrelevant filths included"""
        class TempFilth(Filth):
            type = 'temp'

        filths = [
            MergedFilth(
                PhoneFilth(beg=0, end=4, text='John', detector_name='phone'),
                KnownFilth(beg=0, end=4, text='John',
                           comparison_type='phone')),
            KnownFilth(beg=5, end=10, text='Hello', comparison_type='name'),
            # KnownFilth(beg=5, end=10, text='Hello', comparison_type='temp'),
            TempFilth(beg=100, end=103, text='123', detector_name='temp'),
        ]

        self.assertEquals(
            scrubadub.comparison.get_filth_classification_report(
                filths,
                # [PhoneDetector, KnownFilthDetector],
                output_dict=True,
            ),
            {
                'phone:phone': {
                    'precision': 1.0,
                    'recall': 1.0,
                    'f1-score': 1.0,
                    'support': 1
                },
                'micro avg': {
                    'precision': 1.0,
                    'recall': 1.0,
                    'f1-score': 1.0,
                    'support': 1
                },
                'macro avg': {
                    'precision': 1.0,
                    'recall': 1.0,
                    'f1-score': 1.0,
                    'support': 1
                },
                'weighted avg': {
                    'precision': 1.0,
                    'recall': 1.0,
                    'f1-score': 1.0,
                    'support': 1
                }
            },
        )
Exemple #3
0
    def test_comparison(self):
        """test basic comparison"""
        filths = [
            MergedFilth(
                PhoneFilth(beg=0, end=4, text='1234', detector_name='phone'),
                KnownFilth(beg=0, end=4, text='1234', comparison_type='phone'),
            ),
            KnownFilth(beg=5, end=10, text='12345', comparison_type='phone'),
            MergedFilth(
                PhoneFilth(beg=5, end=9, text='1234', detector_name='phone'),
                KnownFilth(beg=5, end=9, text='1234', comparison_type='phone'),
            ),
            KnownFilth(beg=15, end=20, text='12345', comparison_type='phone'),
        ]

        self.assertEquals(
            scrubadub.comparison.get_filth_classification_report(
                filths,
                output_dict=True,
            ),
            {
                'macro avg': {
                    'f1-score': 0.6666666666666666,
                    'precision': 1.0,
                    'recall': 0.5,
                    'support': 4
                },
                'micro avg': {
                    'f1-score': 0.6666666666666666,
                    'precision': 1.0,
                    'recall': 0.5,
                    'support': 4
                },
                'phone:phone': {
                    'f1-score': 0.6666666666666666,
                    'precision': 1.0,
                    'recall': 0.5,
                    'support': 4
                },
                'weighted avg': {
                    'f1-score': 0.6666666666666666,
                    'precision': 1.0,
                    'recall': 0.5,
                    'support': 4
                }
            },
        )
Exemple #4
0
    def test_false_positive(self):
        """test with incorrect identification"""
        filths = [
            PhoneFilth(beg=0, end=4, text='1234', detector_name='phone_v1'),
            KnownFilth(beg=5, end=10, text='12345', comparison_type='phone'),
            MergedFilth(
                PhoneFilth(beg=5, end=9, text='1234',
                           detector_name='phone_v1'),
                KnownFilth(beg=5, end=9, text='1234', comparison_type='phone'),
            ),
            KnownFilth(beg=15, end=20, text='12345', comparison_type='phone'),
        ]

        self.assertEquals(
            scrubadub.comparison.get_filth_classification_report(
                filths,
                # [PhoneDetector, KnownFilthDetector],
                output_dict=True,
            ),
            {
                'phone:phone_v1': {
                    'precision': 0.5,
                    'recall': 0.3333333333333333,
                    'f1-score': 0.4,
                    'support': 3
                },
                'micro avg': {
                    'precision': 0.5,
                    'recall': 0.3333333333333333,
                    'f1-score': 0.4,
                    'support': 3
                },
                'macro avg': {
                    'precision': 0.5,
                    'recall': 0.3333333333333333,
                    'f1-score': 0.4,
                    'support': 3
                },
                'weighted avg': {
                    'precision': 0.5,
                    'recall': 0.3333333333333333,
                    'f1-score': 0.4000000000000001,
                    'support': 3
                }
            },
        )
Exemple #5
0
 def test_dataframe(self):
     """test basic comparison"""
     filths = [
         MergedFilth(
             PhoneFilth(beg=0, end=4, text='1234', detector_name='phone'),
             KnownFilth(beg=0, end=4, text='1234', comparison_type='phone'),
         ),
         KnownFilth(beg=5, end=10, text='12345', comparison_type='phone'),
         MergedFilth(
             PhoneFilth(beg=4, end=9, text=' 1234', detector_name='phone'),
             KnownFilth(beg=5, end=9, text='1234', comparison_type='phone'),
         ),
         KnownFilth(beg=15, end=20, text='12345', comparison_type='phone'),
     ]
     dataframe = scrubadub.comparison.get_filth_dataframe(filths, )
     self.assertEquals(dataframe.shape[0], 4)
     self.assertEquals(
         dataframe['filth_type'].fillna('none').values.tolist(),
         ['phone', 'phone', 'none', 'none'])
     self.assertEquals(dataframe['beg'].fillna('none').values.tolist(),
                       [0, 4, 'none', 'none'])
     self.assertEquals(dataframe['end'].fillna('none').values.tolist(),
                       [4, 9, 'none', 'none'])
     self.assertEquals(
         dataframe['known_beg'].fillna('none').values.tolist(),
         [0, 5, 5, 15])
     self.assertEquals(
         dataframe['known_end'].fillna('none').values.tolist(),
         [4, 9, 10, 20])
     self.assertEquals(
         dataframe['exact_match'].fillna('none').values.tolist(),
         [True, False, False, False])
     self.assertEquals(
         dataframe['partial_match'].fillna('none').values.tolist(),
         [True, True, False, False])
     self.assertEquals(
         dataframe['true_positive'].fillna('none').values.tolist(),
         [True, True, False, False])
     self.assertEquals(
         dataframe['false_positive'].fillna('none').values.tolist(),
         [False, False, False, False])
     self.assertEquals(
         dataframe['false_negative'].fillna('none').values.tolist(),
         [False, False, True, True])
Exemple #6
0
    def test_other_predefined_types(self):
        """test comparison with other predefined filth types"""
        filths = [
            MergedFilth(
                PhoneFilth(beg=0, end=4, text='John', detector_name='phone'),
                KnownFilth(beg=0, end=4, text='John',
                           comparison_type='phone')),
            KnownFilth(beg=5, end=10, text='Hello', comparison_type='word'),
        ]

        self.assertEquals(
            scrubadub.comparison.get_filth_classification_report(
                filths,
                # [PhoneDetector, KnownFilthDetector],
                output_dict=True,
            ),
            {
                'phone:phone': {
                    'precision': 1.0,
                    'recall': 1.0,
                    'f1-score': 1.0,
                    'support': 1
                },
                'micro avg': {
                    'precision': 1.0,
                    'recall': 1.0,
                    'f1-score': 1.0,
                    'support': 1
                },
                'macro avg': {
                    'precision': 1.0,
                    'recall': 1.0,
                    'f1-score': 1.0,
                    'support': 1
                },
                'weighted avg': {
                    'precision': 1.0,
                    'recall': 1.0,
                    'f1-score': 1.0,
                    'support': 1
                }
            },
        )
Exemple #7
0
    def test_two_comparisons(self):
        """test two filths in comparison"""
        class TempFilth(Filth):
            type = 'temp'

        class TempDetector(Detector):
            filth_cls = TempFilth

        filths = [
            MergedFilth(
                PhoneFilth(beg=0, end=4, text='1234', detector_name='phone'),
                KnownFilth(beg=0, end=4, text='1234', comparison_type='phone'),
            ),
            KnownFilth(beg=5, end=10, text='12345', comparison_type='phone'),
            MergedFilth(
                TempFilth(beg=5, end=9, text='1234', detector_name='temp'),
                KnownFilth(beg=5, end=9, text='1234', comparison_type='temp'),
            ),
            KnownFilth(beg=15, end=20, text='12345', comparison_type='temp'),
        ]

        self.assertEquals(
            scrubadub.comparison.get_filth_classification_report(
                filths,
                output_dict=True,
            ),
            {
                'phone:phone': {
                    'precision': 1.0,
                    'recall': 0.5,
                    'f1-score': 0.6666666666666666,
                    'support': 2
                },
                'temp:temp': {
                    'precision': 1.0,
                    'recall': 0.5,
                    'f1-score': 0.6666666666666666,
                    'support': 2
                },
                'micro avg': {
                    'precision': 1.0,
                    'recall': 0.5,
                    'f1-score': 0.6666666666666666,
                    'support': 4
                },
                'macro avg': {
                    'precision': 1.0,
                    'recall': 0.5,
                    'f1-score': 0.6666666666666666,
                    'support': 4
                },
                'weighted avg': {
                    'precision': 1.0,
                    'recall': 0.5,
                    'f1-score': 0.6666666666666666,
                    'support': 4
                },
                'samples avg': {
                    'precision': 0.5,
                    'recall': 0.5,
                    'f1-score': 0.5,
                    'support': 4
                }
            },
        )
Exemple #8
0
    def test_locales(self):
        """test comparison with other predefined filth types"""

        filths = [
            MergedFilth(
                PhoneFilth(beg=0,
                           end=4,
                           text='1234',
                           detector_name='phone',
                           locale='en_GB'),
                KnownFilth(beg=0,
                           end=4,
                           text='1234',
                           comparison_type='phone',
                           locale='en_GB'),
            ),
            KnownFilth(beg=5,
                       end=10,
                       text='12345',
                       comparison_type='phone',
                       locale='en_GB'),
            MergedFilth(
                PhoneFilth(beg=5,
                           end=9,
                           text='1234',
                           detector_name='phone',
                           locale='en_US'),
                KnownFilth(beg=5,
                           end=9,
                           text='1234',
                           comparison_type='phone',
                           locale='en_US'),
            ),
            KnownFilth(beg=15,
                       end=20,
                       text='12345',
                       comparison_type='phone',
                       locale='en_US'),
        ]

        self.assertEquals(
            scrubadub.comparison.get_filth_classification_report(
                filths,
                output_dict=True,
            ),
            {
                'macro avg': {
                    'f1-score': 0.6666666666666666,
                    'precision': 1.0,
                    'recall': 0.5,
                    'support': 4
                },
                'micro avg': {
                    'f1-score': 0.6666666666666666,
                    'precision': 1.0,
                    'recall': 0.5,
                    'support': 4
                },
                'phone:phone:en_GB': {
                    'f1-score': 0.6666666666666666,
                    'precision': 1.0,
                    'recall': 0.5,
                    'support': 2
                },
                'phone:phone:en_US': {
                    'f1-score': 0.6666666666666666,
                    'precision': 1.0,
                    'recall': 0.5,
                    'support': 2
                },
                'samples avg': {
                    'f1-score': 0.5,
                    'precision': 0.5,
                    'recall': 0.5,
                    'support': 4
                },
                'weighted avg': {
                    'f1-score': 0.6666666666666666,
                    'precision': 1.0,
                    'recall': 0.5,
                    'support': 4
                }
            },
        )