Esempio n. 1
0
 def test_list_filth_documents_list(self):
     """Test the iter_filth_documents function with a list"""
     scrubber = scrubadub.Scrubber(post_processor_list=[
         scrubadub.post_processors.FilthTypeReplacer()
     ])
     docs = [
         "This is a test message for [email protected]",
         "Hello @Jane call me on +33 4 41 26 62 36.",
     ]
     filth_list_one = list(
         scrubber.iter_filth_documents(docs, run_post_processors=True))
     filth_list_two = list(
         scrubber.iter_filth_documents(docs, run_post_processors=False))
     for filths in [filth_list_one, filth_list_two]:
         self.assertEqual(filths, [
             scrubadub.filth.EmailFilth(text='*****@*****.**',
                                        document_name='0',
                                        detector_name='email',
                                        beg=27,
                                        end=46,
                                        locale='en_US'),
             scrubadub.filth.TwitterFilth(text='@Jane',
                                          document_name='1',
                                          detector_name='twitter',
                                          beg=6,
                                          end=11,
                                          locale='en_US'),
             scrubadub.filth.PhoneFilth(text='+33 4 41 26 62 36',
                                        document_name='1',
                                        detector_name='phone',
                                        beg=23,
                                        end=40,
                                        locale='en_US'),
         ])
Esempio n. 2
0
 def test_add_non_detector(self):
     """make sure you can't add a detector that is not a Detector"""
     class NotDetector(object):
         pass
     scrubber = scrubadub.Scrubber()
     with self.assertRaises(TypeError):
         scrubber.add_detector(NotDetector)
Esempio n. 3
0
    def __init__(self, uids_for_initials=True, initials_placeholder='someone'):
        """
        Params:
            uids_for_initials: Whether or not unique identifiers should be used to replace initials. If `False`, `initials_placeholder` will be used instead.
        """
        super().__init__()
        self._uids_for_initials = uids_for_initials
        self.scrubber = scrubadub.Scrubber()
        # replace default name detector with new name detector that doesn't delete keywords
        self.scrubber.remove_detector("name")

        custom_detectors = (CustomNameDetector, InitialsDetector)

        # Use placeholders instead of uids if specified
        if not uids_for_initials:
            # noinspection PyProtectedMember
            for detector in [*custom_detectors, *self.scrubber._detectors.values()]:
                detector.filth_cls.placeholder = initials_placeholder
                # Don't add '{{'/'}}' prefix/suffix
                detector.filth_cls.prefix = detector.filth_cls.suffix = ''
                # Prevent merging because it doesn't respect the placeholder
                detector.filth_cls.merge = lambda _self, other: _self

        for detector in custom_detectors:
            self.scrubber.add_detector(detector)
Esempio n. 4
0
    def test_remove_post_processor(self):
        """make sure you can't add a detector that is not a Detector"""
        post_processor = scrubadub.post_processors.FilthTypeReplacer(
            name='typeinator')
        scrubber = scrubadub.Scrubber(post_processor_list=[post_processor])
        scrubber.add_post_processor(scrubadub.post_processors.HashReplacer)
        scrubber.add_post_processor('filth_type_replacer')

        self.assertEqual(len(scrubber._post_processors), 3)
        self.assertEqual(
            [x.name for x in scrubber._post_processors],
            ['typeinator', 'hash_replacer', 'filth_type_replacer'])

        scrubber.remove_post_processor('filth_type_replacer')
        self.assertEqual(len(scrubber._post_processors), 2)
        self.assertEqual([x.name for x in scrubber._post_processors],
                         ['typeinator', 'hash_replacer'])

        scrubber.remove_post_processor(scrubadub.post_processors.HashReplacer)
        self.assertEqual(len(scrubber._post_processors), 1)
        self.assertEqual([x.name for x in scrubber._post_processors],
                         ['typeinator'])

        scrubber.remove_post_processor(post_processor)
        self.assertEqual(len(scrubber._post_processors), 0)
        self.assertEqual([x.name for x in scrubber._post_processors], [])
Esempio n. 5
0
    def test_add_duplicate_post_processor(self):
        """make sure adding a detector that already exists raises an error"""
        scrubber = scrubadub.Scrubber()
        scrubber.add_post_processor(scrubadub.post_processors.HashReplacer)

        with self.assertRaises(KeyError):
            scrubber.add_post_processor(scrubadub.post_processors.HashReplacer)
Esempio n. 6
0
 def test_filth_merge(self):
     """filth should merge properly"""
     # this looks like an email address 'me at john.doe' and skype
     text = "you can skype me at john.doe"
     scrubber = scrubadub.Scrubber()
     filths = [filth for filth in scrubber.iter_filth(text)]
     self.assertEqual(len(filths), 1)
Esempio n. 7
0
def generate_and_scrub(locale, filth_list, detectors, n_docs: int = 50):
    documents = []
    known_pii = []

    click.echo("Generating {} docs with filth: {}".format(
        locale, ", ".join(filth_list)))
    start_time = time.time()

    for i_doc in range(n_docs):
        new_doc, new_known_pii = make_fake_document(paragraphs=2 *
                                                    len(filth_list),
                                                    locale=locale,
                                                    seed=None,
                                                    filth_types=filth_list)
        documents.append(new_doc)
        known_pii += new_known_pii

    scrubber_time = time.time()
    click.echo("Scrubbing with detectors: {}".format(', '.join(detectors)))

    detectors.append(
        scrubadub.detectors.KnownFilthDetector(locale=locale,
                                               known_filth_items=known_pii))
    scrubber = scrubadub.Scrubber(locale=locale, detector_list=detectors)
    found_filth = list(scrubber.iter_filth_documents(documents))

    end_time = time.time()
    click.echo("Documents generated in {:.2f}s".format(scrubber_time -
                                                       start_time))
    click.echo("Scrubbed documents in  {:.2f}s".format(end_time -
                                                       scrubber_time))

    return found_filth
 def test_not_implemented_locale(self):
     """test a non existant region"""
     scrubber = scrubadub.Scrubber(locale='fr_FR')
     with warnings.catch_warnings():
         warnings.simplefilter("error")
         with self.assertRaises(UserWarning):
             scrubber.add_detector(scrubadub.detectors.PostalCodeDetector)
Esempio n. 9
0
 def test_list_filth_documents_list(self):
     """Test the iter_filth_documents function with a list"""
     scrubber = scrubadub.Scrubber(post_processor_list=[
         scrubadub.post_processors.FilthTypeReplacer()
     ])
     docs = [
         "This is a test message for [email protected]",
         "Hello Jane, I am Tom.",
     ]
     filth_list_one = list(
         scrubber.iter_filth_documents(docs, run_post_processors=True))
     filth_list_two = list(
         scrubber.iter_filth_documents(docs, run_post_processors=False))
     for filths in [filth_list_one, filth_list_two]:
         self.assertEqual(filths, [
             scrubadub.filth.EmailFilth(text='*****@*****.**',
                                        document_name='0',
                                        detector_name='email',
                                        beg=27,
                                        end=46),
             scrubadub.filth.NameFilth(text='Jane',
                                       document_name='1',
                                       detector_name='name',
                                       beg=6,
                                       end=10),
             scrubadub.filth.NameFilth(text='Tom',
                                       document_name='1',
                                       detector_name='name',
                                       beg=17,
                                       end=20),
         ])
Esempio n. 10
0
 def test_filth_merge_placeholder(self):
     """filths should be merged into the biggest filth"""
     text = "you can skype me at john.doe"
     scrubber = scrubadub.Scrubber()
     for filth in scrubber.iter_filth(text):
         self.assertIsInstance(filth, MergedFilth)
         self.assertTrue('SKYPE' in filth.placeholder, filth.placeholder)
         self.assertTrue('EMAIL' in filth.placeholder, filth.placeholder)
Esempio n. 11
0
def clean_if(x):

    scrubber = scrubadub.Scrubber()
    scrubber.remove_detector('name')
    scrubber.remove_detector('url')
    if isinstance(x, str):
        x = scrubber.clean(x)
    return x
Esempio n. 12
0
 def test_scrubber_clean(self):
     """test older scrubber API"""
     scrubber = scrubadub.Scrubber()
     scrubber.remove_detector('email')
     text = "contact Joe Duffy at [email protected]"
     self.assertEqual(
         scrubadub.clean(text),
         "contact {{NAME}} {{NAME}} at {{EMAIL}}",
     )
Esempio n. 13
0
def main():
    general_docs = []
    # address_docs = []
    # uk_phone_docs = []
    known_general_pii = []
    # known_address_pii = []
    # known_uk_phone_pii = []
    start_time = time.time()
    for i_doc in range(50):
        filth_types = [
            'email', 'name', 'phone', 'postalcode', 'ssn', 'twitter', 'url'
        ]
        new_doc, new_known_pii = make_fake_document(paragraphs=2 *
                                                    len(filth_types),
                                                    seed=i_doc,
                                                    filth_types=filth_types)
        general_docs.append(new_doc)
        known_general_pii += new_known_pii

        # new_doc, new_known_pii = make_fake_document(paragraphs=4, seed=i_doc, filth_types=['gb_address', 'us_address'])
        # address_docs.append(new_doc)
        # known_address_pii += new_known_pii

        # new_doc, new_known_pii = make_fake_document(
        #     faker=faker.Faker(locale='en_gb'), paragraphs=2, seed=i_doc, filth_types=['phone']
        # )
        # uk_phone_docs.append(new_doc)
        # known_uk_phone_pii += new_known_pii

    scrubber_time = time.time()
    scrubber = scrubadub.Scrubber()
    # scrubber.add_detector(scrubadub.detectors.stanford_ner.StanfordNERDetector())
    scrubber.add_detector(
        scrubadub.detectors.KnownFilthDetector(
            known_filth_items=known_general_pii))
    filth_list = list(scrubber.iter_filth_documents(general_docs))

    # scrubber = scrubadub.Scrubber(detector_list=[
    #     scrubadub.detectors.address.GBAddressDetector(),
    #     scrubadub.detectors.address.USAddressDetector(),
    #     scrubadub.detectors.KnownFilthDetector(predefined_pii=known_address_pii),
    # ])
    # filth_list += list(scrubber.iter_filth_documents(address_docs))

    # scrubber = scrubadub.Scrubber(detector_list=[
    #     scrubadub.detectors.PhoneDetector(region='gb', name='gb_phone'),
    #     scrubadub.detectors.KnownFilthDetector(predefined_pii=known_uk_phone_pii),
    # ])
    # filth_list += list(scrubber.iter_filth_documents(uk_phone_docs))

    end_time = time.time()
    print("Documents generated in {:.2f}s".format(scrubber_time - start_time))
    print("Scrubbed documents in  {:.2f}s".format(end_time - scrubber_time))
    print(get_filth_classification_report(filth_list))

    sys.exit(0)
 def test_disable_name(self):
     """
     BEFORE: My nino is AB 123456 C
     AFTER:  My nino is {{NINO}}
     """
     before, after = self.get_before_after()
     import scrubadub
     scrubber = scrubadub.Scrubber()
     scrubber.remove_detector('name')
     self.check_equal(after, scrubber.clean(before))
Esempio n. 15
0
 def test_filth_ordering(self):
     """make sure filth is returned in order"""
     scrubber = scrubadub.Scrubber()
     text = ("Alan can be reached by email [email protected] or "
             "phone +1.312.456.6421")
     order = []
     for filth in scrubber.iter_filth(text):
         order.append(filth.beg)
         order.append(filth.end)
     self.assertEqual(sorted(order), order)
Esempio n. 16
0
 def test_disable_passport(self):
     """
     BEFORE: My credit card is 4111111111111111.
     AFTER:  My credit card is {{CREDITCARD}}.
     """
     before, after = self.get_before_after()
     import scrubadub
     scrubber = scrubadub.Scrubber()
     scrubber.remove_detector('passport')
     self.check_equal(after, scrubber.clean(before))
 def test_vehicle_disable_name(self):
     """
     BEFORE: My number plate is AB12 ABC.
     AFTER:  My number plate is {{VEHICLE}}.
     """
     before, after = self.get_before_after()
     import scrubadub
     scrubber = scrubadub.Scrubber()
     scrubber.remove_detector('name')
     self.check_equal(after, scrubber.clean(before))
Esempio n. 18
0
    def test_list_document(self):
        """check we can clean a list of documents"""
        text = [
            'the apple was eaten by a shark',
            'the apple was not eaten by the fish',
        ]
        scrubber = scrubadub.Scrubber()
        self.assertEqual(scrubber.clean_documents(text), text)

        text_dirty = [
            'shark sent [email protected] a complaint',
            'the fish swam on by',
        ]
        text_clean = [
            'shark sent {{EMAIL}} a complaint',
            'the fish swam on by',
        ]
        scrubber = scrubadub.Scrubber()
        self.assertEqual(scrubber.clean_documents(text_dirty), text_clean)
Esempio n. 19
0
 def test_disable_email(self):
     """
     BEFORE: contact me at [email protected]
     AFTER:  contact me at [email protected]
     """
     before, after = self.get_before_after()
     import scrubadub
     scrubber = scrubadub.Scrubber()
     scrubber.remove_detector('email')
     self.check_equal(after, scrubber.clean(before))
 def _get_scrubber():
     """Initialize a scrubber with phone, skype, ssn, tin and url detector."""
     scrubber = scrubadub.Scrubber()
     # We are not looking at names and emails.
     # To prevent false positives, we remove these detectors for now.
     scrubber.remove_detector('email')
     scrubber.remove_detector('name')
     scrubber.remove_detector('phone')
     scrubber.add_detector(TINDetector)
     return scrubber
Esempio n. 21
0
 def test_disable_name(self):
     """
     BEFORE: My passport number is 5333800068GBR8812049F2509286.
     AFTER:  My passport number is {{PASSPORT}}.
     """
     before, after = self.get_before_after()
     import scrubadub
     scrubber = scrubadub.Scrubber()
     scrubber.remove_detector('nino')
     self.check_equal(after, scrubber.clean(before))
Esempio n. 22
0
    def test_add_non_post_processor(self):
        """make sure you can't add a detector that is not a Detector"""
        class NotPostProcessor(object):
            pass

        scrubber = scrubadub.Scrubber()
        with self.assertRaises(TypeError):
            scrubber.add_post_processor(NotPostProcessor)

        with self.assertRaises(ValueError):
            scrubber.add_post_processor('not_really_the_name_of_a_detector')
Esempio n. 23
0
 def test_add_post_processor_instance(self):
     """make sure adding some post processors work"""
     scrubber = scrubadub.Scrubber()
     scrubber.add_post_processor(
         scrubadub.post_processors.HashReplacer(salt='example_salt',
                                                include_filth_type=False))
     scrubber.add_post_processor(
         scrubadub.post_processors.PrefixSuffixReplacer(prefix='<<',
                                                        suffix='>>'))
     print(scrubber._post_processors)
     text = scrubber.clean("hello from [email protected]")
     self.assertEqual(text, "hello from <<5A337A5C25F9D260>>")
Esempio n. 24
0
    def test_iter_not_return_filth(self):
        """make sure a detector cant return non filth"""
        class BadDetector(scrubadub.detectors.Detector):
            name = 'bad_detector'

            # TODO: investigate below
            def iter_filth(self, text, **kwargs):
                yield 'Non-filth'

        scrubber = scrubadub.Scrubber(detector_list=[BadDetector()])
        with self.assertRaises(TypeError) as err:
            list(scrubber.iter_filth('A fake document with no pii'))
Esempio n. 25
0
 def test_add_post_processor_instance_with_name(self):
     """make sure adding a duplicate post_processors with a different name works"""
     scrubber = scrubadub.Scrubber(post_processor_list=[
         scrubadub.post_processors.FilthTypeReplacer(name='typeinator'),
         scrubadub.post_processors.PrefixSuffixReplacer(name='prefixor'),
     ])
     scrubber.add_post_processor(
         scrubadub.post_processors.PrefixSuffixReplacer(
             name='prefixor_two'))
     self.assertEqual(len(scrubber._post_processors), 3)
     filth = list(scrubber.iter_filth('hello [email protected]'))
     self.assertEqual(len(filth), 1)
     self.assertEqual(filth[0].replacement_string, '{{{{EMAIL}}}}')
Esempio n. 26
0
    def test_add_detector_instance(self):
        """make sure adding an initialised detector works"""
        scrubber = scrubadub.Scrubber(detector_list=[])

        scrubber.add_detector(scrubadub.detectors.email.EmailDetector)
        self.assertEqual(len(scrubber._detectors), 1)
        self.assertEqual(list(scrubber._detectors.keys()), ['email'])

        scrubber.remove_detector('email')
        self.assertEqual(len(scrubber._detectors), 0)

        scrubber.add_detector('email')
        self.assertEqual(len(scrubber._detectors), 1)
        self.assertEqual(list(scrubber._detectors.keys()), ['email'])
Esempio n. 27
0
 def test_clean_customize_filth_identification(self):
     """
     BEFORE: contact Joe Duffy at [email protected]
     AFTER:  contact <b>NAME</b> <b>NAME</b> at <b>EMAIL</b>
     """
     before, after = self.get_before_after()
     import scrubadub
     prefix = scrubadub.filth.base.Filth.prefix
     suffix = scrubadub.filth.base.Filth.suffix
     scrubadub.filth.base.Filth.prefix = u'<b>'
     scrubadub.filth.base.Filth.suffix = u'</b>'
     scrubber = scrubadub.Scrubber()
     self.check_equal(after, scrubber.clean(before))
     scrubadub.filth.base.Filth.prefix = prefix
     scrubadub.filth.base.Filth.suffix = suffix
Esempio n. 28
0
def scrub(df, column):
    """Add different scrubber classes and run column through scrubadub."""
    # scrubadub.filth.date_of_birth.DateOfBirthFilth.min_age_years = 5
    scrubber = scrubadub.Scrubber()
    # scrubber.add_detector(scrubadub.detectors.date_of_birth.DateOfBirthDetector())
    scrubber.remove_detector("url")
    scrubber.remove_detector("twitter")
    scrubber.remove_detector("email")
    scrubber.add_detector(SSNDetector)
    scrubber.add_detector(PassportDetector)
    scrubber.add_detector(AlienIdDetector)
    #Test breaking detectors
    scrubber.add_detector(FL_DLDetector)
    scrubber.add_detector(HI_NE_VA_DLDetector)
    scrubber.add_detector(IL_DLDetector)
    scrubber.add_detector(MN_FL_MD_MI_DLDetector)
    scrubber.add_detector(MO_OK_DLDetector)
    scrubber.add_detector(MD_DLDetector)
    #Working Detectors
    scrubber.add_detector(CA_DLDetector)
    scrubber.add_detector(CO_DLDetector)
    scrubber.add_detector(ID_DLDetector)
    scrubber.add_detector(NJ_DLDetector)
    scrubber.add_detector(NY_DLDetector)
    scrubber.add_detector(ND_DLDetector)
    scrubber.add_detector(OH_DLDetector)
    scrubber.add_detector(PA_DLDetector)
    scrubber.add_detector(VT_DLDetector)
    scrubber.add_detector(VA_DLDetector)
    scrubber.add_detector(WA_DLDetector)
    scrubber.add_detector(WV_DLDetector)
    scrubber.add_detector(WI_DLDetector)
    scrubber.add_detector(WY_DLDetector)
    scrubber.add_detector(NH_DLDetector)
    scrubber.add_detector(IO_DLDetector)
    scrubber.add_detector(IA_DLDetector)
    scrubber.add_detector(KS_DLDetector)
    scrubber.add_detector(KY_DLDetector)
    scrubber.add_detector(MI_DLDetector)
    scruby = lambda x: scrubber.clean(x)
    df[column] = df[column].apply(scruby)
    # analyzer = AnalyzerEngine()
    # anonymizer = AnonymizerEngine()
    # entities = ["PHONE_NUMBER","CREDIT_CARD","US_DRIVER_LICENSE","US_SSN","EMAIL_ADDRESS","IP_ADDRESS"]
    # scrub_2 = lambda x: anonymizer.anonymize(text=x,analyzer_results=analyzer.analyze(text=x,entities=entities,language='en')).text
    # df[column] = df[column].apply(scrub_2)
    return df
Esempio n. 29
0
    def test_add_post_processor_order(self):
        """make sure adding some post processors work"""
        scrubber = scrubadub.Scrubber()
        scrubber.add_post_processor(
            scrubadub.post_processors.FilthTypeReplacer(name='one'))
        scrubber.add_post_processor(
            scrubadub.post_processors.HashReplacer(name='two',
                                                   salt='example_salt',
                                                   include_filth_type=False))
        scrubber.add_post_processor(
            scrubadub.post_processors.PrefixSuffixReplacer(name='three',
                                                           prefix='<<',
                                                           suffix='>>'))

        self.assertEqual([
            i for i, x in enumerate(scrubber._post_processors)
            if x.name == 'one'
        ][0], 0)
        self.assertEqual([
            i for i, x in enumerate(scrubber._post_processors)
            if x.name == 'two'
        ][0], 1)
        self.assertEqual([
            i for i, x in enumerate(scrubber._post_processors)
            if x.name == 'three'
        ][0], 2)

        scrubber.add_post_processor(
            scrubadub.post_processors.FilthTypeReplacer(name='zero'), index=0)
        print(scrubber._post_processors)

        self.assertEqual([
            i for i, x in enumerate(scrubber._post_processors)
            if x.name == 'zero'
        ][0], 0)
        self.assertEqual([
            i for i, x in enumerate(scrubber._post_processors)
            if x.name == 'one'
        ][0], 1)
        self.assertEqual([
            i for i, x in enumerate(scrubber._post_processors)
            if x.name == 'two'
        ][0], 2)
        self.assertEqual([
            i for i, x in enumerate(scrubber._post_processors)
            if x.name == 'three'
        ][0], 3)
Esempio n. 30
0
 def test_customize_filth_identification(self):
     """
     BEFORE: contact me at [email protected]
     AFTER:  contact me at <b>EMAIL</b>
     """
     before, after = self.get_before_after()
     import scrubadub
     prefix = scrubadub.filth.base.Filth.prefix
     suffix = scrubadub.filth.base.Filth.suffix
     scrubadub.filth.base.Filth.prefix = u'<b>'
     scrubadub.filth.base.Filth.suffix = u'</b>'
     try:
         scrubber = scrubadub.Scrubber()
         self.check_equal(after, scrubber.clean(before))
     finally:
         # Ensure that this is reset, no matter what happens above
         scrubadub.filth.base.Filth.prefix = prefix
         scrubadub.filth.base.Filth.suffix = suffix