def test_list_filth_documents_list(self): """Test the iter_filth_documents function with a list""" scrubber = scrubadub.Scrubber(post_processor_list=[ scrubadub.post_processors.FilthTypeReplacer() ]) docs = [ "This is a test message for [email protected]", "Hello @Jane call me on +33 4 41 26 62 36.", ] filth_list_one = list( scrubber.iter_filth_documents(docs, run_post_processors=True)) filth_list_two = list( scrubber.iter_filth_documents(docs, run_post_processors=False)) for filths in [filth_list_one, filth_list_two]: self.assertEqual(filths, [ scrubadub.filth.EmailFilth(text='*****@*****.**', document_name='0', detector_name='email', beg=27, end=46, locale='en_US'), scrubadub.filth.TwitterFilth(text='@Jane', document_name='1', detector_name='twitter', beg=6, end=11, locale='en_US'), scrubadub.filth.PhoneFilth(text='+33 4 41 26 62 36', document_name='1', detector_name='phone', beg=23, end=40, locale='en_US'), ])
def test_add_non_detector(self): """make sure you can't add a detector that is not a Detector""" class NotDetector(object): pass scrubber = scrubadub.Scrubber() with self.assertRaises(TypeError): scrubber.add_detector(NotDetector)
def __init__(self, uids_for_initials=True, initials_placeholder='someone'): """ Params: uids_for_initials: Whether or not unique identifiers should be used to replace initials. If `False`, `initials_placeholder` will be used instead. """ super().__init__() self._uids_for_initials = uids_for_initials self.scrubber = scrubadub.Scrubber() # replace default name detector with new name detector that doesn't delete keywords self.scrubber.remove_detector("name") custom_detectors = (CustomNameDetector, InitialsDetector) # Use placeholders instead of uids if specified if not uids_for_initials: # noinspection PyProtectedMember for detector in [*custom_detectors, *self.scrubber._detectors.values()]: detector.filth_cls.placeholder = initials_placeholder # Don't add '{{'/'}}' prefix/suffix detector.filth_cls.prefix = detector.filth_cls.suffix = '' # Prevent merging because it doesn't respect the placeholder detector.filth_cls.merge = lambda _self, other: _self for detector in custom_detectors: self.scrubber.add_detector(detector)
def test_remove_post_processor(self): """make sure you can't add a detector that is not a Detector""" post_processor = scrubadub.post_processors.FilthTypeReplacer( name='typeinator') scrubber = scrubadub.Scrubber(post_processor_list=[post_processor]) scrubber.add_post_processor(scrubadub.post_processors.HashReplacer) scrubber.add_post_processor('filth_type_replacer') self.assertEqual(len(scrubber._post_processors), 3) self.assertEqual( [x.name for x in scrubber._post_processors], ['typeinator', 'hash_replacer', 'filth_type_replacer']) scrubber.remove_post_processor('filth_type_replacer') self.assertEqual(len(scrubber._post_processors), 2) self.assertEqual([x.name for x in scrubber._post_processors], ['typeinator', 'hash_replacer']) scrubber.remove_post_processor(scrubadub.post_processors.HashReplacer) self.assertEqual(len(scrubber._post_processors), 1) self.assertEqual([x.name for x in scrubber._post_processors], ['typeinator']) scrubber.remove_post_processor(post_processor) self.assertEqual(len(scrubber._post_processors), 0) self.assertEqual([x.name for x in scrubber._post_processors], [])
def test_add_duplicate_post_processor(self): """make sure adding a detector that already exists raises an error""" scrubber = scrubadub.Scrubber() scrubber.add_post_processor(scrubadub.post_processors.HashReplacer) with self.assertRaises(KeyError): scrubber.add_post_processor(scrubadub.post_processors.HashReplacer)
def test_filth_merge(self): """filth should merge properly""" # this looks like an email address 'me at john.doe' and skype text = "you can skype me at john.doe" scrubber = scrubadub.Scrubber() filths = [filth for filth in scrubber.iter_filth(text)] self.assertEqual(len(filths), 1)
def generate_and_scrub(locale, filth_list, detectors, n_docs: int = 50): documents = [] known_pii = [] click.echo("Generating {} docs with filth: {}".format( locale, ", ".join(filth_list))) start_time = time.time() for i_doc in range(n_docs): new_doc, new_known_pii = make_fake_document(paragraphs=2 * len(filth_list), locale=locale, seed=None, filth_types=filth_list) documents.append(new_doc) known_pii += new_known_pii scrubber_time = time.time() click.echo("Scrubbing with detectors: {}".format(', '.join(detectors))) detectors.append( scrubadub.detectors.KnownFilthDetector(locale=locale, known_filth_items=known_pii)) scrubber = scrubadub.Scrubber(locale=locale, detector_list=detectors) found_filth = list(scrubber.iter_filth_documents(documents)) end_time = time.time() click.echo("Documents generated in {:.2f}s".format(scrubber_time - start_time)) click.echo("Scrubbed documents in {:.2f}s".format(end_time - scrubber_time)) return found_filth
def test_not_implemented_locale(self): """test a non existant region""" scrubber = scrubadub.Scrubber(locale='fr_FR') with warnings.catch_warnings(): warnings.simplefilter("error") with self.assertRaises(UserWarning): scrubber.add_detector(scrubadub.detectors.PostalCodeDetector)
def test_list_filth_documents_list(self): """Test the iter_filth_documents function with a list""" scrubber = scrubadub.Scrubber(post_processor_list=[ scrubadub.post_processors.FilthTypeReplacer() ]) docs = [ "This is a test message for [email protected]", "Hello Jane, I am Tom.", ] filth_list_one = list( scrubber.iter_filth_documents(docs, run_post_processors=True)) filth_list_two = list( scrubber.iter_filth_documents(docs, run_post_processors=False)) for filths in [filth_list_one, filth_list_two]: self.assertEqual(filths, [ scrubadub.filth.EmailFilth(text='*****@*****.**', document_name='0', detector_name='email', beg=27, end=46), scrubadub.filth.NameFilth(text='Jane', document_name='1', detector_name='name', beg=6, end=10), scrubadub.filth.NameFilth(text='Tom', document_name='1', detector_name='name', beg=17, end=20), ])
def test_filth_merge_placeholder(self): """filths should be merged into the biggest filth""" text = "you can skype me at john.doe" scrubber = scrubadub.Scrubber() for filth in scrubber.iter_filth(text): self.assertIsInstance(filth, MergedFilth) self.assertTrue('SKYPE' in filth.placeholder, filth.placeholder) self.assertTrue('EMAIL' in filth.placeholder, filth.placeholder)
def clean_if(x): scrubber = scrubadub.Scrubber() scrubber.remove_detector('name') scrubber.remove_detector('url') if isinstance(x, str): x = scrubber.clean(x) return x
def test_scrubber_clean(self): """test older scrubber API""" scrubber = scrubadub.Scrubber() scrubber.remove_detector('email') text = "contact Joe Duffy at [email protected]" self.assertEqual( scrubadub.clean(text), "contact {{NAME}} {{NAME}} at {{EMAIL}}", )
def main(): general_docs = [] # address_docs = [] # uk_phone_docs = [] known_general_pii = [] # known_address_pii = [] # known_uk_phone_pii = [] start_time = time.time() for i_doc in range(50): filth_types = [ 'email', 'name', 'phone', 'postalcode', 'ssn', 'twitter', 'url' ] new_doc, new_known_pii = make_fake_document(paragraphs=2 * len(filth_types), seed=i_doc, filth_types=filth_types) general_docs.append(new_doc) known_general_pii += new_known_pii # new_doc, new_known_pii = make_fake_document(paragraphs=4, seed=i_doc, filth_types=['gb_address', 'us_address']) # address_docs.append(new_doc) # known_address_pii += new_known_pii # new_doc, new_known_pii = make_fake_document( # faker=faker.Faker(locale='en_gb'), paragraphs=2, seed=i_doc, filth_types=['phone'] # ) # uk_phone_docs.append(new_doc) # known_uk_phone_pii += new_known_pii scrubber_time = time.time() scrubber = scrubadub.Scrubber() # scrubber.add_detector(scrubadub.detectors.stanford_ner.StanfordNERDetector()) scrubber.add_detector( scrubadub.detectors.KnownFilthDetector( known_filth_items=known_general_pii)) filth_list = list(scrubber.iter_filth_documents(general_docs)) # scrubber = scrubadub.Scrubber(detector_list=[ # scrubadub.detectors.address.GBAddressDetector(), # scrubadub.detectors.address.USAddressDetector(), # scrubadub.detectors.KnownFilthDetector(predefined_pii=known_address_pii), # ]) # filth_list += list(scrubber.iter_filth_documents(address_docs)) # scrubber = scrubadub.Scrubber(detector_list=[ # scrubadub.detectors.PhoneDetector(region='gb', name='gb_phone'), # scrubadub.detectors.KnownFilthDetector(predefined_pii=known_uk_phone_pii), # ]) # filth_list += list(scrubber.iter_filth_documents(uk_phone_docs)) end_time = time.time() print("Documents generated in {:.2f}s".format(scrubber_time - start_time)) print("Scrubbed documents in {:.2f}s".format(end_time - scrubber_time)) print(get_filth_classification_report(filth_list)) sys.exit(0)
def test_disable_name(self): """ BEFORE: My nino is AB 123456 C AFTER: My nino is {{NINO}} """ before, after = self.get_before_after() import scrubadub scrubber = scrubadub.Scrubber() scrubber.remove_detector('name') self.check_equal(after, scrubber.clean(before))
def test_filth_ordering(self): """make sure filth is returned in order""" scrubber = scrubadub.Scrubber() text = ("Alan can be reached by email [email protected] or " "phone +1.312.456.6421") order = [] for filth in scrubber.iter_filth(text): order.append(filth.beg) order.append(filth.end) self.assertEqual(sorted(order), order)
def test_disable_passport(self): """ BEFORE: My credit card is 4111111111111111. AFTER: My credit card is {{CREDITCARD}}. """ before, after = self.get_before_after() import scrubadub scrubber = scrubadub.Scrubber() scrubber.remove_detector('passport') self.check_equal(after, scrubber.clean(before))
def test_vehicle_disable_name(self): """ BEFORE: My number plate is AB12 ABC. AFTER: My number plate is {{VEHICLE}}. """ before, after = self.get_before_after() import scrubadub scrubber = scrubadub.Scrubber() scrubber.remove_detector('name') self.check_equal(after, scrubber.clean(before))
def test_list_document(self): """check we can clean a list of documents""" text = [ 'the apple was eaten by a shark', 'the apple was not eaten by the fish', ] scrubber = scrubadub.Scrubber() self.assertEqual(scrubber.clean_documents(text), text) text_dirty = [ 'shark sent [email protected] a complaint', 'the fish swam on by', ] text_clean = [ 'shark sent {{EMAIL}} a complaint', 'the fish swam on by', ] scrubber = scrubadub.Scrubber() self.assertEqual(scrubber.clean_documents(text_dirty), text_clean)
def test_disable_email(self): """ BEFORE: contact me at [email protected] AFTER: contact me at [email protected] """ before, after = self.get_before_after() import scrubadub scrubber = scrubadub.Scrubber() scrubber.remove_detector('email') self.check_equal(after, scrubber.clean(before))
def _get_scrubber(): """Initialize a scrubber with phone, skype, ssn, tin and url detector.""" scrubber = scrubadub.Scrubber() # We are not looking at names and emails. # To prevent false positives, we remove these detectors for now. scrubber.remove_detector('email') scrubber.remove_detector('name') scrubber.remove_detector('phone') scrubber.add_detector(TINDetector) return scrubber
def test_disable_name(self): """ BEFORE: My passport number is 5333800068GBR8812049F2509286. AFTER: My passport number is {{PASSPORT}}. """ before, after = self.get_before_after() import scrubadub scrubber = scrubadub.Scrubber() scrubber.remove_detector('nino') self.check_equal(after, scrubber.clean(before))
def test_add_non_post_processor(self): """make sure you can't add a detector that is not a Detector""" class NotPostProcessor(object): pass scrubber = scrubadub.Scrubber() with self.assertRaises(TypeError): scrubber.add_post_processor(NotPostProcessor) with self.assertRaises(ValueError): scrubber.add_post_processor('not_really_the_name_of_a_detector')
def test_add_post_processor_instance(self): """make sure adding some post processors work""" scrubber = scrubadub.Scrubber() scrubber.add_post_processor( scrubadub.post_processors.HashReplacer(salt='example_salt', include_filth_type=False)) scrubber.add_post_processor( scrubadub.post_processors.PrefixSuffixReplacer(prefix='<<', suffix='>>')) print(scrubber._post_processors) text = scrubber.clean("hello from [email protected]") self.assertEqual(text, "hello from <<5A337A5C25F9D260>>")
def test_iter_not_return_filth(self): """make sure a detector cant return non filth""" class BadDetector(scrubadub.detectors.Detector): name = 'bad_detector' # TODO: investigate below def iter_filth(self, text, **kwargs): yield 'Non-filth' scrubber = scrubadub.Scrubber(detector_list=[BadDetector()]) with self.assertRaises(TypeError) as err: list(scrubber.iter_filth('A fake document with no pii'))
def test_add_post_processor_instance_with_name(self): """make sure adding a duplicate post_processors with a different name works""" scrubber = scrubadub.Scrubber(post_processor_list=[ scrubadub.post_processors.FilthTypeReplacer(name='typeinator'), scrubadub.post_processors.PrefixSuffixReplacer(name='prefixor'), ]) scrubber.add_post_processor( scrubadub.post_processors.PrefixSuffixReplacer( name='prefixor_two')) self.assertEqual(len(scrubber._post_processors), 3) filth = list(scrubber.iter_filth('hello [email protected]')) self.assertEqual(len(filth), 1) self.assertEqual(filth[0].replacement_string, '{{{{EMAIL}}}}')
def test_add_detector_instance(self): """make sure adding an initialised detector works""" scrubber = scrubadub.Scrubber(detector_list=[]) scrubber.add_detector(scrubadub.detectors.email.EmailDetector) self.assertEqual(len(scrubber._detectors), 1) self.assertEqual(list(scrubber._detectors.keys()), ['email']) scrubber.remove_detector('email') self.assertEqual(len(scrubber._detectors), 0) scrubber.add_detector('email') self.assertEqual(len(scrubber._detectors), 1) self.assertEqual(list(scrubber._detectors.keys()), ['email'])
def test_clean_customize_filth_identification(self): """ BEFORE: contact Joe Duffy at [email protected] AFTER: contact <b>NAME</b> <b>NAME</b> at <b>EMAIL</b> """ before, after = self.get_before_after() import scrubadub prefix = scrubadub.filth.base.Filth.prefix suffix = scrubadub.filth.base.Filth.suffix scrubadub.filth.base.Filth.prefix = u'<b>' scrubadub.filth.base.Filth.suffix = u'</b>' scrubber = scrubadub.Scrubber() self.check_equal(after, scrubber.clean(before)) scrubadub.filth.base.Filth.prefix = prefix scrubadub.filth.base.Filth.suffix = suffix
def scrub(df, column): """Add different scrubber classes and run column through scrubadub.""" # scrubadub.filth.date_of_birth.DateOfBirthFilth.min_age_years = 5 scrubber = scrubadub.Scrubber() # scrubber.add_detector(scrubadub.detectors.date_of_birth.DateOfBirthDetector()) scrubber.remove_detector("url") scrubber.remove_detector("twitter") scrubber.remove_detector("email") scrubber.add_detector(SSNDetector) scrubber.add_detector(PassportDetector) scrubber.add_detector(AlienIdDetector) #Test breaking detectors scrubber.add_detector(FL_DLDetector) scrubber.add_detector(HI_NE_VA_DLDetector) scrubber.add_detector(IL_DLDetector) scrubber.add_detector(MN_FL_MD_MI_DLDetector) scrubber.add_detector(MO_OK_DLDetector) scrubber.add_detector(MD_DLDetector) #Working Detectors scrubber.add_detector(CA_DLDetector) scrubber.add_detector(CO_DLDetector) scrubber.add_detector(ID_DLDetector) scrubber.add_detector(NJ_DLDetector) scrubber.add_detector(NY_DLDetector) scrubber.add_detector(ND_DLDetector) scrubber.add_detector(OH_DLDetector) scrubber.add_detector(PA_DLDetector) scrubber.add_detector(VT_DLDetector) scrubber.add_detector(VA_DLDetector) scrubber.add_detector(WA_DLDetector) scrubber.add_detector(WV_DLDetector) scrubber.add_detector(WI_DLDetector) scrubber.add_detector(WY_DLDetector) scrubber.add_detector(NH_DLDetector) scrubber.add_detector(IO_DLDetector) scrubber.add_detector(IA_DLDetector) scrubber.add_detector(KS_DLDetector) scrubber.add_detector(KY_DLDetector) scrubber.add_detector(MI_DLDetector) scruby = lambda x: scrubber.clean(x) df[column] = df[column].apply(scruby) # analyzer = AnalyzerEngine() # anonymizer = AnonymizerEngine() # entities = ["PHONE_NUMBER","CREDIT_CARD","US_DRIVER_LICENSE","US_SSN","EMAIL_ADDRESS","IP_ADDRESS"] # scrub_2 = lambda x: anonymizer.anonymize(text=x,analyzer_results=analyzer.analyze(text=x,entities=entities,language='en')).text # df[column] = df[column].apply(scrub_2) return df
def test_add_post_processor_order(self): """make sure adding some post processors work""" scrubber = scrubadub.Scrubber() scrubber.add_post_processor( scrubadub.post_processors.FilthTypeReplacer(name='one')) scrubber.add_post_processor( scrubadub.post_processors.HashReplacer(name='two', salt='example_salt', include_filth_type=False)) scrubber.add_post_processor( scrubadub.post_processors.PrefixSuffixReplacer(name='three', prefix='<<', suffix='>>')) self.assertEqual([ i for i, x in enumerate(scrubber._post_processors) if x.name == 'one' ][0], 0) self.assertEqual([ i for i, x in enumerate(scrubber._post_processors) if x.name == 'two' ][0], 1) self.assertEqual([ i for i, x in enumerate(scrubber._post_processors) if x.name == 'three' ][0], 2) scrubber.add_post_processor( scrubadub.post_processors.FilthTypeReplacer(name='zero'), index=0) print(scrubber._post_processors) self.assertEqual([ i for i, x in enumerate(scrubber._post_processors) if x.name == 'zero' ][0], 0) self.assertEqual([ i for i, x in enumerate(scrubber._post_processors) if x.name == 'one' ][0], 1) self.assertEqual([ i for i, x in enumerate(scrubber._post_processors) if x.name == 'two' ][0], 2) self.assertEqual([ i for i, x in enumerate(scrubber._post_processors) if x.name == 'three' ][0], 3)
def test_customize_filth_identification(self): """ BEFORE: contact me at [email protected] AFTER: contact me at <b>EMAIL</b> """ before, after = self.get_before_after() import scrubadub prefix = scrubadub.filth.base.Filth.prefix suffix = scrubadub.filth.base.Filth.suffix scrubadub.filth.base.Filth.prefix = u'<b>' scrubadub.filth.base.Filth.suffix = u'</b>' try: scrubber = scrubadub.Scrubber() self.check_equal(after, scrubber.clean(before)) finally: # Ensure that this is reset, no matter what happens above scrubadub.filth.base.Filth.prefix = prefix scrubadub.filth.base.Filth.suffix = suffix