def scan(self): logging.debug("Scanning %s" % self._path) if os.path.isfile(self._path): mime_type = magic.from_file(self._path, mime=True) self._files.append(File(self._path, mime_type)) logging.debug( "\t- full path: %s, mime_type: %s" % (os.path.abspath(self._path), mime_type) ) else: for root, subdirs, files in os.walk(self._path): for filename in files: file_path = os.path.join(root, filename) mime_type = magic.from_file(file_path, mime=True) logging.debug( "\t- full path: %s, mime_type: %s" % (file_path, mime_type) ) self._files.append(File(file_path, mime_type)) context = { "tokenizer": Tokenizer(), "regex": RegexScanner(), "ner": NERScanner(), } for f in self._files: f.scan(context)
def test_positive_scan_column(self): col = Column('col') col.scan('Jonathan Smith', [RegexScanner(), NERScanner()]) self.assertTrue(col.has_pii()) self.assertEqual({ 'pii_types': [PiiTypes.PERSON], 'name': 'col' }, col.get_dict())
def test_positive_scan_column(self): col = Column("col") col.scan("Jonathan Smith", [RegexScanner(), NERScanner()]) self.assertTrue(col.has_pii()) self.assertEqual({ "pii_types": [PiiTypes.PERSON], "name": "col" }, col.get_dict())
def scan(self, generator): scanners = [RegexScanner(), NERScanner()] for row in generator(column_list=self._columns, schema_name=self._schema, table_name=self): for col, val in zip(self._columns, row): col.scan(val, scanners) for col in self._columns: [self._pii.add(p) for p in col.get_pii_types()] logging.debug(self._pii)
def scan(self, generator): self.logger.debug("Scanning table name %s" % self.get_name()) scanners = [RegexScanner(), NERScanner()] for row in generator(column_list=self.get_children(), schema_name=self._schema, table_name=self): for col, val in zip(self.get_children(), row): col.scan(val, scanners) for col in self.get_children(): [self._pii.add(p) for p in col.get_pii_types()] self.logger.debug("%s has %s", self.get_name(), self.get_pii_types_str())
class RegexTestCase(TestCase): def setUp(self): self.parser = RegexScanner() def test_phones(self): matching = [ "12345678900", "1234567890", "+1 234 567 8900", "234-567-8900", "1-234-567-8900", "1.234.567.8900", "5678900", "567-8900", "(123) 456 7890", "+41 22 730 5989", "(+41) 22 730 5989", "+442345678900", ] for text in matching: self.assertEqual(self.parser.scan(text), [PiiTypes.PHONE]) def test_emails(self): matching = [ "*****@*****.**", "*****@*****.**", "*****@*****.**" ] non_matching = ["*****@*****.**"] for text in matching: self.assertEqual(self.parser.scan(text), [PiiTypes.EMAIL]) for text in non_matching: self.assertEqual(self.parser.scan(text), []) def test_credit_cards(self): matching = [ "0000-0000-0000-0000", "0123456789012345", "0000 0000 0000 0000", "012345678901234", ] for text in matching: self.assertTrue(PiiTypes.CREDIT_CARD in self.parser.scan(text)) def test_street_addresses(self): matching = [ "checkout the new place at 101 main st.", "504 parkwood drive", "3 elm boulevard", "500 elm street ", ] non_matching = ["101 main straight"] for text in matching: self.assertEqual(self.parser.scan(text), [PiiTypes.ADDRESS]) for text in non_matching: self.assertEqual(self.parser.scan(text), [])
def scan_file_object(fd: TextIO) -> List[Any]: """ Args: fd (file descriptor): A file descriptor open in text mode. Returns: A list of PIITypes enum of all the PII types found in the file. """ scanner = IO("api file object", fd) context = { "tokenizer": Tokenizer(), "regex": RegexScanner(), "ner": NERScanner(), } scanner.scan(context) return scanner.get_pii_types()
def test_null_scan_column(self): col = Column('col') col.scan(None, [RegexScanner(), NERScanner()]) self.assertFalse(col.has_pii()) self.assertEqual({'pii_types': [], 'name': 'col'}, col.get_dict())
def setUp(self): self.parser = RegexScanner()
def test_null_scan_column(self): col = Column("col") col.scan(None, [RegexScanner(), NERScanner()]) self.assertFalse(col.has_pii()) self.assertEqual({"pii_types": [], "name": "col"}, col.get_dict())
def scan(self, context): for scanner in [RegexScanner(), NERScanner()]: [self._pii.add(pii) for pii in scanner.scan(context)] logging.debug(self._pii)