Example #1
0
    def scan(self):
        logging.debug("Scanning %s" % self._path)
        if os.path.isfile(self._path):
            mime_type = magic.from_file(self._path, mime=True)
            self._files.append(File(self._path, mime_type))
            logging.debug(
                "\t- full path: %s, mime_type: %s"
                % (os.path.abspath(self._path), mime_type)
            )
        else:
            for root, subdirs, files in os.walk(self._path):
                for filename in files:
                    file_path = os.path.join(root, filename)
                    mime_type = magic.from_file(file_path, mime=True)

                    logging.debug(
                        "\t- full path: %s, mime_type: %s" % (file_path, mime_type)
                    )
                    self._files.append(File(file_path, mime_type))

        context = {
            "tokenizer": Tokenizer(),
            "regex": RegexScanner(),
            "ner": NERScanner(),
        }
        for f in self._files:
            f.scan(context)
Example #2
0
 def test_positive_scan_column(self):
     col = Column('col')
     col.scan('Jonathan Smith', [RegexScanner(), NERScanner()])
     self.assertTrue(col.has_pii())
     self.assertEqual({
         'pii_types': [PiiTypes.PERSON],
         'name': 'col'
     }, col.get_dict())
 def test_positive_scan_column(self):
     col = Column("col")
     col.scan("Jonathan Smith", [RegexScanner(), NERScanner()])
     self.assertTrue(col.has_pii())
     self.assertEqual({
         "pii_types": [PiiTypes.PERSON],
         "name": "col"
     }, col.get_dict())
Example #4
0
    def scan(self, generator):
        scanners = [RegexScanner(), NERScanner()]
        for row in generator(column_list=self._columns,
                             schema_name=self._schema,
                             table_name=self):
            for col, val in zip(self._columns, row):
                col.scan(val, scanners)

        for col in self._columns:
            [self._pii.add(p) for p in col.get_pii_types()]

        logging.debug(self._pii)
    def scan(self, generator):
        self.logger.debug("Scanning table name %s" % self.get_name())
        scanners = [RegexScanner(), NERScanner()]
        for row in generator(column_list=self.get_children(),
                             schema_name=self._schema,
                             table_name=self):
            for col, val in zip(self.get_children(), row):
                col.scan(val, scanners)

        for col in self.get_children():
            [self._pii.add(p) for p in col.get_pii_types()]

        self.logger.debug("%s has %s", self.get_name(),
                          self.get_pii_types_str())
Example #6
0
class RegexTestCase(TestCase):
    def setUp(self):
        self.parser = RegexScanner()

    def test_phones(self):
        matching = [
            "12345678900",
            "1234567890",
            "+1 234 567 8900",
            "234-567-8900",
            "1-234-567-8900",
            "1.234.567.8900",
            "5678900",
            "567-8900",
            "(123) 456 7890",
            "+41 22 730 5989",
            "(+41) 22 730 5989",
            "+442345678900",
        ]
        for text in matching:
            self.assertEqual(self.parser.scan(text), [PiiTypes.PHONE])

    def test_emails(self):
        matching = [
            "*****@*****.**", "*****@*****.**", "*****@*****.**"
        ]
        non_matching = ["*****@*****.**"]
        for text in matching:
            self.assertEqual(self.parser.scan(text), [PiiTypes.EMAIL])
        for text in non_matching:
            self.assertEqual(self.parser.scan(text), [])

    def test_credit_cards(self):
        matching = [
            "0000-0000-0000-0000",
            "0123456789012345",
            "0000 0000 0000 0000",
            "012345678901234",
        ]
        for text in matching:
            self.assertTrue(PiiTypes.CREDIT_CARD in self.parser.scan(text))

    def test_street_addresses(self):
        matching = [
            "checkout the new place at 101 main st.",
            "504 parkwood drive",
            "3 elm boulevard",
            "500 elm street ",
        ]
        non_matching = ["101 main straight"]

        for text in matching:
            self.assertEqual(self.parser.scan(text), [PiiTypes.ADDRESS])
        for text in non_matching:
            self.assertEqual(self.parser.scan(text), [])
Example #7
0
def scan_file_object(fd: TextIO) -> List[Any]:
    """

    Args:
        fd (file descriptor): A file descriptor open in text mode.

    Returns: A list of PIITypes enum of all the PII types found in the file.

    """
    scanner = IO("api file object", fd)
    context = {
        "tokenizer": Tokenizer(),
        "regex": RegexScanner(),
        "ner": NERScanner(),
    }

    scanner.scan(context)
    return scanner.get_pii_types()
Example #8
0
 def test_null_scan_column(self):
     col = Column('col')
     col.scan(None, [RegexScanner(), NERScanner()])
     self.assertFalse(col.has_pii())
     self.assertEqual({'pii_types': [], 'name': 'col'}, col.get_dict())
Example #9
0
 def setUp(self):
     self.parser = RegexScanner()
 def test_null_scan_column(self):
     col = Column("col")
     col.scan(None, [RegexScanner(), NERScanner()])
     self.assertFalse(col.has_pii())
     self.assertEqual({"pii_types": [], "name": "col"}, col.get_dict())
Example #11
0
    def scan(self, context):
        for scanner in [RegexScanner(), NERScanner()]:
            [self._pii.add(pii) for pii in scanner.scan(context)]

        logging.debug(self._pii)