Beispiel #1
0
    def scan(self):
        logging.debug("Scanning %s" % self._path)
        if os.path.isfile(self._path):
            mime_type = magic.from_file(self._path, mime=True)
            self._files.append(File(self._path, mime_type))
            logging.debug(
                "\t- full path: %s, mime_type: %s"
                % (os.path.abspath(self._path), mime_type)
            )
        else:
            for root, subdirs, files in os.walk(self._path):
                for filename in files:
                    file_path = os.path.join(root, filename)
                    mime_type = magic.from_file(file_path, mime=True)

                    logging.debug(
                        "\t- full path: %s, mime_type: %s" % (file_path, mime_type)
                    )
                    self._files.append(File(file_path, mime_type))

        context = {
            "tokenizer": Tokenizer(),
            "regex": RegexScanner(),
            "ner": NERScanner(),
        }
        for f in self._files:
            f.scan(context)
Beispiel #2
0
 def test_tokenization(self):
     tok = Tokenizer()
     tokens = tok.tokenize("Jonathan is in Bangalore")
     self.assertEqual(4, len(tokens))