def scan(self): logging.debug("Scanning %s" % self._path) if os.path.isfile(self._path): mime_type = magic.from_file(self._path, mime=True) self._files.append(File(self._path, mime_type)) logging.debug( "\t- full path: %s, mime_type: %s" % (os.path.abspath(self._path), mime_type) ) else: for root, subdirs, files in os.walk(self._path): for filename in files: file_path = os.path.join(root, filename) mime_type = magic.from_file(file_path, mime=True) logging.debug( "\t- full path: %s, mime_type: %s" % (file_path, mime_type) ) self._files.append(File(file_path, mime_type)) context = { "tokenizer": Tokenizer(), "regex": RegexScanner(), "ner": NERScanner(), } for f in self._files: f.scan(context)
def test_tokenization(self): tok = Tokenizer() tokens = tok.tokenize("Jonathan is in Bangalore") self.assertEqual(4, len(tokens))