def main(): args = arguments() n_tokens = 0 t0 = time.perf_counter() is_xml = False if args.xml or args.tag is not None: is_xml = True tokenizer = Tokenizer(args.split_camel_case, args.token_classes, args.extra_info, args.language) sentence_splitter = SentenceSplitter(args.token_classes or args.extra_info, args.language) if is_xml: if args.parallel > 1: logging.warning( "Parallel tokenization of XML files is currently not supported." ) eos_tags = args.tag if eos_tags is None: eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split( ) eos_tags = set(eos_tags) tokenized_paragraphs = [tokenizer.tokenize_xml(args.FILE)] if args.split_sentences: tokenized_paragraphs = list( sentence_splitter.split_xml(tokenized_paragraphs[0], eos_tags)) else: if args.paragraph_separator == "empty_lines": paragraphs = utils.get_paragraphs(args.FILE) elif args.paragraph_separator == "single_newlines": paragraphs = (line for line in args.FILE if line.strip() != "") if args.parallel > 1: pool = multiprocessing.Pool( min(args.parallel, multiprocessing.cpu_count())) tokenized_paragraphs = pool.imap(tokenizer.tokenize, paragraphs, 250) else: tokenized_paragraphs = map(tokenizer.tokenize, paragraphs) tokenized_paragraphs = (tp for tp in tokenized_paragraphs if tp) if args.split_sentences: tokenized_paragraphs = map(sentence_splitter.split, tokenized_paragraphs) tokenized_paragraphs = (s for tp in tokenized_paragraphs for s in tp) if args.token_classes or args.extra_info: if is_xml: tokenized_paragraphs = ([(l[0], ) if l[1] is None else l for l in tp] for tp in tokenized_paragraphs) tokenized_paragraphs = (["\t".join(t) for t in tp] for tp in tokenized_paragraphs) for tp in tokenized_paragraphs: n_tokens += len(tp) print("\n".join(tp), "\n", sep="") t1 = time.perf_counter() logging.info("Tokenized %d tokens in %d seconds (%d tokens/s)" % (n_tokens, t1 - t0, n_tokens / (t1 - t0)))
class TestTokenizer(unittest.TestCase): """""" def setUp(self): """Necessary preparations""" self.tokenizer = Tokenizer(split_camel_case=True) def _equal(self, raw, tokenized): """""" self.assertEqual(self.tokenizer.tokenize(raw), tokenized.split()) def _equal_xml(self, raw, tokenized): """""" self.assertEqual(self.tokenizer.tokenize_xml(raw, is_file=False), tokenized.split()) def _fail_means_improvement(self, raw, tokenized): """""" self.assertNotEqual(self.tokenizer.tokenize(raw), tokenized.split())