def test_corpus_token_counts_split_with_max_lines(self): token_counts = tokenizer.corpus_token_counts(self.corpus_path, corpus_max_lines=5, split_on_newlines=True) self.assertIn(u"slept", token_counts) self.assertNotIn(u"Mitch", token_counts)
def test_corpus_token_counts_no_split_with_max_lines(self): token_counts = tokenizer.corpus_token_counts( self.corpus_path, corpus_max_lines=5, split_on_newlines=False) self.assertIn(u"slept", token_counts) self.assertNotIn(u"Mitch", token_counts) self.assertDictContainsSubset({ u".\n\n": 1, u"\n": 2, u".\n": 1 }, token_counts)
def test_corpus_token_counts_split_on_newlines(self): token_counts = tokenizer.corpus_token_counts( self.corpus_path, corpus_max_lines=0, split_on_newlines=True) expected = { u"'": 2, u".": 2, u". ": 1, u"... ": 1, u"Groucho": 1, u"Marx": 1, u"Mitch": 1, u"Hedberg": 1, u"I": 3, u"in": 2, u"my": 2, u"pajamas": 2, } self.assertDictContainsSubset(expected, token_counts) self.assertNotIn(u".\n\n", token_counts) self.assertNotIn(u"\n", token_counts)
def main(unused_argv): if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern: raise ValueError( 'Must only provide one of --corpus_filepattern or --vocab_filepattern' ) elif FLAGS.corpus_filepattern: token_counts = tokenizer.corpus_token_counts( FLAGS.corpus_filepattern, FLAGS.corpus_max_lines, split_on_newlines=FLAGS.split_on_newlines) elif FLAGS.vocab_filepattern: token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern, FLAGS.corpus_max_lines) else: raise ValueError( 'Must provide one of --corpus_filepattern or --vocab_filepattern') encoder = text_encoder.SubwordTextEncoder() encoder.build_from_token_counts(token_counts, FLAGS.min_count, FLAGS.num_iterations) encoder.store_to_file(FLAGS.output_filename)
def test_corpus_token_counts_no_split_on_newlines(self): token_counts = tokenizer.corpus_token_counts( self.corpus_path, corpus_max_lines=0, split_on_newlines=False) self.assertDictContainsSubset({u".\n\n": 2, u"\n": 3}, token_counts)