def test_corpus_token_counts_split_with_max_lines(self): token_counts = tokenizer.corpus_token_counts(self.corpus_path, corpus_max_lines=5, split_on_newlines=True) self.assertIn(u"slept", token_counts) self.assertNotIn(u"Mitch", token_counts)
def test_corpus_token_counts_no_split_with_max_lines(self): token_counts = tokenizer.corpus_token_counts( self.corpus_path, corpus_max_lines=5, split_on_newlines=False) self.assertIn(u"slept", token_counts) self.assertNotIn(u"Mitch", token_counts) self.assertDictContainsSubset({ u".\n\n": 1, u"\n": 2, u".\n": 1 }, token_counts)
def test_corpus_token_counts_no_split_with_max_lines(self): token_counts = tokenizer.corpus_token_counts( self.corpus_path, corpus_max_lines=5, split_on_newlines=False) self.assertIn(u"slept", token_counts) self.assertNotIn(u"Mitch", token_counts) self.assertDictContainsSubset({ u".\n\n": 1, u"\n": 2, u".\n": 1 }, token_counts)
def main(unused_argv): gs = text_encoder.SubwordTextEncoder() if not FLAGS.corpus_filepattern: raise ValueError('Must provide --corpus_filepattern') token_counts = tokenizer.corpus_token_counts( FLAGS.corpus_filepattern, FLAGS.corpus_max_lines, split_on_newlines=FLAGS.split_on_newlines) gs.build_from_token_counts(token_counts, FLAGS.min_count, FLAGS.num_iterations) gs.store_to_file(FLAGS.output_fn)
def _get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size): """Read or create vocabulary.""" vocab_filepath = os.path.join(tmp_dir, vocab_filename) print('Vocab file written to: ' + vocab_filepath) if tf.gfile.Exists(vocab_filepath): gs = text_encoder.SubwordTextEncoder(vocab_filepath) return gs example_file = os.path.join(tmp_dir, _EXAMPLES_FILE) gs = text_encoder.SubwordTextEncoder() token_counts = tokenizer.corpus_token_counts( example_file, corpus_max_lines=1000000) gs = gs.build_to_target_size( vocab_size, token_counts, min_val=1, max_val=1e3) gs.store_to_file(vocab_filepath) return gs
def _get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size): """Read or create vocabulary.""" vocab_filepath = os.path.join(tmp_dir, vocab_filename) print('Vocab file written to: ' + vocab_filepath) if tf.gfile.Exists(vocab_filepath): gs = text_encoder.SubwordTextEncoder(vocab_filepath) return gs example_file = os.path.join(tmp_dir, _EXAMPLES_FILE) gs = text_encoder.SubwordTextEncoder() token_counts = tokenizer.corpus_token_counts( example_file, corpus_max_lines=1000000) gs = gs.build_to_target_size( vocab_size, token_counts, min_val=1, max_val=1e3) gs.store_to_file(vocab_filepath) return gs
def test_corpus_token_counts_split_on_newlines(self): token_counts = tokenizer.corpus_token_counts( self.corpus_path, corpus_max_lines=0, split_on_newlines=True) expected = { u"'": 2, u".": 2, u". ": 1, u"... ": 1, u"Groucho": 1, u"Marx": 1, u"Mitch": 1, u"Hedberg": 1, u"I": 3, u"in": 2, u"my": 2, u"pajamas": 2, } self.assertDictContainsSubset(expected, token_counts) self.assertNotIn(u".\n\n", token_counts) self.assertNotIn(u"\n", token_counts)
def test_corpus_token_counts_split_on_newlines(self): token_counts = tokenizer.corpus_token_counts( self.corpus_path, corpus_max_lines=0, split_on_newlines=True) expected = { u"'": 2, u".": 2, u". ": 1, u"... ": 1, u"Groucho": 1, u"Marx": 1, u"Mitch": 1, u"Hedberg": 1, u"I": 3, u"in": 2, u"my": 2, u"pajamas": 2, } self.assertDictContainsSubset(expected, token_counts) self.assertNotIn(u".\n\n", token_counts) self.assertNotIn(u"\n", token_counts)
def main(unused_argv): if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern: raise ValueError( 'Must only provide one of --corpus_filepattern or --vocab_filepattern') elif FLAGS.corpus_filepattern: token_counts = tokenizer.corpus_token_counts( FLAGS.corpus_filepattern, FLAGS.corpus_max_lines, split_on_newlines=FLAGS.split_on_newlines) elif FLAGS.vocab_filepattern: token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern, FLAGS.corpus_max_lines) else: raise ValueError( 'Must provide one of --corpus_filepattern or --vocab_filepattern') encoder = text_encoder.SubwordTextEncoder() encoder.build_from_token_counts(token_counts, FLAGS.min_count, FLAGS.num_iterations) encoder.store_to_file(FLAGS.output_filename)
def main(unused_argv): if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern: raise ValueError( 'Must only provide one of --corpus_filepattern or --vocab_filepattern') elif FLAGS.corpus_filepattern: token_counts = tokenizer.corpus_token_counts( FLAGS.corpus_filepattern, FLAGS.corpus_max_lines, split_on_newlines=FLAGS.split_on_newlines) elif FLAGS.vocab_filepattern: token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern, FLAGS.corpus_max_lines) else: raise ValueError( 'Must provide one of --corpus_filepattern or --vocab_filepattern') encoder = text_encoder.SubwordTextEncoder() encoder.build_from_token_counts(token_counts, FLAGS.min_count, FLAGS.num_iterations) encoder.store_to_file(FLAGS.output_filename)
def test_corpus_token_counts_no_split_on_newlines(self): token_counts = tokenizer.corpus_token_counts(self.corpus_path, corpus_max_lines=0, split_on_newlines=False) self.assertDictContainsSubset({u".\n\n": 2, u"\n": 3}, token_counts)
def test_corpus_token_counts_split_with_max_lines(self): token_counts = tokenizer.corpus_token_counts( self.corpus_path, corpus_max_lines=5, split_on_newlines=True) self.assertIn(u"slept", token_counts) self.assertNotIn(u"Mitch", token_counts)
def test_corpus_token_counts_no_split_on_newlines(self): token_counts = tokenizer.corpus_token_counts( self.corpus_path, corpus_max_lines=0, split_on_newlines=False) self.assertDictContainsSubset({u".\n\n": 2, u"\n": 3}, token_counts)