def test_vocab_token_counts_with_max_lines(self):
        # vocab-1 has 2 lines, vocab-2 has 3
        token_counts = tokenizer.vocab_token_counts(self.vocab_path, 5)

        expected = {
            u"lollipop": 8,
            u"reverberated": 12,
            u"kattywampus": 11,
            u"balderdash": 10,
        }
        self.assertDictEqual(expected, token_counts)
    def test_vocab_token_counts(self):
        token_counts = tokenizer.vocab_token_counts(self.vocab_path, 0)

        expected = {
            u"lollipop": 8,
            u"reverberated": 12,
            u"kattywampus": 11,
            u"balderdash": 10,
            u"jiggery-pokery": 14,
        }
        self.assertDictEqual(expected, token_counts)
  def test_vocab_token_counts_with_max_lines(self):
    # vocab-1 has 2 lines, vocab-2 has 3
    token_counts = tokenizer.vocab_token_counts(self.vocab_path, 5)

    expected = {
        u"lollipop": 8,
        u"reverberated": 12,
        u"kattywampus": 11,
        u"balderdash": 10,
    }
    self.assertDictEqual(expected, token_counts)
  def test_vocab_token_counts(self):
    token_counts = tokenizer.vocab_token_counts(self.vocab_path, 0)

    expected = {
        u"lollipop": 8,
        u"reverberated": 12,
        u"kattywampus": 11,
        u"balderdash": 10,
        u"jiggery-pokery": 14,
    }
    self.assertDictEqual(expected, token_counts)
Exemple #5
0
def main(unused_argv):
  if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern:
    raise ValueError(
        'Must only provide one of --corpus_filepattern or --vocab_filepattern')

  elif FLAGS.corpus_filepattern:
    token_counts = tokenizer.corpus_token_counts(
        FLAGS.corpus_filepattern,
        FLAGS.corpus_max_lines,
        split_on_newlines=FLAGS.split_on_newlines)

  elif FLAGS.vocab_filepattern:
    token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern,
                                                FLAGS.corpus_max_lines)

  else:
    raise ValueError(
        'Must provide one of --corpus_filepattern or --vocab_filepattern')

  encoder = text_encoder.SubwordTextEncoder()
  encoder.build_from_token_counts(token_counts, FLAGS.min_count,
                                  FLAGS.num_iterations)
  encoder.store_to_file(FLAGS.output_filename)
def main(unused_argv):
  if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern:
    raise ValueError(
        'Must only provide one of --corpus_filepattern or --vocab_filepattern')

  elif FLAGS.corpus_filepattern:
    token_counts = tokenizer.corpus_token_counts(
        FLAGS.corpus_filepattern,
        FLAGS.corpus_max_lines,
        split_on_newlines=FLAGS.split_on_newlines)

  elif FLAGS.vocab_filepattern:
    token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern,
                                                FLAGS.corpus_max_lines)

  else:
    raise ValueError(
        'Must provide one of --corpus_filepattern or --vocab_filepattern')

  encoder = text_encoder.SubwordTextEncoder()
  encoder.build_from_token_counts(token_counts, FLAGS.min_count,
                                  FLAGS.num_iterations)
  encoder.store_to_file(FLAGS.output_filename)