Python corpus_token_counts Examples

Programming Language: Python

Namespace/Package Name: trax.data.tokenizer

Method/Function: corpus_token_counts

Examples at hotexamples.com: 5

Python corpus_token_counts - 5 examples found. These are the top rated real world Python examples of trax.data.tokenizer.corpus_token_counts extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

    def test_corpus_token_counts_split_with_max_lines(self):
        token_counts = tokenizer.corpus_token_counts(self.corpus_path,
                                                     corpus_max_lines=5,
                                                     split_on_newlines=True)

        self.assertIn(u"slept", token_counts)
        self.assertNotIn(u"Mitch", token_counts)

Example #2

Show file

File: tokenizer_test.py Project: yliu45/trax

  def test_corpus_token_counts_no_split_with_max_lines(self):
    token_counts = tokenizer.corpus_token_counts(
        self.corpus_path, corpus_max_lines=5, split_on_newlines=False)

    self.assertIn(u"slept", token_counts)
    self.assertNotIn(u"Mitch", token_counts)
    self.assertDictContainsSubset({
        u".\n\n": 1,
        u"\n": 2,
        u".\n": 1
    }, token_counts)

Example #3

Show file

File: tokenizer_test.py Project: yliu45/trax

  def test_corpus_token_counts_split_on_newlines(self):
    token_counts = tokenizer.corpus_token_counts(
        self.corpus_path, corpus_max_lines=0, split_on_newlines=True)

    expected = {
        u"'": 2,
        u".": 2,
        u". ": 1,
        u"... ": 1,
        u"Groucho": 1,
        u"Marx": 1,
        u"Mitch": 1,
        u"Hedberg": 1,
        u"I": 3,
        u"in": 2,
        u"my": 2,
        u"pajamas": 2,
    }
    self.assertDictContainsSubset(expected, token_counts)
    self.assertNotIn(u".\n\n", token_counts)
    self.assertNotIn(u"\n", token_counts)

Example #4

Show file

def main(unused_argv):
    if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern:
        raise ValueError(
            'Must only provide one of --corpus_filepattern or --vocab_filepattern'
        )

    elif FLAGS.corpus_filepattern:
        token_counts = tokenizer.corpus_token_counts(
            FLAGS.corpus_filepattern,
            FLAGS.corpus_max_lines,
            split_on_newlines=FLAGS.split_on_newlines)

    elif FLAGS.vocab_filepattern:
        token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern,
                                                    FLAGS.corpus_max_lines)

    else:
        raise ValueError(
            'Must provide one of --corpus_filepattern or --vocab_filepattern')

    encoder = text_encoder.SubwordTextEncoder()
    encoder.build_from_token_counts(token_counts, FLAGS.min_count,
                                    FLAGS.num_iterations)
    encoder.store_to_file(FLAGS.output_filename)

Example #5

Show file

File: tokenizer_test.py Project: yliu45/trax

  def test_corpus_token_counts_no_split_on_newlines(self):
    token_counts = tokenizer.corpus_token_counts(
        self.corpus_path, corpus_max_lines=0, split_on_newlines=False)

    self.assertDictContainsSubset({u".\n\n": 2, u"\n": 3}, token_counts)