Example #1
0
  def test_count_and_gen_subtokens(self):
    token_counts = {"abc": 5}
    alphabet = set("abc_")
    subtoken_dict = {"a": 0, "b": 1, "c": 2, "_": 3}
    max_subtoken_length = 2

    subtoken_counts = tokenizer._count_and_gen_subtokens(
        token_counts, alphabet, subtoken_dict, max_subtoken_length)

    self.assertIsInstance(subtoken_counts, collections.defaultdict)
    self.assertDictEqual(
        {"a": 5, "b": 5, "c": 5, "_": 5, "ab": 5, "bc": 5, "c_": 5,
         "abc": 5, "bc_": 5, "abc_": 5}, subtoken_counts)
Example #2
0
  def test_count_and_gen_subtokens(self):
    token_counts = {"abc": 5}
    alphabet = set("abc_")
    subtoken_dict = {"a": 0, "b": 1, "c": 2, "_": 3}
    max_subtoken_length = 2

    subtoken_counts = tokenizer._count_and_gen_subtokens(
        token_counts, alphabet, subtoken_dict, max_subtoken_length)

    self.assertIsInstance(subtoken_counts, collections.defaultdict)
    self.assertDictEqual(
        {"a": 5, "b": 5, "c": 5, "_": 5, "ab": 5, "bc": 5, "c_": 5,
         "abc": 5, "bc_": 5, "abc_": 5}, subtoken_counts)