def test_generate_subtokens(self): token_counts = {"ab": 1, "bc": 3, "abc": 5} alphabet = set("abc_") min_count = 100 num_iterations = 1 reserved_tokens = ["reserved", "tokens"] vocab_list = tokenizer._generate_subtokens( token_counts, alphabet, min_count, num_iterations, reserved_tokens) # Check that reserved tokens are at the front of the list self.assertEqual(vocab_list[:2], reserved_tokens) # Check that each character in alphabet is in the vocab list for c in alphabet: self.assertIn(c, vocab_list)