コード例 #1
0
ファイル: test_dict.py プロジェクト: bonbert81/ParlAI
 def test_byte_level_bpe_tokenize(self):
     """
     Tests a bytelevel bpe tokenizer inside ParlAI.
     """
     parser = ParlaiParser()
     parser.set_params(
         dict_tokenizer='bytelevelbpe',
         bpe_vocab=DEFAULT_BYTELEVEL_BPE_VOCAB,
         bpe_merge=DEFAULT_BYTELEVEL_BPE_MERGE,
         bpe_add_prefix_space=False,
     )
     opt = parser.parse_args([], print_args=False)
     agent = DictionaryAgent(opt)
     self.assertEqual(
         # grinning face emoji
         agent.bytelevelbpe_tokenize(u'Hello, ParlAI! \U0001f600'),
         BYTELEVEL_BPE_RESULT,
     )
     self.assertEqual(
         agent.vec2txt([agent.tok2ind[w] for w in BYTELEVEL_BPE_RESULT]),
         # grinning face emoji
         u'Hello, ParlAI! \U0001f600',
     )
     self.assertEqual(
         agent.txt2vec(u'Hello, ParlAI! \U0001f600'),
         [agent.tok2ind[w] for w in BYTELEVEL_BPE_RESULT],
     )
     vocab_size = agent.byte_level_bpe.tokenizer.get_vocab_size()
     with testing_utils.tempdir() as tmpdir:
         path = os.path.join(tmpdir, 'dict-checkpoint')
         agent.save(filename=path)
         agent.load(filename=path)
     # Test loading / saving
     self.assertEqual(vocab_size,
                      agent.byte_level_bpe.tokenizer.get_vocab_size())
     self.assertEqual(
         # grinning face emoji
         agent.bytelevelbpe_tokenize(u'Hello, ParlAI! \U0001f600'),
         BYTELEVEL_BPE_RESULT,
     )
     self.assertEqual(
         agent.vec2txt([agent.tok2ind[w] for w in BYTELEVEL_BPE_RESULT]),
         # grinning face emoji
         u'Hello, ParlAI! \U0001f600',
     )
     self.assertEqual(
         agent.txt2vec(u'Hello, ParlAI! \U0001f600'),
         [agent.tok2ind[w] for w in BYTELEVEL_BPE_RESULT],
     )
     # Test special token ids are mapped correctly:
     # 4 special tokens are added in ParlAI dict in the begining and at the
     # end for Hugging Face null token would be 0 in ParlAI dict and
     # original_vocab in Hugging Face
     assert agent.txt2vec("__null__") == [0]
     assert agent.txt2vec("__start__") == [1]
     assert agent.txt2vec("__end__") == [2]
     assert agent.txt2vec("__unk__") == [3]
コード例 #2
0
ファイル: test_dict.py プロジェクト: Taekyung2/MichinAI
 def test_tokenize_prefix_space(self):
     """
     Tests a bytelevel bpe tokenizer inside ParlAI.
     """
     parser = ParlaiParser()
     parser.set_params(
         dict_tokenizer='bytelevelbpe',
         bpe_vocab=DEFAULT_BYTELEVEL_BPE_VOCAB,
         bpe_merge=DEFAULT_BYTELEVEL_BPE_MERGE,
     )
     opt = parser.parse_args([])
     agent = DictionaryAgent(opt)
     self.assertEqual(
         # grinning face emoji
         agent.bytelevelbpe_tokenize(u'Hello, ParlAI! \U0001f600'),
         ['Ġ'] + BYTELEVEL_BPE_RESULT,
     )
     self.assertEqual(
         agent.vec2txt(
             [agent.tok2ind[w] for w in ['Ġ'] + BYTELEVEL_BPE_RESULT]),
         # grinning face emoji
         u'Hello, ParlAI! \U0001f600',
     )
     self.assertEqual(
         agent.txt2vec(u'Hello, ParlAI! \U0001f600'),
         [agent.tok2ind[w] for w in ['Ġ'] + BYTELEVEL_BPE_RESULT],
     )