def test_gpt2_bpe_tokenize(self): with testing_utils.capture_output(): opt = Opt({'dict_tokenizer': 'gpt2', 'datapath': './data'}) agent = DictionaryAgent(opt) self.assertEqual( # grinning face emoji agent.gpt2_tokenize(u'Hello, ParlAI! \U0001f600'), [ 'Hello', ',', r'\xc4\xa0Par', 'l', 'AI', '!', r'\xc4\xa0\xc3\xb0\xc5\x81\xc4\xba', r'\xc4\xa2', ], ) self.assertEqual( agent.vec2txt(agent.tok2ind[w] for w in [ 'Hello', ',', r'\xc4\xa0Par', 'l', 'AI', '!', r'\xc4\xa0\xc3\xb0\xc5\x81\xc4\xba', r'\xc4\xa2', ]), # grinning face emoji u'Hello, ParlAI! \U0001f600', )
def test_gpt2_bpe_tokenize(self): opt = Opt({'dict_tokenizer': 'gpt2', 'datapath': './data'}) agent = DictionaryAgent(opt) self.assertEqual( agent.gpt2_tokenize(u'Hello, ParlAI! 😀'), [ 'Hello', ',', r'\xc4\xa0Par', 'l', 'AI', '!', r'\xc4\xa0\xc3\xb0\xc5\x81\xc4\xba', r'\xc4\xa2', ], ) self.assertEqual( agent.vec2txt( [ 'Hello', ',', r'\xc4\xa0Par', 'l', 'AI', '!', r'\xc4\xa0\xc3\xb0\xc5\x81\xc4\xba', r'\xc4\xa2', ] ), u'Hello, ParlAI! 😀', )
def test_gpt2_bpe_tokenize(self): opt = Opt({'dict_tokenizer': 'gpt2', 'datapath': './data'}) agent = DictionaryAgent(opt) self.assertEqual( # grinning face emoji agent.gpt2_tokenize(u'Hello, ParlAI! \U0001f600'), GPT2_BPE_RESULT, ) self.assertEqual( agent.vec2txt(agent.tok2ind[w] for w in GPT2_BPE_RESULT), # grinning face emoji u'Hello, ParlAI! \U0001f600', )
def test_gpt2_bpe_tokenize(self): datapath = ParlaiParser().parse_args([], print_args=False)['datapath'] opt = Opt({'dict_tokenizer': 'gpt2', 'datapath': datapath}) agent = DictionaryAgent(opt) self.assertEqual( # grinning face emoji agent.gpt2_tokenize(u'Hello, ParlAI! \U0001f600'), GPT2_BPE_RESULT, ) self.assertEqual( agent.vec2txt(agent.tok2ind[w] for w in GPT2_BPE_RESULT), # grinning face emoji u'Hello, ParlAI! \U0001f600', )