def test_gpt2_word_piece_encoder(self): # 主要检查可以运行 weight_path = 'test/data_for_tests/embedding/small_gpt2' ds = DataSet({'words': ["this is a test sentence".split()]}) embed = GPT2WordPieceEncoder(model_dir_or_name=weight_path, word_dropout=0.1) embed.index_datasets(ds, field_name='words') self.assertTrue(ds.has_field('word_pieces')) result = embed(torch.LongTensor([[1, 2, 3, 4]])) embed = GPT2WordPieceEncoder(model_dir_or_name=weight_path, word_dropout=0.1, language_model=True) embed.index_datasets(ds, field_name='words') self.assertTrue(ds.has_field('word_pieces')) result = embed(torch.LongTensor([[1, 2, 3, 4]]))
def test_gpt2_embed_eq_gpt2_piece_encoder(self): # 主要检查一下embedding的结果与wordpieceencoder的结果是否一致 weight_path = 'test/data_for_tests/embedding/small_gpt2' ds = DataSet({ 'words': ["this is a texta a sentence".split(), 'this is'.split()] }) encoder = GPT2WordPieceEncoder(model_dir_or_name=weight_path) encoder.eval() encoder.index_datasets(ds, field_name='words') word_pieces = torch.LongTensor(ds['word_pieces'].get([0, 1])) word_pieces_res = encoder(word_pieces) vocab = Vocabulary() vocab.from_dataset(ds, field_name='words') vocab.index_dataset(ds, field_name='words', new_field_name='words') ds.set_input('words') words = torch.LongTensor(ds['words'].get([0, 1])) embed = GPT2Embedding(vocab, model_dir_or_name=weight_path, pool_method='first') embed.eval() words_res = embed(words) # 检查word piece什么的是正常work的 self.assertEqual((word_pieces_res[0, :4] - words_res[0, :4]).sum(), 0) self.assertEqual((word_pieces_res[0, 5:] - words_res[0, 4:]).sum(), 0) self.assertEqual((word_pieces_res[1, :2] - words_res[1, :2]).sum(), 0)
def test_generate(self): # weight_path = 'test/data_for_tests/embedding/small_gpt2' weight_path = 'en' encoder = GPT2WordPieceEncoder(model_dir_or_name=weight_path, language_model=True) # 测试一下各项东西是否正常work print( encoder.generate_from_str('This', max_len=20, do_sample=False, num_beams=1, temperature=1, top_k=50, top_p=1.0, repetition_penalty=1.0, length_penalty=1.0)) print( encoder.generate_from_str('This day', max_len=20, do_sample=False, num_beams=1, temperature=1, top_k=50, top_p=1.0, repetition_penalty=1.0, length_penalty=1.0)) print( encoder.generate_from_str('This', max_len=20, do_sample=True, num_beams=3, temperature=1, top_k=50, top_p=1.0, repetition_penalty=1.0, length_penalty=1.0)) print( encoder.generate_from_str('This', max_len=20, do_sample=True, num_beams=3, temperature=2, top_k=20, top_p=2.0, repetition_penalty=2.0, length_penalty=1.5))
def test_eq_transformers(self): # 测试能否正确得到类似于transformers的结果 weight_path = '' # tokenizer = transformers.GPT2Tokenizer.from_pretrained(weight_path) ds = DataSet({ 'words': [ "this this this a is texta model vocab".split(), 'this is'.split() ] }) import transformers input1 = ' '.join(ds[0]['words']) input2 = ' '.join(ds[1]['words']) tokenizer = transformers.GPT2Tokenizer.from_pretrained(weight_path) idx_list1 = tokenizer.encode(input1) idx_list2 = tokenizer.encode(input2) pad_value = tokenizer.encode('<|endoftext|>')[0] tensor = torch.nn.utils.rnn.pad_sequence( [torch.LongTensor(idx_list1), torch.LongTensor(idx_list2)], batch_first=True, padding_value=pad_value) gpt2 = transformers.GPT2Model.from_pretrained( weight_path, output_hidden_states=True) gpt2.eval() tensor = tensor output, _, trans_hidden_states = gpt2( tensor, attention_mask=tensor.ne(pad_value)) encoder = GPT2WordPieceEncoder(model_dir_or_name=weight_path, layers=list(range(13))) encoder.eval() encoder.index_datasets(ds, field_name='words', add_endoftext=False) word_pieces = torch.LongTensor(ds['word_pieces'].get([0, 1])) self.assertEqual(idx_list1, ds[0]['word_pieces']) self.assertEqual(idx_list2, ds[1]['word_pieces']) word_pieces_res = encoder(word_pieces) self.assertEqual( (torch.cat(trans_hidden_states, dim=-1) - word_pieces_res).sum(), 0)