Beispiel #1
0
    def test_gpt2_word_piece_encoder(self):
        # 主要检查可以运行
        weight_path = 'test/data_for_tests/embedding/small_gpt2'
        ds = DataSet({'words': ["this is a test sentence".split()]})
        embed = GPT2WordPieceEncoder(model_dir_or_name=weight_path,
                                     word_dropout=0.1)
        embed.index_datasets(ds, field_name='words')
        self.assertTrue(ds.has_field('word_pieces'))
        result = embed(torch.LongTensor([[1, 2, 3, 4]]))

        embed = GPT2WordPieceEncoder(model_dir_or_name=weight_path,
                                     word_dropout=0.1,
                                     language_model=True)
        embed.index_datasets(ds, field_name='words')
        self.assertTrue(ds.has_field('word_pieces'))
        result = embed(torch.LongTensor([[1, 2, 3, 4]]))
Beispiel #2
0
    def test_gpt2_embed_eq_gpt2_piece_encoder(self):
        # 主要检查一下embedding的结果与wordpieceencoder的结果是否一致
        weight_path = 'test/data_for_tests/embedding/small_gpt2'
        ds = DataSet({
            'words': ["this is a texta a sentence".split(), 'this is'.split()]
        })
        encoder = GPT2WordPieceEncoder(model_dir_or_name=weight_path)
        encoder.eval()
        encoder.index_datasets(ds, field_name='words')
        word_pieces = torch.LongTensor(ds['word_pieces'].get([0, 1]))
        word_pieces_res = encoder(word_pieces)

        vocab = Vocabulary()
        vocab.from_dataset(ds, field_name='words')
        vocab.index_dataset(ds, field_name='words', new_field_name='words')
        ds.set_input('words')
        words = torch.LongTensor(ds['words'].get([0, 1]))
        embed = GPT2Embedding(vocab,
                              model_dir_or_name=weight_path,
                              pool_method='first')
        embed.eval()
        words_res = embed(words)

        # 检查word piece什么的是正常work的
        self.assertEqual((word_pieces_res[0, :4] - words_res[0, :4]).sum(), 0)
        self.assertEqual((word_pieces_res[0, 5:] - words_res[0, 4:]).sum(), 0)
        self.assertEqual((word_pieces_res[1, :2] - words_res[1, :2]).sum(), 0)
Beispiel #3
0
    def test_generate(self):
        # weight_path = 'test/data_for_tests/embedding/small_gpt2'
        weight_path = 'en'

        encoder = GPT2WordPieceEncoder(model_dir_or_name=weight_path,
                                       language_model=True)

        # 测试一下各项东西是否正常work
        print(
            encoder.generate_from_str('This',
                                      max_len=20,
                                      do_sample=False,
                                      num_beams=1,
                                      temperature=1,
                                      top_k=50,
                                      top_p=1.0,
                                      repetition_penalty=1.0,
                                      length_penalty=1.0))
        print(
            encoder.generate_from_str('This day',
                                      max_len=20,
                                      do_sample=False,
                                      num_beams=1,
                                      temperature=1,
                                      top_k=50,
                                      top_p=1.0,
                                      repetition_penalty=1.0,
                                      length_penalty=1.0))
        print(
            encoder.generate_from_str('This',
                                      max_len=20,
                                      do_sample=True,
                                      num_beams=3,
                                      temperature=1,
                                      top_k=50,
                                      top_p=1.0,
                                      repetition_penalty=1.0,
                                      length_penalty=1.0))
        print(
            encoder.generate_from_str('This',
                                      max_len=20,
                                      do_sample=True,
                                      num_beams=3,
                                      temperature=2,
                                      top_k=20,
                                      top_p=2.0,
                                      repetition_penalty=2.0,
                                      length_penalty=1.5))
Beispiel #4
0
    def test_eq_transformers(self):
        # 测试能否正确得到类似于transformers的结果
        weight_path = ''

        # tokenizer = transformers.GPT2Tokenizer.from_pretrained(weight_path)

        ds = DataSet({
            'words': [
                "this this this a is texta model vocab".split(),
                'this is'.split()
            ]
        })

        import transformers
        input1 = ' '.join(ds[0]['words'])
        input2 = ' '.join(ds[1]['words'])
        tokenizer = transformers.GPT2Tokenizer.from_pretrained(weight_path)
        idx_list1 = tokenizer.encode(input1)
        idx_list2 = tokenizer.encode(input2)

        pad_value = tokenizer.encode('<|endoftext|>')[0]
        tensor = torch.nn.utils.rnn.pad_sequence(
            [torch.LongTensor(idx_list1),
             torch.LongTensor(idx_list2)],
            batch_first=True,
            padding_value=pad_value)
        gpt2 = transformers.GPT2Model.from_pretrained(
            weight_path, output_hidden_states=True)
        gpt2.eval()
        tensor = tensor
        output, _, trans_hidden_states = gpt2(
            tensor, attention_mask=tensor.ne(pad_value))

        encoder = GPT2WordPieceEncoder(model_dir_or_name=weight_path,
                                       layers=list(range(13)))
        encoder.eval()
        encoder.index_datasets(ds, field_name='words', add_endoftext=False)
        word_pieces = torch.LongTensor(ds['word_pieces'].get([0, 1]))

        self.assertEqual(idx_list1, ds[0]['word_pieces'])
        self.assertEqual(idx_list2, ds[1]['word_pieces'])

        word_pieces_res = encoder(word_pieces)

        self.assertEqual(
            (torch.cat(trans_hidden_states, dim=-1) - word_pieces_res).sum(),
            0)