Beispiel #1
0
    def test_gpt2_tokenizer(self):
        from fastNLP.modules.tokenizer import GPT2Tokenizer

        tokenizer = GPT2Tokenizer.from_pretrained(
            'test/data_for_tests/embedding/small_gpt2')
        print(tokenizer.encode("this is a texta a sentence"))
        print(tokenizer.encode('this is'))
Beispiel #2
0
    def test_generate_small_gpt2(self):
        # 因为GPT2使用的是GPT2的tokenizer,所以没办法直接生成权重,需要用点下面的方式
        weight_path = ''
        tokenizer = GPT2Tokenizer.from_pretrained(weight_path)

        used_pairs = {}
        used_vocab = {}
        # 修改这里即可获得更多的sentence的数据
        sent1 = "This is a demo sentence"
        sent2 = "another demo"
        sent3 = 'this is a texta model vocab'
        all_tokens = []

        for sent in [sent1, sent2, sent3]:
            tokens = []
            for word in sent.split():
                word = ' ' + word
                token = "".join(tokenizer.byte_encoder[b]
                                for b in word.encode("utf-8"))
                _token, _used_pairs = tokenizer.get_used_merge_pair_vocab(
                    token)
                tokens.extend(_token.split())
                used_pairs.update(_used_pairs)
            all_tokens.extend(tokens)
            token_ids = tokenizer.convert_tokens_to_ids(tokens)
            used_vocab.update({t: i for t, i in zip(tokens, token_ids)})

        print(used_pairs)
        import json
        with open('test/data_for_tests/embedding/small_gpt2/vocab.json',
                  'w') as f:
            new_used_vocab = {}
            for idx, key in enumerate(used_vocab.keys()):
                new_used_vocab[key] = len(new_used_vocab)
            new_used_vocab['<|endoftext|>'] = len(new_used_vocab)
            for i in range(65, 91):
                if chr(i) not in new_used_vocab:
                    new_used_vocab[chr(i)] = len(new_used_vocab)
            for i in range(97, 123):
                if chr(i) not in new_used_vocab:
                    new_used_vocab[chr(i)] = len(new_used_vocab)

            json.dump(new_used_vocab, f)

        with open('test/data_for_tests/embedding/small_gpt2/merges.txt',
                  'w') as f:
            f.write('#version: small\n')
            for k, v in sorted(sorted(used_pairs.items(),
                                      key=lambda kv: kv[1])):
                f.write('{} {}\n'.format(k[0], k[1]))

        new_tokenizer = GPT2Tokenizer.from_pretrained(
            'test/data_for_tests/embedding/small_gpt2')
        new_all_tokens = []
        for sent in [sent1, sent2, sent3]:
            tokens = new_tokenizer.tokenize(sent, add_prefix_space=True)
            new_all_tokens.extend(tokens)
        print(all_tokens, new_all_tokens)

        self.assertSequenceEqual(all_tokens, new_all_tokens)
        config = {
            "architectures": ["GPT2LMHeadModel"],
            "initializer_range": 0.02,
            "layer_norm_epsilon": 1e-05,
            "n_ctx": 20,
            "n_embd": 16,
            "n_head": 4,
            "n_layer": 2,
            "n_positions": 20,
            "vocab_size": len(new_used_vocab)
        }
        with open('test/data_for_tests/embedding/small_gpt2/config.json',
                  'w') as f:
            json.dump(config, f)

        # 生成更小的merges.txt与vocab.json, 方法是通过记录tokenizer中的值实现
        from fastNLP.modules.encoder.gpt2 import GPT2LMHeadModel, GPT2Config

        config = GPT2Config.from_pretrained(
            'test/data_for_tests/embedding/small_gpt2')

        model = GPT2LMHeadModel(config)
        torch.save(
            model.state_dict(),
            'test/data_for_tests/embedding/small_gpt2/small_pytorch_model.bin')
        print(model(torch.LongTensor([[0, 1, 2, 3]])))