Beispiel #1
0
    def __init__(self, config, dataset):
        super(BERT2BERT, self).__init__(config, dataset)

        self.sos_token_idx = 101
        self.eos_token_idx = 102
        self.pretrained_model_path = config['pretrained_model_path']

        self.tokenizer = BertTokenizer.from_pretrained(self.pretrained_model_path)

        self.encoder_configure = BertConfig.from_pretrained(self.pretrained_model_path)
        self.decoder_configure = BertConfig.from_pretrained(self.pretrained_model_path)
        self.encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(
            encoder_config=self.encoder_configure, decoder_config=self.decoder_configure
        )

        self.encoder = BertGenerationEncoder.from_pretrained(
            self.pretrained_model_path, bos_token_id=self.sos_token_idx, eos_token_id=self.eos_token_idx
        )
        self.decoder = BertGenerationDecoder.from_pretrained(
            self.pretrained_model_path,
            bos_token_id=self.sos_token_idx,
            eos_token_id=self.eos_token_idx,
            add_cross_attention=True,
            is_decoder=True
        )
        self.model = EncoderDecoderModel(encoder=self.encoder, decoder=self.decoder, config=self.encoder_decoder_config)

        self.padding_token_idx = self.tokenizer.pad_token_id
        self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx, reduction='none')
 def test_inference_no_head_absolute_embedding(self):
     model = BertGenerationDecoder.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder")
     input_ids = torch.tensor([[101, 7592, 1010, 2026, 3899, 2003, 10140, 102]])
     with torch.no_grad():
         output = model(input_ids)[0]
     expected_shape = torch.Size([1, 8, 50358])
     self.assertEqual(output.shape, expected_shape)
     expected_slice = torch.tensor(
         [[[-0.5788, -2.5994, -3.7054], [0.0438, 4.7997, 1.8795], [1.5862, 6.6409, 4.4638]]]
     )
     self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
Beispiel #3
0
    def __init__(self, lr, **args):
        super(BERT2BERTTrainer, self).__init__()
        self.save_hyperparameters()
        encoder = BertGenerationEncoder.from_pretrained(
            "ckiplab/bert-base-chinese",
            bos_token_id=101,
            eos_token_id=102,
            # force_download=True
        )
        decoder = BertGenerationDecoder.from_pretrained(
            "ckiplab/bert-base-chinese",
            add_cross_attention=True,
            is_decoder=True,
            bos_token_id=101,
            eos_token_id=102)

        self.bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)
        if args['with_keywords_loss']:
            self.loss_fct2 = KeywordsLoss(alpha=args['keywords_loss_alpha'],
                                          loss_fct=args['keywords_loss_fct'])
Beispiel #4
0
def create_model(model_checkpoint_name):
    encoder = BertGenerationEncoder.from_pretrained(
        model_checkpoint_name,
        bos_token_id=BOS_TOKEN_ID,
        eos_token_id=EOS_TOKEN_ID
    )  # add cross attention layers and use BERT’s cls token as BOS token and sep token as EOS token

    decoder = BertGenerationDecoder.from_pretrained(model_checkpoint_name,
                                                    add_cross_attention=True,
                                                    is_decoder=True,
                                                    bos_token_id=BOS_TOKEN_ID,
                                                    eos_token_id=EOS_TOKEN_ID)
    decoder.bert.encoder.requires_grad_(True)
    decoder.lm_head.requires_grad_(True)
    encoder.requires_grad_(False)
    decoder.bert.embeddings.requires_grad_(False)

    model = EncoderDecoderModel(encoder=encoder, decoder=decoder)

    return model
Beispiel #5
0
    def __init__(self, config, dataset):
        super(BERT2BERT, self).__init__(config, dataset)

        self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
        self.encoder_configure = BertConfig.from_pretrained('bert-base-cased')

        self.decoder_configure = BertConfig.from_pretrained('bert-base-cased')

        self.encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(
            encoder_config=self.encoder_configure,
            decoder_config=self.decoder_configure)

        self.encoder = BertGenerationEncoder.from_pretrained('bert-base-cased',
                                                             bos_token_id=101,
                                                             eos_token_id=102)

        self.decoder = BertGenerationDecoder.from_pretrained(
            'bert-base-cased',
            add_cross_attention=True,
            is_decoder=True,
            bos_token_id=101,
            eos_token_id=102)

        self.encoder_decoder = EncoderDecoderModel(
            encoder=self.encoder,
            decoder=self.decoder,
            config=self.encoder_decoder_config)

        self.sos_token = dataset.sos_token
        self.eos_token = dataset.eos_token
        self.padding_token_idx = self.tokenizer.pad_token_id
        self.max_source_length = config['source_max_seq_length']
        self.max_target_length = config['target_max_seq_length']

        self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx,
                                        reduction='none')
# https://medium.com/huggingface/encoder-decoders-in-transformers-a-hybrid-pre-trained-architecture-for-seq2seq-af4d7bf14bb8

from transformers import BertTokenizer, BertTokenizerFast, EncoderDecoderModel, BertGenerationEncoder, BertGenerationDecoder

# add the EOS token as PAD token to avoid warnings
#model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")

# leverage checkpoints for Bert2Bert model...
# use BERT's cls token as BOS token and sep token as EOS token
encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased",
                                                bos_token_id=101,
                                                eos_token_id=102)
# add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
decoder = BertGenerationDecoder.from_pretrained("bert-large-uncased",
                                                add_cross_attention=True,
                                                is_decoder=True,
                                                bos_token_id=101,
                                                eos_token_id=102)
bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)

# create tokenizer...
tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")

# Inputs.
#input_ids = tokenizer('This is a long article to summarize', add_special_tokens=False, return_tensors="pt").input_ids
#labels = tokenizer('This is a short summary', return_tensors="pt").input_ids

# train...
#loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss
#loss.backward()
from transformers import (EncoderDecoderModel, PreTrainedModel, BertTokenizer,
                          BertGenerationEncoder, BertGenerationDecoder)

encoder = BertGenerationEncoder.from_pretrained(
    model_type, bos_token_id=BOS_TOKEN_ID, eos_token_id=EOS_TOKEN_ID
)  # add cross attention layers and use BERT’s cls token as BOS token and sep token as EOS token

decoder = BertGenerationDecoder.from_pretrained(model_type,
                                                add_cross_attention=True,
                                                is_decoder=True,
                                                bos_token_id=BOS_TOKEN_ID,
                                                eos_token_id=EOS_TOKEN_ID)
model = EncoderDecoderModel(encoder=encoder, decoder=decoder).to(device)
Beispiel #8
0
from transformers import BertGenerationTokenizer, BertGenerationDecoder, BertGenerationConfig
import torch

tokenizer = BertGenerationTokenizer.from_pretrained(
    'google/bert_for_seq_generation_L-24_bbc_encoder')
config = BertGenerationConfig.from_pretrained(
    "google/bert_for_seq_generation_L-24_bbc_encoder")
config.is_decoder = True
model = BertGenerationDecoder.from_pretrained(
    'google/bert_for_seq_generation_L-24_bbc_encoder',
    config=config,
    return_dict=True)

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)

prediction_logits = outputs.logits
Beispiel #9
0
    DistilBertModel, DistilBertForMaskedLM, DistilBertTokenizer, DistilBertConfig, \
    DataCollatorForLanguageModeling, Trainer, TrainingArguments, EncoderDecoderModel
from datasets import load_dataset

model_name = 'distilbert-base-multilingual-cased'
tokenizer_name = 'distilbert-base-multilingual-cased'

config = BertGenerationConfig.from_pretrained(model_name)
tokenizer = BertGenerationTokenizer.from_pretrained(tokenizer_name)

# leverage checkpoints for Bert2Bert model...
# use BERT's cls token as BOS token and sep token as EOS token
encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased")
# add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
decoder = BertGenerationDecoder.from_pretrained("bert-large-uncased",
                                                add_cross_attention=True,
                                                is_decoder=True)
bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)
# create tokenizer...
tokenizer = DistilBertTokenizer.from_pretrained("bert-large-uncased")
input_ids = tokenizer('This is a long article to summarize',
                      add_special_tokens=False,
                      return_tensors="pt").input_ids
labels = tokenizer('This is a short summary', return_tensors="pt").input_ids
# train...
# loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss
# loss.backward()

config.attention_type = 'performer'

model = DistilBertForMaskedLM.from_pretrained(bert2bert, config=config)
# SPDX-License-Identifier: Apache-2.0
# based on: https://huggingface.co/docs/transformers/v4.15.0/en/internal/tokenization_utils#transformers.SpecialTokensMixin

from transformers import BertTokenizerFast, BertModel, BertGenerationDecoder

# Let's see how to increase the vocabulary of Bert model and tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

print(tokenizer.all_special_tokens
      )  # --> ['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']
print(tokenizer.all_special_ids)  # --> [100, 102, 0, 101, 103]

model = BertGenerationDecoder.from_pretrained('bert-base-uncased')

print("Original tokenizer\n" + "*" * 50)
print("Vocabulary size: ", tokenizer.vocab_size)
#print("Number of special tokens: ", len(tokenizer.added_tokens_encoder))
print("Size of the full vocabulary with the added tokens: ", len(tokenizer))

# Add special tokens.
#num_added_special_toks = tokenizer.add_special_tokens({"[OBJ]":10001,"[YO]":10002})
num_added_special_toks = tokenizer.add_tokens(["[OBJ]", "[YO]"],
                                              special_tokens=True)
print('We have added', num_added_special_toks, 'special tokens')

# Add "regular" tokens.
num_added_toks = tokenizer.add_tokens(
    ['new_tok1', 'my_new-tok2', 'my_new-tok3', 'new_tok3'],
    special_tokens=False)
print('We have added', num_added_toks, 'tokens')