Beispiel #1
0
    def __init__(self, config, dataset):
        super(BERT2BERT, self).__init__(config, dataset)

        self.sos_token_idx = 101
        self.eos_token_idx = 102
        self.pretrained_model_path = config['pretrained_model_path']

        self.tokenizer = BertTokenizer.from_pretrained(self.pretrained_model_path)

        self.encoder_configure = BertConfig.from_pretrained(self.pretrained_model_path)
        self.decoder_configure = BertConfig.from_pretrained(self.pretrained_model_path)
        self.encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(
            encoder_config=self.encoder_configure, decoder_config=self.decoder_configure
        )

        self.encoder = BertGenerationEncoder.from_pretrained(
            self.pretrained_model_path, bos_token_id=self.sos_token_idx, eos_token_id=self.eos_token_idx
        )
        self.decoder = BertGenerationDecoder.from_pretrained(
            self.pretrained_model_path,
            bos_token_id=self.sos_token_idx,
            eos_token_id=self.eos_token_idx,
            add_cross_attention=True,
            is_decoder=True
        )
        self.model = EncoderDecoderModel(encoder=self.encoder, decoder=self.decoder, config=self.encoder_decoder_config)

        self.padding_token_idx = self.tokenizer.pad_token_id
        self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx, reduction='none')
def convert_tf_checkpoint_to_pytorch(tf_hub_path, pytorch_dump_path, is_encoder_named_decoder, vocab_size, is_encoder):
    # Initialise PyTorch model
    bert_config = BertConfig.from_pretrained(
        "bert-large-cased",
        vocab_size=vocab_size,
        max_position_embeddings=512,
        is_decoder=True,
        add_cross_attention=True,
    )
    bert_config_dict = bert_config.to_dict()
    del bert_config_dict["type_vocab_size"]
    config = BertGenerationConfig(**bert_config_dict)
    if is_encoder:
        model = BertGenerationEncoder(config)
    else:
        model = BertGenerationDecoder(config)
    print("Building PyTorch model from configuration: {}".format(str(config)))

    # Load weights from tf checkpoint
    load_tf_weights_in_bert_generation(
        model,
        tf_hub_path,
        model_class="bert",
        is_encoder_named_decoder=is_encoder_named_decoder,
        is_encoder=is_encoder,
    )

    # Save pytorch-model
    print("Save PyTorch model and config to {}".format(pytorch_dump_path))
    model.save_pretrained(pytorch_dump_path)
    def create_and_check_decoder_model_past_large_inputs(
        self,
        config,
        input_ids,
        input_mask,
        token_labels,
        encoder_hidden_states,
        encoder_attention_mask,
        **kwargs,
    ):
        config.is_decoder = True
        config.add_cross_attention = True
        model = BertGenerationDecoder(config=config).to(torch_device).eval()

        # first forward pass
        outputs = model(
            input_ids,
            attention_mask=input_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            use_cache=True,
        )
        past_key_values = outputs.past_key_values

        # create hypothetical multiple next token and extent to next_input_ids
        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)

        # append to next input_ids and
        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)

        output_from_no_past = model(
            next_input_ids,
            attention_mask=next_attention_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            output_hidden_states=True,
        )["hidden_states"][0]
        output_from_past = model(
            next_tokens,
            attention_mask=next_attention_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            past_key_values=past_key_values,
            output_hidden_states=True,
        )["hidden_states"][0]

        # select random slice
        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()

        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])

        # test that outputs are equal for slice
        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
 def test_inference_no_head_absolute_embedding(self):
     model = BertGenerationDecoder.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder")
     input_ids = torch.tensor([[101, 7592, 1010, 2026, 3899, 2003, 10140, 102]])
     with torch.no_grad():
         output = model(input_ids)[0]
     expected_shape = torch.Size([1, 8, 50358])
     self.assertEqual(output.shape, expected_shape)
     expected_slice = torch.tensor(
         [[[-0.5788, -2.5994, -3.7054], [0.0438, 4.7997, 1.8795], [1.5862, 6.6409, 4.4638]]]
     )
     self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
Beispiel #5
0
 def create_and_check_for_causal_lm(
     self, config, input_ids, input_mask, token_labels, *args,
 ):
     model = BertGenerationDecoder(config)
     model.to(torch_device)
     model.eval()
     result = model(input_ids, attention_mask=input_mask, labels=token_labels)
     self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
def create_slt_transformer(input_vocab_size=1,
                           output_vocab_size=1,
                           **bert_params):

    if input_vocab_size == 1:
        print('WARNING: Input vocab size is 1')
    if output_vocab_size == 1:
        print('WARNING: Output vocab size is 1')

    params = {
        'vocab_size': input_vocab_size,
        'hidden_size': 512,
        'intermediate_size': 2048,
        'max_position_embeddings': 500,
        'num_attention_heads': 8,
        'num_hidden_layers': 3,
        'hidden_act': 'relu',
        'type_vocab_size': 1,
        'hidden_dropout_prob': 0.1,
        'attention_probs_dropout_prob': 0.1
    }
    params.update(bert_params)

    config = BertGenerationConfig(**params)
    encoder = BertGenerationEncoder(config=config)

    params['vocab_size'] = output_vocab_size
    decoder_config = BertGenerationConfig(is_decoder=True,
                                          add_cross_attention=True,
                                          **params)
    decoder = BertGenerationDecoder(config=decoder_config)

    transformer = EncoderDecoderModel(encoder=encoder, decoder=decoder)

    def count_parameters(m):
        return sum(p.numel() for p in m.parameters() if p.requires_grad)

    print(
        f'The encoder has {count_parameters(encoder):,} trainable parameters')
    print(
        f'The decoder has {count_parameters(decoder):,} trainable parameters')
    print(
        f'The whole model has {count_parameters(transformer):,} trainable parameters'
    )

    return transformer
Beispiel #7
0
    def __init__(self, lr, **args):
        super(BERT2BERTTrainer, self).__init__()
        self.save_hyperparameters()
        encoder = BertGenerationEncoder.from_pretrained(
            "ckiplab/bert-base-chinese",
            bos_token_id=101,
            eos_token_id=102,
            # force_download=True
        )
        decoder = BertGenerationDecoder.from_pretrained(
            "ckiplab/bert-base-chinese",
            add_cross_attention=True,
            is_decoder=True,
            bos_token_id=101,
            eos_token_id=102)

        self.bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)
        if args['with_keywords_loss']:
            self.loss_fct2 = KeywordsLoss(alpha=args['keywords_loss_alpha'],
                                          loss_fct=args['keywords_loss_fct'])
Beispiel #8
0
def create_model(model_checkpoint_name):
    encoder = BertGenerationEncoder.from_pretrained(
        model_checkpoint_name,
        bos_token_id=BOS_TOKEN_ID,
        eos_token_id=EOS_TOKEN_ID
    )  # add cross attention layers and use BERT’s cls token as BOS token and sep token as EOS token

    decoder = BertGenerationDecoder.from_pretrained(model_checkpoint_name,
                                                    add_cross_attention=True,
                                                    is_decoder=True,
                                                    bos_token_id=BOS_TOKEN_ID,
                                                    eos_token_id=EOS_TOKEN_ID)
    decoder.bert.encoder.requires_grad_(True)
    decoder.lm_head.requires_grad_(True)
    encoder.requires_grad_(False)
    decoder.bert.embeddings.requires_grad_(False)

    model = EncoderDecoderModel(encoder=encoder, decoder=decoder)

    return model
Beispiel #9
0
    def __init__(self, config, dataset):
        super(BERT2BERT, self).__init__(config, dataset)

        self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
        self.encoder_configure = BertConfig.from_pretrained('bert-base-cased')

        self.decoder_configure = BertConfig.from_pretrained('bert-base-cased')

        self.encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(
            encoder_config=self.encoder_configure,
            decoder_config=self.decoder_configure)

        self.encoder = BertGenerationEncoder.from_pretrained('bert-base-cased',
                                                             bos_token_id=101,
                                                             eos_token_id=102)

        self.decoder = BertGenerationDecoder.from_pretrained(
            'bert-base-cased',
            add_cross_attention=True,
            is_decoder=True,
            bos_token_id=101,
            eos_token_id=102)

        self.encoder_decoder = EncoderDecoderModel(
            encoder=self.encoder,
            decoder=self.decoder,
            config=self.encoder_decoder_config)

        self.sos_token = dataset.sos_token
        self.eos_token = dataset.eos_token
        self.padding_token_idx = self.tokenizer.pad_token_id
        self.max_source_length = config['source_max_seq_length']
        self.max_target_length = config['target_max_seq_length']

        self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx,
                                        reduction='none')
 def get_encoder_decoder_model(self, config, decoder_config):
     encoder_model = BertGenerationEncoder(config)
     decoder_model = BertGenerationDecoder(decoder_config)
     return encoder_model, decoder_model
Beispiel #11
0
# Fresh decoder config.
decoder_config = BertConfig(
    is_decoder = True,
    add_cross_attention = True, # add cross attention layers
    vocab_size = len(decoder_tokenizer),
    # Set required tokens.
    unk_token_id = decoder_tokenizer.vocab["[UNK]"],
    sep_token_id = decoder_tokenizer.vocab["[SEP]"],
    pad_token_id = decoder_tokenizer.vocab["[PAD]"],
    cls_token_id = decoder_tokenizer.vocab["[CLS]"],
    mask_token_id = decoder_tokenizer.vocab["[MASK]"],
    bos_token_id = decoder_tokenizer.vocab["[BOS]"],
    eos_token_id = decoder_tokenizer.vocab["[EOS]"],
    )
# Initialize a brand new bert-based decoder.
decoder = BertGenerationDecoder(config=decoder_config)

# Setup enc-decoder mode.
bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)
bert2bert.config.decoder_start_token_id=decoder_tokenizer.vocab["[CLS]"]
bert2bert.config.pad_token_id=decoder_tokenizer.vocab["[PAD]"]

# Elementary Training.
optimizer = torch.optim.Adam(bert2bert.parameters(), lr=0.000001)
bert2bert.cuda()

for epoch in range(30):
    print("*"*50, "Epoch", epoch, "*"*50)
    if True:
        for batch in tqdm(sierra_dl):
            # tokenize commands and goals.
# https://medium.com/huggingface/encoder-decoders-in-transformers-a-hybrid-pre-trained-architecture-for-seq2seq-af4d7bf14bb8

from transformers import BertTokenizer, BertTokenizerFast, EncoderDecoderModel, BertGenerationEncoder, BertGenerationDecoder

# add the EOS token as PAD token to avoid warnings
#model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")

# leverage checkpoints for Bert2Bert model...
# use BERT's cls token as BOS token and sep token as EOS token
encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased",
                                                bos_token_id=101,
                                                eos_token_id=102)
# add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
decoder = BertGenerationDecoder.from_pretrained("bert-large-uncased",
                                                add_cross_attention=True,
                                                is_decoder=True,
                                                bos_token_id=101,
                                                eos_token_id=102)
bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)

# create tokenizer...
tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")

# Inputs.
#input_ids = tokenizer('This is a long article to summarize', add_special_tokens=False, return_tensors="pt").input_ids
#labels = tokenizer('This is a short summary', return_tensors="pt").input_ids

# train...
#loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss
#loss.backward()
from transformers import (EncoderDecoderModel, PreTrainedModel, BertTokenizer,
                          BertGenerationEncoder, BertGenerationDecoder)

encoder = BertGenerationEncoder.from_pretrained(
    model_type, bos_token_id=BOS_TOKEN_ID, eos_token_id=EOS_TOKEN_ID
)  # add cross attention layers and use BERT’s cls token as BOS token and sep token as EOS token

decoder = BertGenerationDecoder.from_pretrained(model_type,
                                                add_cross_attention=True,
                                                is_decoder=True,
                                                bos_token_id=BOS_TOKEN_ID,
                                                eos_token_id=EOS_TOKEN_ID)
model = EncoderDecoderModel(encoder=encoder, decoder=decoder).to(device)
Beispiel #14
0
from transformers import BertGenerationTokenizer, BertGenerationDecoder, BertGenerationConfig
import torch

tokenizer = BertGenerationTokenizer.from_pretrained(
    'google/bert_for_seq_generation_L-24_bbc_encoder')
config = BertGenerationConfig.from_pretrained(
    "google/bert_for_seq_generation_L-24_bbc_encoder")
config.is_decoder = True
model = BertGenerationDecoder.from_pretrained(
    'google/bert_for_seq_generation_L-24_bbc_encoder',
    config=config,
    return_dict=True)

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)

prediction_logits = outputs.logits
Beispiel #15
0
    DistilBertModel, DistilBertForMaskedLM, DistilBertTokenizer, DistilBertConfig, \
    DataCollatorForLanguageModeling, Trainer, TrainingArguments, EncoderDecoderModel
from datasets import load_dataset

model_name = 'distilbert-base-multilingual-cased'
tokenizer_name = 'distilbert-base-multilingual-cased'

config = BertGenerationConfig.from_pretrained(model_name)
tokenizer = BertGenerationTokenizer.from_pretrained(tokenizer_name)

# leverage checkpoints for Bert2Bert model...
# use BERT's cls token as BOS token and sep token as EOS token
encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased")
# add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
decoder = BertGenerationDecoder.from_pretrained("bert-large-uncased",
                                                add_cross_attention=True,
                                                is_decoder=True)
bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)
# create tokenizer...
tokenizer = DistilBertTokenizer.from_pretrained("bert-large-uncased")
input_ids = tokenizer('This is a long article to summarize',
                      add_special_tokens=False,
                      return_tensors="pt").input_ids
labels = tokenizer('This is a short summary', return_tensors="pt").input_ids
# train...
# loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss
# loss.backward()

config.attention_type = 'performer'

model = DistilBertForMaskedLM.from_pretrained(bert2bert, config=config)
# SPDX-License-Identifier: Apache-2.0
# based on: https://huggingface.co/docs/transformers/v4.15.0/en/internal/tokenization_utils#transformers.SpecialTokensMixin

from transformers import BertTokenizerFast, BertModel, BertGenerationDecoder

# Let's see how to increase the vocabulary of Bert model and tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

print(tokenizer.all_special_tokens
      )  # --> ['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']
print(tokenizer.all_special_ids)  # --> [100, 102, 0, 101, 103]

model = BertGenerationDecoder.from_pretrained('bert-base-uncased')

print("Original tokenizer\n" + "*" * 50)
print("Vocabulary size: ", tokenizer.vocab_size)
#print("Number of special tokens: ", len(tokenizer.added_tokens_encoder))
print("Size of the full vocabulary with the added tokens: ", len(tokenizer))

# Add special tokens.
#num_added_special_toks = tokenizer.add_special_tokens({"[OBJ]":10001,"[YO]":10002})
num_added_special_toks = tokenizer.add_tokens(["[OBJ]", "[YO]"],
                                              special_tokens=True)
print('We have added', num_added_special_toks, 'special tokens')

# Add "regular" tokens.
num_added_toks = tokenizer.add_tokens(
    ['new_tok1', 'my_new-tok2', 'my_new-tok3', 'new_tok3'],
    special_tokens=False)
print('We have added', num_added_toks, 'tokens')