def convert_tf_checkpoint_to_pytorch(tf_hub_path, pytorch_dump_path, is_encoder_named_decoder, vocab_size, is_encoder):
    # Initialise PyTorch model
    bert_config = BertConfig.from_pretrained(
        "bert-large-cased",
        vocab_size=vocab_size,
        max_position_embeddings=512,
        is_decoder=True,
        add_cross_attention=True,
    )
    bert_config_dict = bert_config.to_dict()
    del bert_config_dict["type_vocab_size"]
    config = BertGenerationConfig(**bert_config_dict)
    if is_encoder:
        model = BertGenerationEncoder(config)
    else:
        model = BertGenerationDecoder(config)
    print("Building PyTorch model from configuration: {}".format(str(config)))

    # Load weights from tf checkpoint
    load_tf_weights_in_bert_generation(
        model,
        tf_hub_path,
        model_class="bert",
        is_encoder_named_decoder=is_encoder_named_decoder,
        is_encoder=is_encoder,
    )

    # Save pytorch-model
    print("Save PyTorch model and config to {}".format(pytorch_dump_path))
    model.save_pretrained(pytorch_dump_path)
Exemple #2
0
    def __init__(self, config, dataset):
        super(BERT2BERT, self).__init__(config, dataset)

        self.sos_token_idx = 101
        self.eos_token_idx = 102
        self.pretrained_model_path = config['pretrained_model_path']

        self.tokenizer = BertTokenizer.from_pretrained(self.pretrained_model_path)

        self.encoder_configure = BertConfig.from_pretrained(self.pretrained_model_path)
        self.decoder_configure = BertConfig.from_pretrained(self.pretrained_model_path)
        self.encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(
            encoder_config=self.encoder_configure, decoder_config=self.decoder_configure
        )

        self.encoder = BertGenerationEncoder.from_pretrained(
            self.pretrained_model_path, bos_token_id=self.sos_token_idx, eos_token_id=self.eos_token_idx
        )
        self.decoder = BertGenerationDecoder.from_pretrained(
            self.pretrained_model_path,
            bos_token_id=self.sos_token_idx,
            eos_token_id=self.eos_token_idx,
            add_cross_attention=True,
            is_decoder=True
        )
        self.model = EncoderDecoderModel(encoder=self.encoder, decoder=self.decoder, config=self.encoder_decoder_config)

        self.padding_token_idx = self.tokenizer.pad_token_id
        self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx, reduction='none')
 def test_inference_no_head_absolute_embedding(self):
     model = BertGenerationEncoder.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder")
     input_ids = torch.tensor([[101, 7592, 1010, 2026, 3899, 2003, 10140, 102]])
     with torch.no_grad():
         output = model(input_ids)[0]
     expected_shape = torch.Size([1, 8, 1024])
     self.assertEqual(output.shape, expected_shape)
     expected_slice = torch.tensor(
         [[[0.1775, 0.0083, -0.0321], [1.6002, 0.1287, 0.3912], [2.1473, 0.5791, 0.6066]]]
     )
     self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
    def test_torch_encode_plus_sent_to_model(self):
        import torch

        from transformers import BertGenerationConfig, BertGenerationEncoder

        # Build sequence
        first_ten_tokens = list(self.big_tokenizer.get_vocab().keys())[:10]
        sequence = " ".join(first_ten_tokens)
        encoded_sequence = self.big_tokenizer.encode_plus(sequence, return_tensors="pt", return_token_type_ids=False)
        batch_encoded_sequence = self.big_tokenizer.batch_encode_plus(
            [sequence + " " + sequence], return_tensors="pt", return_token_type_ids=False
        )

        config = BertGenerationConfig()
        model = BertGenerationEncoder(config)

        assert model.get_input_embeddings().weight.shape[0] >= self.big_tokenizer.vocab_size

        with torch.no_grad():
            model(**encoded_sequence)
            model(**batch_encoded_sequence)
 def create_and_check_model_as_decoder(
     self,
     config,
     input_ids,
     input_mask,
     token_labels,
     encoder_hidden_states,
     encoder_attention_mask,
     **kwargs,
 ):
     config.add_cross_attention = True
     model = BertGenerationEncoder(config=config)
     model.to(torch_device)
     model.eval()
     result = model(
         input_ids,
         attention_mask=input_mask,
         encoder_hidden_states=encoder_hidden_states,
         encoder_attention_mask=encoder_attention_mask,
     )
     result = model(
         input_ids,
         attention_mask=input_mask,
         encoder_hidden_states=encoder_hidden_states,
     )
     self.parent.assertEqual(
         result.last_hidden_state.shape,
         (self.batch_size, self.seq_length, self.hidden_size))
def get_model(args):
    if args.model_path:
        model = EncoderDecoderModel.from_pretrained(args.model_path)
        src_tokenizer = BertTokenizer.from_pretrained(
            os.path.join(args.model_path, "src_tokenizer")
        )
        tgt_tokenizer = GPT2Tokenizer.from_pretrained(
            os.path.join(args.model_path, "tgt_tokenizer")
        )
        tgt_tokenizer.build_inputs_with_special_tokens = types.MethodType(
            build_inputs_with_special_tokens, tgt_tokenizer
        )
        if local_rank == 0 or local_rank == -1:
            print("model and tokenizer load from save success")
    else:
        src_tokenizer = BertTokenizer.from_pretrained(args.src_pretrain_dataset_name)
        tgt_tokenizer = GPT2Tokenizer.from_pretrained(args.tgt_pretrain_dataset_name)
        tgt_tokenizer.add_special_tokens(
            {"bos_token": "[BOS]", "eos_token": "[EOS]", "pad_token": "[PAD]"}
        )
        tgt_tokenizer.build_inputs_with_special_tokens = types.MethodType(
            build_inputs_with_special_tokens, tgt_tokenizer
        )
        encoder = BertGenerationEncoder.from_pretrained(args.src_pretrain_dataset_name)
        decoder = GPT2LMHeadModel.from_pretrained(
            args.tgt_pretrain_dataset_name, add_cross_attention=True, is_decoder=True
        )
        decoder.resize_token_embeddings(len(tgt_tokenizer))
        decoder.config.bos_token_id = tgt_tokenizer.bos_token_id
        decoder.config.eos_token_id = tgt_tokenizer.eos_token_id
        decoder.config.vocab_size = len(tgt_tokenizer)
        decoder.config.add_cross_attention = True
        decoder.config.is_decoder = True
        model_config = EncoderDecoderConfig.from_encoder_decoder_configs(
            encoder.config, decoder.config
        )
        model = EncoderDecoderModel(
            encoder=encoder, decoder=decoder, config=model_config
        )
    if local_rank != -1:
        model = model.to(device)
    if args.ngpu > 1:
        print("{}/{} GPU start".format(local_rank, torch.cuda.device_count()))
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[local_rank], output_device=local_rank
        )
    optimizer, scheduler = get_optimizer_and_schedule(args, model)
    return model, src_tokenizer, tgt_tokenizer, optimizer, scheduler
def create_slt_transformer(input_vocab_size=1,
                           output_vocab_size=1,
                           **bert_params):

    if input_vocab_size == 1:
        print('WARNING: Input vocab size is 1')
    if output_vocab_size == 1:
        print('WARNING: Output vocab size is 1')

    params = {
        'vocab_size': input_vocab_size,
        'hidden_size': 512,
        'intermediate_size': 2048,
        'max_position_embeddings': 500,
        'num_attention_heads': 8,
        'num_hidden_layers': 3,
        'hidden_act': 'relu',
        'type_vocab_size': 1,
        'hidden_dropout_prob': 0.1,
        'attention_probs_dropout_prob': 0.1
    }
    params.update(bert_params)

    config = BertGenerationConfig(**params)
    encoder = BertGenerationEncoder(config=config)

    params['vocab_size'] = output_vocab_size
    decoder_config = BertGenerationConfig(is_decoder=True,
                                          add_cross_attention=True,
                                          **params)
    decoder = BertGenerationDecoder(config=decoder_config)

    transformer = EncoderDecoderModel(encoder=encoder, decoder=decoder)

    def count_parameters(m):
        return sum(p.numel() for p in m.parameters() if p.requires_grad)

    print(
        f'The encoder has {count_parameters(encoder):,} trainable parameters')
    print(
        f'The decoder has {count_parameters(decoder):,} trainable parameters')
    print(
        f'The whole model has {count_parameters(transformer):,} trainable parameters'
    )

    return transformer
Exemple #8
0
    def __init__(self, lr, **args):
        super(BERT2BERTTrainer, self).__init__()
        self.save_hyperparameters()
        encoder = BertGenerationEncoder.from_pretrained(
            "ckiplab/bert-base-chinese",
            bos_token_id=101,
            eos_token_id=102,
            # force_download=True
        )
        decoder = BertGenerationDecoder.from_pretrained(
            "ckiplab/bert-base-chinese",
            add_cross_attention=True,
            is_decoder=True,
            bos_token_id=101,
            eos_token_id=102)

        self.bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)
        if args['with_keywords_loss']:
            self.loss_fct2 = KeywordsLoss(alpha=args['keywords_loss_alpha'],
                                          loss_fct=args['keywords_loss_fct'])
Exemple #9
0
def create_model(model_checkpoint_name):
    encoder = BertGenerationEncoder.from_pretrained(
        model_checkpoint_name,
        bos_token_id=BOS_TOKEN_ID,
        eos_token_id=EOS_TOKEN_ID
    )  # add cross attention layers and use BERT’s cls token as BOS token and sep token as EOS token

    decoder = BertGenerationDecoder.from_pretrained(model_checkpoint_name,
                                                    add_cross_attention=True,
                                                    is_decoder=True,
                                                    bos_token_id=BOS_TOKEN_ID,
                                                    eos_token_id=EOS_TOKEN_ID)
    decoder.bert.encoder.requires_grad_(True)
    decoder.lm_head.requires_grad_(True)
    encoder.requires_grad_(False)
    decoder.bert.embeddings.requires_grad_(False)

    model = EncoderDecoderModel(encoder=encoder, decoder=decoder)

    return model
Exemple #10
0
    def __init__(self, config, dataset):
        super(BERT2BERT, self).__init__(config, dataset)

        self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
        self.encoder_configure = BertConfig.from_pretrained('bert-base-cased')

        self.decoder_configure = BertConfig.from_pretrained('bert-base-cased')

        self.encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(
            encoder_config=self.encoder_configure,
            decoder_config=self.decoder_configure)

        self.encoder = BertGenerationEncoder.from_pretrained('bert-base-cased',
                                                             bos_token_id=101,
                                                             eos_token_id=102)

        self.decoder = BertGenerationDecoder.from_pretrained(
            'bert-base-cased',
            add_cross_attention=True,
            is_decoder=True,
            bos_token_id=101,
            eos_token_id=102)

        self.encoder_decoder = EncoderDecoderModel(
            encoder=self.encoder,
            decoder=self.decoder,
            config=self.encoder_decoder_config)

        self.sos_token = dataset.sos_token
        self.eos_token = dataset.eos_token
        self.padding_token_idx = self.tokenizer.pad_token_id
        self.max_source_length = config['source_max_seq_length']
        self.max_target_length = config['target_max_seq_length']

        self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx,
                                        reduction='none')
Exemple #11
0
# SPDX-License-Identifier: Apache-2.0
# based on: https://huggingface.co/blog/how-to-generate

from transformers import BertTokenizer, EncoderDecoderModel, AutoModel
from transformers import BertGenerationEncoder, GPT2LMHeadModel, BertGenerationDecoder

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Version 1: load encoder-decoder together.
#model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "gpt2")

# Version 2: load pretrained modules separatelly and join them.
encoder = BertGenerationEncoder.from_pretrained("bert-base-uncased",
                                                bos_token_id=101,
                                                eos_token_id=102)
# add cross attention layers and use the same BOS and EOS tokens.
decoder = GPT2LMHeadModel.from_pretrained("gpt2",
                                          add_cross_attention=True,
                                          is_decoder=True,
                                          bos_token_id=101,
                                          eos_token_id=102)
model = EncoderDecoderModel(encoder=encoder, decoder=decoder)

# encode context the generation is conditioned on
input_ids = tokenizer.encode('I enjoy walking with my cute dog',
                             return_tensors='pt')

# Activate beam search and early_stopping.
# A simple remedy is to introduce n-grams (a.k.a word sequences of n words) penalties
# as introduced by Paulus et al. (2017) and Klein et al. (2017).
# The most common n-grams penalty makes sure that no n-gram appears twice by
Exemple #12
0
decoder_tokenizer.add_special_tokens({'bos_token': '[BOS]'})
decoder_tokenizer.add_special_tokens({'eos_token': '[EOS]'})
#print(f"\Decoder tokenizer vocabulary ({len(decoder_tokenizer.get_vocab())}):\n" + "-"*50)
#for k, v in decoder_tokenizer.get_vocab().items():
#    print(k, ": ", v)
# decoder_tokenizer.model_max_length=512 ??

# Create dataset/dataloader.
sierra_ds = SierraDataset(data_path=data_path)
sierra_dl = DataLoader(sierra_ds, batch_size=64, shuffle=True, num_workers=2)

# leverage checkpoints for Bert2Bert model...
# use BERT's cls token as BOS token and sep token as EOS token
encoder = BertGenerationEncoder.from_pretrained("bert-base-uncased",
    # Set required tokens.
    #bos_token_id=encoder_tokenizer.vocab["[CLS]"],
    #eos_token_id=encoder_tokenizer.vocab["[SEP]"],
    )

# Fresh decoder config.
decoder_config = BertConfig(
    is_decoder = True,
    add_cross_attention = True, # add cross attention layers
    vocab_size = len(decoder_tokenizer),
    # Set required tokens.
    unk_token_id = decoder_tokenizer.vocab["[UNK]"],
    sep_token_id = decoder_tokenizer.vocab["[SEP]"],
    pad_token_id = decoder_tokenizer.vocab["[PAD]"],
    cls_token_id = decoder_tokenizer.vocab["[CLS]"],
    mask_token_id = decoder_tokenizer.vocab["[MASK]"],
    bos_token_id = decoder_tokenizer.vocab["[BOS]"],
 def test_model_from_pretrained(self):
     model = BertGenerationEncoder.from_pretrained(
         "google/bert_for_seq_generation_L-24_bbc_encoder")
     self.assertIsNotNone(model)
from transformers import (EncoderDecoderModel, PreTrainedModel, BertTokenizer,
                          BertGenerationEncoder, BertGenerationDecoder)

encoder = BertGenerationEncoder.from_pretrained(
    model_type, bos_token_id=BOS_TOKEN_ID, eos_token_id=EOS_TOKEN_ID
)  # add cross attention layers and use BERT’s cls token as BOS token and sep token as EOS token

decoder = BertGenerationDecoder.from_pretrained(model_type,
                                                add_cross_attention=True,
                                                is_decoder=True,
                                                bos_token_id=BOS_TOKEN_ID,
                                                eos_token_id=EOS_TOKEN_ID)
model = EncoderDecoderModel(encoder=encoder, decoder=decoder).to(device)
Exemple #15
0
import torch
from transformers import BertGenerationConfig, BertGenerationEncoder, BertGenerationDecoder, BertTokenizer, BertGenerationTokenizer, \
    DistilBertModel, DistilBertForMaskedLM, DistilBertTokenizer, DistilBertConfig, \
    DataCollatorForLanguageModeling, Trainer, TrainingArguments, EncoderDecoderModel
from datasets import load_dataset

model_name = 'distilbert-base-multilingual-cased'
tokenizer_name = 'distilbert-base-multilingual-cased'

config = BertGenerationConfig.from_pretrained(model_name)
tokenizer = BertGenerationTokenizer.from_pretrained(tokenizer_name)

# leverage checkpoints for Bert2Bert model...
# use BERT's cls token as BOS token and sep token as EOS token
encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased")
# add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
decoder = BertGenerationDecoder.from_pretrained("bert-large-uncased",
                                                add_cross_attention=True,
                                                is_decoder=True)
bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)
# create tokenizer...
tokenizer = DistilBertTokenizer.from_pretrained("bert-large-uncased")
input_ids = tokenizer('This is a long article to summarize',
                      add_special_tokens=False,
                      return_tensors="pt").input_ids
labels = tokenizer('This is a short summary', return_tensors="pt").input_ids
# train...
# loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss
# loss.backward()

config.attention_type = 'performer'
 def get_encoder_decoder_model(self, config, decoder_config):
     encoder_model = BertGenerationEncoder(config)
     decoder_model = BertGenerationDecoder(decoder_config)
     return encoder_model, decoder_model
data_path = "/home/tkornuta/data/local-leonardo-sierra5k"
decoder_tokenizer_path = os.path.join(
    data_path, "leonardo_sierra.decoder_tokenizer.json")

# Let's see how to increase the vocabulary of Bert model and tokenizer
encoder_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#decoder_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
decoder_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file=decoder_tokenizer_path)
print(len(decoder_tokenizer))

# leverage checkpoints for Bert2Bert model...
# use BERT's cls token as BOS token and sep token as EOS token
encoder = BertGenerationEncoder.from_pretrained(
    "bert-base-uncased",
    bos_token_id=encoder_tokenizer.vocab["[CLS]"],
    eos_token_id=encoder_tokenizer.vocab["[SEP]"],
)
# add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
#decoder = BertGenerationDecoder.from_pretrained("bert-base-uncased",
#    add_cross_attention=True, is_decoder=True,
#    bos_token_id=decoder_tokenizer.vocab["[CLS]"],
#    eos_token_id=decoder_tokenizer.vocab["[SEP]"],
#    )
#decoder.resize_token_embeddings(len(decoder_tokenizer))

# Fresh decoder config.
decoder_config = BertConfig(
    is_decoder=True,
    add_cross_attention=True,
    vocab_size=len(decoder_tokenizer),