def test_full_tokenizer(self): tokenizer = BertGenerationTokenizer(SAMPLE_VOCAB, keep_accents=True) tokens = tokenizer.tokenize("This is a test") self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"]) self.assertListEqual( tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382], ) tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.") self.assertListEqual( tokens, [ SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", SPIECE_UNDERLINE + "", "9", "2", "0", "0", "0", ",", SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", "é", ".", ], ) ids = tokenizer.convert_tokens_to_ids(tokens) self.assertListEqual( ids, [ 8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4 ], ) back_tokens = tokenizer.convert_ids_to_tokens(ids) self.assertListEqual( back_tokens, [ SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", SPIECE_UNDERLINE + "", "<unk>", "2", "0", "0", "0", ",", SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", "<unk>", ".", ], )
def setUp(self): super().setUp() tokenizer = BertGenerationTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer.save_pretrained(self.tmpdirname)
from transformers import BertGenerationTokenizer, BertGenerationDecoder, BertGenerationConfig import torch tokenizer = BertGenerationTokenizer.from_pretrained( 'google/bert_for_seq_generation_L-24_bbc_encoder') config = BertGenerationConfig.from_pretrained( "google/bert_for_seq_generation_L-24_bbc_encoder") config.is_decoder = True model = BertGenerationDecoder.from_pretrained( 'google/bert_for_seq_generation_L-24_bbc_encoder', config=config, return_dict=True) inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") outputs = model(**inputs) prediction_logits = outputs.logits
def big_tokenizer(self): return BertGenerationTokenizer.from_pretrained( "google/bert_for_seq_generation_L-24_bbc_encoder")
import torch from transformers import BertGenerationConfig, BertGenerationEncoder, BertGenerationDecoder, BertTokenizer, BertGenerationTokenizer, \ DistilBertModel, DistilBertForMaskedLM, DistilBertTokenizer, DistilBertConfig, \ DataCollatorForLanguageModeling, Trainer, TrainingArguments, EncoderDecoderModel from datasets import load_dataset model_name = 'distilbert-base-multilingual-cased' tokenizer_name = 'distilbert-base-multilingual-cased' config = BertGenerationConfig.from_pretrained(model_name) tokenizer = BertGenerationTokenizer.from_pretrained(tokenizer_name) # leverage checkpoints for Bert2Bert model... # use BERT's cls token as BOS token and sep token as EOS token encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased") # add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token decoder = BertGenerationDecoder.from_pretrained("bert-large-uncased", add_cross_attention=True, is_decoder=True) bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder) # create tokenizer... tokenizer = DistilBertTokenizer.from_pretrained("bert-large-uncased") input_ids = tokenizer('This is a long article to summarize', add_special_tokens=False, return_tensors="pt").input_ids labels = tokenizer('This is a short summary', return_tensors="pt").input_ids # train... # loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss # loss.backward() config.attention_type = 'performer'