Exemple #1
0
    def test_full_tokenizer(self):
        tokenizer = BertGenerationTokenizer(SAMPLE_VOCAB, keep_accents=True)

        tokens = tokenizer.tokenize("This is a test")
        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])

        self.assertListEqual(
            tokenizer.convert_tokens_to_ids(tokens),
            [285, 46, 10, 170, 382],
        )

        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
        self.assertListEqual(
            tokens,
            [
                SPIECE_UNDERLINE + "I",
                SPIECE_UNDERLINE + "was",
                SPIECE_UNDERLINE + "b",
                "or",
                "n",
                SPIECE_UNDERLINE + "in",
                SPIECE_UNDERLINE + "",
                "9",
                "2",
                "0",
                "0",
                "0",
                ",",
                SPIECE_UNDERLINE + "and",
                SPIECE_UNDERLINE + "this",
                SPIECE_UNDERLINE + "is",
                SPIECE_UNDERLINE + "f",
                "al",
                "s",
                "é",
                ".",
            ],
        )
        ids = tokenizer.convert_tokens_to_ids(tokens)
        self.assertListEqual(
            ids,
            [
                8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46,
                72, 80, 6, 0, 4
            ],
        )

        back_tokens = tokenizer.convert_ids_to_tokens(ids)
        self.assertListEqual(
            back_tokens,
            [
                SPIECE_UNDERLINE + "I",
                SPIECE_UNDERLINE + "was",
                SPIECE_UNDERLINE + "b",
                "or",
                "n",
                SPIECE_UNDERLINE + "in",
                SPIECE_UNDERLINE + "",
                "<unk>",
                "2",
                "0",
                "0",
                "0",
                ",",
                SPIECE_UNDERLINE + "and",
                SPIECE_UNDERLINE + "this",
                SPIECE_UNDERLINE + "is",
                SPIECE_UNDERLINE + "f",
                "al",
                "s",
                "<unk>",
                ".",
            ],
        )
Exemple #2
0
    def setUp(self):
        super().setUp()

        tokenizer = BertGenerationTokenizer(SAMPLE_VOCAB, keep_accents=True)
        tokenizer.save_pretrained(self.tmpdirname)
Exemple #3
0
from transformers import BertGenerationTokenizer, BertGenerationDecoder, BertGenerationConfig
import torch

tokenizer = BertGenerationTokenizer.from_pretrained(
    'google/bert_for_seq_generation_L-24_bbc_encoder')
config = BertGenerationConfig.from_pretrained(
    "google/bert_for_seq_generation_L-24_bbc_encoder")
config.is_decoder = True
model = BertGenerationDecoder.from_pretrained(
    'google/bert_for_seq_generation_L-24_bbc_encoder',
    config=config,
    return_dict=True)

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)

prediction_logits = outputs.logits
Exemple #4
0
 def big_tokenizer(self):
     return BertGenerationTokenizer.from_pretrained(
         "google/bert_for_seq_generation_L-24_bbc_encoder")
Exemple #5
0
import torch
from transformers import BertGenerationConfig, BertGenerationEncoder, BertGenerationDecoder, BertTokenizer, BertGenerationTokenizer, \
    DistilBertModel, DistilBertForMaskedLM, DistilBertTokenizer, DistilBertConfig, \
    DataCollatorForLanguageModeling, Trainer, TrainingArguments, EncoderDecoderModel
from datasets import load_dataset

model_name = 'distilbert-base-multilingual-cased'
tokenizer_name = 'distilbert-base-multilingual-cased'

config = BertGenerationConfig.from_pretrained(model_name)
tokenizer = BertGenerationTokenizer.from_pretrained(tokenizer_name)

# leverage checkpoints for Bert2Bert model...
# use BERT's cls token as BOS token and sep token as EOS token
encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased")
# add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
decoder = BertGenerationDecoder.from_pretrained("bert-large-uncased",
                                                add_cross_attention=True,
                                                is_decoder=True)
bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)
# create tokenizer...
tokenizer = DistilBertTokenizer.from_pretrained("bert-large-uncased")
input_ids = tokenizer('This is a long article to summarize',
                      add_special_tokens=False,
                      return_tensors="pt").input_ids
labels = tokenizer('This is a short summary', return_tensors="pt").input_ids
# train...
# loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss
# loss.backward()

config.attention_type = 'performer'