def test_lm_generate_xlm_mlm_en_2048(self): model = XLMWithLMHeadModel.from_pretrained("xlm-mlm-en-2048") model.to(torch_device) input_ids = torch.tensor([[14, 447]], dtype=torch.long, device=torch_device) # the president expected_output_ids = [ 14, 447, 14, 447, 14, 447, 14, 447, 14, 447, 14, 447, 14, 447, 14, 447, 14, 447, 14, 447, ] # the president the president the president the president the president the president the president the president the president the president # TODO(PVP): this and other input_ids I tried for generation give pretty bad results. Not sure why. Model might just not be made for auto-regressive inference output_ids = model.generate(input_ids, do_sample=False) self.assertListEqual(output_ids[0].cpu().numpy().tolist(), expected_output_ids)
def load_model_tokenizer(self, pretrained): """ Load transformer model and tokenizer for given pre-trained name :param pretrained: pre-trained name :return: model, tokenizer """ model = None tokenizer = None if self.method == "T5": if pretrained in T5_PRETRAINED_MODELS: model = T5ForConditionalGeneration.from_pretrained(pretrained) tokenizer = T5Tokenizer.from_pretrained(pretrained) elif self.method == "BART": if pretrained in BART_PRETRAINED_MODELS: model = BartForConditionalGeneration.from_pretrained(pretrained) tokenizer = BartTokenizer.from_pretrained(pretrained) elif self.method == "GPT-2": if pretrained in GPT2_PRETRAINED_MODELS: model = GPT2LMHeadModel.from_pretrained(pretrained) model.config.max_length = self.max_length tokenizer = GPT2Tokenizer.from_pretrained(pretrained) elif self.method == "XLM": if pretrained in XLM_PRETRAINED_MODELS: model = XLMWithLMHeadModel.from_pretrained(pretrained) model.config.max_length = self.max_length tokenizer = XLMTokenizer.from_pretrained(pretrained) else: pass return model, tokenizer
def test_lm_generate_xlm_mlm_en_2048(self): model = XLMWithLMHeadModel.from_pretrained("xlm-mlm-en-2048") input_ids = torch.Tensor([[1, 14, 2232, 26, 1]]).long() # The dog is cute expected_output_ids = [ 1, 14, 2232, 26, 1, 567, 26, 32, 149, 149, 149, 149, 149, 149, 149, 149, 149, 149, 149, 149, ] # The dog is nothing is it!!!!!!!!!!!! TODO (PVP): this sentence (and others I tried) does not make much sense, there seems to be a problem with xlm language generation. torch.manual_seed(0) output_ids = model.generate(input_ids) self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
filename = "SUBTLEXus74286wordstextversion.txt" vocab = get_vocab(filename, 3000) rw_vocab = get_vocab(filename, 10000) filename2 = "SUBTLEX-US frequency list with PoS information text version.txt" pos_dict = get_pos_dict(filename2) GPT2 = ModelInfo(GPT2LMHeadModel.from_pretrained('gpt2', return_dict=True), GPT2Tokenizer.from_pretrained('gpt2'), "Ġ", vocab, "GTP2") Roberta = ModelInfo( RobertaForCausalLM.from_pretrained('roberta-base', return_dict=True), RobertaTokenizer.from_pretrained('roberta-base'), "_", vocab, "Roberta") XLM = ModelInfo( XLMWithLMHeadModel.from_pretrained('xlm-mlm-xnli15-1024', return_dict=True), XLMTokenizer.from_pretrained('xlm-mlm-xnli15-1024'), "_", vocab, "XLM") T5 = ModelInfo( T5ForConditionalGeneration.from_pretrained("t5-base", return_dict=True), T5Tokenizer.from_pretrained("t5-base"), "_", vocab, "T5") Albert = ModelInfo( AlbertForMaskedLM.from_pretrained('albert-base-v2', return_dict=True), AlbertTokenizer.from_pretrained('albert-base-v2'), "_", vocab, "Albert") TXL = ModelInfo(TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103'), TransfoXLTokenizer.from_pretrained('transfo-xl-wt103'), "_", vocab, "TXL") if __name__ == "__main__":
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', do_lower_case=False) model = RobertaForMaskedLM.from_pretrained('./multi-label_LM/multi-label_RoBerta_e10_b16', config=config) # 12-layer, 768-hidden, 12-heads, 125M parameters, roberta-base using the bert-base architecture elif args.LM == 'XLM': from transformers import XLMConfig, XLMTokenizer, XLMWithLMHeadModel config = XLMConfig(vocab_size=64139, emb_dim=1024, max_position_embeddings=512, n_heads=8, n_layers=6, ) tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-enfr-1024', do_lower_case=False) model = XLMWithLMHeadModel.from_pretrained('./multi-label_LM/multi-label_XLM_e10_b16', config=config) # 6-layer, 1024-hidden, 8-heads # XLM English-French model trained on the concatenation of English and French wikipedia else: print('need to define LM from Bert,RoBerta,XLM') print(model) def freeze_layer_fun(freeze_layer): for name, param in model.named_parameters(): if freeze_layer in name: print(name) param.requires_grad = False else: pass
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-xnli15-1024', do_lower_case=False) model = XLMWithLMHeadModel.from_pretrained('xlm-mlm-xnli15-1024', config=config) # 12-layer, 1024-hidden, 8-heads # XLM Model pre-trained with MLM on the 15 XNLI languages. ''' config = XLMConfig( vocab_size=64139, emb_dim=1024, max_position_embeddings=512, n_heads=8, n_layers=6, ) tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-enfr-1024', do_lower_case=False) model = XLMWithLMHeadModel.from_pretrained('xlm-mlm-enfr-1024', config=config) #6-layer, 1024-hidden, 8-heads # XLM English-French model trained on the concatenation of English and French wikipedia else: print('need to define LM from Bert,RoBerta,XLM') ''' from transformers import BertTokenizerFast, BertConfig, BertForMaskedLM, AdamW tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased', do_lower_case=False) ''' train_inputs = torch.Tensor() train_masks = torch.Tensor() for sent in sentences_train: encoded_sent = tokenizer.encode_plus(
train, val, test = Multi30k.splits(root='../../data', exts=('.en', '.de'), fields=(en_text, de_text)) en_text.build_vocab(train, max_size=30000, min_freq=3) de_text.build_vocab(train, max_size=30000, min_freq=3) vocab_en = en_text.vocab vocab_de = de_text.vocab pad_idx = vocab_de.stoi['<pad>'] train_ldr, val_ldr, test_ldr = BucketIterator.splits((train, val, test), batch_size=5) # load model xlm = XLMWithLMHeadModel.from_pretrained('xlm-mlm-ende-1024') xlm.transformer.embeddings = nn.Embedding(len(vocab_en), xlm.config.emb_dim, padding_idx=pad_idx) xlm.pred_layer.proj = nn.Linear(xlm.config.emb_dim, len(vocab_de), bias=True) xlm.cuda() xent = nn.CrossEntropyLoss() batch = next(iter(train_ldr)) src, trg = batch.src.to(0), batch.trg.to(0) def mt_loss(out, target): # only compute loss for non-padding indices
import torch from transformers import XLMTokenizer, XLMWithLMHeadModel import logging logging.basicConfig(level=logging.INFO) tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-17-1280') model = XLMWithLMHeadModel.from_pretrained('xlm-mlm-17-1280') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze( 0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[ 0] # The last hidden-state is the first element of the output tuple print(last_hidden_states) print(last_hidden_states.shape) predicted_index = torch.argmax(last_hidden_states).item() print(predicted_index) predicted_token = tokenizer.convert_ids_to_tokens([predicted_index]) print(predicted_token)