def create_and_check_flaubert_lm_head( self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask, ): model = FlaubertWithLMHeadModel(config) model.to(torch_device) model.eval() loss, logits = model(input_ids, token_type_ids=token_type_ids, labels=token_labels) result = { "loss": loss, "logits": logits, } self.parent.assertListEqual(list(result["loss"].size()), []) self.parent.assertListEqual( list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size])
def create_and_check_flaubert_lm_head( self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, choice_labels, input_mask, ): model = FlaubertWithLMHeadModel(config) model.to(torch_device) model.eval() result = model(input_ids, token_type_ids=token_type_ids, labels=token_labels) self.parent.assertEqual(result.loss.shape, ()) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
from transformers import FlaubertWithLMHeadModel, FlaubertTokenizer import torch from torch.nn import functional as F import os os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152 os.environ["CUDA_VISIBLE_DEVICES"] = "0" tokenizer = FlaubertTokenizer.from_pretrained("flaubert/flaubert_base_uncased") model = FlaubertWithLMHeadModel.from_pretrained( "flaubert/flaubert_base_uncased") handle = open("mask682.txt", "r") handle = handle.readlines() fichier = open("score de prediction682.txt", "w") for line in handle: line = line.strip() coupe = line.split("**") mot = coupe[0] phrase = coupe[1] sequence = eval(f"f'''{phrase}'''") token_ids = tokenizer.encode(sequence, return_tensors='pt') mask_token_index = torch.where(token_ids == tokenizer.mask_token_id)[1] token_logits = model(token_ids).logits softmax = F.softmax(token_logits, dim=-1)
'num_train_epochs': 4.0, 'local_rank': -1, 'n_gpu': 2, 'gradient_accumulation_steps': 1, 'weight_decay': 0.0, 'learning_rate': 5e-5, 'adam_epsilon': 1e-8, 'warmup_steps': 0, 'seed': 0, 'mlm': 0.15, 'max_grad_norm': 1.0, 'logging_steps': 500, 'save_steps': 500, 'evaluate_during_training': True, 'output_dir': 'flaubert_fine_tuned_alldata', 'save_total_limit': None, 'fp16': True, 'fp16_opt_level': "O1" } args = pd.Series(args) # Load model wih LM Head model = FlaubertWithLMHeadModel.from_pretrained("flaubert-base-cased") model.cuda() # Load data df = load_data('path to csv dataset') tokenizer = FlaubertTokenizer.from_pretrained("flaubert-base-cased") lines = df["preprocesed_text"].astype(str).values.tolist() train_dataset = TextDataset(lines, tokenizer) train(args, train_dataset, model, tokenizer)