Exemple #1
0
    def create_and_check_flaubert_lm_head(
        self,
        config,
        input_ids,
        token_type_ids,
        input_lengths,
        sequence_labels,
        token_labels,
        is_impossible_labels,
        input_mask,
    ):
        model = FlaubertWithLMHeadModel(config)
        model.to(torch_device)
        model.eval()

        loss, logits = model(input_ids,
                             token_type_ids=token_type_ids,
                             labels=token_labels)

        result = {
            "loss": loss,
            "logits": logits,
        }

        self.parent.assertListEqual(list(result["loss"].size()), [])
        self.parent.assertListEqual(
            list(result["logits"].size()),
            [self.batch_size, self.seq_length, self.vocab_size])
Exemple #2
0
    def create_and_check_flaubert_lm_head(
        self,
        config,
        input_ids,
        token_type_ids,
        input_lengths,
        sequence_labels,
        token_labels,
        is_impossible_labels,
        choice_labels,
        input_mask,
    ):
        model = FlaubertWithLMHeadModel(config)
        model.to(torch_device)
        model.eval()

        result = model(input_ids, token_type_ids=token_type_ids, labels=token_labels)
        self.parent.assertEqual(result.loss.shape, ())
        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
from transformers import FlaubertWithLMHeadModel, FlaubertTokenizer
import torch
from torch.nn import functional as F
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"  # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

tokenizer = FlaubertTokenizer.from_pretrained("flaubert/flaubert_base_uncased")
model = FlaubertWithLMHeadModel.from_pretrained(
    "flaubert/flaubert_base_uncased")

handle = open("mask682.txt", "r")
handle = handle.readlines()

fichier = open("score de prediction682.txt", "w")
for line in handle:
    line = line.strip()

    coupe = line.split("**")

    mot = coupe[0]
    phrase = coupe[1]

    sequence = eval(f"f'''{phrase}'''")

    token_ids = tokenizer.encode(sequence, return_tensors='pt')
    mask_token_index = torch.where(token_ids == tokenizer.mask_token_id)[1]

    token_logits = model(token_ids).logits
    softmax = F.softmax(token_logits, dim=-1)
Exemple #4
0
        'num_train_epochs': 4.0,
        'local_rank': -1,
        'n_gpu': 2,
        'gradient_accumulation_steps': 1,
        'weight_decay': 0.0,
        'learning_rate': 5e-5,
        'adam_epsilon': 1e-8,
        'warmup_steps': 0,
        'seed': 0,
        'mlm': 0.15,
        'max_grad_norm': 1.0,
        'logging_steps': 500,
        'save_steps': 500,
        'evaluate_during_training': True,
        'output_dir': 'flaubert_fine_tuned_alldata',
        'save_total_limit': None,
        'fp16': True,
        'fp16_opt_level': "O1"
    }
    args = pd.Series(args)
    # Load model wih LM Head
    model = FlaubertWithLMHeadModel.from_pretrained("flaubert-base-cased")
    model.cuda()

    # Load data
    df = load_data('path to csv dataset')
    tokenizer = FlaubertTokenizer.from_pretrained("flaubert-base-cased")
    lines = df["preprocesed_text"].astype(str).values.tolist()
    train_dataset = TextDataset(lines, tokenizer)
    train(args, train_dataset, model, tokenizer)