Beispiel #1
0
 def __init__(self,
              config,
              num_labels,
              freeze_encoder=False,
              lstm_hidden=300):
     # instantiate Flaubert model
     #super().__init__(config)
     super(FlaubertForCourtDecisionClassification, self).__init__()
     # instantiate num. of classes
     self.num_labels = num_labels
     # instantiate and load a pretrained Flaubert model as encoder
     self.encoder = FlaubertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
     # freeze the encoder parameters if required
     if freeze_encoder:
         for param in self.encoder.parameters():
             param.requires_grad = False
     # the classifier: a feed-forward layer attached to the encoder's head
     self.classifier = torch.nn.Linear(in_features=lstm_hidden * 2,
                                       out_features=self.num_labels,
                                       bias=True)
     # instantiate a dropout function for the classifier's input
     self.dropout = torch.nn.Dropout(p=0.1)
     self.loss = torch.nn.CrossEntropyLoss()
     # apply a LSTM
     self.lstm = torch.nn.LSTM(
         self.encoder.config.hidden_size,
         lstm_hidden,
         1,
         batch_first=True,
         bidirectional=True)  #(input_size, hidden_size, num_layers)
    def init_model(self, model='flaubert', device=None, log=False):
        # Choosing device for language model
        if device is None:
            device = torch.device(
                "cuda" if torch.cuda.is_available() else "cpu")
        self.device = device

        try:
            # Flaubert model
            if model == 'flaubert':
                model_name = 'flaubert/flaubert_large_cased'
                flaubert = FlaubertModel.from_pretrained(model_name)
                tokenizer = FlaubertTokenizer.from_pretrained(
                    model_name, do_lowercase=False)
                self.model = flaubert
                self.tokenizer = tokenizer
                self.model_name = model_name
            # Camembert model
            elif model == 'camembert':
                model_name = 'camembert'
                self.model = torch.hub.load('pytorch/fairseq', 'camembert')
                self.model_name = model_name
        except:
            print(f'Error while loading the {model} model.')
            return

        # Model Inference
        self.model.to(self.device)
        self.model.eval()

        # Log Info
        if log:
            self.init_log(self.model_name, self.device)
    def test_inference_no_head_absolute_embedding(self):
        model = FlaubertModel.from_pretrained("flaubert/flaubert_base_cased")
        input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
        output = model(input_ids)[0]
        expected_shape = torch.Size((1, 11, 768))
        self.assertEqual(output.shape, expected_shape)
        expected_slice = torch.tensor(
            [[[-2.6251, -1.4298, -0.0227], [-2.8510, -1.6387, 0.2258], [-2.8114, -1.1832, -0.3066]]]
        )

        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
Beispiel #4
0
 def __init__(self, auto_model: str, auto_path: str):
     super().__init__()
     if auto_model is None:
         auto_model = ""
     if "camembert" in auto_model:
         from transformers import CamembertModel, CamembertTokenizer
         self.auto_embeddings = CamembertModel.from_pretrained(auto_path)
         self.auto_tokenizer = CamembertTokenizer.from_pretrained(auto_path)
     elif "flaubert2" in auto_model:
         from transformers import FlaubertModel, FlaubertTokenizer
         self.auto_embeddings = FlaubertModel.from_pretrained(auto_path)
         self.auto_tokenizer = FlaubertTokenizer.from_pretrained(auto_path)
     elif "flaubert" in auto_model:
         from transformers import XLMModel, XLMTokenizer
         self.auto_embeddings = XLMModel.from_pretrained(auto_path)
         self.auto_tokenizer = XLMTokenizer.from_pretrained(auto_path)
         self.auto_tokenizer.do_lowercase_and_remove_accent = False
     elif "xlm" in auto_model:
         from transformers import XLMModel, XLMTokenizer
         self.auto_embeddings = XLMModel.from_pretrained(auto_path)
         self.auto_tokenizer = XLMTokenizer.from_pretrained(auto_path)
     elif "roberta" in auto_model:
         from transformers import RobertaModel, RobertaTokenizer
         self.auto_embeddings = RobertaModel.from_pretrained(auto_path)
         self.auto_tokenizer = RobertaTokenizer.from_pretrained(auto_path)
     elif "bert" in auto_model:
         from transformers import BertModel, BertTokenizer
         self.auto_embeddings = BertModel.from_pretrained(auto_path)
         self.auto_tokenizer = BertTokenizer.from_pretrained(auto_path)
     else:
         from transformers import AutoModel, AutoTokenizer, XLMTokenizer
         self.auto_embeddings = AutoModel.from_pretrained(auto_path)
         self.auto_tokenizer = AutoTokenizer.from_pretrained(auto_path)
         if isinstance(self.auto_tokenizer, XLMTokenizer):
             self.auto_tokenizer.do_lowercase_and_remove_accent = False
     for param in self.auto_embeddings.parameters():
         param.requires_grad = False
     self._is_fixed = True
     self._output_dim = self.auto_embeddings.config.hidden_size
     self._begin_special_token_count = self.get_begin_special_token_count()
     self._padding_id = self.auto_tokenizer.pad_token_id
Beispiel #5
0
 def test_model_from_pretrained(self):
     for model_name in FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
         model = FlaubertModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
Beispiel #6
0
import torch
from transformers import FlaubertModel, FlaubertTokenizer
from preprocess import preprocess
from sklearn.metrics.pairwise import cosine_similarity

from tqdm import tqdm

modelname = 'flaubert-base-uncased'

# Load pretrained model and tokenizer
flaubert, log = FlaubertModel.from_pretrained(modelname,
                                              output_loading_info=True)
flaubert_tokenizer = FlaubertTokenizer.from_pretrained(modelname,
                                                       do_lowercase=True)


def get_flo_vec(q):
    query = preprocess(q, lower=True)
    token_ids = torch.tensor([flaubert_tokenizer.encode(query)])
    last_layer = flaubert(token_ids)[0][:, 1:-1, :]
    return last_layer.detach().numpy().mean(axis=1)


def build_flo_mat(titles_processed):
    f_mat = [get_flo_vec(t).squeeze() for t in tqdm(titles_processed)]
    return f_mat


class Predictor():
    def __init__(self, titles):
        self.mat = self._build_flo_mat(titles)
Beispiel #7
0
def main():
    usage = """<documentation>"""
    parser = argparse.ArgumentParser(
        description=usage, formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument("input", type=str, help="Path to input tsv file")
    parser.add_argument("output", type=str, help="Path to output")
    args = parser.parse_args()
    fic_input = args.input  #fichier input
    fic_output = args.output  #fichier output

    tableau = open(fic_input, 'r', encoding='utf8')
    next(tableau)  #ignore la 1ère ligne du tableau
    output = open(fic_output, 'w', encoding='utf8')
    output.write("Ligne" + "\t" + "Compositionnel" + "\t"
                 "Base" + "\t" + "Genre base" + "\t" + "Vecteur base" + "\t" +
                 "Collocatif" + "\t" + "Vecteur collocatif" + "\t" + "Phrase" +
                 "\n")  #entête des colonnes du fichier en output

    #chargement de flaubert
    model_id = "flaubert-base-cased"
    tokenizer = FlaubertTokenizer.from_pretrained(model_id,
                                                  do_lower_case=False)
    flaubert = FlaubertModel.from_pretrained(model_id)
    flaubert.eval()

    #parcours du fichier
    cpt = 2  #compteur pour savoir de quelle phrase il s'agit dans le fichier. 1ère phrase = ligne 2 dans le fichier en input.
    for ligne in tableau:
        numero_ligne_phrase = cpt  #compteur de la ligne sur laquelle se trouve la phrase
        decoupage = ligne.split('\t')  #découpage des colonnes à la tabulation
        base = decoupage[0]
        genre_base = decoupage[1]
        nb_base = decoupage[2]
        collocatif = decoupage[3]
        genre_colloc = decoupage[4]
        nb_colloc = decoupage[5]
        lemme_colloc = decoupage[6]
        trait_compositionnel = decoupage[7]
        phrase = decoupage[8]

        #tokenisation avec Flaubert
        id_base = tokenizer.encode(base)[
            1]  #id de la base (id du milieu car entouré par "1" et "1")
        id_collocatif = tokenizer.encode(collocatif)[
            1]  #id du collocatif (id du milieu car entouré par "1" et "1")
        id_phrase = tokenizer.encode(
            phrase
        )  #id dans le vocabulaire de flaubert des tokens de la phrase

        tableau_indice = {
        }  #dictionnaire avec les indices des tokens pour CHAQUE phrase. clé = numéro du token dans la phrase, valeur = id dans le vocabulaire de flaubert
        nb_occurrences = {
        }  #dictionnaire avec les occurrences pour CHAQUE phrase. clé = id dans le vocabulaire de flaubert, valeur = nombre d'occurrence

        #utilisation de pytorch et flaubert sur chaque phrase
        token_ids = torch.tensor(
            [id_phrase])  #création d'une matrice pour chaque phrase
        contextual_vectors = flaubert(token_ids)[
            0]  #calcule des vecteurs contextuels
        contextual_vectors = contextual_vectors.squeeze(
            0)  #On enlève la première dimension
        recovered_tokens = tokenizer.convert_ids_to_tokens(
            id_phrase
        )  #tokens reconstitués(parfois des bouts de tokens, parfois des tokens entiers

        #parcours token par token dans les phrases pour compter le nombre d'occurrence
        for i in range(0, len(id_phrase) - 1):
            id_token = id_phrase[i]
            tableau_indice[i] = id_token
            if id_token in nb_occurrences:
                nb_occurrences[id_token] += 1
            else:
                nb_occurrences[id_token] = 1

        #cas où il n'y a qu'une occurrence de la base et du collocatif
        if nb_occurrences[id_base] == 1 and nb_occurrences[id_collocatif] == 1:
            resultat_colloc = id_collocatif
            resultat_base = id_base
            for tok in tableau_indice.keys():
                if tableau_indice[tok] == id_base:
                    place_tok_un = tok
                elif tableau_indice[tok] == id_collocatif:
                    place_tok_deux = tok

        #cas où une base apparait plusieurs fois dans une phrase
        elif nb_occurrences[
                id_base] > 1:  #si la base apparait plus d'une fois par phrase
            resultat_base, resultat_colloc, place_tok_un, place_tok_deux = base_collocatif_plus_proche(
                tableau_indice, id_base, id_collocatif, True
            )  #resultat_base contiendra id_base, et resultat_colloc contiendra id_collocatif
        #cas où un collocatif apparait plusieurs fois
        elif nb_occurrences[
                id_collocatif] > 1:  #si le collocatif apparait plus d'une fois par phrase
            resultat_base, resultat_colloc, place_tok_un, place_tok_deux = base_collocatif_plus_proche(
                tableau_indice, id_collocatif, id_base, False
            )  #resultat_base contiendra id_collocatif, et resultat_colloc contiendra id_base
        for i in range(0, len(recovered_tokens) - 1):
            if i == place_tok_un:  #si le token lu est égal à la base/collocatif de la phrase
                # ~ tok_un = recovered_tokens[i] #token 1 avec découpage de Flaubert
                vecteur_tok_un = contextual_vectors[
                    i]  #on récupère le vecteur du token lu
                tok_lu_un = base
            if i == place_tok_deux:  #si le token lu est égal à la base/collocatif de la phrase
                # ~ tok_deux = recovered_tokens[i] #token 2 avec découpage Flaubert
                vecteur_tok_deux = contextual_vectors[
                    i]  #on récupère le vecteur du token lu
                tok_lu_deux = collocatif
        #écriture du numéro de la ligne, token1, vecteur token1, token2, vecteur token2 et phrase entière
        output.write(
            str(numero_ligne_phrase) + "\t" + trait_compositionnel + "\t" +
            tok_lu_un + "\t" + genre_base + "\t" +
            " ".join(map(str, vecteur_tok_un.numpy())) + "\t" + tok_lu_deux +
            "\t" + " ".join(map(str, vecteur_tok_deux.numpy())) + "\t" +
            phrase + "\n")
        cpt += 1

    output.close()
Beispiel #8
0
import pandas as pd
from Embedder import getContextualEmbedding, concatEmbeddingEn
from transformers import BertModel
from transformers import FlaubertModel
from transformers import BertTokenizer
from transformers import FlaubertTokenizer
import os

LANG = "EN"
if LANG == "FR":
    tokenizer = FlaubertTokenizer.from_pretrained(
        'flaubert/flaubert_base_cased', do_lowercase=False)
    model, log = FlaubertModel.from_pretrained('flaubert/flaubert_base_cased',
                                               output_loading_info=True)
else:
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased',
                                              do_lowercase=False)
    model = BertModel.from_pretrained('bert-base-cased')

my_file = open("wiki.dump", "r")
content = my_file.read()
my_file.close()
dfWiki = pd.DataFrame()
number = len(content.split())
i = 0

print("Start !", flush=True)
p = content.split('\n')
print("{} articles to processed".format(len(p)), flush=True)
for sentence in p:
    for sent in range(len(sentence.split()) - 500):
Beispiel #9
0
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim
import os
import random
import wandb
from transformers import FlaubertModel, FlaubertTokenizer
from pytorchtools import EarlyStopping
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]="0"



model_id = "flaubert/flaubert_base_uncased"
tokenizer = FlaubertTokenizer.from_pretrained(model_id, do_lower_case=False)
flaubert = FlaubertModel.from_pretrained(model_id, output_hidden_states=True)


wandb.init(project="FNN")
wandb.watch_called = False
config = wandb.config

# les paramères
# dim_input =3072 #con4couches
dim_input = 768
dim_hidden = 100
config.epochs = 100
patience = 20
config.seed = 42

Beispiel #10
0
    tokenizer_out = tokenizer(documents,
                              add_special_tokens=True,
                              max_length=MAX_LEN,
                              return_token_type_ids=False,
                              padding='max_length',
                              return_attention_mask=True,
                              return_tensors='pt',
                              truncation=True)
    label = torch.tensor(label, dtype=torch.long)
    # tokenizer_out est un dictionnaire qui contient 2 clés: input_ids et attention_mask
    return tokenizer_out, label  # on renvoie un tuple à 2 éléments


# build the classification model
PRE_TRAINED_MODEL_NAME = 'flaubert/flaubert_base_cased'
flaubert = FlaubertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
FREEZE_PRETRAINED_MODEL = True


class FlaubertForCourtDecisionClassification(torch.nn.Module):
    def __init__(self,
                 config,
                 num_labels,
                 freeze_encoder=False,
                 lstm_hidden=300):
        # instantiate Flaubert model
        #super().__init__(config)
        super(FlaubertForCourtDecisionClassification, self).__init__()
        # instantiate num. of classes
        self.num_labels = num_labels
        # instantiate and load a pretrained Flaubert model as encoder
Beispiel #11
0
 def test_model_from_pretrained(self):
     for model_name in list(
             FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
         model = FlaubertModel.from_pretrained(model_name,
                                               cache_dir=CACHE_DIR)
         self.assertIsNotNone(model)
    def create(cls,
               model_type='camem',
               model_name="camembert-base",
               embedding_size=768,
               hidden_dim=512,
               rnn_layers=1,
               lstm_dropout=0.5,
               device="cuda",
               mode="weighted",
               key_dim=64,
               val_dim=64,
               num_heads=3,
               attn_dropout=0.3,
               self_attention=False,
               is_require_grad=False):
        configuration = {
            'model_type': model_type,
            "model_name": model_name,
            "device": device,
            "mode": mode,
            "self_attention": self_attention,
            "is_freeze": is_require_grad
        }

        if 'camem' in model_type:
            config_bert = CamembertConfig.from_pretrained(
                model_name, output_hidden_states=True)
            model = CamembertModel.from_pretrained(model_name,
                                                   config=config_bert)
            model.to(device)
        elif 'flaubert' in model_type:
            config_bert = FlaubertConfig.from_pretrained(
                model_name, output_hidden_states=True)
            model = FlaubertModel.from_pretrained(model_name,
                                                  config=config_bert)
            model.to(device)
        elif 'XLMRoberta' in model_type:
            config_bert = XLMRobertaConfig.from_pretrained(
                model_name, output_hidden_states=True)
            model = XLMRobertaModel.from_pretrained(model_name,
                                                    config=config_bert)
            model.to(device)
        elif 'M-Bert' in model_type:
            config_bert = BertConfig.from_pretrained(model_name,
                                                     output_hidden_states=True)
            model = BertModel.from_pretrained(model_name, config=config_bert)
            model.to(device)

        lstm = BiLSTM.create(embedding_size=embedding_size,
                             hidden_dim=hidden_dim,
                             rnn_layers=rnn_layers,
                             dropout=lstm_dropout)

        attn = MultiHeadAttention(key_dim, val_dim, hidden_dim, num_heads,
                                  attn_dropout)
        model.train()
        self = cls(model=model, config=configuration, lstm=lstm, attn=attn)
        # if is_freeze:
        self.freeze()

        return self