def __init__(self, config, num_labels, freeze_encoder=False, lstm_hidden=300): # instantiate Flaubert model #super().__init__(config) super(FlaubertForCourtDecisionClassification, self).__init__() # instantiate num. of classes self.num_labels = num_labels # instantiate and load a pretrained Flaubert model as encoder self.encoder = FlaubertModel.from_pretrained(PRE_TRAINED_MODEL_NAME) # freeze the encoder parameters if required if freeze_encoder: for param in self.encoder.parameters(): param.requires_grad = False # the classifier: a feed-forward layer attached to the encoder's head self.classifier = torch.nn.Linear(in_features=lstm_hidden * 2, out_features=self.num_labels, bias=True) # instantiate a dropout function for the classifier's input self.dropout = torch.nn.Dropout(p=0.1) self.loss = torch.nn.CrossEntropyLoss() # apply a LSTM self.lstm = torch.nn.LSTM( self.encoder.config.hidden_size, lstm_hidden, 1, batch_first=True, bidirectional=True) #(input_size, hidden_size, num_layers)
def init_model(self, model='flaubert', device=None, log=False): # Choosing device for language model if device is None: device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.device = device try: # Flaubert model if model == 'flaubert': model_name = 'flaubert/flaubert_large_cased' flaubert = FlaubertModel.from_pretrained(model_name) tokenizer = FlaubertTokenizer.from_pretrained( model_name, do_lowercase=False) self.model = flaubert self.tokenizer = tokenizer self.model_name = model_name # Camembert model elif model == 'camembert': model_name = 'camembert' self.model = torch.hub.load('pytorch/fairseq', 'camembert') self.model_name = model_name except: print(f'Error while loading the {model} model.') return # Model Inference self.model.to(self.device) self.model.eval() # Log Info if log: self.init_log(self.model_name, self.device)
def __init__(self, config, pos_weight=None): super(FlaubertForMultiLabelSequenceClassification, self).__init__(config) self.num_labels = config.num_labels self.pos_weight = pos_weight self.transformer = FlaubertModel(config) self.sequence_summary = SequenceSummary(config) self.init_weights()
def test_inference_no_head_absolute_embedding(self): model = FlaubertModel.from_pretrained("flaubert/flaubert_base_cased") input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]]) output = model(input_ids)[0] expected_shape = torch.Size((1, 11, 768)) self.assertEqual(output.shape, expected_shape) expected_slice = torch.tensor( [[[-2.6251, -1.4298, -0.0227], [-2.8510, -1.6387, 0.2258], [-2.8114, -1.1832, -0.3066]]] ) self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
def create_and_check_flaubert_model( self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, choice_labels, input_mask, ): model = FlaubertModel(config=config) model.to(torch_device) model.eval() outputs = model(input_ids, lengths=input_lengths, langs=token_type_ids) outputs = model(input_ids, langs=token_type_ids) outputs = model(input_ids) sequence_output = outputs[0] result = { "sequence_output": sequence_output, } self.parent.assertListEqual( list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size])
def __init__(self, auto_model: str, auto_path: str): super().__init__() if auto_model is None: auto_model = "" if "camembert" in auto_model: from transformers import CamembertModel, CamembertTokenizer self.auto_embeddings = CamembertModel.from_pretrained(auto_path) self.auto_tokenizer = CamembertTokenizer.from_pretrained(auto_path) elif "flaubert2" in auto_model: from transformers import FlaubertModel, FlaubertTokenizer self.auto_embeddings = FlaubertModel.from_pretrained(auto_path) self.auto_tokenizer = FlaubertTokenizer.from_pretrained(auto_path) elif "flaubert" in auto_model: from transformers import XLMModel, XLMTokenizer self.auto_embeddings = XLMModel.from_pretrained(auto_path) self.auto_tokenizer = XLMTokenizer.from_pretrained(auto_path) self.auto_tokenizer.do_lowercase_and_remove_accent = False elif "xlm" in auto_model: from transformers import XLMModel, XLMTokenizer self.auto_embeddings = XLMModel.from_pretrained(auto_path) self.auto_tokenizer = XLMTokenizer.from_pretrained(auto_path) elif "roberta" in auto_model: from transformers import RobertaModel, RobertaTokenizer self.auto_embeddings = RobertaModel.from_pretrained(auto_path) self.auto_tokenizer = RobertaTokenizer.from_pretrained(auto_path) elif "bert" in auto_model: from transformers import BertModel, BertTokenizer self.auto_embeddings = BertModel.from_pretrained(auto_path) self.auto_tokenizer = BertTokenizer.from_pretrained(auto_path) else: from transformers import AutoModel, AutoTokenizer, XLMTokenizer self.auto_embeddings = AutoModel.from_pretrained(auto_path) self.auto_tokenizer = AutoTokenizer.from_pretrained(auto_path) if isinstance(self.auto_tokenizer, XLMTokenizer): self.auto_tokenizer.do_lowercase_and_remove_accent = False for param in self.auto_embeddings.parameters(): param.requires_grad = False self._is_fixed = True self._output_dim = self.auto_embeddings.config.hidden_size self._begin_special_token_count = self.get_begin_special_token_count() self._padding_id = self.auto_tokenizer.pad_token_id
def create_and_check_flaubert_model( self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, choice_labels, input_mask, ): model = FlaubertModel(config=config) model.to(torch_device) model.eval() result = model(input_ids, lengths=input_lengths, langs=token_type_ids) result = model(input_ids, langs=token_type_ids) result = model(input_ids) self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
def test_model_from_pretrained(self): for model_name in FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = FlaubertModel.from_pretrained(model_name) self.assertIsNotNone(model)
import torch from transformers import FlaubertModel, FlaubertTokenizer from preprocess import preprocess from sklearn.metrics.pairwise import cosine_similarity from tqdm import tqdm modelname = 'flaubert-base-uncased' # Load pretrained model and tokenizer flaubert, log = FlaubertModel.from_pretrained(modelname, output_loading_info=True) flaubert_tokenizer = FlaubertTokenizer.from_pretrained(modelname, do_lowercase=True) def get_flo_vec(q): query = preprocess(q, lower=True) token_ids = torch.tensor([flaubert_tokenizer.encode(query)]) last_layer = flaubert(token_ids)[0][:, 1:-1, :] return last_layer.detach().numpy().mean(axis=1) def build_flo_mat(titles_processed): f_mat = [get_flo_vec(t).squeeze() for t in tqdm(titles_processed)] return f_mat class Predictor(): def __init__(self, titles): self.mat = self._build_flo_mat(titles)
def main(): usage = """<documentation>""" parser = argparse.ArgumentParser( description=usage, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument("input", type=str, help="Path to input tsv file") parser.add_argument("output", type=str, help="Path to output") args = parser.parse_args() fic_input = args.input #fichier input fic_output = args.output #fichier output tableau = open(fic_input, 'r', encoding='utf8') next(tableau) #ignore la 1ère ligne du tableau output = open(fic_output, 'w', encoding='utf8') output.write("Ligne" + "\t" + "Compositionnel" + "\t" "Base" + "\t" + "Genre base" + "\t" + "Vecteur base" + "\t" + "Collocatif" + "\t" + "Vecteur collocatif" + "\t" + "Phrase" + "\n") #entête des colonnes du fichier en output #chargement de flaubert model_id = "flaubert-base-cased" tokenizer = FlaubertTokenizer.from_pretrained(model_id, do_lower_case=False) flaubert = FlaubertModel.from_pretrained(model_id) flaubert.eval() #parcours du fichier cpt = 2 #compteur pour savoir de quelle phrase il s'agit dans le fichier. 1ère phrase = ligne 2 dans le fichier en input. for ligne in tableau: numero_ligne_phrase = cpt #compteur de la ligne sur laquelle se trouve la phrase decoupage = ligne.split('\t') #découpage des colonnes à la tabulation base = decoupage[0] genre_base = decoupage[1] nb_base = decoupage[2] collocatif = decoupage[3] genre_colloc = decoupage[4] nb_colloc = decoupage[5] lemme_colloc = decoupage[6] trait_compositionnel = decoupage[7] phrase = decoupage[8] #tokenisation avec Flaubert id_base = tokenizer.encode(base)[ 1] #id de la base (id du milieu car entouré par "1" et "1") id_collocatif = tokenizer.encode(collocatif)[ 1] #id du collocatif (id du milieu car entouré par "1" et "1") id_phrase = tokenizer.encode( phrase ) #id dans le vocabulaire de flaubert des tokens de la phrase tableau_indice = { } #dictionnaire avec les indices des tokens pour CHAQUE phrase. clé = numéro du token dans la phrase, valeur = id dans le vocabulaire de flaubert nb_occurrences = { } #dictionnaire avec les occurrences pour CHAQUE phrase. clé = id dans le vocabulaire de flaubert, valeur = nombre d'occurrence #utilisation de pytorch et flaubert sur chaque phrase token_ids = torch.tensor( [id_phrase]) #création d'une matrice pour chaque phrase contextual_vectors = flaubert(token_ids)[ 0] #calcule des vecteurs contextuels contextual_vectors = contextual_vectors.squeeze( 0) #On enlève la première dimension recovered_tokens = tokenizer.convert_ids_to_tokens( id_phrase ) #tokens reconstitués(parfois des bouts de tokens, parfois des tokens entiers #parcours token par token dans les phrases pour compter le nombre d'occurrence for i in range(0, len(id_phrase) - 1): id_token = id_phrase[i] tableau_indice[i] = id_token if id_token in nb_occurrences: nb_occurrences[id_token] += 1 else: nb_occurrences[id_token] = 1 #cas où il n'y a qu'une occurrence de la base et du collocatif if nb_occurrences[id_base] == 1 and nb_occurrences[id_collocatif] == 1: resultat_colloc = id_collocatif resultat_base = id_base for tok in tableau_indice.keys(): if tableau_indice[tok] == id_base: place_tok_un = tok elif tableau_indice[tok] == id_collocatif: place_tok_deux = tok #cas où une base apparait plusieurs fois dans une phrase elif nb_occurrences[ id_base] > 1: #si la base apparait plus d'une fois par phrase resultat_base, resultat_colloc, place_tok_un, place_tok_deux = base_collocatif_plus_proche( tableau_indice, id_base, id_collocatif, True ) #resultat_base contiendra id_base, et resultat_colloc contiendra id_collocatif #cas où un collocatif apparait plusieurs fois elif nb_occurrences[ id_collocatif] > 1: #si le collocatif apparait plus d'une fois par phrase resultat_base, resultat_colloc, place_tok_un, place_tok_deux = base_collocatif_plus_proche( tableau_indice, id_collocatif, id_base, False ) #resultat_base contiendra id_collocatif, et resultat_colloc contiendra id_base for i in range(0, len(recovered_tokens) - 1): if i == place_tok_un: #si le token lu est égal à la base/collocatif de la phrase # ~ tok_un = recovered_tokens[i] #token 1 avec découpage de Flaubert vecteur_tok_un = contextual_vectors[ i] #on récupère le vecteur du token lu tok_lu_un = base if i == place_tok_deux: #si le token lu est égal à la base/collocatif de la phrase # ~ tok_deux = recovered_tokens[i] #token 2 avec découpage Flaubert vecteur_tok_deux = contextual_vectors[ i] #on récupère le vecteur du token lu tok_lu_deux = collocatif #écriture du numéro de la ligne, token1, vecteur token1, token2, vecteur token2 et phrase entière output.write( str(numero_ligne_phrase) + "\t" + trait_compositionnel + "\t" + tok_lu_un + "\t" + genre_base + "\t" + " ".join(map(str, vecteur_tok_un.numpy())) + "\t" + tok_lu_deux + "\t" + " ".join(map(str, vecteur_tok_deux.numpy())) + "\t" + phrase + "\n") cpt += 1 output.close()
import pandas as pd from Embedder import getContextualEmbedding, concatEmbeddingEn from transformers import BertModel from transformers import FlaubertModel from transformers import BertTokenizer from transformers import FlaubertTokenizer import os LANG = "EN" if LANG == "FR": tokenizer = FlaubertTokenizer.from_pretrained( 'flaubert/flaubert_base_cased', do_lowercase=False) model, log = FlaubertModel.from_pretrained('flaubert/flaubert_base_cased', output_loading_info=True) else: tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lowercase=False) model = BertModel.from_pretrained('bert-base-cased') my_file = open("wiki.dump", "r") content = my_file.read() my_file.close() dfWiki = pd.DataFrame() number = len(content.split()) i = 0 print("Start !", flush=True) p = content.split('\n') print("{} articles to processed".format(len(p)), flush=True) for sentence in p: for sent in range(len(sentence.split()) - 500):
import torch.nn as nn from torch.nn import functional as F import torch.optim as optim import os import random import wandb from transformers import FlaubertModel, FlaubertTokenizer from pytorchtools import EarlyStopping os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"]="0" model_id = "flaubert/flaubert_base_uncased" tokenizer = FlaubertTokenizer.from_pretrained(model_id, do_lower_case=False) flaubert = FlaubertModel.from_pretrained(model_id, output_hidden_states=True) wandb.init(project="FNN") wandb.watch_called = False config = wandb.config # les paramères # dim_input =3072 #con4couches dim_input = 768 dim_hidden = 100 config.epochs = 100 patience = 20 config.seed = 42
tokenizer_out = tokenizer(documents, add_special_tokens=True, max_length=MAX_LEN, return_token_type_ids=False, padding='max_length', return_attention_mask=True, return_tensors='pt', truncation=True) label = torch.tensor(label, dtype=torch.long) # tokenizer_out est un dictionnaire qui contient 2 clés: input_ids et attention_mask return tokenizer_out, label # on renvoie un tuple à 2 éléments # build the classification model PRE_TRAINED_MODEL_NAME = 'flaubert/flaubert_base_cased' flaubert = FlaubertModel.from_pretrained(PRE_TRAINED_MODEL_NAME) FREEZE_PRETRAINED_MODEL = True class FlaubertForCourtDecisionClassification(torch.nn.Module): def __init__(self, config, num_labels, freeze_encoder=False, lstm_hidden=300): # instantiate Flaubert model #super().__init__(config) super(FlaubertForCourtDecisionClassification, self).__init__() # instantiate num. of classes self.num_labels = num_labels # instantiate and load a pretrained Flaubert model as encoder
def test_model_from_pretrained(self): for model_name in list( FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: model = FlaubertModel.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model)
def create(cls, model_type='camem', model_name="camembert-base", embedding_size=768, hidden_dim=512, rnn_layers=1, lstm_dropout=0.5, device="cuda", mode="weighted", key_dim=64, val_dim=64, num_heads=3, attn_dropout=0.3, self_attention=False, is_require_grad=False): configuration = { 'model_type': model_type, "model_name": model_name, "device": device, "mode": mode, "self_attention": self_attention, "is_freeze": is_require_grad } if 'camem' in model_type: config_bert = CamembertConfig.from_pretrained( model_name, output_hidden_states=True) model = CamembertModel.from_pretrained(model_name, config=config_bert) model.to(device) elif 'flaubert' in model_type: config_bert = FlaubertConfig.from_pretrained( model_name, output_hidden_states=True) model = FlaubertModel.from_pretrained(model_name, config=config_bert) model.to(device) elif 'XLMRoberta' in model_type: config_bert = XLMRobertaConfig.from_pretrained( model_name, output_hidden_states=True) model = XLMRobertaModel.from_pretrained(model_name, config=config_bert) model.to(device) elif 'M-Bert' in model_type: config_bert = BertConfig.from_pretrained(model_name, output_hidden_states=True) model = BertModel.from_pretrained(model_name, config=config_bert) model.to(device) lstm = BiLSTM.create(embedding_size=embedding_size, hidden_dim=hidden_dim, rnn_layers=rnn_layers, dropout=lstm_dropout) attn = MultiHeadAttention(key_dim, val_dim, hidden_dim, num_heads, attn_dropout) model.train() self = cls(model=model, config=configuration, lstm=lstm, attn=attn) # if is_freeze: self.freeze() return self