def load_tokenizer(self): # Load the tokenizer. if self.verbose == True: print('Loading {} tokenizer...'.format(self.model_name)) if self.model_name == 'bert': self.tokenizer = BertTokenizer.from_pretrained(self.model_type, do_lower_case=True) if self.model_name == 'distilbert': self.tokenizer = DistilBertTokenizer.from_pretrained( self.model_type, do_lower_case=True) if self.model_name == 'albert': self.tokenizer = AlbertTokenizer.from_pretrained( self.model_type, do_lower_case=True) if self.model_name == 'bart': self.tokenizer = BartTokenizer.from_pretrained(self.model_type, do_lower_case=True) if self.model_name == 'xlnet': self.tokenizer = XLNetTokenizer.from_pretrained(self.model_type, do_lower_case=True) if self.model_name == 'roberta': self.tokenizer = RobertaTokenizer.from_pretrained( self.model_type, do_lower_case=True) if self.model_name == 'camenbert': self.tokenizer = CamembertTokenizer.from_pretrained( self.model_type, do_lower_case=True) if self.model_name == 'flaubert': self.tokenizer = FlaubertTokenizer.from_pretrained( self.model_type, do_lower_case=True) if self.model_name == 'gpt2': self.tokenizer = GPT2Tokenizer.from_pretrained(self.model_type)
def init_model(self, model='flaubert', device=None, log=False): # Choosing device for language model if device is None: device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.device = device try: # Flaubert model if model == 'flaubert': model_name = 'flaubert/flaubert_large_cased' flaubert = FlaubertModel.from_pretrained(model_name) tokenizer = FlaubertTokenizer.from_pretrained( model_name, do_lowercase=False) self.model = flaubert self.tokenizer = tokenizer self.model_name = model_name # Camembert model elif model == 'camembert': model_name = 'camembert' self.model = torch.hub.load('pytorch/fairseq', 'camembert') self.model_name = model_name except: print(f'Error while loading the {model} model.') return # Model Inference self.model.to(self.device) self.model.eval() # Log Info if log: self.init_log(self.model_name, self.device)
def create(cls, data_file, image_dir, transform, labels_path, pad_idx=0, tokenizer=None, model_type=None, min_char_len=1, max_seq_length=510, model_name="camembert-base", clear_cache=False, is_cls=True): if tokenizer is None: if 'camem' in model_type: tokenizer = CamembertTokenizer.from_pretrained(model_name) elif 'flaubert' in model_type: tokenizer = FlaubertTokenizer.from_pretrained(model_name) elif 'XLMRoberta' in model_type: tokenizer = XLMRobertaTokenizer.from_pretrained(model_name) elif 'M-Bert' in model_type: tokenizer = BertTokenizer.from_pretrained(model_name) with open(data_file, 'rb') as f: data = pickle.load(f) # data = data_file idx2labels, labels2idx = cls.create_labels(labels_path) config = { "min_char_len": min_char_len, "model_name": model_name, "max_sequence_length": max_seq_length, "clear_cache": clear_cache, "pad_idx": pad_idx, "is_cls": is_cls, "idx2labels": idx2labels, "labels2idx": labels2idx } self = cls(data, image_dir, transform, tokenizer, config) return self
def __init__(self, auto_model: str, auto_path: str): super().__init__() if auto_model is None: auto_model = "" if "camembert" in auto_model: from transformers import CamembertModel, CamembertTokenizer self.auto_embeddings = CamembertModel.from_pretrained(auto_path) self.auto_tokenizer = CamembertTokenizer.from_pretrained(auto_path) elif "flaubert2" in auto_model: from transformers import FlaubertModel, FlaubertTokenizer self.auto_embeddings = FlaubertModel.from_pretrained(auto_path) self.auto_tokenizer = FlaubertTokenizer.from_pretrained(auto_path) elif "flaubert" in auto_model: from transformers import XLMModel, XLMTokenizer self.auto_embeddings = XLMModel.from_pretrained(auto_path) self.auto_tokenizer = XLMTokenizer.from_pretrained(auto_path) self.auto_tokenizer.do_lowercase_and_remove_accent = False elif "xlm" in auto_model: from transformers import XLMModel, XLMTokenizer self.auto_embeddings = XLMModel.from_pretrained(auto_path) self.auto_tokenizer = XLMTokenizer.from_pretrained(auto_path) elif "roberta" in auto_model: from transformers import RobertaModel, RobertaTokenizer self.auto_embeddings = RobertaModel.from_pretrained(auto_path) self.auto_tokenizer = RobertaTokenizer.from_pretrained(auto_path) elif "bert" in auto_model: from transformers import BertModel, BertTokenizer self.auto_embeddings = BertModel.from_pretrained(auto_path) self.auto_tokenizer = BertTokenizer.from_pretrained(auto_path) else: from transformers import AutoModel, AutoTokenizer, XLMTokenizer self.auto_embeddings = AutoModel.from_pretrained(auto_path) self.auto_tokenizer = AutoTokenizer.from_pretrained(auto_path) if isinstance(self.auto_tokenizer, XLMTokenizer): self.auto_tokenizer.do_lowercase_and_remove_accent = False for param in self.auto_embeddings.parameters(): param.requires_grad = False self._is_fixed = True self._output_dim = self.auto_embeddings.config.hidden_size self._begin_special_token_count = self.get_begin_special_token_count() self._padding_id = self.auto_tokenizer.pad_token_id
import torch from transformers import FlaubertModel, FlaubertTokenizer from preprocess import preprocess from sklearn.metrics.pairwise import cosine_similarity from tqdm import tqdm modelname = 'flaubert-base-uncased' # Load pretrained model and tokenizer flaubert, log = FlaubertModel.from_pretrained(modelname, output_loading_info=True) flaubert_tokenizer = FlaubertTokenizer.from_pretrained(modelname, do_lowercase=True) def get_flo_vec(q): query = preprocess(q, lower=True) token_ids = torch.tensor([flaubert_tokenizer.encode(query)]) last_layer = flaubert(token_ids)[0][:, 1:-1, :] return last_layer.detach().numpy().mean(axis=1) def build_flo_mat(titles_processed): f_mat = [get_flo_vec(t).squeeze() for t in tqdm(titles_processed)] return f_mat class Predictor(): def __init__(self, titles): self.mat = self._build_flo_mat(titles)
from transformers import FlaubertWithLMHeadModel, FlaubertTokenizer import torch from torch.nn import functional as F import os os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152 os.environ["CUDA_VISIBLE_DEVICES"] = "0" tokenizer = FlaubertTokenizer.from_pretrained("flaubert/flaubert_base_uncased") model = FlaubertWithLMHeadModel.from_pretrained( "flaubert/flaubert_base_uncased") handle = open("mask682.txt", "r") handle = handle.readlines() fichier = open("score de prediction682.txt", "w") for line in handle: line = line.strip() coupe = line.split("**") mot = coupe[0] phrase = coupe[1] sequence = eval(f"f'''{phrase}'''") token_ids = tokenizer.encode(sequence, return_tensors='pt') mask_token_index = torch.where(token_ids == tokenizer.mask_token_id)[1] token_logits = model(token_ids).logits softmax = F.softmax(token_logits, dim=-1)
def main(): usage = """<documentation>""" parser = argparse.ArgumentParser( description=usage, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument("input", type=str, help="Path to input tsv file") parser.add_argument("output", type=str, help="Path to output") args = parser.parse_args() fic_input = args.input #fichier input fic_output = args.output #fichier output tableau = open(fic_input, 'r', encoding='utf8') next(tableau) #ignore la 1ère ligne du tableau output = open(fic_output, 'w', encoding='utf8') output.write("Ligne" + "\t" + "Compositionnel" + "\t" "Base" + "\t" + "Genre base" + "\t" + "Vecteur base" + "\t" + "Collocatif" + "\t" + "Vecteur collocatif" + "\t" + "Phrase" + "\n") #entête des colonnes du fichier en output #chargement de flaubert model_id = "flaubert-base-cased" tokenizer = FlaubertTokenizer.from_pretrained(model_id, do_lower_case=False) flaubert = FlaubertModel.from_pretrained(model_id) flaubert.eval() #parcours du fichier cpt = 2 #compteur pour savoir de quelle phrase il s'agit dans le fichier. 1ère phrase = ligne 2 dans le fichier en input. for ligne in tableau: numero_ligne_phrase = cpt #compteur de la ligne sur laquelle se trouve la phrase decoupage = ligne.split('\t') #découpage des colonnes à la tabulation base = decoupage[0] genre_base = decoupage[1] nb_base = decoupage[2] collocatif = decoupage[3] genre_colloc = decoupage[4] nb_colloc = decoupage[5] lemme_colloc = decoupage[6] trait_compositionnel = decoupage[7] phrase = decoupage[8] #tokenisation avec Flaubert id_base = tokenizer.encode(base)[ 1] #id de la base (id du milieu car entouré par "1" et "1") id_collocatif = tokenizer.encode(collocatif)[ 1] #id du collocatif (id du milieu car entouré par "1" et "1") id_phrase = tokenizer.encode( phrase ) #id dans le vocabulaire de flaubert des tokens de la phrase tableau_indice = { } #dictionnaire avec les indices des tokens pour CHAQUE phrase. clé = numéro du token dans la phrase, valeur = id dans le vocabulaire de flaubert nb_occurrences = { } #dictionnaire avec les occurrences pour CHAQUE phrase. clé = id dans le vocabulaire de flaubert, valeur = nombre d'occurrence #utilisation de pytorch et flaubert sur chaque phrase token_ids = torch.tensor( [id_phrase]) #création d'une matrice pour chaque phrase contextual_vectors = flaubert(token_ids)[ 0] #calcule des vecteurs contextuels contextual_vectors = contextual_vectors.squeeze( 0) #On enlève la première dimension recovered_tokens = tokenizer.convert_ids_to_tokens( id_phrase ) #tokens reconstitués(parfois des bouts de tokens, parfois des tokens entiers #parcours token par token dans les phrases pour compter le nombre d'occurrence for i in range(0, len(id_phrase) - 1): id_token = id_phrase[i] tableau_indice[i] = id_token if id_token in nb_occurrences: nb_occurrences[id_token] += 1 else: nb_occurrences[id_token] = 1 #cas où il n'y a qu'une occurrence de la base et du collocatif if nb_occurrences[id_base] == 1 and nb_occurrences[id_collocatif] == 1: resultat_colloc = id_collocatif resultat_base = id_base for tok in tableau_indice.keys(): if tableau_indice[tok] == id_base: place_tok_un = tok elif tableau_indice[tok] == id_collocatif: place_tok_deux = tok #cas où une base apparait plusieurs fois dans une phrase elif nb_occurrences[ id_base] > 1: #si la base apparait plus d'une fois par phrase resultat_base, resultat_colloc, place_tok_un, place_tok_deux = base_collocatif_plus_proche( tableau_indice, id_base, id_collocatif, True ) #resultat_base contiendra id_base, et resultat_colloc contiendra id_collocatif #cas où un collocatif apparait plusieurs fois elif nb_occurrences[ id_collocatif] > 1: #si le collocatif apparait plus d'une fois par phrase resultat_base, resultat_colloc, place_tok_un, place_tok_deux = base_collocatif_plus_proche( tableau_indice, id_collocatif, id_base, False ) #resultat_base contiendra id_collocatif, et resultat_colloc contiendra id_base for i in range(0, len(recovered_tokens) - 1): if i == place_tok_un: #si le token lu est égal à la base/collocatif de la phrase # ~ tok_un = recovered_tokens[i] #token 1 avec découpage de Flaubert vecteur_tok_un = contextual_vectors[ i] #on récupère le vecteur du token lu tok_lu_un = base if i == place_tok_deux: #si le token lu est égal à la base/collocatif de la phrase # ~ tok_deux = recovered_tokens[i] #token 2 avec découpage Flaubert vecteur_tok_deux = contextual_vectors[ i] #on récupère le vecteur du token lu tok_lu_deux = collocatif #écriture du numéro de la ligne, token1, vecteur token1, token2, vecteur token2 et phrase entière output.write( str(numero_ligne_phrase) + "\t" + trait_compositionnel + "\t" + tok_lu_un + "\t" + genre_base + "\t" + " ".join(map(str, vecteur_tok_un.numpy())) + "\t" + tok_lu_deux + "\t" + " ".join(map(str, vecteur_tok_deux.numpy())) + "\t" + phrase + "\n") cpt += 1 output.close()
import bson import numpy as np import pymongo from tqdm import tqdm from transformers import FlaubertTokenizer from utils import post_token_classification from flaubert_token_classification import TFFlaubertForTokenClassification model = TFFlaubertForTokenClassification.from_pretrained("../models/ner") tokenizer = FlaubertTokenizer.from_pretrained("jplu/tf-flaubert-base-cased") SEQUENCE_LENGTH = 64 ner_labels = ["LOC", "MISC", "ORG", "PER", "O"] client = pymongo.MongoClient("mongodb://localhost:27017/") db = client["bert_clustering"] articles = db["articles"].find() for article in tqdm(articles): sentence = tokenizer.tokenize(article["raw"]) input_tokens = tokenizer.encode_plus( article["raw"], max_length=SEQUENCE_LENGTH, pad_to_max_length=SEQUENCE_LENGTH, add_special_tokens=True, return_tensors='tf', return_token_type_ids=True, return_attention_mask=True, )
import pandas as pd from Embedder import getContextualEmbedding, concatEmbeddingEn from transformers import BertModel from transformers import FlaubertModel from transformers import BertTokenizer from transformers import FlaubertTokenizer import os LANG = "EN" if LANG == "FR": tokenizer = FlaubertTokenizer.from_pretrained( 'flaubert/flaubert_base_cased', do_lowercase=False) model, log = FlaubertModel.from_pretrained('flaubert/flaubert_base_cased', output_loading_info=True) else: tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lowercase=False) model = BertModel.from_pretrained('bert-base-cased') my_file = open("wiki.dump", "r") content = my_file.read() my_file.close() dfWiki = pd.DataFrame() number = len(content.split()) i = 0 print("Start !", flush=True) p = content.split('\n') print("{} articles to processed".format(len(p)), flush=True) for sentence in p: for sent in range(len(sentence.split()) - 500):
import numpy as np import torch.nn as nn from torch.nn import functional as F import torch.optim as optim import os import random import wandb from transformers import FlaubertModel, FlaubertTokenizer from pytorchtools import EarlyStopping os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"]="0" model_id = "flaubert/flaubert_base_uncased" tokenizer = FlaubertTokenizer.from_pretrained(model_id, do_lower_case=False) flaubert = FlaubertModel.from_pretrained(model_id, output_hidden_states=True) wandb.init(project="FNN") wandb.watch_called = False config = wandb.config # les paramères # dim_input =3072 #con4couches dim_input = 768 dim_hidden = 100 config.epochs = 100 patience = 20 config.seed = 42
for i in range(len(description)): document = " ".join(description[i]) inputs.append(document) # segmentation of sentences for each document nlp = spacy.load("fr_core_news_sm") doc_sent = [] for i in range(len(inputs)): doc = nlp(inputs[i]) sentences = [sent.text for sent in doc.sents] doc_sent.append(sentences) # define tokenizer of sentences MAX_LEN = 512 tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased', padding=True, truncation=True) def get_tokenized(documents, label): tokenizer_out = tokenizer(documents, add_special_tokens=True, max_length=MAX_LEN, return_token_type_ids=False, padding='max_length', return_attention_mask=True, return_tensors='pt', truncation=True) label = torch.tensor(label, dtype=torch.long) # tokenizer_out est un dictionnaire qui contient 2 clés: input_ids et attention_mask return tokenizer_out, label # on renvoie un tuple à 2 éléments
def parse_dawt(label_map, max_seq_length=64, pad_token_label_id=0): sample = [] labels = [] label_counts = {key: 0 for key, _ in label_map.items()} annotations = open( "../../opendata/wiki_annotation/wiki_annotations_json_sample_fr") tokenizer = FlaubertTokenizer.from_pretrained( "jplu/tf-flaubert-base-cased") for annotation in tqdm(annotations): a = json.loads(annotation) if "entities" not in a or "tokens" not in a: continue entities = a["entities"] tokens = a["tokens"] del a # Add entities to tokens for entity in entities: i = entity["start_position"] token = tokens[i] if "type" not in entity: continue entity.pop("id_str", None) if "entity" not in token and (entity["end_position"] - entity["start_position"] == 0): token["entity"] = entity i += 1 word_tokens = [] label_ids = [] for idx, token in enumerate(tokens): word = token["raw_form"] label = token["entity"]["type"] if "entity" in token else 'O' if idx != len(tokens) and word.lower() in [ "l", "d", "s", "t", "n", "j", "m", "n" ]: word += "\'" tokens[idx]["raw_form"] = word word_token = tokenizer.tokenize(word) word_tokens.extend(word_token) label_ids.extend([label_map[label]] + [0] * (len(word_token) - 1)) label_counts[label] += 1 * len(word_token) if "section_break" in token: word_tokens = [tokenizer.cls_token ] + word_tokens + [tokenizer.sep_token] padding_length = max_seq_length - len(word_tokens) label_ids = [pad_token_label_id ] + label_ids + [pad_token_label_id] input_ids = tokenizer.convert_tokens_to_ids( word_tokens + [tokenizer.pad_token] * padding_length) label_ids += [pad_token_label_id] * padding_length sample.append(input_ids[:max_seq_length]) labels.append(label_ids[:max_seq_length]) word_tokens = [] label_ids = [] return sample, labels, label_counts # if __name__ == "__main__": # s, l = parse_dawt() # print(s[0]) # print(l[0])
else: model_name = "bert-base-uncased" tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=do_lower_case) elif language == "el": if args.cased: print("model not available") else: model_name = "nlpaueb/bert-base-greek-uncased-v1" tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=do_lower_case) elif language == "fr": if args.cased: print("model not available") else: model_name = "flaubert/flaubert_base_uncased" tokenizer = FlaubertTokenizer.from_pretrained(model_name, do_lower_case=do_lower_case) elif language == "es": if args.cased: print("model not available") else: model_name = "dccuchile/bert-base-spanish-wwm-uncased" tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=do_lower_case) torch.set_default_tensor_type(torch.cuda.FloatTensor) torch.manual_seed(0) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') batch_size = 1 toplayer = 13