def train_sbert_model(model_name, mancon_corpus=False, med_nli=False, multi_nli=False, multi_nli_train_x: np.ndarray = None, multi_nli_train_y: np.ndarray = None, multi_nli_test_x: np.ndarray = None, multi_nli_test_y: np.ndarray = None, med_nli_train_x: np.ndarray = None, med_nli_train_y: np.ndarray = None, med_nli_test_x: np.ndarray = None, med_nli_test_y: np.ndarray = None, man_con_train_y: np.ndarray = None, man_con_train_x: np.ndarray = None, man_con_test_x: np.ndarray = None, man_con_test_y: np.ndarray = None, batch_size: int = 2, num_epochs: int = 1, ): """Train SBERT on any NLI dataset. :param model_name: model to be used, currently supported: deepset/covid_bert_base or biobert :param mancon_corpus: [description], defaults to False :type mancon_corpus: bool, optional :param med_nli: [description], defaults to False :type med_nli: bool, optional :param multi_nli: [description], defaults to False :type multi_nli: bool, optional :param multi_nli_train_x: [description], defaults to None :type multi_nli_train_x: np.ndarray, optional :param multi_nli_train_y: [description], defaults to None :type multi_nli_train_y: np.ndarray, optional :param multi_nli_test_x: [description], defaults to None :type multi_nli_test_x: np.ndarray, optional :param multi_nli_test_y: [description], defaults to None :type multi_nli_test_y: np.ndarray, optional :param batch_size: [description], defaults to 2 :type batch_size: int, optional :param num_epochs: [description], defaults to 1 :type num_epochs: int, optional :return: [description] :rtype: [type] """ if model_name == "deepset/covid_bert_base": covid_bert_path = "covid_bert_path" model_save_path = covid_bert_path os.makedirs(model_save_path, exist_ok=True) wget.download("https://cdn.huggingface.co/deepset/covid_bert_base/vocab.txt", out=f"{model_save_path}/") # download the vocab file else: model_name = "allenai/biomed_roberta_base" model_save_path = "biobert_path" os.makedirs(model_save_path, exist_ok=True) wget.download("https://cdn.huggingface.co/allenai/biomed_roberta_base/merges.txt", out=f"{model_save_path}/") wget.download("https://cdn.huggingface.co/allenai/biomed_roberta_base/vocab.json", out=f"{model_save_path}/") # download the vocab file bert_model = AutoModel.from_pretrained(model_name) bert_model.save_pretrained(model_save_path) covid_ert_tokenizer = AutoTokenizer.from_pretrained(model_name) del bert_model word_embedding_model = models.Transformer(model_save_path) shutil.rmtree(model_save_path) pooling_model = models.Pooling(768, pooling_mode_mean_tokens=True, pooling_mode_cls_token=True, pooling_mode_max_tokens=True) # generating biobert sentence embeddings (mean pooling of sentence embedding vectors) sbert_model = SBERTPredictor(word_embedding_model, pooling_model) if multi_nli: if multi_nli_train_x is not None: df_multi_train = remove_tokens_get_sentence_sbert(multi_nli_train_x, multi_nli_train_y) df_multi_val = remove_tokens_get_sentence_sbert(multi_nli_test_x, multi_nli_test_y) multi_train_dataset = ClassifierDataset(df_multi_train, tokenizer=covid_ert_tokenizer) multi_val_dataset = ClassifierDataset(df_multi_val, tokenizer=covid_ert_tokenizer) class_weights = multi_train_dataset.class_weights() train_loader = DataLoader(dataset=multi_train_dataset, batch_size=batch_size, collate_fn=collate_fn) val_loader = DataLoader(dataset=multi_val_dataset, batch_size=1, collate_fn=collate_fn) trainer(model=sbert_model, train_dataloader=train_loader, val_dataloader=val_loader, class_weights=class_weights, epochs=num_epochs) if med_nli: if med_nli_train_x is not None: df_mednli_train = remove_tokens_get_sentence_sbert(med_nli_train_x, med_nli_train_y) df_mednli_val = remove_tokens_get_sentence_sbert(med_nli_test_x, med_nli_test_y) mednli_train_dataset = ClassifierDataset(df_mednli_train, tokenizer=covid_ert_tokenizer) mednli_val_dataset = ClassifierDataset(df_mednli_val, tokenizer=covid_ert_tokenizer) class_weights = mednli_train_dataset.class_weights() train_loader = DataLoader(dataset=mednli_train_dataset, batch_size=batch_size, collate_fn=collate_fn) val_loader = DataLoader(dataset=mednli_val_dataset, batch_size=1, collate_fn=collate_fn) trainer(model=sbert_model, train_dataloader=train_loader, val_dataloader=val_loader, class_weights=class_weights, epochs=num_epochs) if mancon_corpus: if man_con_train_x is not None: df_mancon_train = remove_tokens_get_sentence_sbert(man_con_train_x, man_con_train_y) df_mancon_val = remove_tokens_get_sentence_sbert(man_con_test_x, man_con_test_y) mancon_train_dataset = ClassifierDataset(df_mancon_train, tokenizer=covid_ert_tokenizer) mancon_val_dataset = ClassifierDataset(df_mancon_val, tokenizer=covid_ert_tokenizer) class_weights = mancon_train_dataset.class_weights() train_loader = DataLoader(dataset=mancon_train_dataset, batch_size=batch_size, collate_fn=collate_fn) val_loader = DataLoader(dataset=mancon_val_dataset, batch_size=1, collate_fn=collate_fn) trainer(model=sbert_model, train_dataloader=train_loader, val_dataloader=val_loader, class_weights=class_weights, epochs=num_epochs) return sbert_model
from tqdm.notebook import tqdm from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix from torch.nn import CrossEntropyLoss os.environ["HF_HOME"] = "/scratch/huggingface_cache/" os.makedirs(f'/scratch/devanshg27/{EXPERIMENT_ID}') from torch import cuda device = 'cuda' if cuda.is_available() else 'cpu' tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) #, use_fast=True) config = AutoConfig.from_pretrained(model_checkpoint) config.num_labels = 2 # hack to change num_labels of pretrained model (save without classification head, and then add new classification head while loading) model = AutoModel.from_pretrained(model_checkpoint) model.save_pretrained(f'/scratch/devanshg27_temp_{EXPERIMENT_ID}') model = AutoModelForSequenceClassification.from_pretrained( f'/scratch/devanshg27_temp_{EXPERIMENT_ID}', config=config) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs model = torch.nn.DataParallel(model) model = model.to(device) class GLUECoSNLIProcessor(processors['xnli']): def get_labels(self): return ["contradiction", "entailment"]
if encoder_type == 'nmt': #model_name_or_dir = f'{exp_folder}/hf' BATCH_SIZE = 2000 LAYER_ID = 4 tokenizer_hf = FSMTTokenizer.from_pretrained(model_name_or_dir) model_hf = FSMTForConditionalGeneration.from_pretrained( model_name_or_dir) model_hf = model_hf.cuda() encoder_hf = model_hf.base_model.encoder encoder_hf.device = model_hf.device elif encoder_type == 'bert': #model_name_or_dir = 'xlm-roberta-base' BATCH_SIZE = 2000 # probably can do 512 LAYER_ID = 7 tokenizer_hf = AutoTokenizer.from_pretrained(model_name_or_dir) encoder_hf = AutoModel.from_pretrained(model_name_or_dir) encoder_hf = encoder_hf.cuda() encoded_sent = extract_reps_sent(data=data, tokenizer_hf=tokenizer_hf, encoder_hf=encoder_hf, batch_size=BATCH_SIZE, layer_id=LAYER_ID) pickle_dump_to_file(encoded_sent, savefile)
# Code adapted from: https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb import pandas as pd import torch import transformers from torch.utils.data import Dataset, DataLoader from torch import cuda import sys from sklearn.metrics import f1_score from transformers import AutoTokenizer, AutoModel LMTokenizer = AutoTokenizer.from_pretrained(sys.argv[1]) LMModel = AutoModel.from_pretrained(sys.argv[1]) device = 'cuda' if cuda.is_available() else 'cpu' train_dataset = pd.read_csv('./train.csv', sep=',', names=['CGT', 'CDT', 'CC', 'label']) testing_dataset = pd.read_csv('./validation.csv', sep=',', names=['CGT', 'CDT', 'CC', 'label']) MAX_LEN = 512 TRAIN_BATCH_SIZE = int(sys.argv[2]) VALID_BATCH_SIZE = int(sys.argv[2]) LEARNING_RATE = float(sys.argv[3]) drop_out = float(sys.argv[4]) EPOCHS = 10 tokenizer = LMTokenizer
def __init__(self, config, device, num_genres=None): super().__init__() self.config = config self.device = device self.num_genres = num_genres if num_genres else len(config['genres']) self.max_seg_len = config['max_segment_len'] self.max_span_width = config['max_span_width'] assert config['loss_type'] in ['marginalized', 'hinge'] if config['coref_depth'] > 1 or config[ 'higher_order'] == 'cluster_merging': assert config[ 'fine_grained'] # Higher-order is in slow fine-grained scoring # Model self.dropout = nn.Dropout(p=config['dropout_rate']) if config['hidden_dropout_prob'] >= 0: self.bert = AutoModel.from_pretrained( config['bert_pretrained_name_or_path'], hidden_dropout_prob=config['hidden_dropout_prob']) else: self.bert = AutoModel.from_pretrained( config['bert_pretrained_name_or_path']) self.bert_emb_size = self.bert.config.hidden_size self.span_emb_size = self.bert_emb_size * 3 if config['use_features']: self.span_emb_size += config['feature_emb_size'] self.pair_emb_size = self.span_emb_size * 3 if config['use_metadata']: self.pair_emb_size += 2 * config['feature_emb_size'] if config['use_features']: self.pair_emb_size += config['feature_emb_size'] if config['use_segment_distance']: self.pair_emb_size += config['feature_emb_size'] self.emb_span_width = self.make_embedding( self.max_span_width) if config['use_features'] else None self.emb_span_width_prior = self.make_embedding( self.max_span_width) if config['use_width_prior'] else None self.emb_antecedent_distance_prior = self.make_embedding( 10) if config['use_distance_prior'] else None self.emb_genre = self.make_embedding(self.num_genres) self.emb_same_speaker = self.make_embedding( 2) if config['use_metadata'] else None self.emb_segment_distance = self.make_embedding( config['max_training_sentences'] ) if config['use_segment_distance'] else None self.emb_top_antecedent_distance = self.make_embedding(10) self.emb_cluster_size = self.make_embedding( 10) if config['higher_order'] == 'cluster_merging' else None self.mention_token_attn = self.make_ffnn( self.bert_emb_size, 0, output_size=1) if config['model_heads'] else None self.span_emb_score_ffnn = self.make_ffnn( self.span_emb_size, [config['ffnn_size']] * config['ffnn_depth'], output_size=1) self.span_width_score_ffnn = self.make_ffnn( config['feature_emb_size'], [config['ffnn_size']] * config['ffnn_depth'], output_size=1) if config['use_width_prior'] else None self.coarse_bilinear = self.make_ffnn(self.span_emb_size, 0, output_size=self.span_emb_size) self.antecedent_distance_score_ffnn = self.make_ffnn( config['feature_emb_size'], 0, output_size=1) if config['use_distance_prior'] else None self.coref_score_ffnn = self.make_ffnn( self.pair_emb_size, [config['ffnn_size']] * config['ffnn_depth'], output_size=1) if config['fine_grained'] else None self.gate_ffnn = self.make_ffnn( 2 * self.span_emb_size, 0, output_size=self.span_emb_size ) if config['coref_depth'] > 1 else None self.span_attn_ffnn = self.make_ffnn( self.span_emb_size, 0, output_size=1 ) if config['higher_order'] == 'span_clustering' else None self.cluster_score_ffnn = self.make_ffnn( 3 * self.span_emb_size + config['feature_emb_size'], [config['cluster_ffnn_size']] * config['ffnn_depth'], output_size=1 ) if config['higher_order'] == 'cluster_merging' else None self.update_steps = 0 # Internal use for debug self.debug = False
from transformers import AutoTokenizer, AutoModel from sklearn.ensemble import RandomForestClassifier from sklearn.datasets import make_classification from torch import cuda import csv import torch device = 'cuda' if cuda.is_available() else 'cpu' tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased') model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased').to( device) file = open('train.csv') cr = csv.reader(file) lines = list(cr) file.close() xtrain = [] ytrain = [] for i in lines: input_ids = torch.tensor(tokenizer.encode(i[-2])).unsqueeze(0) outputs = model(input_ids.to(device)) last_hidden_states = outputs[0] xtrain.append(last_hidden_states.tolist()[0][0]) ytrain.append(int(i[-1])) xtest = [] ytest = []
data = pd.read_csv( r"E:\Projects\Emotion_detection_gihan\finbert_experiments\financial phrasebank\processed_fpbank.csv" ) sentences = list(data["sentence"]) labels = list(data["sentiment_id"]) #Sentences we want sentence embeddings for # sentences = ['This framework generates embeddings for each input sentence', # 'Sentences are passed as a list of string.', # 'The quick brown fox jumps over the lazy dog.'] model_name = "ProsusAI/finbert" # model_name = "bert-base-uncased" #Load AutoModel from huggingface model repository tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModel.from_pretrained(model_name) #Tokenize sentences encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt') #Compute token embeddings with torch.no_grad(): model_output = model(**encoded_input) #Perform pooling. In this case, mean pooling sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
def _compute_pytorch(model_names, dictionary, average_over, device, torchscript, fp16): for c, model_name in enumerate(model_names): print(f"{c + 1} / {len(model_names)}") config = AutoConfig.from_pretrained(model_name, torchscript=torchscript) model = AutoModel.from_pretrained(model_name, config=config) tokenizer = AutoTokenizer.from_pretrained(model_name) tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False) max_input_size = tokenizer.max_model_input_sizes[model_name] batch_sizes = [1, 2, 4, 8] slice_sizes = [8, 64, 128, 256, 512, 1024] dictionary[model_name] = { "bs": batch_sizes, "ss": slice_sizes, "results": {} } dictionary[model_name]["results"] = {i: {} for i in batch_sizes} for batch_size in batch_sizes: if fp16: model.half() model.to(device) model.eval() for slice_size in slice_sizes: if max_input_size is not None and slice_size > max_input_size: dictionary[model_name]["results"][batch_size][ slice_size] = "N/A" else: sequence = torch.tensor(tokenized_sequence[:slice_size], device=device).repeat( batch_size, 1) try: if torchscript: print("Tracing model with sequence size", sequence.shape) inference = torch.jit.trace(model, sequence) inference(sequence) else: inference = model inference(sequence) print("Going through model with sequence of shape", sequence.shape) runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3) average_time = sum(runtimes) / float( len(runtimes)) / 3.0 dictionary[model_name]["results"][batch_size][ slice_size] = average_time except RuntimeError as e: print("Doesn't fit on GPU.", e) torch.cuda.empty_cache() dictionary[model_name]["results"][batch_size][ slice_size] = "N/A" return dictionary
import pickle, os.path, pandas as pd # Modules for Scraping from importlib import import_module # Modules for Running Predictions import sys, time, os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' from transformers import AutoModel, BertTokenizerFast import torch import torch.nn as nn import numpy as np import time import sqlite3 as sql # These are constant variables device = torch.device("cpu") bert = AutoModel.from_pretrained('bert-base-uncased') tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') class BERT_Arch(nn.Module): def __init__(self, bert): super(BERT_Arch, self).__init__() self.bert = bert # dropout layer self.dropout = nn.Dropout(0.2) # relu activation function self.relu = nn.ReLU() # dense layer 1 self.fc1 = nn.Linear(768, 512) # dense layer 2 (Output layer) self.fc2 = nn.Linear(512, 2)
def load_pretrained_model(model_name, config, cache_dir): if model_name in ["gpt2", "distilgpt2", "gpt2-large"]: return GPT2ModelNoPastState.from_pretrained(model_name, config=config, cache_dir=cache_dir) return AutoModel.from_pretrained(model_name, config=config, cache_dir=cache_dir)
FairseqDropout, LayerDropModuleList, LayerNorm, PositionalEmbedding, SinusoidalPositionalEmbedding, TransformerDecoderLayer, TransformerEncoderLayer, ) from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_ from torch import Tensor from transformers import AutoModel DEFAULT_MAX_SOURCE_POSITIONS = 1024 DEFAULT_MAX_TARGET_POSITIONS = 1024 Pretrained_model = AutoModel.from_pretrained("lanwuwei/GigaBERT-v4-Arabic-and-English") Pretrained_model.eval() @register_model("transformer") class TransformerModel(FairseqEncoderDecoderModel): """ Transformer model from `"Attention Is All You Need" (Vaswani, et al, 2017) <https://arxiv.org/abs/1706.03762>`_. Args: encoder (TransformerEncoder): the encoder decoder (TransformerDecoder): the decoder The Transformer model provides the following named architectures and command-line arguments:
# print("nonzero_index", nonzero_index) # nonzero_index = nonzero_index.squeeze(1) # print() # print("nonzero_index", list(nonzero_index.numpy())) # exit() ### load the pretrained model output_dir = "/tmp/test-mlm-wwm" config_file = os.path.join(output_dir, "tokenizer_config.json") model_path = os.path.join(output_dir) tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") model = AutoModel.from_pretrained(output_dir) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = model.to(device) ### load the data ### load sentences dataset_path = "/p/reviewde/data/ratebeer/sentences" id2sent_file = "id_to_sent.pickle" max_corpus_size = 100 embedding_cache_path = 'ratebeer-embeddings-size-{}.pkl'.format( max_corpus_size) # #Check if embedding cache path exists
args = parser.parse_args() # Set defaults ------------------------------------------------------------ if torch.cuda.is_available(): args.device = torch.device('cuda') else: args.device = torch.device('cpu') # Read MedMentions trn/dev/tst splits split_trn = [p.rstrip() for p in open(cfg.MM_PMIDS_TRN).readlines()] split_dev = [p.rstrip() for p in open(cfg.MM_PMIDS_DEV).readlines()] split_tst = [p.rstrip() for p in open(cfg.MM_PMIDS_TST).readlines()] # Load pretrained BERT model tokenizer = AutoTokenizer.from_pretrained(cfg.BERT_MODEL) encoder = AutoModel.from_pretrained(cfg.BERT_MODEL, output_hidden_states=True) encoder.to(args.device) encoder.eval() # Read CUIs UMLS = Entities() # Read and convert MedMentions annotation examples examples = read_mm_examples() # Exclude the CUIs that do not have any name associated with print('=> Deleting all the CUIs without a name') to_delete = [] total = len(UMLS.cuis) for cui, e in UMLS.cuis.items(): if len(e.names) == 0:
def __init__(self, model_name_or_path, temperature=0.05, pooling="mean"): super().__init__(temperature) self.bert_model = AutoModel.from_pretrained(model_name_or_path) self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) self.pooling = pooling
from train_model import * from pprint import pprint model = "craftassist/agent/models/semantic_parser/ttad_bert_updated/caip_test_model.pth" args_path = "craftassist/agent/models/semantic_parser/ttad_bert_updated/caip_test_model_args.pk" args = pickle.load(open(args_path, "rb")) tokenizer = AutoTokenizer.from_pretrained(args.pretrained_encoder_name) full_tree, tree_i2w = json.load(open(args.tree_voc_file)) dataset = CAIPDataset(tokenizer, args, prefix="", full_tree_voc=(full_tree, tree_i2w)) enc_model = AutoModel.from_pretrained(args.pretrained_encoder_name) bert_config = BertConfig.from_pretrained("bert-base-uncased") bert_config.is_decoder = True bert_config.add_cross_attention = True bert_config.vocab_size = len(tree_i2w) + 8 bert_config.num_hidden_layers = args.num_decoder_layers dec_with_loss = DecoderWithLoss(bert_config, args, tokenizer) encoder_decoder = EncoderDecoderWithLoss(enc_model, dec_with_loss, args) map_location = None if torch.cuda.is_available() else torch.device("cpu") encoder_decoder.load_state_dict(torch.load(model, map_location=map_location), strict=False) encoder_decoder = encoder_decoder.cuda() _ = encoder_decoder.eval() def get_beam_tree(chat, noop_thres=0.95, beam_size=5, well_formed_pen=1e2):
def __init__(self, config, args): super().__init__() self.encoder = AutoModel.from_pretrained(args.model_name)
def __init__(self, pretrained): super().__init__() self.model = AutoModel.from_pretrained(pretrained)
parser.add_argument( '--random_init', action='store_true', default=False, help='Boolean indication whether to randomly initialize the model.') args = parser.parse_args() print(args) print('Extracting Features') tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model_name) config = AutoConfig.from_pretrained(args.pretrained_model_name, output_hidden_states=True) if args.random_init: # random initialization of the model model = AutoModel.from_config(config) else: model = AutoModel.from_pretrained(args.pretrained_model_name, config=config) manifold_vectors = defaultdict(dict) with open(args.tag_file) as f: for tag in f: tag = tag.strip().lower() for layer in range(1, config.num_hidden_layers + 1): manifold_vectors[layer][tag] = None device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) model.eval()
import torch from transformers import AutoModel, AutoTokenizer, BertTokenizer torch.set_grad_enabled(False) # Store the model we want to use MODEL_NAME = "bert-base-cased" # We need to create the model and tokenizer model = AutoModel.from_pretrained(MODEL_NAME) tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) ############1. basic usage############ # Tokens comes from a process that splits the input into sub-entities with interesting linguistic properties. tokens = tokenizer.tokenize("This is an input example") print("Tokens: {}".format(tokens)) # This is not sufficient for the model, as it requires integers as input, # not a problem, let's convert tokens to ids. tokens_ids = tokenizer.convert_tokens_to_ids(tokens) print("Tokens id: {}".format(tokens_ids)) # Add the required special tokens tokens_ids = tokenizer.build_inputs_with_special_tokens(tokens_ids) # We need to convert to a Deep Learning framework specific format, let's use PyTorch for now. tokens_pt = torch.tensor([tokens_ids]) print("Tokens PyTorch: {}".format(tokens_pt)) # Now we're ready to go through BERT with out input outputs, pooled = model(tokens_pt)
} model_name = st.selectbox( "Select the model", ('scibert-nli', 'biobert-nli', 'covidbert-nli', 'clinicalcovidbert-nli'), index=2) '#### Selected model:', model_name EMBEDDINGS_PATH = f'{model_name}-embeddings.pkl' path = os.path.join(MODELS_DIR, model_name) if not os.path.exists(path): os.makedirs(path) tokenizer = AutoTokenizer.from_pretrained(MODELS[model_name]) model = AutoModel.from_pretrained(MODELS[model_name]) model.save_pretrained(path) tokenizer.save_pretrained(path) word_embedding_model = models.BERT(path, max_seq_length=512, do_lower_case=True) pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) rmtree(path) model.save(path)
device = 'cuda' if torch.cuda.is_available() else 'cpu' data_dir = 'data-aug-wsd/data/' saved_model_dir = 'data-aug-wsd/saved_models/' data_file = 'train_data_reduced_augmented_le5.jsonl' experiment_dir = saved_model_dir + '/' + experiment_name + '-' + experiment_number + '/' if __name__ == "__main__": if not os.path.exists(experiment_dir): os.mkdir(experiment_dir) tokenizer = AutoTokenizer.from_pretrained(model_name) context_encoder = AutoModel.from_pretrained(model_name).to(device) sense_encoder = AutoModel.from_pretrained(model_name).to(device) context_encoder_optimizer = AdamW(context_encoder.parameters(), lr=context_encoder_lr, correct_bias=False) sense_encoder_optimizer = AdamW(sense_encoder.parameters(), lr=sense_encoder_lr, correct_bias=False) cos_sim = torch.nn.CosineSimilarity() with open(data_dir + data_file, 'r') as f: data = f.readlines() data = list(map(lambda x: json.loads(x.rstrip('\n')), data)) data = np.array(data)
LANG = "en" try: os.remove(f"./all_{LANG}/content_{LANG}.txt") except FileNotFoundError: pass for file in glob.iglob(f"./{LANG}/*"): os.remove(file) # model = "illuin/camembert-large-fquad" model = "camembert-base" # model = "bert-large-uncased-whole-word-masking-finetuned-squad" tokenizer = AutoTokenizer.from_pretrained(model) # bertizer = AutoModelForQuestionAnswering.from_pretrained(model) bertizer = AutoModel.from_pretrained(model) with open("covid_raw.json", "r") as file: dico = json.load(file) dico_splitted = {} for source, sub_dic in track(dico.items(), description="Entries..."): try: raw_text_fr = sub_dic["content_fr"] title_fr = sub_dic["title_fr"] except KeyError: continue splited_words_fr = np.array(raw_text_fr.split(" ")) splitted_chunk_words_fr = np.array_split( splited_words_fr, (len(splited_words_fr) // 200) + 1)
def build_transformer_base(self): """Build the transformer base model. """ self.transformer = AutoModel.from_pretrained( self.config.transformer_base, config=self.transformer_config)
from transformers import AutoTokenizer, AutoModel import os # define the name of the directory to be created DIRECTORY = "./bert-base-cased" try: os.mkdir(DIRECTORY) except OSError: print("Creation of the directory %s failed" % DIRECTORY) else: print("Successfully created the directory %s " % DIRECTORY) AutoTokenizer.from_pretrained("bert-base-cased").save_pretrained(DIRECTORY) AutoModel.from_pretrained("bert-base-cased").save_pretrained(DIRECTORY)
def build_model(self): config = AutoConfig.from_pretrained(self.hparams.pretrain, output_hidden_states=True) model = AutoModel.from_pretrained(self.hparams.pretrain, config=config) return model
def __init__(self, num_labels): super(TextClassifier, self).__init__() self.pretrained_model = AutoModel.from_pretrained(model_checkpoint) self.classifier = nn.Sequential(nn.Dropout(p=0.2), nn.Linear(256, num_labels), nn.ReLU())
def main(): # 1. Parse input arguments # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() # 2. Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") # 3. Detecting last checkpoint and eventualy continue from last checkpoint last_checkpoint = None if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome." ) elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # 4. Load dataset # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files this script will use the first column for the full image path and the second column for the # captions (unless you specify column names for this with the `image_column` and `caption_column` arguments). # if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. dataset = load_dataset( data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, keep_in_memory=False, data_dir=data_args.data_dir, ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # 5. Load pretrained model, tokenizer, and feature extractor if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer ) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer ) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) # Load feature_extractor, in this script we only use this to get the mean and std for normalization. feature_extractor = AutoFeatureExtractor.from_pretrained( model_args.feature_extractor_name or model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModel.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) config = model.config def _freeze_params(module): for param in module.parameters(): param.requires_grad = False if model_args.freeze_vision_model: _freeze_params(model.vision_model) if model_args.freeze_text_model: _freeze_params(model.text_model) # set seed for torch dataloaders set_seed(training_args.seed) # Preprocessing the datasets. # We need to tokenize inputs and targets. if training_args.do_train: column_names = dataset["train"].column_names elif training_args.do_eval: column_names = dataset["validation"].column_names elif training_args.do_predict: column_names = dataset["test"].column_names else: logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.") return # 6. Get the column names for input/target. dataset_columns = dataset_name_mapping.get(data_args.dataset_name, None) if data_args.image_column is None: image_column = dataset_columns[0] if dataset_columns is not None else column_names[0] else: image_column = data_args.image_column if image_column not in column_names: raise ValueError( f"--image_column' value '{data_args.image_column}' needs to be one of: {', '.join(column_names)}" ) if data_args.caption_column is None: caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1] else: caption_column = data_args.caption_column if caption_column not in column_names: raise ValueError( f"--caption_column' value '{data_args.caption_column}' needs to be one of: {', '.join(column_names)}" ) # 7. Preprocessing the datasets. # Initialize torchvision transforms and jit it for faster processing. image_transformations = Transform( config.vision_config.image_size, feature_extractor.image_mean, feature_extractor.image_std ) image_transformations = torch.jit.script(image_transformations) # Preprocessing the datasets. # We need to tokenize input captions and transform the images. def tokenize_captions(examples): captions = [caption for caption in examples[caption_column]] text_inputs = tokenizer(captions, max_length=data_args.max_seq_length, padding="max_length", truncation=True) examples["input_ids"] = text_inputs.input_ids examples["attention_mask"] = text_inputs.attention_mask return examples def transform_images(examples): images = [read_image(image_file, mode=ImageReadMode.RGB) for image_file in examples[image_column]] examples["pixel_values"] = [image_transformations(image) for image in images] return examples def filter_corrupt_images(examples): """remove problematic images""" valid_images = [] for image_file in examples[image_column]: try: Image.open(image_file) valid_images.append(True) except Exception: valid_images.append(False) return valid_images if training_args.do_train: if "train" not in dataset: raise ValueError("--do_train requires a train dataset") train_dataset = dataset["train"] if data_args.max_train_samples is not None: max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) train_dataset = train_dataset.filter( filter_corrupt_images, batched=True, num_proc=data_args.preprocessing_num_workers ) train_dataset = train_dataset.map( function=tokenize_captions, batched=True, remove_columns=[col for col in column_names if col != image_column], num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on train dataset", ) # Transform images on the fly as doing it on the whole dataset takes too much time. train_dataset.set_transform(transform_images) if training_args.do_eval: if "validation" not in dataset: raise ValueError("--do_eval requires a train validation") eval_dataset = dataset["validation"] if data_args.max_eval_samples is not None: max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) eval_dataset = eval_dataset.select(range(max_eval_samples)) eval_dataset = eval_dataset.filter( filter_corrupt_images, batched=True, num_proc=data_args.preprocessing_num_workers ) eval_dataset = eval_dataset.map( function=tokenize_captions, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=[col for col in column_names if col != image_column], load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on validation dataset", ) # Transform images on the fly as doing it on the whole dataset takes too much time. eval_dataset.set_transform(transform_images) if training_args.do_predict: if "test" not in dataset: raise ValueError("--do_predict requires a test dataset") test_dataset = dataset["test"] if data_args.max_eval_samples is not None: max_eval_samples = min(len(test_dataset), data_args.max_eval_samples) test_dataset = test_dataset.select(range(max_eval_samples)) test_dataset = test_dataset.filter( filter_corrupt_images, batched=True, num_proc=data_args.preprocessing_num_workers ) test_dataset = test_dataset.map( function=tokenize_captions, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=[col for col in column_names if col != image_column], load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on test dataset", ) # Transform images on the fly as doing it on the whole dataset takes too much time. test_dataset.set_transform(transform_images) # 8. Initalize our trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, data_collator=collate_fn, ) # 9. Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() trainer.log_metrics("train", train_result.metrics) trainer.save_metrics("train", train_result.metrics) trainer.save_state() # 10. Evaluation if training_args.do_eval: metrics = trainer.evaluate() trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # 11. Write Training Stats and push to hub. kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "contrastive-image-text-modeling"} if data_args.dataset_name is not None: kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: kwargs["dataset_args"] = data_args.dataset_config_name kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: kwargs["dataset"] = data_args.dataset_name if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs)
def transformers( path_to_senteval: str, pretrained_model_name_or_path: str, output_filepath: str = None, mean_pool: bool = False, cuda_device: int = -1, prototyping_config: bool = False, verbose: bool = False, ) -> None: """Evaluates a pre-trained model from the Transformers library against the SentEval benchmark.""" from transformers import AutoModel, AutoTokenizer # SentEval prepare and batcher def prepare(params, samples): return @torch.no_grad() def batcher(params, batch): batch = _cleanup_batch(batch) # Re-tokenize the input text using the pre-trained tokenizer batch = [" ".join(tokens) for tokens in batch] # HACK (John): This will save us in the case of tokenizers with no default max_length # Why does this happen? Open an issue on Transformers. max_length = params.tokenizer.max_length if hasattr( tokenizer, "max_length") else 512 inputs = params.tokenizer.batch_encode_plus(batch, pad_to_max_length=True, max_length=max_length, return_tensors="pt") # Place all input tensors on same device as the model inputs = { name: tensor.to(params.device) for name, tensor in inputs.items() } sequence_output, pooled_output = model(**inputs) # If mean_pool, we take the average of the token-level embeddings, accounting for pads. # Otherwise, we take the pooled output for this specific model, which is typically the # embedding of a special tokens embedding, like [CLS] or <s>, which is prepended to the # input during tokenization. if mean_pool: embeddings = torch.sum( sequence_output * inputs["attention_mask"].unsqueeze(-1), dim=1) / torch.clamp(torch.sum( inputs["attention_mask"], dim=1, keepdims=True), min=1e-9) else: embeddings = pooled_output embeddings = embeddings.cpu().numpy() return embeddings # Determine the torch device device = _get_device(cuda_device) # Load the Transformers tokenizer tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path) typer.secho( (f"{SUCCESS} Tokenizer '{pretrained_model_name_or_path}' from Transformers loaded" " successfully."), fg=typer.colors.GREEN, bold=True, ) # Load the Transformers model model = AutoModel.from_pretrained(pretrained_model_name_or_path) model.to(device) model.eval() typer.secho( f'{SUCCESS} Model "{pretrained_model_name_or_path}" from Transformers loaded successfully.', fg=typer.colors.GREEN, bold=True, ) # Performs a few setup steps and returns the SentEval params params_senteval = _setup_senteval(path_to_senteval, prototyping_config, verbose) params_senteval["tokenizer"] = tokenizer params_senteval["model"] = model params_senteval["device"] = device _run_senteval(params_senteval, path_to_senteval, batcher, prepare, output_filepath) return
def test_rag_sequence_from_pretrained(self): rag_config = self.get_rag_config() rag_decoder_tokenizer = BartTokenizer.from_pretrained( "facebook/bart-large-cnn") rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained( "facebook/dpr-question_encoder-single-nq-base") rag_retriever = RagRetriever( rag_config, question_encoder_tokenizer=rag_question_encoder_tokenizer, generator_tokenizer=rag_decoder_tokenizer, ) input_ids = rag_question_encoder_tokenizer( "who sings does he love me with reba", return_tensors="pt").input_ids decoder_input_ids = rag_decoder_tokenizer( "Linda Davis", return_tensors="pt").input_ids input_ids = input_ids.to(torch_device) decoder_input_ids = decoder_input_ids.to(torch_device) with tempfile.TemporaryDirectory() as tmp_dirname: rag_sequence = RagSequenceForGeneration.from_pretrained_question_encoder_generator( "facebook/dpr-question_encoder-single-nq-base", "facebook/bart-large-cnn", retriever=rag_retriever, config=rag_config, ).to(torch_device) # check that the from pretrained methods work rag_sequence.save_pretrained(tmp_dirname) rag_sequence.from_pretrained(tmp_dirname, retriever=rag_retriever) rag_sequence.to(torch_device) with torch.no_grad(): output = rag_sequence( input_ids, labels=decoder_input_ids, ) loss_pretrained = output.loss del rag_sequence question_encoder = AutoModel.from_pretrained( "facebook/dpr-question_encoder-single-nq-base") generator = AutoModelForSeq2SeqLM.from_pretrained( "facebook/bart-large-cnn") rag_sequence = RagSequenceForGeneration( config=rag_config, question_encoder=question_encoder, generator=generator, retriever=rag_retriever) rag_sequence.to(torch_device) with torch.no_grad(): output = rag_sequence( input_ids, labels=decoder_input_ids, ) loss_init = output.loss self.assertAlmostEqual(loss_pretrained.item(), loss_init.item(), places=4)
def __init__(self, classifier_dims, num_classes, gaussian_noise, dropout, internal_dims, n_layers, featurizer, n_tokens_in=64, n_tokens_out=16, use_as_super=False, **kwargs): embedding_dims = 768 super(AlbertClassifer, self).__init__(classifier_dims, num_classes, embedding_dims, gaussian_noise, dropout, internal_dims, n_layers, featurizer, final_layer_builder, n_tokens_in, n_tokens_out, True, **kwargs) self.word_masking_proba = kwargs[ "word_masking_proba"] if "word_masking_proba" in kwargs else 0.0 if not use_as_super: model = kwargs["model"] if "model" in kwargs else 'albert-base-v2' global_dir = get_global("models_dir") model = os.path.join( global_dir, model) if model in os.listdir(global_dir) else model self.tokenizer = AutoTokenizer.from_pretrained(model) self.model = AutoModel.from_pretrained(model) print("Pick stored Model", model, "Model Class = ", type(self.model), "Tokenizer Class = ", type(self.tokenizer)) if featurizer == "cnn": self.featurizer = CNN1DFeaturizer(n_tokens_in, embedding_dims, n_tokens_out, classifier_dims, internal_dims, n_layers, gaussian_noise, dropout) elif featurizer == "gru": self.featurizer = GRUFeaturizer(n_tokens_in, embedding_dims, n_tokens_out, classifier_dims, internal_dims, n_layers, gaussian_noise, dropout) elif featurizer == "basic": self.featurizer = BasicFeaturizer(n_tokens_in, embedding_dims, n_tokens_out, classifier_dims, internal_dims, n_layers, gaussian_noise, dropout) elif featurizer == "transformer": self.attention_drop_proba = kwargs[ "attention_drop_proba"] if "attention_drop_proba" in kwargs else 0.0 n_encoders = kwargs.pop("n_encoders", n_layers) n_decoders = kwargs.pop("n_decoders", n_layers) self.featurizer = TransformerFeaturizer( n_tokens_in, embedding_dims, n_tokens_out, classifier_dims, internal_dims, n_encoders, n_decoders, gaussian_noise, dropout, self.attention_drop_proba) else: raise NotImplementedError() self.final_layer = fb_1d_loss_builder(classifier_dims, n_tokens_out, num_classes, dropout, **kwargs) if "stored_model" in kwargs: load_stored_params(self, kwargs["stored_model"]) self.word_masking = WordMasking(tokenizer=self.tokenizer, **kwargs) self.reg_layers = get_regularization_layers(self)