def main(model_path, model_type, sentence_corpus, output_path): #### Read sentence courpus. output: list of sentences #### sentences = read.read_from_tsv(os.path.join(sentence_corpus, "input.tsv")) sentences = [item for row in sentences for item in row] print(sentences[:10]) if model_type.lower() in ["bert"]: word_embedding_model = models.BERT(model_path) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) embedder = SentenceTransformer( modules=[word_embedding_model, pooling_model]) #### load sentence BERT models and generate sentence embeddings #### else: #### load sentence BERT models and generate sentence embeddings #### embedder = SentenceTransformer(model_path) sentences_embedding = embedder.encode(sentences) read.save_in_pickle(os.path.join(output_path, "embeddings.pkl"), sentences_embedding)
def main(model_path, model_type, extra_dataset): # Read the dataset train_batch_size = 64 num_epochs = 20 model_save_path = model_path + '_continue_training_' + datetime.now( ).strftime("%Y_%m_%d_%H_%M_%S") n2c2_reader = TripletReader(extra_dataset) if model_type.lower() in ["bert"]: word_embedding_model = models.BERT(model_path) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) embedder = SentenceTransformer( modules=[word_embedding_model, pooling_model]) #### load sentence BERT models and generate sentence embeddings #### else: #### load sentence BERT models and generate sentence embeddings #### embedder = SentenceTransformer(model_path) # Load a pre-trained sentence transformer model model = SentenceTransformer(model_path) # Convert the dataset to a DataLoader ready for training logging.info("Read extra training dataset") train_data = SentencesDataset(n2c2_reader.get_examples('train.tsv'), model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.TripletLoss(model=model, triplet_margin=triplet_margin) logging.info("Read development dataset") dev_data = SentencesDataset(examples=n2c2_reader.get_examples('dev.tsv'), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) evaluator = TripletEvaluator(dev_dataloader) # Configure the training. We skip evaluation in this example warmup_steps = math.ceil( len(train_data) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=math.ceil(len(train_data) / train_batch_size), warmup_steps=warmup_steps, output_path=model_save_path)
def calc(self, text1, text2): transformer = models.BERT('cl-tohoku/bert-base-japanese-whole-word-masking') pooling = models.Pooling(transformer.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[transformer, pooling]) sentences = [text1, text2] embeddings = model.encode(sentences) return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
def main(): parser = set_parser() args = parser.parse_args() # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.info("arguments are parsed") args.world_size = args.gpus * args.nodes patent_reader = PatentDataReader(args.data_dir, normalize_scores=True) # Use BERT for mapping tokens to embeddings logger.warning("Loading Bert Model") word_embedding_model = models.BERT('bert-base-uncased', max_seq_length=510) logger.warning("Model is loaded") # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) if args.use_tpu: logger.warning("TPU training") device = xm.xla_device() args.n_gpu = 1 elif args.local_rank == -1: logger.warning("Non dist training") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs logger.warning("Dist training local rank %s", args.local_rank) torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group( backend="nccl", timeout=datetime.timedelta(hours=10)) args.n_gpu = 1 args.device = device model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) train_loss = losses.CosineSimilarityLoss(model=model) model.to(args.device) train_loss.to(args.device) # Training if args.do_train: logger.warning("Read Patent Training dataset") train_data = load_and_cache_examples(args, patent_reader, model) logger.warning("Training dataset is loaded") # train_data = SentencesDataset(patent_reader.get_examples('train.tsv', max_examples=17714), model) tr_loss = train(args, train_data, model, train_loss) logger.info(" average loss = %s", tr_loss)
def load_model(path): checkpoint_files = os.listdir(path) if 'pytorch_model.bin' in checkpoint_files: word_embedding_model = models.BERT(path) pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) return SentenceTransformer(modules=[word_embedding_model, pooling_model]) return SentenceTransformer(path)
def __init__(self, base, alpha=0.21): super().__init__() self.base = base model_path = "models/covidbert-nli" # for downloading/loading the model check: # https://github.com/gsarti/covid-papers-browser # /blob/master/scripts/download_model.py if (path.exists(model_path) and path.isdir(model_path)): word_embedding_model = models.BERT(model_path, max_seq_length=128, do_lower_case=True) pooling_model = models.Pooling( word_embedding_model\ .get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False ) model = SentenceTransformer( modules=[word_embedding_model, pooling_model]) self.model = model else: print("Installing bert model...") tokenizer = AutoTokenizer.from_pretrained("gsarti/covidbert-nli") model = AutoModel.from_pretrained("gsarti/covidbert-nli") model.save_pretrained(model_path) tokenizer.save_pretrained(model_path) # Build the SentenceTransformer directly word_embedding_model = models.BERT(model_path, max_seq_length=128, do_lower_case=True) pooling_model = models.Pooling( word_embedding_model\ .get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False ) model = SentenceTransformer( modules=[word_embedding_model, pooling_model]) self.model = model print("Bert model is installed") self.alpha = alpha
def GetDevEmbedding(self, test=False): """Only used by WriteDPRCP.""" if test: primary_texts = self.test_primary_texts if not self.symmetric: secondary_texts = self.test_secondary_texts else: primary_texts = self.dev_primary_texts if not self.symmetric: secondary_texts = self.dev_secondary_texts if self.world.tag == 'inat' or self.world.tag == 'celeba': image_embedder = ImageEmbedder(self.world.tag, None) image_embedder.init_model() logging.info( "Getting {} embedding".format('test' if test else 'dev')) logging.info("Primary:") primary_embs = image_embedder.embed(primary_texts) if not self.symmetric: logging.info("Secondary:") secondary_embs = image_embedder.embed(secondary_texts) else: word_embedding_model = models.BERT('bert-base-uncased') pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer( modules=[word_embedding_model, pooling_model]) logging.info( "Getting {} embedding".format('test' if test else 'dev')) logging.info("Primary:") primary_embs = np.array(model.encode(primary_texts)) if not self.symmetric: logging.info("Secondary:") secondary_embs = np.array(model.encode(secondary_texts)) # Normalize for i in range(primary_embs.shape[0]): primary_embs[i, :] /= np.linalg.norm(primary_embs[i, :]) if not self.symmetric: for i in range(secondary_embs.shape[0]): secondary_embs[i, :] /= np.linalg.norm(secondary_embs[i, :]) if test: self.test_primary_embeddings = primary_embs if not self.symmetric: self.test_secondary_embeddings = secondary_embs else: self.dev_primary_embeddings = primary_embs if not self.symmetric: self.dev_secondary_embeddings = secondary_embs
def get_model(self): if self.model_type == 'electra': return ELECTRA(self.model_name, max_seq_length=self.max_seq_length, do_lower_case=self.do_lower_case) elif self.model_type == 'bert': return models.BERT(self.model_name, max_seq_length=self.max_seq_length, do_lower_case=self.do_lower_case) else: raise AttributeError("Not supported")
def get_sbert(): global model if model is None: transformer = models.BERT( 'cl-tohoku/bert-base-japanese-whole-word-masking') pooling = models.Pooling(transformer.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[transformer, pooling]) return model
def load_sentence_transformer( name: str = 'gsarti/scibert-nli', max_seq_length: int = 128, do_lower_case: bool = True) -> SentenceTransformer: """ Loads a SentenceTransformer from HuggingFace AutoModel bestiary """ word_embedding_model = models.BERT('gsarti/scibert-nli', max_seq_length=128, do_lower_case=True) pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) return SentenceTransformer(modules=[word_embedding_model, pooling_model])
def __post_init__(self): word_embedding_model = models.BERT( 'gsarti/biobert-nli', max_seq_length=128, do_lower_case=True ) # apply pooling to get one fixed vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False ) self.model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
def train_sbert(model_name, model_save_path): batch_size = 16 nli_reader, sts_reader = load_dataset() train_num_labels = nli_reader.get_num_labels() # Use BERT for mapping tokens to embeddings word_embedding_model = models.BERT(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read AllNLI train dataset") train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels) logging.info("Read STSbenchmark dev dataset") dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) # Configure the training num_epochs = 1 warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=model_save_path ) model = SentenceTransformer(model_save_path) test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size) evaluator = EmbeddingSimilarityEvaluator(test_dataloader) model.evaluate(evaluator)
def __init__(self, model_dir: str, vocab: Optional[pd.DataFrame] = None) -> None: word_embedding_model = models.BERT(model_dir) pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=False, pooling_mode_cls_token=True, pooling_mode_max_tokens=False) self.encoder = SentenceTransformer( modules=[word_embedding_model, pooling_model]) if vocab is not None: self.vocab2index(vocab) else: self.codes = [] self.concept_names = [] self.tree_index = None
def test_bert_wkpooling(self): word_embedding_model = models.BERT( 'bert-base-uncased', model_args={'output_hidden_states': True}) pooling_model = models.WKPooling( word_embedding_model.get_word_embedding_dimension()) model = SentenceTransformer( modules=[word_embedding_model, pooling_model]) scores = [ 0.6906377742193329, 0.9910573945907297, 0.8395676755959804, 0.7569234597143, 0.8324509121875274 ] for sentences, score in zip(WKPoolingTest.sentence_pairs, scores): embedding = model.encode(sentences, convert_to_numpy=True) similarity = 1 - scipy.spatial.distance.cosine( embedding[0], embedding[1]) assert abs(similarity - score) < 0.01
def initialize(self): if self.model == 'USE': encoder = hub.load( "https://tfhub.dev/google/universal-sentence-encoder/2") if self.model == 'scibert_scivocab_uncased': #provide the path to the downloaded scibert model word_embedding_model = models.BERT( './../rev_sig/codes/models/scibert_scivocab_uncased/') pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) encoder = SentenceTransformer( modules=[word_embedding_model, pooling_model]) else: encoder = SentenceTransformer('bert-base-nli-mean-tokens') return encoder
def main(model_path, model_type, sentence_corpus, query): if model_type.lower() in ["bert"]: word_embedding_model = models.BERT(model_path) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) embedder = SentenceTransformer( modules=[word_embedding_model, pooling_model]) #### load sentence BERT models and generate sentence embeddings #### else: #### load sentence BERT models and generate sentence embeddings #### embedder = SentenceTransformer(model_path) corpus_embeddings = read.read_from_pickle( os.path.join(sentence_corpus, "embeddings.pkl")) corpus = read.read_from_tsv(os.path.join(sentence_corpus, "input.tsv")) sentences = [item for row in corpus for item in row] query_embedding = embedder.encode([query]) # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity closest_n = 5 distances = scipy.spatial.distance.cdist(query_embedding, corpus_embeddings, "cosine")[0] results = zip(range(len(distances)), distances) results = sorted(results, key=lambda x: x[1]) print("\n\n======================\n\n") print("Query:", query) print("\nTop 5 most similar sentences in corpus:") for idx, distance in results[0:closest_n]: print(sentences[idx].strip(), "(Score: %.4f)" % (1 - distance))
def GetInitialEmbedding(self, encode_batch_size): if self.world.tag == 'inat' or self.world.tag == 'celeba': image_embedder = ImageEmbedder(self.world.tag, None) image_embedder.init_model() logging.info("Getting initial embedding") logging.info("Primary:") self.train_primary_embeddings = image_embedder.embed( self.train_primary_texts) if not self.symmetric: logging.info("Secondary:") self.train_secondary_embeddings = image_embedder.embed( self.train_secondary_texts) else: word_embedding_model = models.BERT('bert-base-uncased') pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer( modules=[word_embedding_model, pooling_model]) logging.info("Getting initial embedding") logging.info("Primary:") self.train_primary_embeddings = np.array( model.encode(self.train_primary_texts, batch_size=encode_batch_size)) if not self.symmetric: logging.info("Secondary:") self.train_secondary_embeddings = np.array( model.encode(self.train_secondary_texts, batch_size=encode_batch_size)) self.NormalizeEmbeddings() return
def main(): parser = set_parser() args = parser.parse_args() # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.info("arguments are parsed") args.world_size = args.gpus * args.nodes patent_reader = PatentDataReader(args.data_dir, normalize_scores=True) # Use BERT for mapping tokens to embeddings word_embedding_model = models.BERT('bert-base-cased', max_seq_length=510) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) # Setup CUDA, GPU & distributed training if args.local_rank == -1: logger.warning("Non dist training") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs logger.warning("Dist training local rank %s", args.local_rank) torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group( backend="nccl", timeout=datetime.timedelta(hours=10)) args.n_gpu = 1 args.device = device model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) train_loss = losses.CosineSimilarityLoss(model=model) model.to(args.device) train_loss.to(args.device) if args.fp16: try: import apex apex.amp.register_half_function(torch, "einsum") except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: logger.warning("Read Patent Training dataset") train_data = load_and_cache_examples(args, patent_reader, model) if args.eval_during_train: logging.info("Read STSbenchmark dev dataset") dev_data = load_and_cache_examples(args, patent_reader, model, evaluate=True) else: dev_data = None # train_data = SentencesDataset(patent_reader.get_examples('train.tsv', max_examples=17714), model) tr_loss = train(args, train_data, model, train_loss, dev_dataset=dev_data) logger.info(" average loss = %s", tr_loss)
# model.save_pretrained('models/') # not sure if this the right way to save tuned models or right use of path # tokenizer.save_pretrained('models/') # select one Transformer model_name = 'allenai/scibert_scivocab_uncased' # same not sure i'm calling the model right here # Read the dataset batch_size = 16 nli_reader = NLIDataReader('../datasets/AllNLI') sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark') train_num_labels = nli_reader.get_num_labels() model_save_path = 'models/training_nli_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") # Use sciBERT model for mapping tokens to embeddings word_embedding_model = models.BERT(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read AllNLI train dataset") train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels)
level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout # Read the dataset nli_reader = ChineseDataReader(args.data_dir) train_num_labels = nli_reader.get_num_labels() model_save_path = args.output_dir if args.do_train: # Use BERT for mapping tokens to embeddings word_embedding_model = models.BERT(args.model_name_or_path) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read train dataset") train_data = SentencesDataset(nli_reader.get_train_examples(args.data_dir), model=model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
def is_correct(meddra_code, candidates, topk=1): for candidate in candidates[:topk]: if check_label(candidate, meddra_code): return 1 return 0 if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--model_dir') parser.add_argument('--data_folder') parser.add_argument('--vocab') parser.add_argument('--k', type=int, default=5) args = parser.parse_args() word_embedding_model = models.BERT(args.model_dir) pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=False, pooling_mode_cls_token=True, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) ################ entities = read_dataset(args.data_folder) ################ entity_texts = [e['entity_text'].lower() for e in entities] labels = [e['label'] for e in entities] ################## vocab = read_vocab(args.vocab) codes = vocab.label.values
def get_vector(record_texts_dict: dict, embedding_size, record_num: int, device, save_path='', s_text_emb_method='glove'): record_embeddings = None if s_text_emb_method == 'glove': print('----glove vector----') stop_word_list = get_stop_word( stop_word_path='./resource/stop_words.txt') glove_dict = get_glove_dict( glove_dict_path='./resource/glove/glove.6B.' + str(embedding_size) + 'd.txt') record_embeddings = np.zeros((record_num, embedding_size)) t_count = 0 # print(item_num) for i in tqdm(range(record_num)): item_emb = np.zeros(embedding_size) try: word_str = str(record_texts_dict[i]) word_list = word_str.split(" ") # print(word_list) t_div = 1 for word in word_list: if word not in stop_word_list: try: word_glove_vector = glove_dict[word] item_emb = item_emb + word_glove_vector except KeyError: continue t_div += 1 else: continue # print(t_div, item_emb, item_emb / t_div) record_embeddings[i] = item_emb / t_div # normalise t_count += 1 except KeyError: continue elif s_text_emb_method == 'sbert': print('----sentence-bert vector----') # Sentence-BERT: # Sentence Embeddings using Siamese BERT-Networks https://arxiv.org/abs/1908.10084 # https://github.com/UKPLab/sentence-transformers # google/bert_uncased_L-2_H-128_A-2(BERT-Tiny) # google/bert_uncased_L-12_H-256_A-4(BERT-Mini) # google/bert_uncased_L-4_H-512_A-8(BERT-Small) # google/bert_uncased_L-8_H-512_A-8(BERT-Medium) # google/bert_uncased_L-12_H-768_A-12(BERT-Base) word_embedding_model = models.BERT( 'google/bert_uncased_L-12_H-256_A-4', max_seq_length=510) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension()) bert_model = SentenceTransformer( modules=[word_embedding_model, pooling_model], device=device) one_req_num = 500 record_list = list(record_texts_dict.values()) req_times = int(math.ceil(len(record_list) / one_req_num)) for ii in tqdm(range(req_times)): if ii == 0: record_embeddings = bert_model.encode( record_list[ii * one_req_num:(ii + 1) * one_req_num]) elif ii < req_times - 1: record_embeddings = np.vstack( (record_embeddings, bert_model.encode(record_list[ii * one_req_num:(ii + 1) * one_req_num]))) else: record_embeddings = np.vstack( (record_embeddings, bert_model.encode(record_list[ii * one_req_num:]))) else: print('Do not support', s_text_emb_method, 'text embedding method.') if save_path != '': np.save(save_path, record_embeddings) return record_embeddings
# In[2]: # Read the dataset batch_size = 16 nli_reader = NLIDataReader('datasets/AllNLI') sts_reader = STSDataReader('datasets/stsbenchmark') train_num_labels = nli_reader.get_num_labels() model_save_path = 'output/training_nli_bert-' + datetime.now().strftime( "%Y-%m-%d_%H-%M-%S") # In[3]: # Use BERT for mapping tokens to embeddings # Using manually downloaded model data: word_embedding_model = models.BERT('../models/bert-base-multilingual-cased/') # Or you can let the library handle the downloading and caching for you: # word_embedding_model = models.BERT('bert-base-multilingual-cased') # In[4]: def children(m): return m if isinstance(m, (list, tuple)) else list(m.children()) def set_trainable_attr(m, b): m.trainable = b for p in m.parameters(): p.requires_grad = b
index=2) '#### Selected model:', model_name EMBEDDINGS_PATH = f'{model_name}-embeddings.pkl' path = os.path.join(MODELS_DIR, model_name) if not os.path.exists(path): os.makedirs(path) tokenizer = AutoTokenizer.from_pretrained(MODELS[model_name]) model = AutoModel.from_pretrained(MODELS[model_name]) model.save_pretrained(path) tokenizer.save_pretrained(path) word_embedding_model = models.BERT(path, max_seq_length=512, do_lower_case=True) pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) rmtree(path) model.save(path) print(f'Model {model_name} available in {path}') # Displaying data
"--output_dir", type=str, help="Directory where the models are saved." ) args = parser.parse_args() path = os.path.join(args.output_dir, MODELS_PATH, args.model) if not os.path.exists(path): os.makedirs(path) if args.model not in list(MODELS_PRETRAINED) + list(MODELS_FINETUNED): raise AttributeError("Model should be selected in the list: " + ", ".join(list(MODELS_PRETRAINED) + list(MODELS_FINETUNED)) ) tokenizer = AutoTokenizer.from_pretrained(MODELS[args.model]) model = AutoModel.from_pretrained(MODELS[args.model]) model.save_pretrained(path) tokenizer.save_pretrained(path) if args.model in MODELS_FINETUNED.keys(): # Build the SentenceTransformer directly word_embedding_model = models.BERT( path, max_seq_length=args.max_seq_length, do_lower_case=args.do_lower_case ) pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) rmtree(path) model.save(path) print(f'Model {args.model} available in', path)
args = parser.parse_args() path = os.path.join(MODELS_PATH, args.model) if not os.path.exists(path): os.makedirs(path) if args.model == 'scibert': # Used to fine-tune SciBERT from default embeddings tokenizer = AutoTokenizer.from_pretrained( "allenai/scibert_scivocab_cased") model = AutoModel.from_pretrained("allenai/scibert_scivocab_cased") model.save_pretrained(path) tokenizer.save_pretrained(path) print('SciBERT Transformer model available in', path) elif args.model == 'scibert-nli': # Already-trained SciBERT tokenizer = AutoTokenizer.from_pretrained("gsarti/scibert-nli") model = AutoModel.from_pretrained("gsarti/scibert-nli") model.save_pretrained(path) tokenizer.save_pretrained(path) word_embedding_model = models.BERT(path) pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer( modules=[word_embedding_model, pooling_model]) rmtree(path) model.save(path) print('SciBERT SentenceTransformer model available in', path) else: raise AttributeError("Model should be selected in the list: " + ", ".join(MODELS))
logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout # Read the dataset batch_size = 16 nli_reader = NLIDataReader('datasets/AllNLI') sts_reader = STSDataReader('datasets/stsbenchmark') train_num_labels = nli_reader.get_num_labels() model_save_path = 'output/training_nli_bert-' + datetime.now().strftime( "%Y-%m-%d_%H:%M:%S") # Use BERT for mapping tokens to embeddings word_embedding_model = models.BERT('bert-base-uncased') # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read AllNLI train dataset") train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) train_loss = losses.SoftmaxLoss(
def load_model(use_covidbert=False): """Function that loads and returns the CovidBERT model""" # # Load CovidBERT # if use_covidbert: # print("Loading model...") # model = AutoModelForMaskedLM.from_pretrained("deepset/covid_bert_base") # print("Loading tokenizer...") # tokenizer = AutoTokenizer.from_pretrained("deepset/covid_bert_base") # print("Finished loading the model successfully!") #model = SentenceTransformer(model_path) # #Load CovidBERT # if use_covidbert: # print("Loading model...") # model = AutoModelWithLMHead.from_pretrained("manueltonneau/clinicalcovid-bert-nli") # print("Loading tokenizer...") # print("\n") # tokenizer = AutoTokenizer.from_pretrained("manueltonneau/clinicalcovid-bert-nli") # print("\n") # print("Finished loading the model successfully!") # # Save the model to model path # model_path = os.path.join("models","clinicalcovid") # if not os.path.exists(model_path): # os.makedirs(model_path) # model.save_pretrained(model_path) # tokenizer.save_pretrained(model_path) # model = SentenceTransformer(model_path) # Load CovidBERT if use_covidbert: print("Loading model...") model = AutoModelWithLMHead.from_pretrained("gsarti/covidbert-nli") print("Loading tokenizer...") print("\n") tokenizer = AutoTokenizer.from_pretrained("gsarti/covidbert-nli") print("\n") print("Finished loading the model successfully!") # Save the model to model path model_path = os.path.join("models", "gsarticovid") if not os.path.exists(model_path): os.makedirs(model_path) model.save_pretrained(model_path) tokenizer.save_pretrained(model_path) print(f"Successfully saved model to {model_path}") print("Loading Sentence Transformer now!") word_embedding_model = models.BERT( model_path, # max_seq_length=args.max_seq_length, # do_lower_case=args.do_lower_case ) pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer( modules=[word_embedding_model, pooling_model]) rmtree(model_path) model.save(model_path) print("Finished building Sentence Transformer!") # Load regular BERT else: print("Loading BERT") tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased') print("Finished loading BERT") return model, tokenizer
def train(triplet_data_dir, output): logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) ### Create a torch.DataLoader that passes training batch instances to our model train_batch_size = 16 triplet_reader = TripletReader(triplet_data_dir, s1_col_idx=1, s2_col_idx=2, s3_col_idx=3, delimiter=',', quoting=csv.QUOTE_MINIMAL, has_header=True) # output_path = "output/bert-base-wikipedia-sections-mean-tokens-"+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") output_path = output + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") num_epochs = 1 ### Configure sentence transformers for training and train on the provided dataset # Use BERT for mapping tokens to embeddings word_embedding_model = models.BERT('bert-base-uncased') # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) logging.info("Read Triplet train dataset") train_data = SentencesDataset(examples=triplet_reader.get_examples( 'train.csv', 2000000), model=model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.TripletLoss(model=model) logging.info("Read Wikipedia Triplet dev dataset") dev_data = SentencesDataset(examples=triplet_reader.get_examples( 'validation.csv', 10000), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) evaluator = TripletEvaluator(dev_dataloader) warmup_steps = int(len(train_data) * num_epochs / train_batch_size * 0.1) #10% of train data # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=output_path) ############################################################################## # # Load the stored model and evaluate its performance on STS benchmark dataset # ############################################################################## model = SentenceTransformer(output_path) test_data = SentencesDataset( examples=triplet_reader.get_examples('test.csv'), model=model) test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size) evaluator = TripletEvaluator(test_dataloader) model.evaluate(evaluator)
#!/usr/bin/env python3 # Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. from transformers import AutoTokenizer from transformers import AutoModelWithLMHead from sentence_transformers import SentenceTransformer from sentence_transformers import models import sys PATH = sys.argv[1] print('Saving model to %s' % PATH) tokenizer = AutoTokenizer.from_pretrained("gsarti/scibert-nli") model = AutoModelWithLMHead.from_pretrained("gsarti/scibert-nli") model.save_pretrained(PATH) tokenizer.save_pretrained(PATH) embedding = models.BERT(PATH, max_seq_length=128, do_lower_case=True) pooling_model = models.Pooling(embedding.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[embedding, pooling_model]) model.save(PATH)