def pretrained_model_score(self, model_name, expected_score): model = SentenceTransformer(model_name) sts_reader = STSDataReader('../examples/datasets/stsbenchmark') test_data = SentencesDataset( examples=sts_reader.get_examples("sts-test.csv"), model=model) test_dataloader = DataLoader(test_data, shuffle=False, batch_size=8) evaluator = EmbeddingSimilarityEvaluator(test_dataloader) score = model.evaluate(evaluator) * 100 print(model_name, "{:.2f} vs. exp: {:.2f}".format(score, expected_score)) assert abs(score - expected_score) < 0.1
def main(): model = SentenceTransformer('bert-base-nli-mean-tokens') sts_reader = STSDataReader('datasets/stsbenchmark') test_data = SentencesDataset( examples=sts_reader.get_examples('sts-test.csv'), model=model, dataset_cache_id='sts-eval') test_dataloader = DataLoader(test_data, shuffle=False, batch_size=16) evaluator = EmbeddingSimilarityEvaluator(test_dataloader) model.evaluate(evaluator)
def nlptrain(premodel,ver,tr_data,te_data): #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout # Read the dataset model_name = 'roberta-large-nli-stsb-mean-tokens' train_batch_size = 16 num_epochs = 4 model_save_path = ver sts_reader = STSDataReader('kt_datasets/kt_benchmark', normalize_scores=True) # Load a pre-trained sentence transformer model model = SentenceTransformer(premodel) # Convert the dataset to a DataLoader ready for training logging.info("") train_data = SentencesDataset(sts_reader.get_examples(tr_data), model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.CosineSimilarityLoss(model=model) logging.info("") dev_data = SentencesDataset(examples=sts_reader.get_examples(te_data), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) # Configure the training. We skip evaluation in this example warmup_steps = math.ceil(len(train_data)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=model_save_path) list=['model saved in '+ ver+' directory'] return(list)
import logging from datetime import datetime #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout # Read the dataset train_batch_size = 16 num_epochs = 4 model_save_path = 'output/training_stsbenchmark_roberta-' + datetime.now( ).strftime("%Y-%m-%d_%H-%M-%S") sts_reader = STSDataReader('datasets/stsbenchmark', normalize_scores=True) # Use XLNet for mapping tokens to embeddings word_embedding_model = models.RoBERTa('roberta-base') # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read STSbenchmark train dataset")
logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout # Read the dataset #model_name = 'bert-base-nli-stsb-mean-tokens' model_name = "../saved_models" train_batch_size = 32 num_epochs = 4 model_save_path = 'output/quora_continue_training-' + model_name + '-' + datetime.now( ).strftime("%Y-%m-%d_%H-%M-%S") sts_reader = STSDataReader('../data/quora', normalize_scores=True, s1_col_idx=4, s2_col_idx=5, score_col_idx=6, max_score=1) # Load a pre-trained sentence transformer model model = SentenceTransformer(model_name) # Convert the dataset to a DataLoader ready for training logging.info("Read Quora train dataset") train_data = SentencesDataset(sts_reader.get_examples('train.csv'), model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.CosineSimilarityLoss(model=model) logging.info("Read Quora dev dataset")
from datetime import datetime #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout # Read the dataset model_name = 'roberta-large-nli-stsb-mean-tokens' train_batch_size = 16 num_epochs = 4 model_save_path = 'output/train1-' + model_name + '-' + datetime.now( ).strftime("%Y-%m-%d_%H-%M-%S") sts_reader = STSDataReader('kt_datasets/kt_benchmark', normalize_scores=True) # Load a pre-trained sentence transformer model model = SentenceTransformer('output/retkt') # Convert the dataset to a DataLoader ready for training logging.info("") train_data = SentencesDataset(sts_reader.get_examples('kt1.csv'), model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.CosineSimilarityLoss(model=model) logging.info("") dev_data = SentencesDataset(examples=sts_reader.get_examples('kt1.csv'), model=model)
import os dirname = os.path.dirname(os.path.dirname(__file__)) #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout # Read the dataset train_batch_size = 16 num_epochs = 4 model_save_path = os.path.join(dirname, 'output/training_stsbenchmark_bert-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) sts_reader = STSDataReader(os.path.join(dirname, 'data/sts-b'), normalize_scores=True) # Use BERT for mapping tokens to embeddings word_embedding_model = models.BERT('bert-base-uncased') # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read STSbenchmark train dataset") train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model)
from sentence_transformers import SentencesDataset, SentenceTransformer, losses from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator from sentence_transformers.readers import STSDataReader from torch.utils.data import DataLoader # model = SentenceTransformer('bert-base-nli-mean-tokens') sts_reader = STSDataReader('stsbenchmark_data', normalize_scores=True) train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.CosineSimilarityLoss(model=model) print(train_data)
This examples loads a pre-trained model and evaluates it on the STSbenchmark dataset """ from torch.utils.data import DataLoader from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator from sentence_transformers.readers import STSDataReader import numpy as np import logging #### Just some code to print debug information to stdout np.set_printoptions(threshold=100) logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout # Load a named sentence model (based on BERT). This will download the model from our server. # Alternatively, you can also pass a filepath to SentenceTransformer() model = SentenceTransformer('bert-base-nli-mean-tokens') sts_reader = STSDataReader('datasets/stsbenchmark') test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) test_dataloader = DataLoader(test_data, shuffle=False, batch_size=8) evaluator = EmbeddingSimilarityEvaluator(test_dataloader) model.evaluate(evaluator)