def pretrained_model_score(self, model_name, expected_score): model = SentenceTransformer(model_name) sts_dataset_path = 'datasets/stsbenchmark.tsv.gz' if not os.path.exists(sts_dataset_path): util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path) train_samples = [] dev_samples = [] test_samples = [] with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: score = float( row['score']) / 5.0 # Normalize score to range 0 ... 1 inp_example = InputExample( texts=[row['sentence1'], row['sentence2']], label=score) if row['split'] == 'dev': dev_samples.append(inp_example) elif row['split'] == 'test': test_samples.append(inp_example) else: train_samples.append(inp_example) evaluator = EmbeddingSimilarityEvaluator.from_input_examples( test_samples, name='sts-test') score = model.evaluate(evaluator) * 100 print(model_name, "{:.2f} vs. exp: {:.2f}".format(score, expected_score)) assert score > expected_score or abs(score - expected_score) < 0.1
def evaluate_stsb_test(self, model, expected_score): evaluator = EmbeddingSimilarityEvaluator.from_input_examples( self.test_samples, name='sts-test') score = model.evaluate(evaluator) * 100 print("STS-Test Performance: {:.2f} vs. exp: {:.2f}".format( score, expected_score)) assert score > expected_score or abs(score - expected_score) < 0.1
def create_posts_ranking(fl, data_dir, model, validate=None, is_test=False): train_posts_ranking = [] disbn = [] with open(os.path.join(data_dir, fl)) as f: data = json.load(f) for obj in data: answers = obj['answers'] filtered_answers = [] votes = 1000000 for answer in answers: my_votes = answer['a_votes'] if my_votes < votes: votes = my_votes filtered_answers.append(answer) if len(filtered_answers) > 1: rank = len(filtered_answers) for answer in filtered_answers: dist = rank / len(filtered_answers) disbn.append(answer['a_rank']) rank = rank - 1 train_posts_ranking.append( InputExample(texts=[obj['q_text'], answer['a_text']], label=dist)) random.shuffle(train_posts_ranking) print("data size " + str(len(train_posts_ranking))) if is_test: return train_posts_ranking if max_size: train_posts_ranking = train_posts_ranking[:max_size] evaluator = None if posts_rank_str == validate: train_posts_ranking, dev_posts_ranking = train_test_split( train_posts_ranking, test_size=0.1) evaluator = EmbeddingSimilarityEvaluator.from_input_examples( dev_posts_ranking, name='posts ranking') warmup_steps = math.ceil( len(train_posts_ranking) * num_epochs / batch_size * 0.1) # 10% of train data for warm-up train_data_posts_ranking = SentencesDataset(train_posts_ranking, model=model) train_dataloader_posts_ranking = DataLoader(train_data_posts_ranking, shuffle=True, batch_size=batch_size) train_loss_posts_ranking = losses.CosineSimilarityLoss(model=model) print('R: Number of training examples: ', len(train_posts_ranking)) global evaluation_steps evaluation_steps = math.ceil(len(train_posts_ranking) / 0.1) return train_dataloader_posts_ranking, train_loss_posts_ranking, evaluator, warmup_steps
def fit_model(trial, train_fold, val_fold, fold_index): print("######################") print("start of fold_index:", fold_index) print("len(train_fold)", len(train_fold)) print("len(val_fold)", len(val_fold)) batch_size = trial.suggest_int("train_batch_size", 4, 50) num_epochs = trial.suggest_int("num_epochs", 1, 4) lr = trial.suggest_uniform("lr", 2e-6, 2e-4) eps = trial.suggest_uniform("eps", 1e-7, 1e-5) weight_decay = trial.suggest_uniform("weight_decay", 0.001, 0.1) warmup_steps_mul = trial.suggest_uniform("warmup_steps_mul", 0.1, 0.5) model = SentenceTransformer(model_name) # create train dataloader # train_sentece_dataset = SentencesDataset(train_fold, model=model) # this is deprecated train_dataloader = DataLoader(train_fold, shuffle=True, batch_size=batch_size) # define loss train_loss = losses.CosineSimilarityLoss(model=model) warmup_steps = math.ceil( len(train_fold) * num_epochs / batch_size * warmup_steps_mul) # Train the model model.fit( train_objectives=[(train_dataloader, train_loss)], evaluator=None, epochs=num_epochs, warmup_steps=warmup_steps, optimizer_params={ "lr": lr, "eps": eps, "correct_bias": False }, weight_decay=weight_decay, ) # evaluate the model val_evaluator = EmbeddingSimilarityEvaluator.from_input_examples( val_fold, name="val_set", main_similarity=SimilarityFunction.COSINE) result = val_evaluator(model) print("######################################################") print("test result:", result) print("######################################################") if math.isnan(result): result = 0.0 return result
def create_hirerachy_examples(fl, data_dir, model, validate=None, is_test=False): train_hierarchy_samples = [] disbn = [] with open(os.path.join(data_dir, fl)) as f: data = json.load(f) max_distance = 0 for obj in data: if obj['distance'] > max_distance: max_distance = obj['distance'] for obj in data: # flip the meaning of similarity, since the more distant the two classes, the closer to 0 it should be dist = (max_distance - obj['distance']) / (max_distance - 1) train_hierarchy_samples.append( InputExample(texts=[obj['class1'], obj['class2']], label=dist)) disbn.append(obj['distance']) random.shuffle(train_hierarchy_samples) train_hierarchy_samples = train_hierarchy_samples[:100000] disbn = disbn[:100000] if max_size: train_hierarchy_samples = train_hierarchy_samples[:max_size] disbn = disbn[:max_size] if is_test: return train_hierarchy_samples evaluator = None if hierarchy_str == validate: train_hierarchy_samples, dev_hierarchy_samples = train_test_split( train_hierarchy_samples, stratify=disbn, test_size=0.1) evaluator = EmbeddingSimilarityEvaluator.from_input_examples( dev_hierarchy_samples, name='hierarchy') warmup_steps = math.ceil( len(train_hierarchy_samples) * num_epochs / batch_size * 0.1) # 10% of train data for warm-up train_data_hierarchy = SentencesDataset(train_hierarchy_samples, model=model) train_dataloader_hierarchy = DataLoader(train_data_hierarchy, shuffle=True, batch_size=batch_size) train_loss_hierarchy = losses.CosineSimilarityLoss(model=model) print('H: Number of training examples: ', len(train_hierarchy_samples)) global evaluation_steps evaluation_steps = math.ceil(len(train_hierarchy_samples) / 0.1) return train_dataloader_hierarchy, train_loss_hierarchy, evaluator, warmup_steps
def create_train_usage(fl, data_dir, model, validate=None, is_test=False): train_usage = [] with open(os.path.join(data_dir, fl)) as f: data = json.load(f) min_d = 10000000 max_d = 0 for obj in data: dist = obj['distance'] if dist < min_d: min_d = dist if dist > max_d: max_d = dist for obj in data: dist = (max_d - obj['distance']) / (max_d - min_d) train_usage.append( InputExample(texts=[obj['class1'], obj['class2']], label=dist)) random.shuffle(train_usage) if is_test: return train_usage if max_size: train_usage = train_usage[:max_size] evaluator = None if usage_str == validate: train_usage, dev_usage = train_test_split(train_usage, test_size=0.1) evaluator = EmbeddingSimilarityEvaluator.from_input_examples( dev_usage, name='usage') warmup_steps = math.ceil(len(train_usage) * num_epochs / batch_size * 0.1) # 10% of train data for warm-up train_data_usage = SentencesDataset(train_usage, model=model) train_dataloader_usage = DataLoader(train_data_usage, shuffle=True, batch_size=batch_size) train_loss_usage = losses.CosineSimilarityLoss(model=model) print('U: Number of training examples: ', len(train_usage)) global evaluation_steps evaluation_steps = math.ceil(len(train_usage) / 0.1) return train_dataloader_usage, train_loss_usage, evaluator, warmup_steps
def test_model(model_dir, languages, crossings, params): # load all data incl. crossings test_data_stsb_cross = load_test_data(languages, crossings) assert len(test_data_stsb_cross) == 4 assert len(test_data_stsb_cross[0]) == 1379 lang_1_stsb = test_data_stsb_cross[0] lang_2_stsb = test_data_stsb_cross[1] assert len(lang_1_stsb) == len(lang_2_stsb) == 1379 # test data of crossings only lang_cross_stsb = list( itertools.chain.from_iterable(test_data_stsb_cross[2:])) assert len(lang_cross_stsb) == 1379 * 2 # test data of lang1, lang1 and crossings lang_all_stsb = list(itertools.chain.from_iterable(test_data_stsb_cross)) assert len(lang_all_stsb) == 1379 * 4 # convert to sentence_transformers datasets lang_1_stsb = to_input_example(lang_1_stsb) lang_2_stsb = to_input_example(lang_2_stsb) lang_cross_stsb = to_input_example(lang_cross_stsb) lang_all_stsb = to_input_example(lang_all_stsb) # load model from dir model = SentenceTransformer(model_dir) test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples( lang_1_stsb, name="test_lang_1_stsb", main_similarity=SimilarityFunction.COSINE) result_lang_1_stsb = test_evaluator(model) test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples( lang_2_stsb, name="test_lang_2_stsb", main_similarity=SimilarityFunction.COSINE) result_lang_2_stsb = test_evaluator(model) test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples( lang_cross_stsb, name="test_lang_cross_stsb", main_similarity=SimilarityFunction.COSINE, ) result_lang_cross_stsb = test_evaluator(model) test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples( lang_all_stsb, name="test_lang_all_stsb", main_similarity=SimilarityFunction.COSINE, ) result_lang_all_stsb = test_evaluator(model) params["lang_1_test_result_spearman"] = result_lang_1_stsb params["lang_2_test_result_spearman"] = result_lang_2_stsb params["lang_cross_test_result_spearman"] = result_lang_cross_stsb params["lang_all_test_result_spearman"] = result_lang_all_stsb params["languages"] = languages print("test_results:", params) with open(os.path.join(model_dir, "test_results.json"), "w") as outfile: json.dump(params, outfile)
else: validation_examples.append(sample) train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size) return train_dataloader, validation_examples if __name__ == "__main__": args = parser.parse_args() model, train_loss = construct_model(args.base_model, args.encoder_style) train_dataloader, validation_examples = load_data( args.training_data_file, batch_size=args.batch_size) if args.encoder_style == BIENCODER: evaluator = EmbeddingSimilarityEvaluator.from_input_examples( validation_examples) else: evaluator = CECorrelationEvaluator.from_input_examples( validation_examples) if args.encoder_style == BIENCODER: model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=args.num_epochs, warmup_steps=args.warmup_steps, evaluator=evaluator, evaluation_steps=500, optimizer_params={'lr': args.lr}, output_path=args.output_training_directory, use_amp=True) elif args.encoder_style == CROSSENCODER: model.fit(train_dataloader=train_dataloader, evaluator=evaluator,
pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read STSbenchmark train dataset") train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.CosineSimilarityLoss(model=model) logging.info("Read STSbenchmark dev dataset") evaluator = EmbeddingSimilarityEvaluator.from_input_examples( sts_reader.get_examples('sts-dev.csv'), name='sts-dev') # Configure the training. We skip evaluation in this example warmup_steps = math.ceil( len(train_data) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=model_save_path)
train_dataset = SentencesDataset(train_samples, model=model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=len(label2int)) #Read STSbenchmark dataset and use it as development set logging.info("Read STSbenchmark dev dataset") dev_samples = [] with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: if row['split'] == 'dev': score = float(row['score']) / 5.0 #Normalize score to range 0 ... 1 dev_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=score)) dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, batch_size=train_batch_size, name='sts-dev') # Configure the training num_epochs = 1 warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=dev_evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps,
os.path.join(script_folder_path, args.sts_corpus)) for idx, target in enumerate(target_eval_files): output_filename_eval = os.path.join(script_folder_path, args.sts_corpus + target + "-test.csv") if args.whitening: evaluators[target[:5]].append( WhiteningEmbeddingSimilarityEvaluator.from_input_examples( sts_reader.get_examples(output_filename_eval), measure_data_num=target_eval_data_num[idx], embed_dim=args.embed_dim, name=target, main_similarity=SimilarityFunction.COSINE)) else: evaluators[target[:5]].append( EmbeddingSimilarityEvaluator.from_input_examples( sts_reader.get_examples(output_filename_eval), name=target, main_similarity=SimilarityFunction.COSINE)) all_results = [] logger_text = "" for task, sequential_evaluator in evaluators.items(): result = model.evaluate( SequentialEvaluator( sequential_evaluator, main_score_function=lambda scores: np.mean(scores))) logger_text += "%.2f \t" % (result * 100) all_results.append(result * 100) logger.info(" \t".join(target_eval_tasks) + " \tOverall.") logger.info(logger_text + "%.2f" % np.mean(all_results))
# print(sorted_df.info()) # print(sorted_df.head()) # print(sorted_df['Topic'].value_counts()) train, test = train_test_split(sorted_df, stratify=sorted_df['Topic']) test, val = train_test_split(test, stratify=test['Topic']) print("Getting the bert-base-nli-mean-tokens model.") model = SentenceTransformer("bert-base-nli-mean-tokens") print("Read AIML QA dataset") train_dataloader = DataLoader(train, shuffle=True, batch_size=train_batch_size) print("Calculate loss") train_loss = losses.CosineSimilarityLoss(model=model) print("Create evaluator") evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val) # Train the model warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up print("training the model...") model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=model_save_path) print("complete") # Development set: Measure correlation between cosine score and gold labels print("evaluating trained model...")
"Step 3: Train bi-encoder: {} with STSbenchmark (gold + silver dataset)". format(model_name)) # Convert the dataset to a DataLoader ready for training logging.info("Read STSbenchmark gold and silver train dataset") silver_samples = list(InputExample(texts=[data[0], data[1]], label=score) for \ data, score in zip(silver_data, silver_scores)) train_dataset = SentencesDataset(gold_samples + silver_samples, bi_encoder) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size) train_loss = losses.CosineSimilarityLoss(model=bi_encoder) logging.info("Read STSbenchmark dev dataset") evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev') # Configure the training. warmup_steps = math.ceil(len(train_dataset) * num_epochs / batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the bi-encoder model bi_encoder.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=bi_encoder_path) ######################################################################
pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read AllNLI train dataset") train_dataset = SentencesDataset(nli_reader.get_examples('train.gz'), model=model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels) logging.info("Read STSbenchmark dev dataset") dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(sts_reader.get_examples('sts-dev.csv'), batch_size=train_batch_size, name='sts-dev', main_similarity=args.main_similarity) # Configure the training num_epochs = 1 warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=dev_evaluator, epochs=num_epochs, evaluation_steps=1000,
script_folder_path = os.path.dirname(os.path.realpath(__file__)) #Limit torch to 4 threads torch.set_num_threads(4) #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout model_name = '../training/nli/output/training_nli_bert-base-uncased-2021-01-10_14-44-13' # Load a named sentence model (based on BERT). This will download the model from our server. # Alternatively, you can also pass a filepath to SentenceTransformer() model = SentenceTransformer(model_name) sts_corpus = "../datasets/stsbenchmark/" target_eval_files = set(['sts','sts12', 'sts13', 'sts14', 'sts15', 'sts16', 'sick-r']) evaluators = [] #evaluators has a list of different evaluator classes we call periodically sts_reader = STSBenchmarkDataReader(os.path.join(script_folder_path, sts_corpus)) for target in target_eval_files: output_filename_eval = os.path.join(script_folder_path,sts_corpus + target + "-test.csv") evaluators.append(EmbeddingSimilarityEvaluator.from_input_examples(sts_reader.get_examples(output_filename_eval), name=target)) evaluator = SequentialEvaluator(evaluators, main_score_function=lambda scores: np.mean(scores)) model.evaluate(evaluator)
print("Number of dissimilar pairs in dev data: ", len(dev_pair_corpus) - num_sim_dev_pairs) print("total validation data size: ", len(dev_pair_corpus)) # modeling model_variant = args.variant print("model variant is: ", model_variant) path = args.output_path print("output path: ", path) # word_embedding_model = models.Transformer("roberta-base") # pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), # pooling_mode_mean_tokens=True, # pooling_mode_cls_token=False, # pooling_mode_max_tokens=False) # model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) model = SentenceTransformer(model_variant) # fine-tune on top of SBERT model.to(device) train_dataloader = DataLoader(train_pair_corpus, shuffle=True, batch_size=32) train_loss = losses.CosineSimilarityLoss(model=model) evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_pair_corpus, name='dev') warmup_steps = math.ceil(len(train_dataloader) * 4 * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=4, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=path)
from sentence_transformers.readers import STSBenchmarkDataReader import logging import sys import os import torch script_folder_path = os.path.dirname(os.path.realpath(__file__)) #Limit torch to 4 threads torch.set_num_threads(4) #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-nli-mean-tokens' # Load a named sentence model (based on BERT). This will download the model from our server. # Alternatively, you can also pass a filepath to SentenceTransformer() model = SentenceTransformer(model_name) sts_reader = STSBenchmarkDataReader( os.path.join(script_folder_path, '../datasets/stsbenchmark')) evaluator = EmbeddingSimilarityEvaluator.from_input_examples( sts_reader.get_examples("sts-test.csv")) model.evaluate(evaluator)
test_posts_ranking = create_posts_ranking( 'stackoverflow_data_ranking_v2_testing.json', data_dir, model, validate=None, is_test=True) test_search = create_search( 'stackoverflow_matches_codesearchnet_5k_test_collection.tsv', 'stackoverflow_matches_codesearchnet_5k_test_queries.tsv', 'stackoverflow_matches_codesearchnet_5k_test_blanca-qidpidtriples.train.tsv', data_dir, model, validate=None, is_test=True) test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples( test_hierarchy_samples, name='test-hierarchy-samples') test_evaluator(model, output_path=args.output_dir) test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples( test_posts_ranking, name='test-post-ranking') test_evaluator(model, output_path=args.output_dir) test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples( test_usage, name='test-usage') test_evaluator(model, output_path=args.output_dir) test_evaluator = BinaryClassificationEvaluator.from_input_examples( test_linked_posts, name='test-linked-posts') test_evaluator(model, output_path=args.output_dir) test_evaluator = BinaryClassificationEvaluator.from_input_examples( test_class_posts, name='test-class-posts') test_evaluator(model, output_path=args.output_dir) test_evaluator = BinaryClassificationEvaluator.from_input_examples(
import math from sentence_transformers import losses from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator train_dataloader = DataLoader(train_samples[:100000], shuffle=True, batch_size=train_batch_size) # train_dataloader = DataLoader(train_samples[:80000], shuffle=True, batch_size=train_batch_size) train_loss = losses.CosineSimilarityLoss(model=word_embedding_model) # Development set: Measure correlation between cosine score and gold labels logging.info("Read SNLI benchmark dev dataset") evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='snli-test') # Configure the training. We skip evaluation in this example warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) logging.info("checkpoint_save_steps: {}".format(10*len(train_dataloader))) # Train the model word_embedding_model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=10000, warmup_steps=warmup_steps, output_path=OUTPUT_MODEL,
logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base # model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-uncased' set_seed(args) # Read the dataset train_batch_size = args.batch_size model_save_path = args.model_path model = SentenceTransformer(model_save_path) folder = '../datasets/temp-sts/STS-data' #'STS2012-gold','STS2013-gold','STS2014-gold','STS2015-gold', names = [ 'STS2012-gold', 'STS2013-gold', 'STS2014-gold', 'STS2015-gold', 'STS2016-gold', 'SICK-data' ] for name in names: sts_reader = STSDataReader(os.path.join(folder, name)) test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples( sts_reader.get_examples('all.tsv'), batch_size=train_batch_size, name=name + '-test') test_evaluator(model, output_path=model_save_path)
def main(args): #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout # Read the dataset train_batch_size = 64 num_epochs = 1000 if args.pretrained: model = SentenceTransformer(args.pretrained) model_save_path = os.path.join( args.save_path, args.pretrained.split("/")[-1] + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) else: #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base model_name = 'cl-tohoku/bert-base-japanese-char-whole-word-masking' model_save_path = os.path.join( args.save_path, model_name.replace("/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer( modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read custom train dataset") train_samples = [] val_samples = [] inp_list = [] dataset_path = args.data_path with gzip.open(dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: score = float( row['score']) / 10 # Normalize score to range 0 ... 1 inp_list.append( InputExample(texts=[row['sentence1'], row['sentence2']], label=score)) from sklearn.model_selection import train_test_split train_samples, val_samples = train_test_split(inp_list, test_size=0.2) # import ipdb; ipdb.set_trace() train_dataset = SentencesDataset(train_samples, model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) train_loss = losses.CosineSimilarityLoss(model=model) logging.info("Read custom dev dataset") # evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_samples, name='sts-dev') evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_samples) # Configure the training. We skip evaluation in this example warmup_steps = math.ceil( len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # import ipdb; ipdb.set_trace() # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=model_save_path)
def test(model_save_path, sts_dataset_path, train_batch_size): test_samples = read_dataset(sts_dataset_path, "test") model = SentenceTransformer(model_save_path) test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples( test_samples, batch_size=train_batch_size, name='sts-test') test_evaluator(model, output_path=model_save_path)