num_epochs = 4 warmup_steps = math.ceil(len(train_data_sts) * num_epochs / batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Here we define the two train objectives: train_dataloader_nli with train_loss_nli (i.e., SoftmaxLoss for NLI data) # and train_dataloader_sts with train_loss_sts (i.e., CosineSimilarityLoss for STSbenchmark data) # You can pass as many (dataloader, loss) tuples as you like. They are iterated in a round-robin way. train_objectives = [(train_dataloader_nli, train_loss_nli), (train_dataloader_sts, train_loss_sts)] # Train the model model.fit(train_objectives=train_objectives, evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=model_save_path) ############################################################################## # # Load the stored model and evaluate its performance on STS benchmark dataset # ############################################################################## model = SentenceTransformer(model_save_path) test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size) evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
def main(): parser = argparse.ArgumentParser() # Input and output configs parser.add_argument("--task", default=None, type=str, required=True, help="the task to run bert ranker for") parser.add_argument("--data_folder", default=None, type=str, required=True, help="the folder containing data") parser.add_argument("--output_dir", default=None, type=str, required=True, help="the folder to output predictions") parser.add_argument("--negative_sampler", default="random", type=str, required=False, help="negative sampling procedure to use ['random', 'bm25', 'sentence_transformer']") parser.add_argument("--anserini_folder", default="", type=str, required=True, help="Path containing the anserini bin <anserini_folder>/target/appassembler/bin/IndexCollection") parser.add_argument("--sentence_bert_ns_model", default="all-MiniLM-L6-v2", type=str, required=False, help="model to use for sentenceBERT negative sampling.") parser.add_argument('--denoise_negatives', dest='denoise_negatives', action='store_true') parser.add_argument('--no-denoise_negatives', dest='denoise_negatives', action='store_false') parser.set_defaults(denoise_negatives=False) parser.add_argument("--num_ns_for_denoising", default=100, type=int, required=False, help="Only used for --denoise_negatives. Number of total of samples to retrieve and get the bottom 10.") parser.add_argument("--generative_sampling_model", default="all-MiniLM-L6-v2", type=str, required=False, help="model to use for generating negative samples on the go.") parser.add_argument('--remove_cand_subsets', dest='remove_cand_subsets', action='store_true') parser.add_argument('--dont_remove_cand_subsets', dest='remove_cand_subsets', action='store_false') parser.set_defaults(remove_cand_subsets=True) #which part of the context we use to sample negatives. parser.add_argument('--last_utterance_only', dest='last_utterance_only', action='store_true') parser.add_argument('--all_utterances', dest='last_utterance_only', action='store_false') parser.set_defaults(last_utterance_only=False) # External corpus to augment negative sampling parser.add_argument('--external_corpus', dest='use_external_corpus', action='store_true') parser.add_argument('--dont_use_external_corpus', dest='use_external_corpus', action='store_false') parser.set_defaults(use_external_corpus=False) # #Training procedure parser.add_argument("--num_epochs", default=3, type=int, required=False, help="Number of epochs for training.") parser.add_argument("--train_batch_size", default=8, type=int, required=False, help="Training batch size.") # #Model hyperparameters parser.add_argument("--transformer_model", default="bert-base-cased", type=str, required=False, help="Bert model to use (default = bert-base-cased).") parser.add_argument("--loss", default='MultipleNegativesRankingLoss', type=str, required=False, help="Loss function to use ['MultipleNegativesRankingLoss', 'TripletLoss', 'MarginMSELoss']") ## Wandb project name parser.add_argument("--wandb_project", default='train_sentence_transformer', type=str, required=False, help="name of the wandb project") parser.add_argument("--seed", default=42, type=int, required=False, help="Random seed.") args = parser.parse_args() torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) max_seq_length = 300 if args.transformer_model == 'all-mpnet-base-v2' or args.transformer_model == 'msmarco-bert-base-dot-v5': model = SentenceTransformer(args.transformer_model) model.max_seq_length = max_seq_length else: word_embedding_model = models.Transformer(args.transformer_model, max_seq_length=max_seq_length) tokens = ['[UTTERANCE_SEP]', '[TURN_SEP]', '[AUG]'] word_embedding_model.tokenizer.add_tokens(tokens, special_tokens=True) word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer)) pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) eval_only = False if eval_only: logging.info("Skipping training (eval_only=True)") else: logging.info("Creating train CRR dataset for {} using {}.".format(args.task, args.negative_sampler)) crr_reader = CRRBenchmarkDataReader('{}/{}'.format(args.data_folder, args.task)) train_data = crr_reader.get_examples("train.tsv", args.negative_sampler, args.anserini_folder, args.sentence_bert_ns_model, args.loss, args.output_dir, True, False, args.denoise_negatives, args.num_ns_for_denoising, args.generative_sampling_model, args.remove_cand_subsets, args.last_utterance_only, args.use_external_corpus) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=args.train_batch_size) if args.loss == 'MultipleNegativesRankingLoss': train_loss = losses.MultipleNegativesRankingLoss(model=model, similarity_fct=util.dot_score) elif args.loss == 'MarginMSELoss': train_loss = losses.MarginMSELoss(model=model) elif args.loss == 'TripletLoss': train_loss = losses.TripletLoss(model=model) elif args.loss == 'ContrastiveLoss': train_loss = losses.ContrastiveLoss(model=model) elif args.loss == 'OnlineContrastiveLoss': train_loss = losses.OnlineContrastiveLoss(model=model) ns_description = args.negative_sampler if args.negative_sampler == 'sentence_transformer': ns_description+="_{}".format(args.sentence_bert_ns_model) if args.negative_sampler == 'generative': ns_description+="_{}".format(args.generative_sampling_model) wandb.init(project=args.wandb_project) wandb.config.update(args) if not eval_only: # this is the eval data for the training, not the actual evaluation logging.info("Getting eval data") examples_dev = crr_reader.get_examples('valid.tsv', args.negative_sampler, args.anserini_folder, args.sentence_bert_ns_model, args.loss, args.output_dir, eval_data=True) examples_dev = examples_dev[0:(11*500)] eval_samples = [] docs = [] for i, example in enumerate(examples_dev): if (i+1)%11==0: eval_samples.append({'query': example.texts[0], 'positive': [example.texts[1]], 'negative': docs }) docs=[] else: docs.append(example.texts[2]) evaluator = RerankingEvaluator(eval_samples, write_csv=True, similarity_fct=util.dot_score) warmup_steps = math.ceil(len(train_data)*args.num_epochs/args.train_batch_size*0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) logging.info("Fitting sentenceBERT for {}".format(args.task)) model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=args.num_epochs, evaluation_steps=100, steps_per_epoch=10000, warmup_steps=warmup_steps, output_path=args.output_dir+"{}_{}_ns_{}_loss_{}".format(args.transformer_model, args.task, ns_description, args.loss)) logging.info("Evaluating for full retrieval of responses to dialogue.") train = pd.read_csv(args.data_folder+args.task+"/train.tsv", sep="\t") test = pd.read_csv(args.data_folder+args.task+"/test.tsv", sep="\t") ns_test_sentenceBERT = negative_sampling.SentenceBERTNegativeSampler(list(train["response"].values)+list(test["response"].values), 10, args.data_folder+args.task+"/test_sentenceBERTembeds", -1, args.output_dir+"{}_{}_ns_{}_loss_{}".format(args.transformer_model, args.task, ns_description, args.loss), use_cache_for_embeddings=False) ns_info = [ (ns_test_sentenceBERT, ["cand_sentenceBERT_{}".format(i) for i in range(10)] + ["sentenceBERT_retrieved_relevant", "sentenceBERT_rank"], 'sentenceBERT') ] examples = [] examples_cols = ["context", "relevant_response"] + \ reduce(lambda x,y:x+y, [t[1] for t in ns_info]) logging.info("Retrieving candidates using different negative sampling strategies for {}.".format(args.task)) recall_df = [] for idx, row in enumerate(tqdm(test.itertuples(index=False), total=len(test))): context = row[0] relevant_response = row[1] instance = [context, relevant_response] for ns, _ , ns_name in ns_info: ns_candidates, scores, had_relevant, rank_relevant, _ = ns.sample(context, [relevant_response]) for ns in ns_candidates: instance.append(ns) instance.append(had_relevant) instance.append(rank_relevant) if had_relevant: r10 = 1 else: r10 = 0 if rank_relevant == 0: r1 = 1 else: r1 =0 recall_df.append([r10, r1]) examples.append(instance) recall_df = pd.DataFrame(recall_df, columns = ["R@10", "R@1"]) examples_df = pd.DataFrame(examples, columns=examples_cols) logging.info("R@10: {}".format(examples_df[[c for c in examples_df.columns if 'retrieved_relevant' in c]].sum()/examples_df.shape[0])) wandb.log({'R@10': (examples_df[[c for c in examples_df.columns if 'retrieved_relevant' in c]].sum()/examples_df.shape[0]).values[0]}) rank_col = [c for c in examples_df.columns if 'rank' in c][0] logging.info("R@1: {}".format(examples_df[examples_df[rank_col]==0].shape[0]/examples_df.shape[0])) wandb.log({'R@1': examples_df[examples_df[rank_col]==0].shape[0]/examples_df.shape[0]}) recall_df.to_csv(args.output_dir+"/recall_df_{}_{}_ns_{}_loss_{}.csv".format(args.transformer_model.replace("/", "-"), args.task, ns_description.replace("/", "-"), args.loss), index=False, sep="\t")
shuffle=True, batch_size=batch_size) train_loss = losses.CosineSimilarityLoss(model=model) logging.info("Read STSbenchmark dev dataset") evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev') # Configure the training num_epochs = 10 warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, warmup_steps=warmup_steps, output_path=model_save_path) ############################################################################## # # Load the stored model and evaluate its performance on STS benchmark dataset # ############################################################################## model = SentenceTransformer(model_save_path) test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples( test_samples, name='sts-test') model.evaluate(evaluator)
batch_size=train_batch_size) train_loss = losses.CosineSimilarityLoss(model=model) logging.info("Read STSbenchmark dev dataset") evaluator = EmbeddingSimilarityEvaluator.from_input_examples( dev_samples, name='sts-dev') # Configure the training. We skip evaluation in this example warmup_steps = math.ceil( len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up # Stopping and Evaluating after 30% of training data (less than 1 epoch) # We find from (Dodge et al.) that 20-30% is often ideal for convergence of random seed steps_per_epoch = math.ceil( len(train_dataset) / train_batch_size * stop_after) logging.info("Warmup-steps: {}".format(warmup_steps)) logging.info("Early-stopping: {}% of the training-data".format( int(stop_after * 100))) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, steps_per_epoch=steps_per_epoch, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=model_save_path, output_path_ignore_not_empty=True)
self.model = model self.queries = queries self.corpus = corpus self.triplets_file = triplets_file def __iter__(self): with gzip.open(self.triplets_file, 'rt') as fIn: for line in fIn: qid, pos_id, neg_id = line.strip().split() query_text = self.queries[qid] pos_text = self.corpus[pos_id] neg_text = self.corpus[neg_id] yield InputExample(texts=[query_text, pos_text, neg_text]) def __len__(self): return 397226027 # For training the SentenceTransformer model, we need a dataset, a dataloader, and a loss used for training. train_dataset = TripletsDataset(model=model, queries=queries, corpus=corpus, triplets_file=train_filepath) train_dataloader = DataLoader(train_dataset, shuffle=False, batch_size=train_batch_size) train_loss = losses.MultipleNegativesRankingLoss(model=model) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=ir_evaluator, epochs=1, warmup_steps=1000, output_path=model_save_path, evaluation_steps=5000, use_amp=True )
def train_nli(): #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base #model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-uncased' model_name = 'pretrained_model/bert-base-uncased' # Read the dataset train_batch_size = 6 nli_reader = NLIDataReader('./examples/datasets/AllNLI') sts_reader = STSBenchmarkDataReader('./examples/datasets/stsbenchmark') train_num_labels = nli_reader.get_num_labels() model_save_path = 'output/training_nli_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read AllNLI train dataset") train_dataset = SentencesDataset(nli_reader.get_examples('train.gz'), model=model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels) logging.info("Read STSbenchmark dev dataset") dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) evaluator = LabelAccuracyEvaluator(dev_dataloader,softmax_model = Softmax_label(model = model, sentence_embedding_dimension = model.get_sentence_embedding_dimension(), num_labels = train_num_labels)) # Configure the training num_epochs = 1 warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=100, warmup_steps=warmup_steps, output_path=model_save_path ) ############################################################################## # # Load the stored model and evaluate its performance on STS benchmark dataset # ############################################################################## #model = SentenceTransformer(model_save_path) test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size) #evaluator = EmbeddingSimilarityEvaluator(test_dataloader) model.evaluate(evaluator)
lang1, lang2 = lang2, lang1 filepath = 'STS2017-extended/STS.{}-{}.txt'.format(lang1, lang2) if filepath in filelist: filename = os.path.basename(filepath) sts_data[filename] = {'sentences1': [], 'sentences2': [], 'scores': []} fIn = zip.open(filepath) for line in io.TextIOWrapper(fIn, 'utf8'): sent1, sent2, score = line.strip().split("\t") score = float(score) sts_data[filename]['sentences1'].append(sent1) sts_data[filename]['sentences2'].append(sent2) sts_data[filename]['scores'].append(score) for filename, data in sts_data.items(): test_evaluator = evaluation.EmbeddingSimilarityEvaluator(data['sentences1'], data['sentences2'], data['scores'], batch_size=inference_batch_size, name=filename, show_progress_bar=False) evaluators.append(test_evaluator) # Train the model student_model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluation.SequentialEvaluator(evaluators, main_score_function=lambda scores: np.mean(scores)), epochs=num_epochs, warmup_steps=num_warmup_steps, evaluation_steps=num_evaluation_steps, output_path=output_path, save_best_model=True, optimizer_params= {'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False} )
#train_loss = losses.BatchSemiHardTripletLoss(model=model) logging.info("Read TREC val dataset") dev_evaluator = TripletEvaluator.from_input_examples(dev_set, name='trec-dev') logging.info("Performance before fine-tuning:") dev_evaluator(model) warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) # 10% of train data # Train the model model.fit( train_objectives=[(train_dataloader, train_loss)], evaluator=dev_evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=output_path, ) ############################################################################## # # Load the stored model and evaluate its performance on TREC dataset # ############################################################################## logging.info("Evaluating model on test set") test_evaluator = TripletEvaluator.from_input_examples(test_set, name='trec-test') model.evaluate(test_evaluator)
def BertEM(path_train, path_valid, path_test, path_error, epochs_num, warmup_steps_num, evaluation_steps_num): #实例化进度条 bar = progressbar #定义模型 #model = SentenceTransformer('bert-large-nli-stsb-mean-tokens',device='cuda:1') model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens', device='cuda:6') #model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens',device='cuda:4') data_type = {"text_a": str, "text_b": str} #截断 def auto_truncate(val): return val[:1500] train_data = pd.read_csv(path_train, converters={ 'text_a': auto_truncate, 'text_b': auto_truncate }) valid_data = pd.read_csv(path_valid, converters={ 'text_a': auto_truncate, 'text_b': auto_truncate }) test_data = pd.read_csv(path_test, converters={ 'text_a': auto_truncate, 'text_b': auto_truncate }) #训练集 train_examples = [] for i in bar.progressbar(range(len(train_data))): time.sleep(0.0001) text_a = train_data.iloc[i]['text_a'] text_b = train_data.iloc[i]['text_b'] text_a = str(text_a) text_b = str(text_b) label_data = train_data.iloc[i]['label'] label_data = float(label_data) train_examples.append( InputExample(texts=[text_a, text_b], label=label_data)) print(InputExample) #验证集 sentence_a = [] sentence_b = [] label_valid = [] for i in bar.progressbar(range(len(valid_data))): time.sleep(0.0001) sentence1 = valid_data.iloc[i]['text_a'] sentence2 = valid_data.iloc[i]['text_b'] label_valid_t = valid_data.iloc[i]['label'] label_valid_t = float(label_valid_t) sentence_a.append(sentence1) sentence_b.append(sentence2) label_valid.append(label_valid_t) #定义评估器 #evaluator = evaluation.EmbeddingSimilarityEvaluator(sentence_a, sentence_b, label_valid) evaluator = evaluation.BinaryClassificationEvaluator( sentence_a, sentence_b, label_valid) #定义数据集,损失函数 train_dataset = SentencesDataset(train_examples, model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=32) train_loss = losses.CosineSimilarityLoss(model) #计算时间 start_time = time.clock() #训练模型 model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=epochs_num, warmup_steps=warmup_steps_num, evaluator=evaluator, evaluation_steps=evaluation_steps_num, use_amp=True) end_time = time.clock() #=========================================评估过程=================================================== #读取并把test所有属性转化成str test_data = pd.read_csv(path_test, encoding='utf-8') test_data['text_a'] = test_data['text_a'].map(lambda x: str(x)) test_data['text_b'] = test_data['text_b'].map(lambda x: str(x)) #循环创建预测的list字典 list_num = 38 prefix = 'pred_list_' test_map = {prefix + str(i): [] for i in range(list_num)} for i in range(len(test_map.keys())): test_map[prefix + str(i)].append(0.001) test_map[prefix + str(i)].append(0.001) test_map[prefix + str(i)].append(0.001) test_map[prefix + str(i)].append(0.001) label_list = [] score = 0.20 #记录错误的dataframe error_csv = pd.DataFrame(columns=('id', 'text_a', 'text_b', 'cos_scores')) #记录计算分数的dataframe score_df = pd.DataFrame(columns=('label', 'pred')) #进入测试集测试 for i in bar.progressbar(range(len(test_data))): time.sleep(0.0001) text_a_embedding = model.encode(test_data.iloc[i]['text_a'], convert_to_tensor=True) text_b_embedding = model.encode(test_data.iloc[i]['text_b'], convert_to_tensor=True) cos_scores = util.pytorch_cos_sim(text_a_embedding, text_b_embedding)[0] cos_scores = cos_scores.cpu() #标签list label = test_data.iloc[i]['label'] label = int(label) label_list.append(label) #记录下错误的数据 if cos_scores >= 0.80: pred_test = 1 else: pred_test = 0 if pred_test != label: error_text_a = test_data.iloc[i]['text_a'] error_text_b = test_data.iloc[i]['text_b'] error_cos_scores = cos_scores error_csv = error_csv.append(pd.DataFrame({ 'id': [i], 'text_a': [error_text_a], 'text_b': [error_text_b], 'cos_scores': [error_cos_scores] }), ignore_index=True) #生成预测list statistics_pred(score, label, cos_scores, prefix, test_map) #compute_pred(score,cos_scores,prefix,test_map) # error_csv.to_csv(path_error, index=0) max_f1 = 0 target_threshold = 0.01 target_precision = 0.01 target_recall = 0.01 threshold = 0.20 #循环所有列表,输出各种得分结果 for i in range(len(test_map.keys())): #循环计算得分 precision, recall, f1 = compute_score(test_map[prefix + str(i)][0], test_map[prefix + str(i)][1], test_map[prefix + str(i)][2], test_map[prefix + str(i)][3]) if f1 >= max_f1: max_f1 = f1 target_threshold = threshold target_precision = precision target_recall = recall print('The score > {} result is precision: {}, | recall:{}, | f1: {}'. format(round(threshold, 2), precision, recall, f1)) threshold += 0.02 #输出所有结果 print('================dataset_name==================', path_a) print( '================threshold:{}, target_precision:{}, target_recall:{}, max_f1:{}' .format(target_threshold, target_precision, target_recall, max_f1)) print('================train_time:{}'.format(str(end_time - start_time)))
label=score)) dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples( dev_samples, batch_size=train_batch_size, name='sts-dev') # Configure the training warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit( train_objectives=[(train_dataloader, train_loss)], evaluator=dev_evaluator, epochs=num_epochs, evaluation_steps=int(len(train_dataloader) * 0.1), warmup_steps=warmup_steps, output_path=model_save_path, use_amp=False #Set to True, if your GPU supports FP16 operations ) ############################################################################## # # Load the stored model and evaluate its performance on STS benchmark dataset # ############################################################################## test_samples = [] with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader:
other_qid_list = list(distraction_questions.keys()) random.shuffle(other_qid_list) for qid in other_qid_list[0:max(0, max_corpus_size - len(ir_corpus))]: ir_corpus[qid] = distraction_questions[qid] #Given queries, a corpus and a mapping with relevant documents, the InformationRetrievalEvaluator computes different IR # metrices. For our use case MRR@k and Accuracy@k are relevant. ir_evaluator = evaluation.InformationRetrievalEvaluator( ir_queries, ir_corpus, ir_relevant_docs) evaluators.append(ir_evaluator) # Create a SequentialEvaluator. This SequentialEvaluator runs all three evaluators in a sequential order. # We optimize the model with respect to the score from the last evaluator (scores[-1]) seq_evaluator = evaluation.SequentialEvaluator( evaluators, main_score_function=lambda scores: scores[-1]) logging.info("Evaluate model without training") seq_evaluator(model, epoch=0, steps=0, output_path=model_save_path) # Train the model model.fit(train_objectives=[(train_dataloader_MultipleNegativesRankingLoss, train_loss_MultipleNegativesRankingLoss), (train_dataloader_ConstrativeLoss, train_loss_ConstrativeLoss)], evaluator=seq_evaluator, epochs=num_epochs, warmup_steps=1000, output_path=model_save_path, output_path_ignore_not_empty=True)
train_loss = losses.DenoisingAutoEncoderLoss(model, decoder_name_or_path=model_name, tie_encoder_decoder=True) evaluation_steps = 1000 logging.info("Training sentences: {}".format(len(train_sentences))) logging.info("Performance before training") dev_evaluator(model) # Train the model model.fit( train_objectives=[(train_dataloader, train_loss)], evaluator=dev_evaluator, epochs=num_epochs, evaluation_steps=evaluation_steps, output_path=model_save_path, weight_decay=0, warmup_steps=100, optimizer_params={'lr': 3e-5}, use_amp=True #Set to True, if your GPU supports FP16 cores ) ############################################################################## # # Load the stored model and evaluate its performance on STS benchmark dataset # ############################################################################## model = SentenceTransformer(model_save_path) test_evaluator(model, output_path=model_save_path)
def train(model_name_or_path: str, hf_dataset: str, aspect: str, fold: Union[int, str], output_path: str, train_epochs: int = 3, train_batch_size: int = 25, eval_batch_size: int = 32, evaluation_steps: int = 5000, train_on_test: bool = False, loss: str = 'multiple_negatives_ranking', override: bool = False): """ # $MODEL_NAME $HF_DATASET $ASPECT $FOLD $OUTPUT_DIR --train_epochs=3 --train_batch_size=$TRAIN_BATCH_SIZE --eval_batch_size=$EVAL_BATCH_SIZE Run with: $ export CUDA_VISIBLE_DEVICES=1 $ ./sentence_transformer_cli.py train scibert-scivocab-uncased paperswithcode_task_docs 1 ./output/st_scibert/1 --train_epochs=3 --train_batch_size=25 --eval_batch_size=32 :param loss: Training loss function (choices: multiple_negatives_ranking, cosine) :param train_on_test: If True, joint training on train and test set (validation disabled) :param aspect: :param evaluation_steps: :param train_epochs: :param model_name_or_path: :param hf_dataset: :param fold: :param output_path: :param train_batch_size: :param eval_batch_size: :param override: :return: """ top_ks = [5, 10, 25, 50] # cuda_device = -1 # hf_dataset = 'paperswithcode_task_docs' # model_name_or_path = 'scibert-scivocab-uncased' # fold = 1 max_token_length = 336 # ssee pwc_token_stats.ipynb nlp_cache_dir = './data/nlp_cache' # train_batch_size = 25 # eval_batch_size = 32 # override = False # output_path = './output/pwc_task_st/1/sci-bert' # output_path = os.path.join(output_path, str(fold), model_name_or_path) # output/1/sci-bert if os.path.exists(output_path) and not override: logger.error(f'Stop. Output path exists already: {output_path}') sys.exit(1) # if cuda_device >= 0: # os.environ["CUDA_VISIBLE_DEVICES"] = str(cuda_device) # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Model path from env if not os.path.exists(model_name_or_path) and os.path.exists( os.path.join(env['bert_dir'], model_name_or_path)): model_name_or_path = os.path.join(env['bert_dir'], model_name_or_path) word_embedding_model = Transformer(model_name_or_path, max_seq_length=max_token_length) pooling_model = Pooling( word_embedding_model.get_word_embedding_dimension()) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # tokenizer = BertTokenizer.from_pretrained(model_name_or_path) # dataset docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='docs', cache_dir=nlp_cache_dir, split='docs') train_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='relations', cache_dir=nlp_cache_dir, split=get_train_split(aspect, fold)) test_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='relations', cache_dir=nlp_cache_dir, split=get_test_split(aspect, fold)) # filter for positive labels only train_ds = train_ds.filter(lambda row: row['label'] == 'y') logger.info(f'After filtering: {len(train_ds):,}') # joint training on train and test? if train_on_test: # # import pyarrow # from datasets.arrow_dataset import Dataset # # full_ds_table = pyarrow.concat_tables([train_ds.data, test_ds.data]) # full_ds = Dataset(arrow_table=full_ds_table) raise NotImplementedError('TODO Evaluator') else: # standard training on test only train_sds = DocumentPairSentencesDataset(docs_ds, train_ds, model, max_length=max_token_length, forced_length=0) train_sds.tokenize_all_docs() evaluator = NearestNeighborsEvaluator(model, docs_ds, test_ds, top_ks=top_ks, batch_size=eval_batch_size, show_progress_bar=True) if loss == 'cosine': train_loss = losses.CosineSimilarityLoss(model) elif loss == 'multiple_negatives_ranking': # A nice advantage of MultipleNegativesRankingLoss is that it only requires positive pairs # https://github.com/UKPLab/sentence-transformers/tree/master/examples/training/quora_duplicate_questions train_loss = losses.MultipleNegativesRankingLoss(model) else: raise ValueError(f'Unsupported loss function: {loss}') train_dl = DataLoader(train_sds, shuffle=True, batch_size=train_batch_size) # Training model.fit( train_objectives=[(train_dl, train_loss)], epochs=train_epochs, # try 1-4 warmup_steps=100, evaluator=evaluator, evaluation_steps= evaluation_steps, # increase to 5000 (full dataset => 20k steps) output_path=output_path, output_path_ignore_not_empty=True) logger.info('Training done')
model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=args.batch_size) evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) # Configure the training. We skip evaluation in this example warmup_steps = math.ceil( len(train_data) * args.num_epochs / args.batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=args.num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=args.ckpt_path) ############################################################################## # # Load the stored model and evaluate its performance on STS benchmark dataset # ############################################################################## model = SentenceTransformer(args.ckpt_path) test_data = SentencesDataset( examples=sts_reader.get_examples("sts-test_vi.csv"), model=model) test_dataloader = DataLoader(test_data, shuffle=False, batch_size=args.batch_size)
max_sentence_length=256) train_data.add_dataset([[sent] for sent in train_sentences_wikipedia], max_sentence_length=256) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.MSELoss(model=student_model) # We create an evaluator, that measure the Mean Squared Error (MSE) between the teacher and the student embeddings dev_sentences = dev_sentences_nli + dev_sentences_wikipedia dev_evaluator_mse = evaluation.MSEEvaluator(dev_sentences, dev_sentences, teacher_model=teacher_model) # Train the student model to imitate the teacher student_model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluation.SequentialEvaluator( [dev_evaluator_sts, dev_evaluator_mse]), epochs=1, warmup_steps=1000, evaluation_steps=5000, output_path=output_path, save_best_model=True, optimizer_params={ 'lr': 1e-4, 'eps': 1e-6, 'correct_bias': False }, use_amp=True)
batch_size=batch_size) train_loss = losses.CosineSimilarityLoss(model=bi_encoder) logging.info("Read STSbenchmark dev dataset") evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev') # Configure the training. warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the bi-encoder model bi_encoder.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=bi_encoder_path) ################################################################################# # # Evaluate cross-encoder and Augmented SBERT performance on STS benchmark dataset # ################################################################################# # load the stored augmented-sbert model bi_encoder = SentenceTransformer(bi_encoder_path) test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples( test_samples, name='sts-test') test_evaluator(bi_encoder, output_path=bi_encoder_path)
torch.cuda.empty_cache() my_model_path = 'msmarco/models/test_model5' model_1 = SentenceTransformer('roberta-large-nli-stsb-mean-tokens') dev_dataloader = torch.load(os.path.join('msmarco/models/test_model5', 'dev_dataloader.pth')) train_dataloader = torch.load(os.path.join('msmarco/models/test_model5', 'train_dataloader.pth')) evaluator1 = BinaryEmbeddingSimilarityEvaluator(dev_dataloader) evaluator2 = EmbeddingSimilarityEvaluator(dev_dataloader) evaluator = SequentialEvaluator([evaluator1, evaluator2], main_score_function = lambda scores: scores[0]) optimizer_class = transformers.AdamW optimizer_params = {'lr': 2e-6, 'eps': 1e-6, 'correct_bias': False} train_loss = losses.CosineSimilarityLoss(model=model_1) num_epochs = 100 warmup_steps = math.ceil(len(train_dataloader.dataset)*num_epochs / train_dataloader.batch_size*0.1) #10% of train data for warm-up model_1.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, steps_per_epoch=1000, warmup_steps=warmup_steps, optimizer_class=optimizer_class, optimizer_params=optimizer_params, output_path=os.path.join(my_model_path, 'model_lre06_not_od')) # works only when you have an evaluator model_1.save(os.path.join(my_model_path, 'model_lre06_not_od_final'))