def main(model_path, model_type, extra_dataset): # Read the dataset train_batch_size = 64 num_epochs = 20 model_save_path = model_path + '_continue_training_' + datetime.now( ).strftime("%Y_%m_%d_%H_%M_%S") n2c2_reader = TripletReader(extra_dataset) if model_type.lower() in ["bert"]: word_embedding_model = models.BERT(model_path) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) embedder = SentenceTransformer( modules=[word_embedding_model, pooling_model]) #### load sentence BERT models and generate sentence embeddings #### else: #### load sentence BERT models and generate sentence embeddings #### embedder = SentenceTransformer(model_path) # Load a pre-trained sentence transformer model model = SentenceTransformer(model_path) # Convert the dataset to a DataLoader ready for training logging.info("Read extra training dataset") train_data = SentencesDataset(n2c2_reader.get_examples('train.tsv'), model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.TripletLoss(model=model, triplet_margin=triplet_margin) logging.info("Read development dataset") dev_data = SentencesDataset(examples=n2c2_reader.get_examples('dev.tsv'), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) evaluator = TripletEvaluator(dev_dataloader) # Configure the training. We skip evaluation in this example warmup_steps = math.ceil( len(train_data) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=math.ceil(len(train_data) / train_batch_size), warmup_steps=warmup_steps, output_path=model_save_path)
def train_contrastive_model(self, slang_ind, params=None, fold_name='default'): if params is None: params = { 'train_batch_size': 16, 'num_epochs': 4, 'triplet_margin': 1, 'outpath': 'SBERT_contrastive' } self.prep_contrastive_training(slang_ind, fold_name=fold_name) out_dir = self.out_dir + '/' + fold_name + '/SBERT_data/' triplet_reader = TripletReader(out_dir, s1_col_idx=0, s2_col_idx=1, s3_col_idx=2, delimiter=',', has_header=True) output_path = out_dir + params['outpath'] sbert_model = SentenceTransformer('bert-base-nli-mean-tokens') train_data = SentencesDataset( examples=triplet_reader.get_examples('contrastive_train.csv'), model=sbert_model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=params['train_batch_size']) train_loss = losses.TripletLoss( model=sbert_model, triplet_margin=params['triplet_margin']) dev_data = SentencesDataset( examples=triplet_reader.get_examples('contrastive_dev.csv'), model=sbert_model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=params['train_batch_size']) evaluator = TripletEvaluator(dev_dataloader) warmup_steps = int( len(train_data) * params['num_epochs'] / params['train_batch_size'] * 0.1) #10% of train data # Train the model sbert_model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=params['num_epochs'], evaluation_steps=len(dev_data), warmup_steps=warmup_steps, output_path=output_path)
def run_triplets_model(train_triplets, val_cluster_data, test_cluster_data, output_path, train_batch_size, eval_steps, num_epochs, warmup_frac, use_model_device, model_name='distilbert-base-uncased', out_features=256): task = Task.init(project_name='BB Clustering', task_name='bbclustering_triplets') if torch.cuda.is_available(): device = torch.device('cuda') print('CUDA is available and using device: ' + str(device)) else: device = torch.device('cpu') print('CUDA not available, using device: ' + str(device)) ### Configure sentence transformers for training and train on the provided dataset # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) doc_dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=out_features, activation_function=nn.Tanh()) model = CustomSentenceTransformer(modules=[word_embedding_model, pooling_model, doc_dense_model]) train_dataloader = DataLoader(train_triplets, shuffle=True, batch_size=train_batch_size) train_loss = losses.TripletLoss(model=model) evaluator = ClusterEvaluator.from_input_examples(val_cluster_data, use_model_device) test_evaluator = ClusterEvaluator.from_input_examples(test_cluster_data, use_model_device) warmup_steps = int(len(train_dataloader) * num_epochs * warmup_frac) # 10% of train data print("Raw BERT embedding performance") model.to(device) evaluator(model, output_path) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, test_evaluator=test_evaluator, epochs=num_epochs, evaluation_steps=eval_steps, warmup_steps=warmup_steps, output_path=output_path)
torch.cuda.empty_cache() my_model_path = '/run/media/root/Windows/Users/agnes/Downloads/data/msmarco/train_results/test_wiki' model_wiki = SentenceTransformer('bert-base-wikipedia-sections-mean-tokens') dev_dataloader = torch.load(os.path.join(my_model_path, 'dev_dataloader.pth')) train_dataloader = torch.load( os.path.join(my_model_path, 'train_dataloader.pth')) evaluator = TripletEvaluator(dev_dataloader) optimizer_class = transformers.AdamW optimizer_params = {'lr': 2e-4, 'eps': 1e-6, 'correct_bias': False} train_loss = losses.TripletLoss(model=model_wiki) num_epochs = 4 warmup_steps = math.ceil( len(train_dataloader.dataset) * num_epochs / train_dataloader.batch_size * 0.05) #5% of train data for warm-up model_wiki.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, steps_per_epoch=8000, warmup_steps=warmup_steps, optimizer_class=optimizer_class, optimizer_params=optimizer_params, output_path=os.path.join( my_model_path,
with open(my_loc + '/proc_data/val-triples.csv', 'w', encoding='utf-8') as f: for trs in val_trips: f.write('%s\t%s\t%s\n' % (trs[0], trs[1], trs[2])) num_epochs = 2 batch_size = 8 model_save_path = my_loc + '/models/finetune_%s_%s' % (fname, emb_type) triplet_reader = TripletReader(dataset_folder=my_loc + '/proc_data/') model = SentenceTransformer(emb_type) train_data = SentencesDataset(triplet_reader.get_examples('train-triples.csv'), model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) train_loss = losses.TripletLoss(model=model, triplet_margin=1) dev_data = SentencesDataset( examples=triplet_reader.get_examples('val-triples.csv'), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) evaluator = TripletEvaluator(dev_dataloader) warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size * 0.1) #10% of train data for warm-up model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=500, warmup_steps=warmup_steps, output_path=model_save_path)
def train(triplet_data_dir, output): logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) ### Create a torch.DataLoader that passes training batch instances to our model train_batch_size = 16 triplet_reader = TripletReader(triplet_data_dir, s1_col_idx=1, s2_col_idx=2, s3_col_idx=3, delimiter=',', quoting=csv.QUOTE_MINIMAL, has_header=True) # output_path = "output/bert-base-wikipedia-sections-mean-tokens-"+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") output_path = output + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") num_epochs = 1 ### Configure sentence transformers for training and train on the provided dataset # Use BERT for mapping tokens to embeddings word_embedding_model = models.BERT('bert-base-uncased') # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) logging.info("Read Triplet train dataset") train_data = SentencesDataset(examples=triplet_reader.get_examples( 'train.csv', 2000000), model=model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.TripletLoss(model=model) logging.info("Read Wikipedia Triplet dev dataset") dev_data = SentencesDataset(examples=triplet_reader.get_examples( 'validation.csv', 10000), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) evaluator = TripletEvaluator(dev_dataloader) warmup_steps = int(len(train_data) * num_epochs / train_batch_size * 0.1) #10% of train data # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=output_path) ############################################################################## # # Load the stored model and evaluate its performance on STS benchmark dataset # ############################################################################## model = SentenceTransformer(output_path) test_data = SentencesDataset( examples=triplet_reader.get_examples('test.csv'), model=model) test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size) evaluator = TripletEvaluator(test_dataloader) model.evaluate(evaluator)
def main(): parser = argparse.ArgumentParser() # Input and output configs parser.add_argument("--task", default=None, type=str, required=True, help="the task to run bert ranker for") parser.add_argument("--data_folder", default=None, type=str, required=True, help="the folder containing data") parser.add_argument("--output_dir", default=None, type=str, required=True, help="the folder to output predictions") parser.add_argument("--negative_sampler", default="random", type=str, required=False, help="negative sampling procedure to use ['random', 'bm25', 'sentence_transformer']") parser.add_argument("--anserini_folder", default="", type=str, required=True, help="Path containing the anserini bin <anserini_folder>/target/appassembler/bin/IndexCollection") parser.add_argument("--sentence_bert_ns_model", default="all-MiniLM-L6-v2", type=str, required=False, help="model to use for sentenceBERT negative sampling.") parser.add_argument('--denoise_negatives', dest='denoise_negatives', action='store_true') parser.add_argument('--no-denoise_negatives', dest='denoise_negatives', action='store_false') parser.set_defaults(denoise_negatives=False) parser.add_argument("--num_ns_for_denoising", default=100, type=int, required=False, help="Only used for --denoise_negatives. Number of total of samples to retrieve and get the bottom 10.") parser.add_argument("--generative_sampling_model", default="all-MiniLM-L6-v2", type=str, required=False, help="model to use for generating negative samples on the go.") parser.add_argument('--remove_cand_subsets', dest='remove_cand_subsets', action='store_true') parser.add_argument('--dont_remove_cand_subsets', dest='remove_cand_subsets', action='store_false') parser.set_defaults(remove_cand_subsets=True) #which part of the context we use to sample negatives. parser.add_argument('--last_utterance_only', dest='last_utterance_only', action='store_true') parser.add_argument('--all_utterances', dest='last_utterance_only', action='store_false') parser.set_defaults(last_utterance_only=False) # External corpus to augment negative sampling parser.add_argument('--external_corpus', dest='use_external_corpus', action='store_true') parser.add_argument('--dont_use_external_corpus', dest='use_external_corpus', action='store_false') parser.set_defaults(use_external_corpus=False) # #Training procedure parser.add_argument("--num_epochs", default=3, type=int, required=False, help="Number of epochs for training.") parser.add_argument("--train_batch_size", default=8, type=int, required=False, help="Training batch size.") # #Model hyperparameters parser.add_argument("--transformer_model", default="bert-base-cased", type=str, required=False, help="Bert model to use (default = bert-base-cased).") parser.add_argument("--loss", default='MultipleNegativesRankingLoss', type=str, required=False, help="Loss function to use ['MultipleNegativesRankingLoss', 'TripletLoss', 'MarginMSELoss']") ## Wandb project name parser.add_argument("--wandb_project", default='train_sentence_transformer', type=str, required=False, help="name of the wandb project") parser.add_argument("--seed", default=42, type=int, required=False, help="Random seed.") args = parser.parse_args() torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) max_seq_length = 300 if args.transformer_model == 'all-mpnet-base-v2' or args.transformer_model == 'msmarco-bert-base-dot-v5': model = SentenceTransformer(args.transformer_model) model.max_seq_length = max_seq_length else: word_embedding_model = models.Transformer(args.transformer_model, max_seq_length=max_seq_length) tokens = ['[UTTERANCE_SEP]', '[TURN_SEP]', '[AUG]'] word_embedding_model.tokenizer.add_tokens(tokens, special_tokens=True) word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer)) pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) eval_only = False if eval_only: logging.info("Skipping training (eval_only=True)") else: logging.info("Creating train CRR dataset for {} using {}.".format(args.task, args.negative_sampler)) crr_reader = CRRBenchmarkDataReader('{}/{}'.format(args.data_folder, args.task)) train_data = crr_reader.get_examples("train.tsv", args.negative_sampler, args.anserini_folder, args.sentence_bert_ns_model, args.loss, args.output_dir, True, False, args.denoise_negatives, args.num_ns_for_denoising, args.generative_sampling_model, args.remove_cand_subsets, args.last_utterance_only, args.use_external_corpus) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=args.train_batch_size) if args.loss == 'MultipleNegativesRankingLoss': train_loss = losses.MultipleNegativesRankingLoss(model=model, similarity_fct=util.dot_score) elif args.loss == 'MarginMSELoss': train_loss = losses.MarginMSELoss(model=model) elif args.loss == 'TripletLoss': train_loss = losses.TripletLoss(model=model) elif args.loss == 'ContrastiveLoss': train_loss = losses.ContrastiveLoss(model=model) elif args.loss == 'OnlineContrastiveLoss': train_loss = losses.OnlineContrastiveLoss(model=model) ns_description = args.negative_sampler if args.negative_sampler == 'sentence_transformer': ns_description+="_{}".format(args.sentence_bert_ns_model) if args.negative_sampler == 'generative': ns_description+="_{}".format(args.generative_sampling_model) wandb.init(project=args.wandb_project) wandb.config.update(args) if not eval_only: # this is the eval data for the training, not the actual evaluation logging.info("Getting eval data") examples_dev = crr_reader.get_examples('valid.tsv', args.negative_sampler, args.anserini_folder, args.sentence_bert_ns_model, args.loss, args.output_dir, eval_data=True) examples_dev = examples_dev[0:(11*500)] eval_samples = [] docs = [] for i, example in enumerate(examples_dev): if (i+1)%11==0: eval_samples.append({'query': example.texts[0], 'positive': [example.texts[1]], 'negative': docs }) docs=[] else: docs.append(example.texts[2]) evaluator = RerankingEvaluator(eval_samples, write_csv=True, similarity_fct=util.dot_score) warmup_steps = math.ceil(len(train_data)*args.num_epochs/args.train_batch_size*0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) logging.info("Fitting sentenceBERT for {}".format(args.task)) model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=args.num_epochs, evaluation_steps=100, steps_per_epoch=10000, warmup_steps=warmup_steps, output_path=args.output_dir+"{}_{}_ns_{}_loss_{}".format(args.transformer_model, args.task, ns_description, args.loss)) logging.info("Evaluating for full retrieval of responses to dialogue.") train = pd.read_csv(args.data_folder+args.task+"/train.tsv", sep="\t") test = pd.read_csv(args.data_folder+args.task+"/test.tsv", sep="\t") ns_test_sentenceBERT = negative_sampling.SentenceBERTNegativeSampler(list(train["response"].values)+list(test["response"].values), 10, args.data_folder+args.task+"/test_sentenceBERTembeds", -1, args.output_dir+"{}_{}_ns_{}_loss_{}".format(args.transformer_model, args.task, ns_description, args.loss), use_cache_for_embeddings=False) ns_info = [ (ns_test_sentenceBERT, ["cand_sentenceBERT_{}".format(i) for i in range(10)] + ["sentenceBERT_retrieved_relevant", "sentenceBERT_rank"], 'sentenceBERT') ] examples = [] examples_cols = ["context", "relevant_response"] + \ reduce(lambda x,y:x+y, [t[1] for t in ns_info]) logging.info("Retrieving candidates using different negative sampling strategies for {}.".format(args.task)) recall_df = [] for idx, row in enumerate(tqdm(test.itertuples(index=False), total=len(test))): context = row[0] relevant_response = row[1] instance = [context, relevant_response] for ns, _ , ns_name in ns_info: ns_candidates, scores, had_relevant, rank_relevant, _ = ns.sample(context, [relevant_response]) for ns in ns_candidates: instance.append(ns) instance.append(had_relevant) instance.append(rank_relevant) if had_relevant: r10 = 1 else: r10 = 0 if rank_relevant == 0: r1 = 1 else: r1 =0 recall_df.append([r10, r1]) examples.append(instance) recall_df = pd.DataFrame(recall_df, columns = ["R@10", "R@1"]) examples_df = pd.DataFrame(examples, columns=examples_cols) logging.info("R@10: {}".format(examples_df[[c for c in examples_df.columns if 'retrieved_relevant' in c]].sum()/examples_df.shape[0])) wandb.log({'R@10': (examples_df[[c for c in examples_df.columns if 'retrieved_relevant' in c]].sum()/examples_df.shape[0]).values[0]}) rank_col = [c for c in examples_df.columns if 'rank' in c][0] logging.info("R@1: {}".format(examples_df[examples_df[rank_col]==0].shape[0]/examples_df.shape[0])) wandb.log({'R@1': examples_df[examples_df[rank_col]==0].shape[0]/examples_df.shape[0]}) recall_df.to_csv(args.output_dir+"/recall_df_{}_{}_ns_{}_loss_{}.csv".format(args.transformer_model.replace("/", "-"), args.task, ns_description.replace("/", "-"), args.loss), index=False, sep="\t")