def cli_main(): # 作者在issues里提到的多语言的预训练模型 xlm-r-40langs-bert-base-nli-stsb-mean-tokens # 针对信息检索任务的多语言预训练模型 distilbert-multilingual-nli-stsb-quora-ranking model = SentenceTransformer( 'distilbert-multilingual-nli-stsb-quora-ranking') num_epochs = 10 train_batch_size = 64 model_save_path = os.path.join( cur_dir, 'output/training_MultipleNegativesRankingLoss-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) os.makedirs(model_save_path, exist_ok=True) colab_dir = "/content/drive/My Drive/data/nlp" data_file = os.path.join(colab_dir, "LCCC-large.json") train_samples = get_data(data_file) # After reading the train_samples, we create a SentencesDataset and a DataLoader train_dataset = SentencesDataset(train_samples, model=model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) train_loss = losses.MultipleNegativesRankingLoss(model) ###### Duplicate Questions Information Retrieval ###### evaluators = [] data_file = os.path.join(colab_dir, "STC.json") max_ir_num = 5000 max_corpus_size = 100000 ir_queries, ir_corpus, ir_relevant_docs = get_iq_corpus( data_file, max_ir_num, max_corpus_size) ir_evaluator = evaluation.InformationRetrievalEvaluator( ir_queries, ir_corpus, ir_relevant_docs) evaluators.append(ir_evaluator) seq_evaluator = evaluation.SequentialEvaluator( evaluators, main_score_function=lambda scores: scores[-1]) logging.info("Evaluate model without training") seq_evaluator(model, epoch=0, steps=0, output_path=model_save_path) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=seq_evaluator, epochs=num_epochs, warmup_steps=1000, output_path=model_save_path, output_path_ignore_not_empty=True)
def train(): # We construct the SentenceTransformer bi-encoder from scratch word_embedding_model = models.Transformer(model_name, max_seq_length=350) pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension()) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) model_save_path = 'output/training_ms-marco_bi-encoder-' + model_name.replace( "/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") # Read our training file. qidpidtriples consists of triplets (qid, positive_pid, negative_pid) train_filepath = os.path.join( data_folder, 'msmarco-qidpidtriples.rnd-shuf.train-eval.tsv') # Create the evaluator that is called during training queries = read_queries() corpus = read_corpus() dev_queries, dev_corpus, dev_rel_docs = prepare_data_for_evaluation( queries, corpus) ir_evaluator = evaluation.InformationRetrievalEvaluator( dev_queries, dev_corpus, dev_rel_docs, name='ms-marco-train_eval') # For training the SentenceTransformer model, we need a dataset, a dataloader, and a loss used for training. train_dataset = TripletsDataset(model=model, queries=queries, corpus=corpus, triplets_file=train_filepath) train_dataloader = DataLoader(train_dataset, shuffle=False, batch_size=train_batch_size) train_loss = losses.MultipleNegativesRankingLoss(model=model) # print(next(iter(train_dataloader))) # return # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=ir_evaluator, epochs=1, warmup_steps=1000, output_path=model_save_path, evaluation_steps=5000, use_amp=True)
if qid in ir_needed_qids: ir_corpus[qid] = question else: distraction_questions[qid] = question # Now, also add some irrelevant questions to fill our corpus other_qid_list = list(distraction_questions.keys()) random.shuffle(other_qid_list) for qid in other_qid_list[0:max(0, max_corpus_size - len(ir_corpus))]: ir_corpus[qid] = distraction_questions[qid] #Given queries, a corpus and a mapping with relevant documents, the InformationRetrievalEvaluator computes different IR # metrices. For our use case MRR@k and Accuracy@k are relevant. ir_evaluator = evaluation.InformationRetrievalEvaluator( ir_queries, ir_corpus, ir_relevant_docs) evaluators.append(ir_evaluator) # Create a SequentialEvaluator. This SequentialEvaluator runs all three evaluators in a sequential order. # We optimize the model with respect to the score from the last evaluator (scores[-1]) seq_evaluator = evaluation.SequentialEvaluator( evaluators, main_score_function=lambda scores: scores[-1]) logging.info("Evaluate model without training") seq_evaluator(model, epoch=0, steps=0, output_path=model_save_path) # Train the model model.fit(train_objectives=[(train_dataloader_MultipleNegativesRankingLoss, train_loss_MultipleNegativesRankingLoss), (train_dataloader_ConstrativeLoss,
if qid not in dev_rel_docs: dev_rel_docs[qid] = set() dev_rel_docs[qid].add(pos_id) if num_negatives[qid] < num_max_dev_negatives: dev_corpus[neg_id] = corpus[neg_id] num_negatives[qid] += 1 logging.info("Dev queries: {}".format(len(dev_queries))) logging.info("Dev Corpus: {}".format(len(dev_corpus))) # Create the evaluator that is called during training ir_evaluator = evaluation.InformationRetrievalEvaluator(dev_queries, dev_corpus, dev_rel_docs, name='ms-marco-train_eval') # Read our training file. qidpidtriples consists of triplets (qid, positive_pid, negative_pid) train_filepath = os.path.join(data_folder, 'msmarco-qidpidtriples.rnd-shuf.train.tsv.gz') if not os.path.exists(train_filepath): logging.info("Download "+os.path.basename(train_filepath)) util.http_get('https://sbert.net/datasets/msmarco-qidpidtriples.rnd-shuf.train.tsv.gz', train_filepath) #We load the qidpidtriples file on-the-fly by using a custom IterableDataset class class TripletsDataset(IterableDataset): def __init__(self, model, queries, corpus, triplets_file): self.model = model self.queries = queries self.corpus = corpus self.triplets_file = triplets_file
dev_queries[qid] = qry_idt + query with gzip.open('../data/collection-rnd.tsv.gz', 'rt') as fIn: for line in fIn: pid, passage = line.strip().split("\t") passage = doc_idt + passage if pid in needed_pids or dev_corpus_max_size <= 0 or len( dev_corpus) <= dev_corpus_max_size: dev_corpus[pid] = passage logging.info("Train size: {}".format(len(train_queries))) logging.info("Dev queries: {}".format(len(dev_queries))) logging.info("Dev Corpus: {}".format(len(dev_corpus))) ir_evaluator = evaluation.InformationRetrievalEvaluator( dev_queries, dev_corpus, dev_rel_docs) train_dataset = QueriesDataset(train_queries, corpus, model=model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) train_loss = NTXentLossTriplet(model, scale=20) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=ir_evaluator, epochs=num_epoch, warmup_steps=1000, output_path=model_save_path, evaluation_steps=math.ceil(len(train_dataloader) / 3) + 1, use_amp=True)
dev_rel_docs[qid] = set() dev_rel_docs[qid].add(pid) needed_pids.add(pid) needed_qids.add(qid) # Read passages with open(collection_filepath, encoding='utf8') as fIn: for line in fIn: pid, passage = line.strip().split("\t") passage = passage if pid in needed_pids or corpus_max_size <= 0 or len( corpus) <= corpus_max_size: corpus[pid] = passage.strip() ## Run evaluator logging.info("Queries: {}".format(len(dev_queries))) logging.info("Corpus: {}".format(len(corpus))) ir_evaluator = evaluation.InformationRetrievalEvaluator( dev_queries, corpus, dev_rel_docs, show_progress_bar=True, corpus_chunk_size=100000, precision_recall_at_k=[10, 100], name="msmarco dev") ir_evaluator(model)
def model_training( train_data_path, evaluator_path, model_name, output_path, train_batch_size, num_epochs, samples_per_label, ): logging.basicConfig( format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()], ) output_path = (output_path + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")) os.makedirs(output_path, exist_ok=True) # You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base # model_name = 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext/' ### Create a torch.DataLoader that passes training batch instances to our model logging.info("Loading training dataset") train_set = read_dataset(train_data_path) # Load pretrained model word_embedding_model = models.Transformer(model_name) # tokenizer_args={"additional_special_tokens": ['<e>', '</e>']}) # word_embedding_model.auto_model.resize_token_embeddings( # len(word_embedding_model.tokenizer)) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) # pooling_mode_mean_mark_tokens=True) # dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=2048, activation_function=nn.Tanh()) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) model.max_seq_length = 16 logging.info("Read concept normalization training dataset") #### try different sample size #### train_data_sampler = SentenceLabelDataset( examples=train_set, samples_per_label=samples_per_label) ##### Try whether shuffle ##### By default, it shouldn't be shuffled every epoch train_dataloader = DataLoader(train_data_sampler, batch_size=train_batch_size, drop_last=True) ### Triplet losses #################### ### There are 4 triplet loss variants: ### - BatchHardTripletLoss ### - BatchHardSoftMarginTripletLoss ### - BatchSemiHardTripletLoss ### - BatchAllTripletLoss ####################################### # train_loss = losses.BatchAllTripletLoss(model=model) #train_loss = losses.BatchHardTripletLoss(sentence_embedder=model) train_loss = losses.BatchHardSoftMarginTripletLoss(model) #train_loss = losses.BatchSemiHardTripletLoss(sentence_embedder=model) # evaluator = [] logging.info("Read concept normalization val dataset") ir_queries = read.read_from_json( os.path.join(evaluator_path, "dev_queries")) ir_corpus = read.read_from_json(os.path.join(evaluator_path, "corpus")) ir_relevant_docs = read.read_from_json( os.path.join(evaluator_path, "dev_relevant_docs")) ir_evaluator_n2c2_dev = evaluation.InformationRetrievalEvaluator( ir_queries, ir_corpus, ir_relevant_docs, corpus_chunk_size=300000, name="evaluation_results", map_at_k=[1, 3, 5, 10], batch_size=1024, show_progress_bar=True) # evaluator.append(ir_evaluator_n2c2_dev) # Create a SequentialEvaluator. This SequentialEvaluator runs all three evaluators in a sequential order. # We optimize the model with respect to the score from the last evaluator (scores[-1]) # seq_evaluator = evaluation.SequentialEvaluator(evaluator, main_score_function=lambda scores: scores[1]) logging.info("Performance before fine-tuning:") ir_evaluator_n2c2_dev(model) # warmup_steps = int( # len(train_dataset) * num_epochs / train_batch_size * 0.1 # ) # 10% of train data warmup_steps = 0 # Train the model model.fit( train_objectives=[(train_dataloader, train_loss)], # evaluator = None, evaluator=ir_evaluator_n2c2_dev, output_path_ignore_not_empty=True, optimizer_params={ 'lr': 1e-4, 'eps': 1e-6, 'correct_bias': False }, epochs=num_epochs, warmup_steps=warmup_steps, output_path=output_path, )