Example #1
0
def cli_main():
    # 作者在issues里提到的多语言的预训练模型 xlm-r-40langs-bert-base-nli-stsb-mean-tokens
    # 针对信息检索任务的多语言预训练模型  distilbert-multilingual-nli-stsb-quora-ranking
    model = SentenceTransformer(
        'distilbert-multilingual-nli-stsb-quora-ranking')

    num_epochs = 10
    train_batch_size = 64
    model_save_path = os.path.join(
        cur_dir, 'output/training_MultipleNegativesRankingLoss-' +
        datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
    os.makedirs(model_save_path, exist_ok=True)

    colab_dir = "/content/drive/My Drive/data/nlp"
    data_file = os.path.join(colab_dir, "LCCC-large.json")
    train_samples = get_data(data_file)

    # After reading the train_samples, we create a SentencesDataset and a DataLoader
    train_dataset = SentencesDataset(train_samples, model=model)
    train_dataloader = DataLoader(train_dataset,
                                  shuffle=True,
                                  batch_size=train_batch_size)
    train_loss = losses.MultipleNegativesRankingLoss(model)

    ###### Duplicate Questions Information Retrieval ######
    evaluators = []
    data_file = os.path.join(colab_dir, "STC.json")
    max_ir_num = 5000
    max_corpus_size = 100000
    ir_queries, ir_corpus, ir_relevant_docs = get_iq_corpus(
        data_file, max_ir_num, max_corpus_size)

    ir_evaluator = evaluation.InformationRetrievalEvaluator(
        ir_queries, ir_corpus, ir_relevant_docs)
    evaluators.append(ir_evaluator)
    seq_evaluator = evaluation.SequentialEvaluator(
        evaluators, main_score_function=lambda scores: scores[-1])

    logging.info("Evaluate model without training")
    seq_evaluator(model, epoch=0, steps=0, output_path=model_save_path)

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=seq_evaluator,
              epochs=num_epochs,
              warmup_steps=1000,
              output_path=model_save_path,
              output_path_ignore_not_empty=True)
def train():
    # We construct the SentenceTransformer bi-encoder from scratch
    word_embedding_model = models.Transformer(model_name, max_seq_length=350)
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension())
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    model_save_path = 'output/training_ms-marco_bi-encoder-' + model_name.replace(
        "/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

    # Read our training file. qidpidtriples consists of triplets (qid, positive_pid, negative_pid)
    train_filepath = os.path.join(
        data_folder, 'msmarco-qidpidtriples.rnd-shuf.train-eval.tsv')

    # Create the evaluator that is called during training
    queries = read_queries()
    corpus = read_corpus()
    dev_queries, dev_corpus, dev_rel_docs = prepare_data_for_evaluation(
        queries, corpus)
    ir_evaluator = evaluation.InformationRetrievalEvaluator(
        dev_queries, dev_corpus, dev_rel_docs, name='ms-marco-train_eval')

    # For training the SentenceTransformer model, we need a dataset, a dataloader, and a loss used for training.
    train_dataset = TripletsDataset(model=model,
                                    queries=queries,
                                    corpus=corpus,
                                    triplets_file=train_filepath)
    train_dataloader = DataLoader(train_dataset,
                                  shuffle=False,
                                  batch_size=train_batch_size)
    train_loss = losses.MultipleNegativesRankingLoss(model=model)

    # print(next(iter(train_dataloader)))
    # return

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=ir_evaluator,
              epochs=1,
              warmup_steps=1000,
              output_path=model_save_path,
              evaluation_steps=5000,
              use_amp=True)
Example #3
0
        if qid in ir_needed_qids:
            ir_corpus[qid] = question
        else:
            distraction_questions[qid] = question

# Now, also add some irrelevant questions to fill our corpus
other_qid_list = list(distraction_questions.keys())
random.shuffle(other_qid_list)

for qid in other_qid_list[0:max(0, max_corpus_size - len(ir_corpus))]:
    ir_corpus[qid] = distraction_questions[qid]

#Given queries, a corpus and a mapping with relevant documents, the InformationRetrievalEvaluator computes different IR
# metrices. For our use case MRR@k and Accuracy@k are relevant.
ir_evaluator = evaluation.InformationRetrievalEvaluator(
    ir_queries, ir_corpus, ir_relevant_docs)

evaluators.append(ir_evaluator)

# Create a SequentialEvaluator. This SequentialEvaluator runs all three evaluators in a sequential order.
# We optimize the model with respect to the score from the last evaluator (scores[-1])
seq_evaluator = evaluation.SequentialEvaluator(
    evaluators, main_score_function=lambda scores: scores[-1])

logging.info("Evaluate model without training")
seq_evaluator(model, epoch=0, steps=0, output_path=model_save_path)

# Train the model
model.fit(train_objectives=[(train_dataloader_MultipleNegativesRankingLoss,
                             train_loss_MultipleNegativesRankingLoss),
                            (train_dataloader_ConstrativeLoss,
Example #4
0
        if qid not in dev_rel_docs:
            dev_rel_docs[qid] = set()

        dev_rel_docs[qid].add(pos_id)

        if num_negatives[qid] < num_max_dev_negatives:
            dev_corpus[neg_id] = corpus[neg_id]
            num_negatives[qid] += 1

logging.info("Dev queries: {}".format(len(dev_queries)))
logging.info("Dev Corpus: {}".format(len(dev_corpus)))


# Create the evaluator that is called during training
ir_evaluator = evaluation.InformationRetrievalEvaluator(dev_queries, dev_corpus, dev_rel_docs, name='ms-marco-train_eval')

# Read our training file. qidpidtriples consists of triplets (qid, positive_pid, negative_pid)
train_filepath = os.path.join(data_folder, 'msmarco-qidpidtriples.rnd-shuf.train.tsv.gz')
if not os.path.exists(train_filepath):
    logging.info("Download "+os.path.basename(train_filepath))
    util.http_get('https://sbert.net/datasets/msmarco-qidpidtriples.rnd-shuf.train.tsv.gz', train_filepath)


#We load the qidpidtriples file on-the-fly by using a custom IterableDataset class
class TripletsDataset(IterableDataset):
    def __init__(self, model, queries, corpus, triplets_file):
        self.model = model
        self.queries = queries
        self.corpus = corpus
        self.triplets_file = triplets_file
Example #5
0
                dev_queries[qid] = qry_idt + query

    with gzip.open('../data/collection-rnd.tsv.gz', 'rt') as fIn:
        for line in fIn:
            pid, passage = line.strip().split("\t")
            passage = doc_idt + passage

            if pid in needed_pids or dev_corpus_max_size <= 0 or len(
                    dev_corpus) <= dev_corpus_max_size:
                dev_corpus[pid] = passage

    logging.info("Train size: {}".format(len(train_queries)))
    logging.info("Dev queries: {}".format(len(dev_queries)))
    logging.info("Dev Corpus: {}".format(len(dev_corpus)))

    ir_evaluator = evaluation.InformationRetrievalEvaluator(
        dev_queries, dev_corpus, dev_rel_docs)

    train_dataset = QueriesDataset(train_queries, corpus, model=model)
    train_dataloader = DataLoader(train_dataset,
                                  shuffle=True,
                                  batch_size=train_batch_size)
    train_loss = NTXentLossTriplet(model, scale=20)

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=ir_evaluator,
              epochs=num_epoch,
              warmup_steps=1000,
              output_path=model_save_path,
              evaluation_steps=math.ceil(len(train_dataloader) / 3) + 1,
              use_amp=True)
            dev_rel_docs[qid] = set()
        dev_rel_docs[qid].add(pid)

        needed_pids.add(pid)
        needed_qids.add(qid)

# Read passages
with open(collection_filepath, encoding='utf8') as fIn:
    for line in fIn:
        pid, passage = line.strip().split("\t")
        passage = passage

        if pid in needed_pids or corpus_max_size <= 0 or len(
                corpus) <= corpus_max_size:
            corpus[pid] = passage.strip()

## Run evaluator
logging.info("Queries: {}".format(len(dev_queries)))
logging.info("Corpus: {}".format(len(corpus)))

ir_evaluator = evaluation.InformationRetrievalEvaluator(
    dev_queries,
    corpus,
    dev_rel_docs,
    show_progress_bar=True,
    corpus_chunk_size=100000,
    precision_recall_at_k=[10, 100],
    name="msmarco dev")

ir_evaluator(model)
Example #7
0
def model_training(
    train_data_path,
    evaluator_path,
    model_name,
    output_path,
    train_batch_size,
    num_epochs,
    samples_per_label,
):

    logging.basicConfig(
        format="%(asctime)s - %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
        level=logging.INFO,
        handlers=[LoggingHandler()],
    )

    output_path = (output_path + datetime.now().strftime("%Y_%m_%d_%H_%M_%S"))

    os.makedirs(output_path, exist_ok=True)

    # You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
    # model_name = 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext/'

    ### Create a torch.DataLoader that passes training batch instances to our model

    logging.info("Loading training dataset")
    train_set = read_dataset(train_data_path)

    # Load pretrained model
    word_embedding_model = models.Transformer(model_name)
    # tokenizer_args={"additional_special_tokens": ['<e>', '</e>']})

    # word_embedding_model.auto_model.resize_token_embeddings(
    #     len(word_embedding_model.tokenizer))

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)
    # pooling_mode_mean_mark_tokens=True)

    # dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=2048, activation_function=nn.Tanh())

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    model.max_seq_length = 16

    logging.info("Read concept normalization training dataset")

    #### try different sample size ####

    train_data_sampler = SentenceLabelDataset(
        examples=train_set, samples_per_label=samples_per_label)

    ##### Try whether shuffle  #####  By default, it shouldn't be shuffled every epoch

    train_dataloader = DataLoader(train_data_sampler,
                                  batch_size=train_batch_size,
                                  drop_last=True)

    ### Triplet losses ####################
    ### There are 4 triplet loss variants:
    ### - BatchHardTripletLoss
    ### - BatchHardSoftMarginTripletLoss
    ### - BatchSemiHardTripletLoss
    ### - BatchAllTripletLoss
    #######################################

    # train_loss = losses.BatchAllTripletLoss(model=model)
    #train_loss = losses.BatchHardTripletLoss(sentence_embedder=model)
    train_loss = losses.BatchHardSoftMarginTripletLoss(model)
    #train_loss = losses.BatchSemiHardTripletLoss(sentence_embedder=model)

    # evaluator = []

    logging.info("Read concept normalization val dataset")

    ir_queries = read.read_from_json(
        os.path.join(evaluator_path, "dev_queries"))
    ir_corpus = read.read_from_json(os.path.join(evaluator_path, "corpus"))
    ir_relevant_docs = read.read_from_json(
        os.path.join(evaluator_path, "dev_relevant_docs"))
    ir_evaluator_n2c2_dev = evaluation.InformationRetrievalEvaluator(
        ir_queries,
        ir_corpus,
        ir_relevant_docs,
        corpus_chunk_size=300000,
        name="evaluation_results",
        map_at_k=[1, 3, 5, 10],
        batch_size=1024,
        show_progress_bar=True)

    # evaluator.append(ir_evaluator_n2c2_dev)
    # Create a SequentialEvaluator. This SequentialEvaluator runs all three evaluators in a sequential order.
    # We optimize the model with respect to the score from the last evaluator (scores[-1])
    # seq_evaluator = evaluation.SequentialEvaluator(evaluator, main_score_function=lambda scores: scores[1])

    logging.info("Performance before fine-tuning:")
    ir_evaluator_n2c2_dev(model)

    # warmup_steps = int(
    #     len(train_dataset) * num_epochs / train_batch_size * 0.1
    # )  # 10% of train data
    warmup_steps = 0

    # Train the model
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        # evaluator = None,
        evaluator=ir_evaluator_n2c2_dev,
        output_path_ignore_not_empty=True,
        optimizer_params={
            'lr': 1e-4,
            'eps': 1e-6,
            'correct_bias': False
        },
        epochs=num_epochs,
        warmup_steps=warmup_steps,
        output_path=output_path,
    )