Exemple #1
0
def get_loss(loss_type, model):
    if loss_type == 'BatchAllTripletLoss':
        return losses.BatchAllTripletLoss(model=model)

    if loss_type == 'BatchHardSoftMarginTripletLoss':
        return losses.BatchHardSoftMarginTripletLoss(model=model)

    if loss_type == 'BatchHardTripletLoss':
        return losses.BatchHardTripletLoss(model=model)

    if loss_type == 'BatchSemiHardTripletLoss':
        return losses.BatchSemiHardTripletLoss(model=model)

    if loss_type == 'ContrastiveLoss':
        return losses.ContrastiveLoss(model=model)

    if loss_type == 'CosineSimilarityLoss':
        return losses.CosineSimilarityLoss(model=model)

    if loss_type == 'MegaBatchMarginLoss':
        return losses.MegaBatchMarginLoss(model=model)

    if loss_type == 'MultipleNegativesRankingLoss':
        return losses.MultipleNegativesRankingLoss(model=model)

    if loss_type == 'OnlineContrastiveLoss':
        return losses.OnlineContrastiveLoss(model=model)

    raise ValueError('Invalid loss type')
Exemple #2
0
train_dataset_MultipleNegativesRankingLoss = SentencesDataset(
    train_samples_MultipleNegativesRankingLoss, model=model)
train_dataloader_MultipleNegativesRankingLoss = DataLoader(
    train_dataset_MultipleNegativesRankingLoss,
    shuffle=True,
    batch_size=train_batch_size)
train_loss_MultipleNegativesRankingLoss = losses.MultipleNegativesRankingLoss(
    model)

# Create data loader and loss for OnlineContrastiveLoss
train_dataset_ConstrativeLoss = SentencesDataset(train_samples_ConstrativeLoss,
                                                 model=model)
train_dataloader_ConstrativeLoss = DataLoader(train_dataset_ConstrativeLoss,
                                              shuffle=True,
                                              batch_size=train_batch_size)
train_loss_ConstrativeLoss = losses.OnlineContrastiveLoss(
    model=model, distance_metric=distance_metric, margin=margin)

################### Development  Evaluators ##################
# We add 3 evaluators, that evaluate the model on Duplicate Questions pair classification,
# Duplicate Questions Mining, and Duplicate Questions Information Retrieval
evaluators = []

###### Classification ######
# Given (quesiton1, question2), is this a duplicate or not?
# The evaluator will compute the embeddings for both questions and then compute
# a cosine similarity. If the similarity is above a threshold, we have a duplicate.
dev_sentences1 = []
dev_sentences2 = []
dev_labels = []
with open(os.path.join(dataset_path, "classification/dev_pairs.tsv"),
          encoding='utf8') as fIn:
    gold = qrel[topic["number"]].items()
    query = topic["title"].strip()

    for item in gold:
        try:
            doc = db.lookup_docno(item[0])
            examples.append(InputExample(texts=[query, doc], label=item[1]))
        except:
            continue
print("finished", len(examples))

#%%
from torch.utils.data import DataLoader
train_dataset = SentencesDataset(examples, ranker)
train_dl = DataLoader(train_dataset, shuffle=True, batch_size=16)
train_loss = losses.OnlineContrastiveLoss(model=ranker)

ranker.fit(train_dataloader=train_dl,
           epochs=20,
           output_path="ranker/constrastive_loss/",
           save_best_model=True)

pickle.dump(
    ranker,
    open("ranker/constrastive_loss/ranker_contrastive_loss_20_epochs.pkl",
         "wb"))

from tqdm.notebook import tqdm

run = {}
for topic in tqdm(topics):
Exemple #4
0
    def train(self, train_df, eval_df):
        """

        :param train_df: dataframe with columns 'text_a', 'text_b', 'labels'
        :param eval_df: dataframe with columns 'text_a', 'text_b', 'labels'
        :return:
        """

        # format training data
        if "text_a" in train_df.columns and "text_b" in train_df.columns and "labels" in train_df.columns:
            if self.args.do_lower_case:
                train_df.loc[:, 'text_a'] = train_df['text_a'].str.lower()
                train_df.loc[:, 'text_b'] = train_df['text_b'].str.lower()

            train_examples = [
                InputExample(str(i), texts=[text_a, text_b], label=label)
                for i, (text_a, text_b, label) in enumerate(
                    zip(
                        train_df["text_a"].astype(str),
                        train_df["text_b"].astype(str),
                        train_df["labels"].astype(int),
                    ))
            ]
        else:
            raise KeyError(
                'Training data processing - Required columns not found!')

        # format evaluation data
        if "text_a" in train_df.columns and "text_b" in train_df.columns and "labels" in eval_df.columns:
            if self.args.do_lower_case:
                eval_df.loc[:, 'text_a'] = eval_df['text_a'].str.lower()
                eval_df.loc[:, 'text_b'] = eval_df['text_b'].str.lower()

            evaluator = evaluation.BinaryClassificationEvaluator(
                list(eval_df["text_a"]),
                list(eval_df["text_b"]),
                list(eval_df["labels"].astype(int)),
                batch_size=self.args.eval_batch_size)
        else:
            raise KeyError(
                'Evaluation data processing - Required columns not found!')

        # Define train dataset, the dataloader and the train loss
        train_dataloader = DataLoader(train_examples,
                                      shuffle=True,
                                      batch_size=self.args.train_batch_size)
        if self.args.loss_func is not None and self.args.loss_func == 'MultipleNegativesRankingLoss':
            train_loss = losses.MultipleNegativesRankingLoss(self.model)
        else:
            distance_metric = losses.SiameseDistanceMetric.COSINE_DISTANCE
            train_loss = losses.OnlineContrastiveLoss(
                model=self.model,
                distance_metric=distance_metric,
                margin=self.args.margin)

        # Tune the model
        self.model.fit(
            train_objectives=[(train_dataloader, train_loss)],
            epochs=self.args.num_train_epochs,
            warmup_steps=self.args.warmup_steps,
            optimizer_params={'lr': self.args.learning_rate},
            weight_decay=self.args.weight_decay,
            evaluator=evaluator,
            evaluation_steps=self.args.evaluate_during_training_steps,
            max_grad_norm=self.args.max_grad_norm,
            output_path=self.args.best_model_dir,
            show_progress_bar=self.args.show_progress_bar)

        evaluation_file = os.path.join(self.args.best_model_dir,
                                       evaluator.csv_file)
        eval_results_df = pd.read_csv(evaluation_file)
        eval_results_df.sort_values(self.score_type,
                                    inplace=True,
                                    ascending=False,
                                    ignore_index=True)
        self.threshold = eval_results_df.loc[0, self.threshold_type]
        print(
            f'Set model threshold to {self.threshold} acquiring a {self.score_type} of {eval_results_df.loc[0, self.score_type]}'
        )

        return self.threshold
def main():
    parser = argparse.ArgumentParser()

    # Input and output configs
    parser.add_argument("--task", default=None, type=str, required=True,
                        help="the task to run bert ranker for")
    parser.add_argument("--data_folder", default=None, type=str, required=True,
                        help="the folder containing data")
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="the folder to output predictions")
    parser.add_argument("--negative_sampler", default="random", type=str, required=False,
                        help="negative sampling procedure to use ['random', 'bm25', 'sentence_transformer']")
    parser.add_argument("--anserini_folder", default="", type=str, required=True,
                        help="Path containing the anserini bin <anserini_folder>/target/appassembler/bin/IndexCollection")
    parser.add_argument("--sentence_bert_ns_model", default="all-MiniLM-L6-v2", type=str, required=False,
                        help="model to use for sentenceBERT negative sampling.")

    parser.add_argument('--denoise_negatives', dest='denoise_negatives', action='store_true')
    parser.add_argument('--no-denoise_negatives', dest='denoise_negatives', action='store_false')
    parser.set_defaults(denoise_negatives=False)
    parser.add_argument("--num_ns_for_denoising", default=100, type=int, required=False,
                        help="Only used for --denoise_negatives. Number of total of samples to retrieve and get the bottom 10.")

    parser.add_argument("--generative_sampling_model", default="all-MiniLM-L6-v2", type=str, required=False,
                        help="model to use for generating negative samples on the go.")

    parser.add_argument('--remove_cand_subsets', dest='remove_cand_subsets', action='store_true')
    parser.add_argument('--dont_remove_cand_subsets', dest='remove_cand_subsets', action='store_false')
    parser.set_defaults(remove_cand_subsets=True)

    #which part of the context we use to sample negatives.
    parser.add_argument('--last_utterance_only', dest='last_utterance_only', action='store_true')
    parser.add_argument('--all_utterances', dest='last_utterance_only', action='store_false')
    parser.set_defaults(last_utterance_only=False)

    # External corpus to augment negative sampling
    parser.add_argument('--external_corpus', dest='use_external_corpus', action='store_true')
    parser.add_argument('--dont_use_external_corpus', dest='use_external_corpus', action='store_false')
    parser.set_defaults(use_external_corpus=False)

    # #Training procedure
    parser.add_argument("--num_epochs", default=3, type=int, required=False,
                        help="Number of epochs for training.")
    parser.add_argument("--train_batch_size", default=8, type=int, required=False,
                        help="Training batch size.")
    # #Model hyperparameters
    parser.add_argument("--transformer_model", default="bert-base-cased", type=str, required=False,
                        help="Bert model to use (default = bert-base-cased).")
    parser.add_argument("--loss", default='MultipleNegativesRankingLoss', type=str, required=False,
                        help="Loss function to use ['MultipleNegativesRankingLoss', 'TripletLoss', 'MarginMSELoss']")

    ## Wandb project name 
    parser.add_argument("--wandb_project", default='train_sentence_transformer', type=str, required=False,
                        help="name of the wandb project")
    parser.add_argument("--seed", default=42, type=int, required=False,
                        help="Random seed.")

    args = parser.parse_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    max_seq_length = 300
    if args.transformer_model == 'all-mpnet-base-v2' or args.transformer_model == 'msmarco-bert-base-dot-v5':
        model = SentenceTransformer(args.transformer_model)
        model.max_seq_length = max_seq_length
    else:
        word_embedding_model = models.Transformer(args.transformer_model, max_seq_length=max_seq_length)
        tokens = ['[UTTERANCE_SEP]', '[TURN_SEP]', '[AUG]']
        word_embedding_model.tokenizer.add_tokens(tokens, special_tokens=True)
        word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer))
        pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                pooling_mode_mean_tokens=True,
                                pooling_mode_cls_token=False,
                                pooling_mode_max_tokens=False)
        model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


    eval_only = False
    if eval_only:
        logging.info("Skipping training (eval_only=True)")
    
    else:
        logging.info("Creating train CRR dataset for {} using {}.".format(args.task, args.negative_sampler))
        crr_reader = CRRBenchmarkDataReader('{}/{}'.format(args.data_folder, args.task))
        train_data = crr_reader.get_examples("train.tsv", args.negative_sampler,
                                    args.anserini_folder, args.sentence_bert_ns_model, args.loss, args.output_dir,
                                    True, False,
                                    args.denoise_negatives, args.num_ns_for_denoising,
                                    args.generative_sampling_model,
                                    args.remove_cand_subsets,
                                    args.last_utterance_only,
                                    args.use_external_corpus)
        train_dataloader = DataLoader(train_data, shuffle=True, batch_size=args.train_batch_size)
    
    if args.loss == 'MultipleNegativesRankingLoss':
        train_loss = losses.MultipleNegativesRankingLoss(model=model, similarity_fct=util.dot_score)
    elif args.loss == 'MarginMSELoss':
        train_loss = losses.MarginMSELoss(model=model)
    elif args.loss == 'TripletLoss':
        train_loss = losses.TripletLoss(model=model)
    elif args.loss == 'ContrastiveLoss':
        train_loss = losses.ContrastiveLoss(model=model)
    elif args.loss == 'OnlineContrastiveLoss':
        train_loss = losses.OnlineContrastiveLoss(model=model)


    ns_description = args.negative_sampler
    if args.negative_sampler == 'sentence_transformer':
        ns_description+="_{}".format(args.sentence_bert_ns_model)

    if args.negative_sampler == 'generative':
        ns_description+="_{}".format(args.generative_sampling_model)

    wandb.init(project=args.wandb_project)
    wandb.config.update(args)

    if not eval_only: # this is the eval data for the training, not the actual evaluation
        logging.info("Getting eval data")
        examples_dev = crr_reader.get_examples('valid.tsv', 
            args.negative_sampler, args.anserini_folder, args.sentence_bert_ns_model, args.loss, args.output_dir, eval_data=True)
        examples_dev = examples_dev[0:(11*500)]
        eval_samples = []
        docs = []
        for i, example in enumerate(examples_dev):
            if (i+1)%11==0:
                eval_samples.append({'query': example.texts[0], 
                                    'positive': [example.texts[1]],
                                    'negative': docs
                })
                docs=[]
            else:
                docs.append(example.texts[2])
        evaluator = RerankingEvaluator(eval_samples, write_csv=True, similarity_fct=util.dot_score)
        warmup_steps = math.ceil(len(train_data)*args.num_epochs/args.train_batch_size*0.1) #10% of train data for warm-up
        logging.info("Warmup-steps: {}".format(warmup_steps))

        logging.info("Fitting sentenceBERT for {}".format(args.task))

        model.fit(train_objectives=[(train_dataloader, train_loss)],
            evaluator=evaluator,
            epochs=args.num_epochs,
            evaluation_steps=100,          
            steps_per_epoch=10000,        
            warmup_steps=warmup_steps,
            output_path=args.output_dir+"{}_{}_ns_{}_loss_{}".format(args.transformer_model, args.task, ns_description, args.loss))

    logging.info("Evaluating for full retrieval of responses to dialogue.")

    train = pd.read_csv(args.data_folder+args.task+"/train.tsv", sep="\t")
    test = pd.read_csv(args.data_folder+args.task+"/test.tsv", sep="\t")

    ns_test_sentenceBERT = negative_sampling.SentenceBERTNegativeSampler(list(train["response"].values)+list(test["response"].values), 10, 
                   args.data_folder+args.task+"/test_sentenceBERTembeds", -1, 
                   args.output_dir+"{}_{}_ns_{}_loss_{}".format(args.transformer_model, args.task, ns_description, args.loss),
                   use_cache_for_embeddings=False)
    
    ns_info = [
        (ns_test_sentenceBERT, 
        ["cand_sentenceBERT_{}".format(i) for i in range(10)] + ["sentenceBERT_retrieved_relevant", "sentenceBERT_rank"], 
        'sentenceBERT')
    ]
    examples = []
    examples_cols = ["context", "relevant_response"] + \
        reduce(lambda x,y:x+y, [t[1] for t in ns_info])
    logging.info("Retrieving candidates using different negative sampling strategies for {}.".format(args.task))
    recall_df = []
    for idx, row in enumerate(tqdm(test.itertuples(index=False), total=len(test))):
        context = row[0]
        relevant_response = row[1]
        instance = [context, relevant_response]

        for ns, _ , ns_name in ns_info:
            ns_candidates, scores, had_relevant, rank_relevant, _ = ns.sample(context, [relevant_response])
            for ns in ns_candidates:
                instance.append(ns)
            instance.append(had_relevant)
            instance.append(rank_relevant)
            if had_relevant:
                r10 = 1
            else:
                r10 = 0
            if rank_relevant == 0:
                r1 = 1
            else:
                r1 =0
            recall_df.append([r10, r1])
        examples.append(instance)

    recall_df  = pd.DataFrame(recall_df, columns = ["R@10", "R@1"])
    examples_df = pd.DataFrame(examples, columns=examples_cols)
    logging.info("R@10: {}".format(examples_df[[c for c in examples_df.columns if 'retrieved_relevant' in c]].sum()/examples_df.shape[0]))
    wandb.log({'R@10': (examples_df[[c for c in examples_df.columns if 'retrieved_relevant' in c]].sum()/examples_df.shape[0]).values[0]})
    rank_col = [c for c in examples_df.columns if 'rank' in c][0]
    logging.info("R@1: {}".format(examples_df[examples_df[rank_col]==0].shape[0]/examples_df.shape[0]))
    wandb.log({'R@1': examples_df[examples_df[rank_col]==0].shape[0]/examples_df.shape[0]})
    recall_df.to_csv(args.output_dir+"/recall_df_{}_{}_ns_{}_loss_{}.csv".format(args.transformer_model.replace("/", "-"), args.task, ns_description.replace("/", "-"), args.loss), index=False, sep="\t")