def run_experiment(args):
    args.run_id = str(ex.current_run._id)

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    # tokenizer = BertTokenizer.from_pretrained(args.transformer_model)
    tokenizer = BertTokenizerFast.from_pretrained(args.transformer_model)
    # Conversation Response Ranking datasets needs special tokens
    if args.task in ["mantis", "msdialog", "ubuntu_dstc8"]: 
        special_tokens_dict = {'additional_special_tokens': ['[UTTERANCE_SEP]', '[TURN_SEP]'] }
        tokenizer.add_special_tokens(special_tokens_dict)        

    #Load datasets
    train = pd.read_csv(args.data_folder+args.task+"/train.tsv", sep="\t", 
                        nrows=args.sample_data if args.sample_data != -1 else None)
    valid = pd.read_csv(args.data_folder+args.task+"/valid.tsv", sep="\t",
                        nrows=args.sample_data if args.sample_data != -1 else None)

    #Choose the negative candidate sampler
    document_col = train.columns[1]
    if args.train_negative_sampler == 'random':
        ns_train = negative_sampling.RandomNegativeSampler(list(train[document_col].values), args.num_ns_train)
    elif args.train_negative_sampler == 'bm25':
        ns_train = negative_sampling.BM25NegativeSamplerPyserini(list(train[document_col].values), args.num_ns_train, 
                    args.data_folder+args.task+"/anserini_train/", args.sample_data, args.anserini_folder)
    elif args.train_negative_sampler == 'sentenceBERT':
        ns_train = negative_sampling.SentenceBERTNegativeSampler(list(train[document_col].values), args.num_ns_train, 
                    args.data_folder+args.task+"/train_sentenceBERTembeds", args.sample_data, args.bert_sentence_model)        

    if args.test_negative_sampler == 'random':
        ns_val = negative_sampling.RandomNegativeSampler(list(valid[document_col].values) + list(train[document_col].values), args.num_ns_eval)
    elif args.test_negative_sampler == 'bm25':
        ns_val = negative_sampling.BM25NegativeSamplerPyserini(list(valid[document_col].values) + list(train[document_col].values),
                    args.num_ns_eval, args.data_folder+args.task+"/anserini_valid/", args.sample_data, args.anserini_folder)
    elif args.test_negative_sampler == 'sentenceBERT':
        ns_val = negative_sampling.SentenceBERTNegativeSampler(list(valid[document_col].values) + list(train[document_col].values),
                    args.num_ns_eval, args.data_folder+args.task+"/valid_sentenceBERTembeds", args.sample_data, args.bert_sentence_model)

    train = train.groupby(train.columns[0]).agg(list).reset_index()
    labels = []
    sample = 10000
    max_labels = 0
    for idx, row in enumerate(tqdm(train[0:sample].itertuples(index=False), total=sample)):
        query = row[0]
        relevant_documents = row[1]
        query_labels = []
        for relevant_document in relevant_documents:
            query_labels.append(1.0)
        ns_candidates, ns_scores, _, _, _= ns_train.sample(query, relevant_documents)
        for i, ns in enumerate(ns_candidates):
            query_labels.append(ns_scores[i])
        labels.append(query_labels)
        if max_labels < len(query_labels):
            max_labels = len(query_labels)
    df_labels = pd.DataFrame(labels, columns = ["candidate_{}".format(i) for i in range(max_labels)])
    df_labels.to_csv(args.output_dir+"/{}_weak_supervision.csv".format(args.task), index=False)
Exemple #2
0
def run_experiment(args):
    args.run_id = str(ex.current_run._id)

    tokenizer = BertTokenizer.from_pretrained(args.transformer_model)
    #Load datasets
    ## Conversation Response Ranking
    if args.task in ["mantis", "msdialog", "ubuntu_dstc8"]: 
        add_turn_separator = (args.task != "ubuntu_dstc8") # Ubuntu data has several utterances from same user in the context.
        train = preprocess_crr.read_crr_tsv_as_df(args.data_folder+args.task+"/train.tsv", args.sample_data, add_turn_separator)
        valid = preprocess_crr.read_crr_tsv_as_df(args.data_folder+args.task+"/valid.tsv", args.sample_data, add_turn_separator)
        special_tokens_dict = {'additional_special_tokens': ['[UTTERANCE_SEP]', '[TURN_SEP]'] }
        tokenizer.add_special_tokens(special_tokens_dict)
    ## Similar Question Retrieval and Passage Retrieval
    elif args.task in ["qqp", "linkso", "trec2020pr"]:
        if args.sample_data == -1: args.sample_data=None            
        train = pd.read_csv(args.data_folder+args.task+"/train.tsv", sep="\t", nrows=args.sample_data)
        valid = pd.read_csv(args.data_folder+args.task+"/valid.tsv", sep="\t", nrows=args.sample_data)
    elif args.task=="scisumm":
        train, valid = preprocess_scisumm.transform_to_dfs("../data/Training-Set-2019/Task1/From-Training-Set-2018/")

    #Choose the negative candidate sampler
    document_col = train.columns[1]
    if args.train_negative_sampler == 'random':
        ns_train = negative_sampling.RandomNegativeSampler(list(train[document_col].values), args.num_ns_train)
    elif args.train_negative_sampler == 'bm25':
        ns_train = negative_sampling.BM25NegativeSamplerPyserini(list(train[document_col].values), args.num_ns_train, 
                    args.data_folder+"/"+args.task+"/anserini_train/", args.sample_data, args.anserini_folder)
    elif args.train_negative_sampler == 'sentenceBERT':
        ns_train = negative_sampling.SentenceBERTNegativeSampler(list(train[document_col].values), args.num_ns_train, 
                    args.data_folder+"/"+args.task+"/train_sentenceBERTembeds", args.sample_data, args.bert_sentence_model)

    if args.test_negative_sampler == 'random':
        ns_val = negative_sampling.RandomNegativeSampler(list(valid[document_col].values) + list(train[document_col].values), args.num_ns_eval)
    elif args.test_negative_sampler == 'bm25':
        ns_val = negative_sampling.BM25NegativeSamplerPyserini(list(valid[document_col].values) + list(train[document_col].values),
                    args.num_ns_eval, args.data_folder+"/"+args.task+"/anserini_valid/", args.sample_data, args.anserini_folder)
    elif args.test_negative_sampler == 'sentenceBERT':
        ns_val = negative_sampling.SentenceBERTNegativeSampler(list(valid[document_col].values) + list(train[document_col].values),
                    args.num_ns_eval, args.data_folder+"/"+args.task+"/valid_sentenceBERTembeds", args.sample_data, args.bert_sentence_model)

    #Create the loaders for the datasets, with the respective negative samplers
    dataloader = dataset.QueryDocumentDataLoader(train, valid, valid,
                                tokenizer, ns_train, ns_val,
                                'classification', args.train_batch_size, 
                                args.val_batch_size, args.max_seq_len, 
                                args.sample_data, args.data_folder + args.task)

    train_loader, val_loader, test_loader = dataloader.get_pytorch_dataloaders()


    #Instantiate transformer model to be used
    model = BertForSequenceClassification.from_pretrained(args.transformer_model)
    model.resize_token_embeddings(len(dataloader.tokenizer))

    #Instantiate trainer that handles fitting.
    trainer = transformer_trainer.TransformerTrainer(model, train_loader, val_loader, test_loader, 
                                 args.num_ns_eval, "classification", tokenizer,
                                 args.validate_every_epochs, args.num_validation_instances,
                                 args.num_epochs, args.lr, args.sacred_ex)

    #Train
    model_name = model.__class__.__name__
    logging.info("Fitting {} for {}{}".format(model_name, args.data_folder, args.task))
    trainer.fit()

    #Predict for test
    logging.info("Predicting")
    preds, labels = trainer.test()
    res = results_analyses_tools.evaluate_and_aggregate(preds, labels, ['R_10@1','R_10@1',
                    'R_10@2',
                    'R_10@5',
                    'R_2@1'])
    for metric, v in res.items():
        logging.info("Test {} : {:4f}".format(metric, v))

    #Saving predictions and labels to a file
    max_preds_column = max([len(l) for l in preds])
    preds_df = pd.DataFrame(preds, columns=["prediction_"+str(i) for i in range(max_preds_column)])
    preds_df.to_csv(args.output_dir+"/"+args.run_id+"/predictions.csv", index=False)

    labels_df = pd.DataFrame(labels, columns=["label_"+str(i) for i in range(max_preds_column)])
    labels_df.to_csv(args.output_dir+"/"+args.run_id+"/labels.csv", index=False)

    #Saving model to a file
    if args.save_model:
        torch.save(model.state_dict(), args.output_dir+"/"+args.run_id+"/model")

    #In case we want to get uncertainty estimations at prediction time
    if args.predict_with_uncertainty_estimation:  
        logging.info("Predicting with dropout.")      
        preds, uncertainties, labels, foward_passes_preds = trainer.test_with_dropout(args.num_foward_prediction_passes)
        res = results_analyses_tools.evaluate_and_aggregate(preds, labels, ['R_10@1'])
        for metric, v in res.items():
            logging.info("Test (w. dropout and {} foward passes) {} : {:4f}".format(args.num_foward_prediction_passes, metric, v))
        
        max_preds_column = max([len(l) for l in preds])
        preds_df = pd.DataFrame(preds, columns=["prediction_"+str(i) for i in range(max_preds_column)])
        preds_df.to_csv(args.output_dir+"/"+args.run_id+"/predictions_with_dropout.csv", index=False)

        for i, f_pass_preds in enumerate(foward_passes_preds):
            preds_df = pd.DataFrame(f_pass_preds, columns=["prediction_"+str(i) for i in range(max_preds_column)])
            preds_df.to_csv(args.output_dir+"/"+args.run_id+"/predictions_with_dropout_f_pass_{}.csv".format(i), index=False)

        labels_df = pd.DataFrame(labels, columns=["label_"+str(i) for i in range(max_preds_column)])
        labels_df.to_csv(args.output_dir+"/"+args.run_id+"/labels.csv", index=False)
        
        uncertainties_df = pd.DataFrame(uncertainties, columns=["uncertainty_"+str(i) for i in range(max_preds_column)])
        uncertainties_df.to_csv(args.output_dir+"/"+args.run_id+"/uncertainties.csv", index=False)

    return trainer.best_ndcg
Exemple #3
0
def run_experiment(args):
    args.run_id = str(ex.current_run._id)

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    tokenizer = BertTokenizer.from_pretrained(args.transformer_model)
    # Conversation Response Ranking datasets needs special tokens
    if args.task in ["mantis", "msdialog", "ubuntu_dstc8"]:
        special_tokens_dict = {
            'additional_special_tokens': ['[UTTERANCE_SEP]', '[TURN_SEP]']
        }
        tokenizer.add_special_tokens(special_tokens_dict)

    #Load datasets
    train = pd.read_csv(
        args.data_folder + args.task + "/train.tsv",
        sep="\t",
        nrows=args.sample_data if args.sample_data != -1 else None)
    valid = pd.read_csv(
        args.data_folder + args.task + "/valid.tsv",
        sep="\t",
        nrows=args.sample_data if args.sample_data != -1 else None)

    #Choose the negative candidate sampler
    document_col = train.columns[1]
    if args.train_negative_sampler == 'random':
        ns_train = negative_sampling.RandomNegativeSampler(
            list(train[document_col].values), args.num_ns_train)
    elif args.train_negative_sampler == 'bm25':
        ns_train = negative_sampling.BM25NegativeSamplerPyserini(
            list(train[document_col].values), args.num_ns_train,
            args.data_folder + args.task + "/anserini_train/",
            args.sample_data, args.anserini_folder)
    elif args.train_negative_sampler == 'sentenceBERT':
        ns_train = negative_sampling.SentenceBERTNegativeSampler(
            list(train[document_col].values), args.num_ns_train,
            args.data_folder + args.task + "/train_sentenceBERTembeds",
            args.sample_data, args.bert_sentence_model)

    if args.test_negative_sampler == 'random':
        ns_val = negative_sampling.RandomNegativeSampler(
            list(valid[document_col].values) +
            list(train[document_col].values), args.num_ns_eval)
    elif args.test_negative_sampler == 'bm25':
        ns_val = negative_sampling.BM25NegativeSamplerPyserini(
            list(valid[document_col].values) +
            list(train[document_col].values), args.num_ns_eval,
            args.data_folder + args.task + "/anserini_valid/",
            args.sample_data, args.anserini_folder)
    elif args.test_negative_sampler == 'sentenceBERT':
        ns_val = negative_sampling.SentenceBERTNegativeSampler(
            list(valid[document_col].values) +
            list(train[document_col].values), args.num_ns_eval,
            args.data_folder + args.task + "/valid_sentenceBERTembeds",
            args.sample_data, args.bert_sentence_model)

    #Create the loaders for the datasets, with the respective negative samplers
    dataloader = dataset.QueryDocumentDataLoader(
        train, valid, valid, tokenizer, ns_train, ns_val, 'classification',
        args.train_batch_size, args.val_batch_size, args.max_seq_len,
        args.sample_data, args.data_folder + args.task)

    train_loader, val_loader, test_loader = dataloader.get_pytorch_dataloaders(
    )

    #Instantiate transformer model to be used
    model = pointwise_bert.BertForPointwiseLearning.from_pretrained(
        args.transformer_model,
        loss_function=args.loss_function,
        smoothing=args.smoothing)

    model.resize_token_embeddings(len(dataloader.tokenizer))

    #Instantiate trainer that handles fitting.
    trainer = transformer_trainer.TransformerTrainer(
        model,
        train_loader,
        val_loader,
        test_loader,
        args.num_ns_eval,
        "classification",
        tokenizer,
        args.validate_every_epochs,
        args.num_validation_batches,
        args.num_epochs,
        args.lr,
        args.sacred_ex,
        args.validate_every_steps,
        validation_metric='R_10@1',
        num_training_instances=args.num_training_instances)

    #Train
    model_name = model.__class__.__name__
    logging.info("Fitting {} for {}{}".format(model_name, args.data_folder,
                                              args.task))
    trainer.fit()

    #Predict for test
    logging.info("Predicting for the validation set.")
    preds, labels, softmax_logits = trainer.test()
    res = results_analyses_tools.evaluate_and_aggregate(
        preds, labels, ['R_10@1'])
    for metric, v in res.items():
        logging.info("Test {} : {:3f}".format(metric, v))
        wandb.log({'step': 0, "dev_" + metric: v})

    #Saving predictions and labels to a file
    max_preds_column = max([len(l) for l in preds])
    preds_df = pd.DataFrame(
        preds,
        columns=["prediction_" + str(i) for i in range(max_preds_column)])
    preds_df.to_csv(args.output_dir + "/" + args.run_id + "/predictions.csv",
                    index=False)

    softmax_df = pd.DataFrame(
        softmax_logits,
        columns=["prediction_" + str(i) for i in range(max_preds_column)])
    softmax_df.to_csv(args.output_dir + "/" + args.run_id +
                      "/predictions_softmax.csv",
                      index=False)

    labels_df = pd.DataFrame(
        labels, columns=["label_" + str(i) for i in range(max_preds_column)])
    labels_df.to_csv(args.output_dir + "/" + args.run_id + "/labels.csv",
                     index=False)

    #Saving model to a file
    if args.save_model:
        torch.save(model.state_dict(),
                   args.output_dir + "/" + args.run_id + "/model")

    #In case we want to get uncertainty estimations at prediction time
    if args.predict_with_uncertainty_estimation:
        logging.info("Predicting with MC dropout for the validation set.")
        preds, labels, softmax_logits, foward_passes_preds, uncertainties = trainer.test_with_dropout(
            args.num_foward_prediction_passes)
        res = results_analyses_tools.evaluate_and_aggregate(
            preds, labels, ['R_10@1'])
        for metric, v in res.items():
            logging.info(
                "Test (w. dropout and {} foward passes) {} : {:3f}".format(
                    args.num_foward_prediction_passes, metric, v))

        max_preds_column = max([len(l) for l in preds])
        preds_df = pd.DataFrame(
            preds,
            columns=["prediction_" + str(i) for i in range(max_preds_column)])
        preds_df.to_csv(args.output_dir + "/" + args.run_id +
                        "/predictions_with_dropout.csv",
                        index=False)

        softmax_df = pd.DataFrame(
            softmax_logits,
            columns=["prediction_" + str(i) for i in range(max_preds_column)])
        softmax_df.to_csv(args.output_dir + "/" + args.run_id +
                          "/predictions_with_dropout_softmax.csv",
                          index=False)

        for i, f_pass_preds in enumerate(foward_passes_preds):
            preds_df = pd.DataFrame(f_pass_preds,
                                    columns=[
                                        "prediction_" + str(i)
                                        for i in range(max_preds_column)
                                    ])
            preds_df.to_csv(
                args.output_dir + "/" + args.run_id +
                "/predictions_with_dropout_f_pass_{}.csv".format(i),
                index=False)

        labels_df = pd.DataFrame(
            labels,
            columns=["label_" + str(i) for i in range(max_preds_column)])
        labels_df.to_csv(args.output_dir + "/" + args.run_id + "/labels.csv",
                         index=False)

        uncertainties_df = pd.DataFrame(
            uncertainties,
            columns=["uncertainty_" + str(i) for i in range(max_preds_column)])
        uncertainties_df.to_csv(args.output_dir + "/" + args.run_id +
                                "/uncertainties.csv",
                                index=False)

    return trainer.best_eval_metric
def main():
    parser = argparse.ArgumentParser()

    # Input and output configs
    parser.add_argument("--task",
                        default=None,
                        type=str,
                        required=True,
                        help="the task to run bert ranker for")
    parser.add_argument("--data_folder",
                        default=None,
                        type=str,
                        required=True,
                        help="the folder containing data")
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="the folder to output raw negative_samples")
    parser.add_argument(
        "--anserini_folder",
        default="",
        type=str,
        required=False,
        help=
        "Path containing the anserini bin <anserini_folder>/target/appassembler/bin/IndexCollection"
    )
    parser.add_argument(
        "--sample_data",
        default=-1,
        type=int,
        required=False,
        help=
        "Amount of data to sample for training and eval. If no sampling required use -1."
    )
    parser.add_argument("--seed",
                        default=42,
                        type=str,
                        required=False,
                        help="random seed")
    parser.add_argument(
        "--num_ns_train",
        default=1,
        type=int,
        required=False,
        help="Number of negatively sampled documents to use during training")

    args = parser.parse_args()
    logging.basicConfig(level=logging.INFO,
                        format="%(asctime)s [%(levelname)s] %(message)s",
                        handlers=[logging.StreamHandler()])

    #Load datasets
    add_turn_separator = (
        args.task != "ubuntu_dstc8"
    )  # Ubuntu data has several utterances from same user in the context.
    train = preprocess_crr.read_crr_tsv_as_df(
        args.data_folder + args.task + "/train.tsv", args.sample_data,
        add_turn_separator)
    valid = preprocess_crr.read_crr_tsv_as_df(
        args.data_folder + args.task + "/valid.tsv", args.sample_data,
        add_turn_separator)

    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
    ns_valid_random = negative_sampling.RandomNegativeSampler(
        list(train["response"].values) + list(valid["response"].values),
        args.num_ns_train)
    ns_valid_bm25 = negative_sampling.BM25NegativeSamplerPyserini(
        list(train["response"].values) + list(valid["response"].values),
        args.num_ns_train,
        args.data_folder + args.task + "/anserini_valid/",
        args.sample_data,
        args.anserini_folder,
        set_rm3=True)
    ns_valid_sentenceBERT = negative_sampling.SentenceBERTNegativeSampler(
        list(train["response"].values) + list(valid["response"].values),
        args.num_ns_train,
        args.data_folder + args.task + "/valid_sentenceBERTembeds",
        args.sample_data, args.data_folder + args.task +
        "/bert-base-cased_{}".format(args.task))  #pre-trained embedding

    examples = []
    examples_cols = ["context", "relevant_response"] + \
        ["cand_random_{}".format(i) for i in range(args.num_ns_train)] + \
        ["random_retrieved_relevant", "random_rank"]+  \
        ["cand_bm25_{}".format(i) for i in range(args.num_ns_train)] + \
        ["bm25_retrieved_relevant", "bm25_rank"]+  \
        ["cand_sentenceBERT_{}".format(i) for i in range(args.num_ns_train)] + \
        ["sentenceBERT_retrieved_relevant", "sentenceBERT_rank"]

    logging.info("Retrieving candidates using random, bm25 and sentenceBERT.")
    for idx, row in enumerate(
            tqdm(valid.itertuples(index=False), total=len(valid))):
        context = row[0]
        relevant_response = row[1]
        instance = [context, relevant_response]

        for ns_name, ns in [("random", ns_valid_random),
                            ("bm25", ns_valid_bm25),
                            ("sentenceBERT", ns_valid_sentenceBERT)]:
            ns_candidates, had_relevant, rank_relevant = ns.sample(
                context, relevant_response)
            for ns in ns_candidates:
                instance.append(ns)
            instance.append(had_relevant)
            instance.append(rank_relevant)
        examples.append(instance)

    examples_df = pd.DataFrame(examples, columns=examples_cols)
    examples_df.to_csv(args.output_dir +
                       "/_all_negative_samples_{}.csv".format(args.task),
                       index=False,
                       sep="\t")
def run_experiment(args):
    args.run_id = str(ex.current_run._id)

    tokenizer = BertTokenizer.from_pretrained(args.transformer_model)
    # Load datasets
    ## Conversation Response Ranking
    if args.task in ["mantis", "msdialog", "ubuntu_dstc8"]:
        add_turn_separator = (
            args.task != "ubuntu_dstc8"
        )  # Ubuntu data has several utterances from same user in the context.
        train = preprocess_crr.read_crr_tsv_as_df(
            args.data_folder + args.task + "/train.tsv", args.sample_data,
            add_turn_separator)
        valid = preprocess_crr.read_crr_tsv_as_df(
            args.data_folder + args.task + "/valid.tsv", args.sample_data,
            add_turn_separator)
        special_tokens_dict = {
            'additional_special_tokens': ['[UTTERANCE_SEP]', '[TURN_SEP]']
        }
        tokenizer.add_special_tokens(special_tokens_dict)
    ## Similar Question Retrieval and Passage Retrieval
    elif args.task in ["qqp", "linkso", "trec2020pr"]:
        if args.sample_data == -1: args.sample_data = None
        train = pd.read_csv(args.data_folder + args.task + "/train.tsv",
                            sep="\t",
                            nrows=args.sample_data)
        valid = pd.read_csv(args.data_folder + args.task + "/valid.tsv",
                            sep="\t",
                            nrows=args.sample_data)
    elif args.task == "scisumm":
        train, valid = preprocess_scisumm.transform_to_dfs(
            "../data/Training-Set-2019/Task1/From-Training-Set-2018/")
    elif args.task == "scisumm_ranked":
        train, valid, test = preprocess_scisumm_ranked.transform_to_dfs(
            args.path_to_ranked_file, args.path_to_ranked_test,
            args.path_to_ranked_dev)

    # Choose the negative candidate sampler
    document_col = train.columns[1]
    ns_train = None
    ns_val = None
    if args.train_negative_sampler == 'random':
        ns_train = negative_sampling.RandomNegativeSampler(
            list(train[document_col].values), args.num_ns_train)
    elif args.train_negative_sampler == 'bm25':
        ns_train = negative_sampling.BM25NegativeSamplerPyserini(
            list(train[document_col].values), args.num_ns_train,
            args.data_folder + "/" + args.task + "/anserini_train/",
            args.sample_data, args.anserini_folder)
    elif args.train_negative_sampler == 'sentenceBERT':
        ns_train = negative_sampling.SentenceBERTNegativeSampler(
            list(train[document_col].values), args.num_ns_train,
            args.data_folder + "/" + args.task + "/train_sentenceBERTembeds",
            args.sample_data, args.bert_sentence_model)
    if args.test_negative_sampler == 'random':
        ns_val = negative_sampling.RandomNegativeSampler(
            list(valid[document_col].values) +
            list(train[document_col].values), args.num_ns_eval)
    elif args.test_negative_sampler == 'bm25':
        ns_val = negative_sampling.BM25NegativeSamplerPyserini(
            list(valid[document_col].values) +
            list(train[document_col].values), args.num_ns_eval,
            args.data_folder + "/" + args.task + "/anserini_valid/",
            args.sample_data, args.anserini_folder)
    elif args.test_negative_sampler == 'sentenceBERT':
        ns_val = negative_sampling.SentenceBERTNegativeSampler(
            list(valid[document_col].values) +
            list(train[document_col].values), args.num_ns_eval,
            args.data_folder + "/" + args.task + "/valid_sentenceBERTembeds",
            args.sample_data, args.bert_sentence_model)

    # Create the loaders for the datasets, with the respective negative samplers
    dataloader = dataset.QueryDocumentDataLoader(
        train, valid, test, tokenizer, ns_train, ns_val, 'classification',
        args.train_batch_size, args.val_batch_size, args.max_seq_len,
        args.sample_data, args.data_folder + "/" + args.task)
    if args.task == "scisumm_ranked":
        with_ranked_list = True
    else:
        with_ranked_list = False
    train_loader, val_loader, test_loader = dataloader.get_pytorch_dataloaders(
        with_ranked_list)

    # Instantiate transformer model to be used
    model = BertForSequenceClassification.from_pretrained(
        args.transformer_model)
    model.resize_token_embeddings(len(dataloader.tokenizer))

    # Instantiate trainer that handles fitting.
    trainer = transformer_trainer.TransformerTrainer(
        model, train_loader, val_loader, test_loader, args.num_ns_eval,
        "classification", tokenizer, args.validate_every_epochs,
        args.num_validation_instances, args.num_epochs, args.lr,
        args.sacred_ex)

    # Train
    model_name = model.__class__.__name__
    logging.info("Fitting {} for {}{}".format(model_name, args.data_folder,
                                              args.task))
    trainer.fit()

    # Predict for test
    logging.info("Predicting")
    preds, labels, doc_ids, all_queries, preds_without_acc = trainer.validate()
    res = results_analyses_tools.evaluate_and_aggregate(
        preds, labels, [
            'R_10@1', 'R_10@2', 'R_10@5', 'R_2@1', 'accuracy_0.3',
            'accuracy_0.3_upto_1', 'precision_0.3', 'recall_0.3',
            'f_score_0.3', 'accuracy_0.4', 'accuracy_0.4_upto_1',
            'precision_0.4', 'recall_0.4', 'f_score_0.4', 'accuracy_0.5',
            'accuracy_0.5_upto_1', 'precision_0.5', 'recall_0.5', 'f_score_0.5'
        ])
    for metric, v in res.items():
        logging.info("Test {} : {:4f}".format(metric, v))

    # Saving predictions and labels to a file
    max_preds_column = max([len(l) for l in preds])
    preds_df = pd.DataFrame(
        preds,
        columns=["prediction_" + str(i) for i in range(max_preds_column)])
    preds_df.to_csv(args.output_dir + "/" + args.run_id + "/predictions.csv",
                    index=False)

    labels_df = pd.DataFrame(
        labels, columns=["label_" + str(i) for i in range(max_preds_column)])
    labels_df.to_csv(args.output_dir + "/" + args.run_id + "/labels.csv",
                     index=False)

    new_preds = list((np.array(preds_without_acc) > 0.3).astype(int))
    d = {
        'query': all_queries,
        'doc_id': doc_ids,
        'label': new_preds,
        'similiarity': preds_without_acc
    }

    df_doc_ids = pd.DataFrame(d)
    df_doc_ids_ones = df_doc_ids[df_doc_ids['label'] == 1]
    df_doc_ids_ones = df_doc_ids_ones.groupby('query').agg(list).reset_index()
    df_doc_ids_non_ones = df_doc_ids.groupby('query').agg(list).reset_index()
    new_df = []
    for i, row in df_doc_ids_non_ones.iterrows():
        if all([v == 0 for v in row['label']]):
            highest_value = [
                x for _, x in sorted(zip(row['similiarity'], row['doc_id']),
                                     key=lambda pair: pair[0])
            ]
            highest_value_sim = [x for x in sorted(row['similiarity'])]

            row['label'] = [1]
            row['doc_id'] = [highest_value[0]]
            row['similiarity'] = [highest_value_sim[0]]

            new_df.append(row)

    result = pd.concat([df_doc_ids_ones, pd.DataFrame(new_df)])
    result.to_csv(args.output_dir + "/" + args.run_id + "/doc_ids_dev.csv",
                  index=False,
                  sep='\t')

    # predict on the test set
    preds, labels, doc_ids, all_queries, preds_without_acc = trainer.test()

    new_preds = list((np.array(preds_without_acc) > 0.3).astype(int))
    d = {
        'query': all_queries,
        'doc_id': doc_ids,
        'label': new_preds,
        'similiarity': preds_without_acc
    }

    df_doc_ids = pd.DataFrame(d)
    df_doc_ids_ones = df_doc_ids[df_doc_ids['label'] == 1]
    df_doc_ids_ones = df_doc_ids_ones.groupby('query').agg(list).reset_index()
    df_doc_ids_non_ones = df_doc_ids.groupby('query').agg(list).reset_index()
    new_df = []
    for i, row in df_doc_ids_non_ones.iterrows():
        if all([v == 0 for v in row['label']]):
            highest_value = [
                x for _, x in sorted(zip(row['similiarity'], row['doc_id']),
                                     key=lambda pair: pair[0])
            ]
            highest_value_sim = [x for x in sorted(row['similiarity'])]

            row['label'] = [1]
            row['doc_id'] = [highest_value[0]]
            row['similiarity'] = [highest_value_sim[0]]

            new_df.append(row)

    result = pd.concat([df_doc_ids_ones, pd.DataFrame(new_df)])
    result.to_csv(args.output_dir + "/" + args.run_id + "/doc_ids_test.csv",
                  index=False,
                  sep='\t')

    # Saving model to a file
    if args.save_model:
        torch.save(model.state_dict(),
                   args.output_dir + "/" + args.run_id + "/model")

    # In case we want to get uncertainty estimations at prediction time
    if args.predict_with_uncertainty_estimation:
        logging.info("Predicting with dropout.")
        preds, uncertainties, labels, foward_passes_preds = trainer.test_with_dropout(
            args.num_foward_prediction_passes)
        res = results_analyses_tools.evaluate_and_aggregate(
            preds, labels, ['R_10@1'])
        for metric, v in res.items():
            logging.info(
                "Test (w. dropout and {} foward passes) {} : {:4f}".format(
                    args.num_foward_prediction_passes, metric, v))

        max_preds_column = max([len(l) for l in preds])
        preds_df = pd.DataFrame(
            preds,
            columns=["prediction_" + str(i) for i in range(max_preds_column)])
        preds_df.to_csv(args.output_dir + "/" + args.run_id +
                        "/predictions_with_dropout.csv",
                        index=False)

        for i, f_pass_preds in enumerate(foward_passes_preds):
            preds_df = pd.DataFrame(f_pass_preds,
                                    columns=[
                                        "prediction_" + str(i)
                                        for i in range(max_preds_column)
                                    ])
            preds_df.to_csv(
                args.output_dir + "/" + args.run_id +
                "/predictions_with_dropout_f_pass_{}.csv".format(i),
                index=False)

        labels_df = pd.DataFrame(
            labels,
            columns=["label_" + str(i) for i in range(max_preds_column)])
        labels_df.to_csv(args.output_dir + "/" + args.run_id + "/labels.csv",
                         index=False)

        uncertainties_df = pd.DataFrame(
            uncertainties,
            columns=["uncertainty_" + str(i) for i in range(max_preds_column)])
        uncertainties_df.to_csv(args.output_dir + "/" + args.run_id +
                                "/uncertainties.csv",
                                index=False)

    return trainer.best_ndcg
Exemple #6
0
def main():
    parser = argparse.ArgumentParser()

    # Input and output configs
    parser.add_argument("--task", default=None, type=str, required=True,
                        help="the task to run bert ranker for")
    parser.add_argument("--data_folder", default=None, type=str, required=True,
                        help="the folder containing data")
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="the folder to output raw negative_samples")
    parser.add_argument("--anserini_folder", default="", type=str, required=True,
                        help="Path containing the anserini bin <anserini_folder>/target/appassembler/bin/IndexCollection")
    parser.add_argument("--sample_data", default=-1, type=int, required=False,
                         help="Amount of data to sample for training and eval. If no sampling required use -1.")
    parser.add_argument("--seed", default=42, type=str, required=False,
                        help="random seed")
    parser.add_argument("--num_ns", default=1, type=int, required=False,
                        help="Number of negatively sampled documents to use during training")
    parser.add_argument("--sentence_bert_model", type=str, required=False, default="all-MiniLM-L6-v2",
                        help="Model to calculate sentence embeddings with for sentenceBERT negative sampling.")

    parser.add_argument("--num_expansion_terms", default=10, type=int, required=False,
                        help="expansion terms for rm3")
    parser.add_argument("--num_expansion_docs", default=10, type=int, required=False,
                        help="expansion docs for rm3")
    parser.add_argument("--original_query_weight", default=0.5, type=float, required=False,
                        help="original query weight for rm3")   

    args = parser.parse_args()
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s [%(levelname)s] %(message)s",
        handlers=[
            logging.StreamHandler()
        ]
    )
    #Load datasets
    train = pd.read_csv(args.data_folder+args.task+"/train.tsv", sep="\t", 
                        nrows=args.sample_data if args.sample_data != -1 else None)
    test = pd.read_csv(args.data_folder+args.task+"/test.tsv", sep="\t",
                        nrows=args.sample_data if args.sample_data != -1 else None)
    # embed()

    # ns_test_random = negative_sampling.RandomNegativeSampler(list(train["response"].values)+list(test["response"].values), args.num_ns)    
    # ns_test_bm25 = negative_sampling.BM25NegativeSamplerPyserini(list(train["response"].values)+list(test["response"].values), args.num_ns,
    #             args.data_folder+args.task+"/anserini_test_{}/".format(args.sample_data), args.sample_data, args.anserini_folder)
    ns_test_bm25_rm3 = negative_sampling.BM25NegativeSamplerPyserini(list(train["response"].values)+list(test["response"].values), args.num_ns,
               args.data_folder+args.task+"/anserini_test_{}/".format(args.sample_data), args.sample_data, args.anserini_folder, set_rm3=True, 
               num_expansion_docs=args.num_expansion_docs, num_expansion_terms=args.num_expansion_terms, original_query_weight=args.original_query_weight)
    # ns_test_sentenceBERT = negative_sampling.SentenceBERTNegativeSampler(list(train["response"].values)+list(test["response"].values), args.num_ns, 
    #                args.data_folder+args.task+"/test_sentenceBERTembeds", args.sample_data, args.sentence_bert_model,
    #                use_cache_for_embeddings=False)

    ns_info = [
        # (ns_test_random, ["cand_random_{}".format(i) for i in range(args.num_ns)] + ["random_retrieved_relevant", "random_rank"], 'random'),
        # (ns_test_bm25, ["cand_bm25_{}".format(i) for i in range(args.num_ns)] + ["bm25_retrieved_relevant", "bm25_rank"], 'bm25'),
        (ns_test_bm25_rm3,["cand_bm25rm3_{}".format(i) for i in range(args.num_ns)] + ["bm25rm3_retrieved_relevant", "bm25rm3_rank"], 'bm25rm3'),        
        # (ns_test_sentenceBERT, ["cand_sentenceBERT_{}".format(i) for i in range(args.num_ns)] + ["sentenceBERT_retrieved_relevant", "sentenceBERT_rank"], 'sentenceBERT')
    ]

    examples = []
    examples_cols = ["context", "relevant_response"] + \
        reduce(lambda x,y:x+y, [t[1] for t in ns_info])
    logging.info("Retrieving candidates using different negative sampling strategies for {}.".format(args.task))
    recall_df = []
    for idx, row in enumerate(tqdm(test.itertuples(index=False), total=len(test))):
        context = row[0]
        relevant_response = row[1]
        instance = [context, relevant_response]

        for ns, _ , ns_name in ns_info:
            ns_candidates, scores, had_relevant, rank_relevant, _ = ns.sample(context, [relevant_response])
            for ns in ns_candidates:
                instance.append(ns)
            instance.append(had_relevant)
            instance.append(rank_relevant)
            if had_relevant:
                r10 = 1
            else:
                r10 = 0
            if rank_relevant == 0:
                r1 = 1
            else:
                r1 =0
            recall_df.append([r10, r1, ns_name])
        examples.append(instance)

    recall_df  = pd.DataFrame(recall_df, columns = ["R@10", "R@1", "NS"])
    # recall_df[recall_df["NS"]=="random"][["R@10", "R@1"]].to_csv(args.output_dir+"/recall_df_random_{}.csv".format(args.task), index=False, sep="\t")
    # recall_df[recall_df["NS"]=="bm25rm3"][["R@10", "R@1"]].to_csv(args.output_dir+"/recall_df_bm25rm3_{}.csv".format(args.task), index=False, sep="\t")
    examples_df = pd.DataFrame(examples, columns=examples_cols)
    print("R@10: {}".format(examples_df[[c for c in examples_df.columns if 'retrieved_relevant' in c]].sum()/examples_df.shape[0]))
    rank_col = [c for c in examples_df.columns if 'rank' in c][0]
    print("R@1: {}".format(examples_df[examples_df[rank_col]==0].shape[0]/examples_df.shape[0]))
def run_experiment(args):
    args.run_id = str(ex.current_run._id)

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    tokenizer = BertTokenizer.from_pretrained(args.transformer_model)
    #Load datasets
    train = pd.read_csv(
        args.data_folder + args.task + "/train_test.tsv",
        sep="\t",
        nrows=args.sample_data if args.sample_data != -1 else None)
    valid = pd.read_csv(
        args.data_folder + args.task + "/valid_test.tsv",
        sep="\t",
        nrows=args.sample_data if args.sample_data != -1 else None)
    special_tokens_dict = {
        'additional_special_tokens': ['[UTTERANCE_SEP]', '[TURN_SEP]']
    }
    tokenizer.add_special_tokens(special_tokens_dict)

    #Choose the negative candidate sampler
    document_col = train.columns[1]
    ns_train = negative_sampling.BM25NegativeSamplerPyserini(
        list(train[document_col].values), args.num_ns_train,
        args.data_folder + args.task + "/anserini_train/", args.sample_data,
        args.anserini_folder)

    ns_val_random = negative_sampling.RandomNegativeSampler(
        list(valid[document_col].values) + list(train[document_col].values),
        args.num_ns_eval)
    ns_val_bm25 = negative_sampling.BM25NegativeSamplerPyserini(
        list(valid[document_col].values) + list(train[document_col].values),
        args.num_ns_eval, args.data_folder + args.task + "/anserini_valid/",
        args.sample_data, args.anserini_folder)
    ns_val_bert_sentence = negative_sampling.SentenceBERTNegativeSampler(
        list(valid[document_col].values) + list(train[document_col].values),
        args.num_ns_eval,
        args.data_folder + args.task + "/valid_sentenceBERTembeds",
        args.sample_data, args.bert_sentence_model)

    #Create the loaders for the datasets, with the respective negative samplers
    cross_ns_val = {}
    cross_ns_train = {}
    for (ns_name, ns_val) in [("random", ns_val_random), ("bm25", ns_val_bm25),
                              ("sentenceBERT", ns_val_bert_sentence)]:
        dataloader = dataset.QueryDocumentDataLoader(
            train, valid, valid, tokenizer, ns_train, ns_val, 'classification',
            args.train_batch_size, args.val_batch_size, args.max_seq_len,
            args.sample_data, args.data_folder + args.task)
        train_loader, val_loader, test_loader = dataloader.get_pytorch_dataloaders(
        )
        cross_ns_val[ns_name] = val_loader
        cross_ns_train[ns_name] = train_loader

    #Instantiate transformer model to be used
    model = BertForSequenceClassification.from_pretrained(
        args.transformer_model)
    model.resize_token_embeddings(len(dataloader.tokenizer))

    #Instantiate trainer that handles fitting.
    trainer = transformer_trainer.TransformerTrainer(
        model, cross_ns_train["bm25"], cross_ns_val["bm25"],
        cross_ns_val["bm25"], args.num_ns_eval, "classification", tokenizer,
        args.validate_every_epochs, args.num_validation_batches,
        args.num_epochs, args.lr, args.sacred_ex)

    #Train
    model_name = model.__class__.__name__
    logging.info("Fitting {} for {}{}".format(model_name, args.data_folder,
                                              args.task))
    trainer.fit()

    #Cross-NS predictions
    for ns_index, ns_name in enumerate(["random", "bm25", "sentenceBERT"]):
        logging.info("Predicting for NS {}".format(ns_name))
        os.makedirs(args.output_dir + "/" + str(int(args.run_id) + ns_index),
                    exist_ok=True)
        with open(
                args.output_dir + "/" + str(int(args.run_id) + ns_index) +
                "/config.json", "w") as f:
            config_w = {'args': vars(args)}
            config_w['args']['test_dataset'] = args.task
            config_w['args']['train_negative_sampler'] = 'bm25'
            config_w['args']['test_negative_sampler'] = ns_name
            if 'sacred_ex' in config_w['args']:
                del config_w['args']['sacred_ex']
            json.dump(config_w, f, indent=4)
        # preds, labels, softmax_logits = trainer.test()
        trainer.num_validation_batches = -1  # no sample
        preds, labels, softmax_logits = trainer.predict(cross_ns_val[ns_name])

        #Saving predictions and labels to a file
        max_preds_column = max([len(l) for l in preds])
        preds_df = pd.DataFrame(
            preds,
            columns=["prediction_" + str(i) for i in range(max_preds_column)])
        preds_df.to_csv(args.output_dir + "/" +
                        str(int(args.run_id) + ns_index) + "/predictions.csv",
                        index=False)

        softmax_df = pd.DataFrame(
            softmax_logits,
            columns=["prediction_" + str(i) for i in range(max_preds_column)])
        softmax_df.to_csv(args.output_dir + "/" +
                          str(int(args.run_id) + ns_index) +
                          "/predictions_softmax.csv",
                          index=False)

        labels_df = pd.DataFrame(
            labels,
            columns=["label_" + str(i) for i in range(max_preds_column)])
        labels_df.to_csv(args.output_dir + "/" +
                         str(int(args.run_id) + ns_index) + "/labels.csv",
                         index=False)

        #Saving model to a file
        if args.save_model:
            torch.save(
                model.state_dict(), args.output_dir + "/" +
                str(int(args.run_id) + ns_index) + "/model")

        #In case we want to get uncertainty estimations at prediction time
        if args.predict_with_uncertainty_estimation:
            logging.info("Predicting with dropout.")
            trainer.num_validation_batches = -1  # no sample
            preds, labels, softmax_logits, foward_passes_preds, uncertainties = \
                trainer.predict_with_uncertainty(cross_ns_val[ns_name], args.num_foward_prediction_passes)

            max_preds_column = max([len(l) for l in preds])
            preds_df = pd.DataFrame(preds,
                                    columns=[
                                        "prediction_" + str(i)
                                        for i in range(max_preds_column)
                                    ])
            preds_df.to_csv(args.output_dir + "/" +
                            str(int(args.run_id) + ns_index) +
                            "/predictions_with_dropout.csv",
                            index=False)

            softmax_df = pd.DataFrame(softmax_logits,
                                      columns=[
                                          "prediction_" + str(i)
                                          for i in range(max_preds_column)
                                      ])
            softmax_df.to_csv(args.output_dir + "/" +
                              str(int(args.run_id) + ns_index) +
                              "/predictions_with_dropout_softmax.csv",
                              index=False)

            for i, f_pass_preds in enumerate(foward_passes_preds):
                preds_df = pd.DataFrame(f_pass_preds,
                                        columns=[
                                            "prediction_" + str(i)
                                            for i in range(max_preds_column)
                                        ])
                preds_df.to_csv(
                    args.output_dir + "/" + str(int(args.run_id) + ns_index) +
                    "/predictions_with_dropout_f_pass_{}.csv".format(i),
                    index=False)

            labels_df = pd.DataFrame(
                labels,
                columns=["label_" + str(i) for i in range(max_preds_column)])
            labels_df.to_csv(args.output_dir + "/" +
                             str(int(args.run_id) + ns_index) + "/labels.csv",
                             index=False)

            uncertainties_df = pd.DataFrame(
                uncertainties,
                columns=[
                    "uncertainty_" + str(i) for i in range(max_preds_column)
                ])
            uncertainties_df.to_csv(args.output_dir + "/" +
                                    str(int(args.run_id) + ns_index) +
                                    "/uncertainties.csv",
                                    index=False)

    #Cross-dataset predictions
    cross_datasets = set(["msdialog", "ubuntu_dstc8", "mantis"]) - set(
        [args.task])
    cross_datasets = sorted(list(cross_datasets))
    cross_data_val_dataloader = {}
    for cross_task in cross_datasets:
        train_cross = preprocess_crr.read_crr_tsv_as_df(
            args.data_folder + cross_task + "/train.tsv", args.sample_data,
            add_turn_separator)
        valid_cross = preprocess_crr.read_crr_tsv_as_df(
            args.data_folder + cross_task + "/valid.tsv", args.sample_data,
            add_turn_separator)
        ns_train_cross = negative_sampling.BM25NegativeSamplerPyserini(
            list(train_cross[document_col].values), args.num_ns_train,
            args.data_folder + cross_task + "/anserini_train/",
            args.sample_data, args.anserini_folder)
        ns_val_bm25_cross = negative_sampling.BM25NegativeSamplerPyserini(
            list(valid_cross[document_col].values) +
            list(train_cross[document_col].values), args.num_ns_eval,
            args.data_folder + cross_task + "/anserini_valid/",
            args.sample_data, args.anserini_folder)
        dataloader = dataset.QueryDocumentDataLoader(
            train_cross, valid_cross, valid_cross, tokenizer, ns_train_cross,
            ns_val_bm25_cross, 'classification', args.train_batch_size,
            args.val_batch_size, args.max_seq_len, args.sample_data,
            args.data_folder + cross_task)
        _, val_loader, _ = dataloader.get_pytorch_dataloaders()
        cross_data_val_dataloader[cross_task] = val_loader

    for task_index, cross_task in enumerate(cross_datasets):
        logging.info("Predicting for dataset {}".format(cross_task))
        os.makedirs(args.output_dir + "/" +
                    str(int(args.run_id) + ns_index + task_index + 1),
                    exist_ok=True)
        with open(
                args.output_dir + "/" +
                str(int(args.run_id) + ns_index + task_index + 1) +
                "/config.json", "w") as f:
            config_w = {'args': vars(args)}
            config_w['args']['test_dataset'] = cross_task
            config_w['args']['train_negative_sampler'] = 'bm25'
            config_w['args']['test_negative_sampler'] = 'bm25'
            if 'sacred_ex' in config_w['args']:
                del config_w['args']['sacred_ex']
            json.dump(config_w, f, indent=4)
        # preds, labels, softmax_logits = trainer.test()
        trainer.num_validation_batches = -1  # no sample
        preds, labels, softmax_logits = trainer.predict(
            cross_data_val_dataloader[cross_task])

        #Saving predictions and labels to a file
        max_preds_column = max([len(l) for l in preds])
        preds_df = pd.DataFrame(
            preds,
            columns=["prediction_" + str(i) for i in range(max_preds_column)])
        preds_df.to_csv(args.output_dir + "/" +
                        str(int(args.run_id) + ns_index + task_index + 1) +
                        "/predictions.csv",
                        index=False)

        softmax_df = pd.DataFrame(
            softmax_logits,
            columns=["prediction_" + str(i) for i in range(max_preds_column)])
        softmax_df.to_csv(args.output_dir + "/" +
                          str(int(args.run_id) + ns_index + task_index + 1) +
                          "/predictions_softmax.csv",
                          index=False)

        labels_df = pd.DataFrame(
            labels,
            columns=["label_" + str(i) for i in range(max_preds_column)])
        labels_df.to_csv(args.output_dir + "/" +
                         str(int(args.run_id) + ns_index + task_index + 1) +
                         "/labels.csv",
                         index=False)

        #Saving model to a file
        if args.save_model:
            torch.save(
                model.state_dict(), args.output_dir + "/" +
                str(int(args.run_id) + ns_index + task_index + 1) + "/model")

        #In case we want to get uncertainty estimations at prediction time
        if args.predict_with_uncertainty_estimation:
            logging.info("Predicting with dropout.")
            preds, labels, softmax_logits, foward_passes_preds, uncertainties = \
                trainer.predict_with_uncertainty(cross_data_val_dataloader[cross_task], args.num_foward_prediction_passes)

            max_preds_column = max([len(l) for l in preds])
            preds_df = pd.DataFrame(preds,
                                    columns=[
                                        "prediction_" + str(i)
                                        for i in range(max_preds_column)
                                    ])
            preds_df.to_csv(args.output_dir + "/" +
                            str(int(args.run_id) + ns_index + task_index + 1) +
                            "/predictions_with_dropout.csv",
                            index=False)

            softmax_df = pd.DataFrame(softmax_logits,
                                      columns=[
                                          "prediction_" + str(i)
                                          for i in range(max_preds_column)
                                      ])
            softmax_df.to_csv(
                args.output_dir + "/" +
                str(int(args.run_id) + ns_index + task_index + 1) +
                "/predictions_with_dropout_softmax.csv",
                index=False)

            for i, f_pass_preds in enumerate(foward_passes_preds):
                preds_df = pd.DataFrame(f_pass_preds,
                                        columns=[
                                            "prediction_" + str(i)
                                            for i in range(max_preds_column)
                                        ])
                preds_df.to_csv(
                    args.output_dir + "/" +
                    str(int(args.run_id) + ns_index + task_index + 1) +
                    "/predictions_with_dropout_f_pass_{}.csv".format(i),
                    index=False)

            labels_df = pd.DataFrame(
                labels,
                columns=["label_" + str(i) for i in range(max_preds_column)])
            labels_df.to_csv(
                args.output_dir + "/" +
                str(int(args.run_id) + ns_index + task_index + 1) +
                "/labels.csv",
                index=False)

            uncertainties_df = pd.DataFrame(
                uncertainties,
                columns=[
                    "uncertainty_" + str(i) for i in range(max_preds_column)
                ])
            uncertainties_df.to_csv(
                args.output_dir + "/" +
                str(int(args.run_id) + ns_index + task_index + 1) +
                "/uncertainties.csv",
                index=False)
    return 0.0
    def get_examples(self, filename, ns_name, anserini_folder, sent_bert_model, loss, output_dir, input_pair=True, eval_data=False,
        denoise_negatives=False, num_ns_for_denoising=100, generative_model = 'facebook/blenderbot-3B', remove_cand_subsets=True,
        last_utterance_only=False, use_external_corpus=False):
        """
        filename specified which data split to use (train.csv, dev.csv, test.csv).
        """
        filepath = os.path.join(self.dataset_folder, filename)
        self.data = pd.read_csv(filepath, sep="\t")


        if denoise_negatives:
            num_ns = num_ns_for_denoising
        else:
            num_ns = 10

        candidates = list(self.data["response"].values)
        if use_external_corpus:
            external_datasets = [
                'movie-corpus',
                'wiki-corpus',
                'subreddit-Ubuntu',
                'subreddit-microsoft',
                'subreddit-apple',
                'subreddit-Database',
                'subreddit-DIY',
                'subreddit-electronics',
                'subreddit-ENGLISH',
                'subreddit-gis',
                'subreddit-Physics',
                'subreddit-scifi',
                'subreddit-statistics',
                'subreddit-travel',
                'subreddit-worldbuilding'
            ]
            for ds_name in external_datasets:
                corpus = Corpus(download(ds_name))
                corpus.print_summary_stats()
                for utt in corpus.iter_utterances():
                    if utt.text != "":
                        candidates.append(utt.text)

        if ns_name == "random" or eval_data:
            self.negative_sampler = negative_sampling.RandomNegativeSampler(candidates, num_ns)
        elif ns_name == "bm25":
            index_folder = "/anserini_train_-1/"
            if use_external_corpus:
                index_folder = index_folder.replace("train", "train_expanded_")
            self.negative_sampler = negative_sampling.BM25NegativeSamplerPyserini(candidates, num_ns,
                self.dataset_folder+index_folder, -1, anserini_folder)
        elif ns_name == "sentence_transformer":
            self.negative_sampler = negative_sampling.SentenceBERTNegativeSampler(candidates, num_ns, 
                self.dataset_folder+"/train_sentenceBERTembeds", -1, sent_bert_model, large_index=use_external_corpus)
        elif ns_name == "generative":
            self.negative_sampler = negative_sampling.GenerativeNegativeSamplerForDialogue(num_ns, generative_model)
            
        if loss == 'MarginMSELoss':
            self.negative_sampler.score_relevant_docs = True
        if loss == "ContrastiveLoss" and not eval_data:
            input_pair = False
        if loss == "OnlineContrastiveLoss" and not eval_data:
            input_pair = False
        examples = []
        scores_df = []

        # Code used to annotate some samples
        # samples_to_annotate = []
        # self.data = self.data.sample(200, random_state=42)
        # self.negative_sampler.score_relevant_docs = True
        count_ns_part_of_context = 0
        for idx, row in enumerate(tqdm(self.data.itertuples(index=False), total=len(self.data))):
            context = row[0]
            if last_utterance_only:
                if 'msdialog' in self.dataset_folder:
                    context = context.split("[TURN_SEP]")[-1].split("[UTTERANCE_SEP]")[0].strip()
                else:
                    context = context.split("[TURN_SEP]")[-1].split("[UTTERANCE_SEP]")[-2].strip()

            relevant_response = row[1]
            if not input_pair:
                examples.append(InputExample(guid=filename+str(idx)+"_pos",
                    texts=[context, relevant_response], label=1.0))
            if ns_name == "bm25" and not eval_data:
                ns_candidates, ns_scores , _ , _ , rel_scores = self.negative_sampler.sample(context, [relevant_response], max_query_len = 512, normalize_scores = False, rel_doc_id = str(idx))
            else:
                ns_candidates, ns_scores , _ , _ , rel_scores = self.negative_sampler.sample(context, [relevant_response])
            rel_score = rel_scores[0]

            if denoise_negatives:
                zipped = zip(ns_candidates[-10:], ns_scores[-10:])
            else: 
                zipped = zip(ns_candidates, ns_scores)

            for ns, score_ns in zipped:
                if remove_cand_subsets and ns.replace("<<<AGENT>>>: ", "") in context:
                    count_ns_part_of_context+=1
                else: 
                    if input_pair:
                        examples.append(InputExample(texts=[context, relevant_response, ns], label=float(rel_score-score_ns)))
                        scores_df.append(rel_score-score_ns)
                        # samples_to_annotate.append([self.dataset_folder.split("/")[-1], ns_name, context, relevant_response, ns, rel_score, score_ns])
                    else:
                        examples.append(InputExample(guid=filename+str(idx)+"_neg", 
                            texts=[context, ns], label=0.0))
        logging.info("{} {} count of ns which are part of the context: {} out of {}.".format(self.dataset_folder.split("/")[-1],
         ns_name, count_ns_part_of_context, len(examples)))
        # print(pd.DataFrame(scores_df).describe())
        # pd.DataFrame(samples_to_annotate, columns=['task', 'ns', 'context', 'rel_response', 'negative_sample', 'rel_score', 'score_negative']).\
        #     to_csv(output_dir+"neg_samples_{}_{}.csv".format(ns_name, self.dataset_folder.split("/")[-1]), index=False)        

        if loss == 'MarginMSELoss':
            pd.DataFrame(scores_df).to_csv(output_dir+"MarginScores_{}_{}.csv".format(ns_name, self.dataset_folder.split("/")[-1]))
        return examples