Esempio n. 1
0
def eval_model_for_downstream(model_saved_path):
    seed = 12
    torch.manual_seed(seed)
    bert_model_name = 'bert-base-uncased'
    # lazy = False
    lazy = True
    forward_size = 32
    # batch_size = 64
    batch_size = 128
    do_lower_case = True

    debug_mode = False
    # est_datasize = 900_000

    num_class = 1
    # num_train_optimization_steps

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace("false", namespace="labels")  # 0
    vocab.add_token_to_namespace("true", namespace="labels")  # 1
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels')

    # Load Dataset
    train_list = common.load_json(config.TRAIN_FILE)
    dev_list = common.load_json(config.DEV_FULLWIKI_FILE)

    dev_fitems_list = common.load_jsonl(
        config.PDATA_ROOT / "content_selection_forward" / "hotpot_dev_p_level_unlabeled.jsonl")
    train_fitems_list = common.load_jsonl(
        config.PDATA_ROOT / "content_selection_forward" / "hotpot_train_p_level_labeled.jsonl")
    test_fitems_list = common.load_jsonl(
        config.PDATA_ROOT / "content_selection_forward" / "hotpot_test_p_level_unlabeled.jsonl")

    if debug_mode:
        dev_list = dev_list[:10]
        dev_fitems_list = dev_fitems_list[:296]
        train_fitems_list = train_fitems_list[:300]
        eval_frequency = 2
        # print(dev_list[-1]['_id'])
        # exit(0)

    dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id')
    train_o_dict = list_dict_data_tool.list_to_dict(train_list, '_id')

    bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case)
    bert_cs_reader = BertContentSelectionReader(bert_tokenizer, lazy, is_paired=True,
                                                example_filter=lambda x: len(x['context']) == 0, max_l=286)

    bert_encoder = BertModel.from_pretrained(bert_model_name)
    model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1,
                                            act_type='tanh', use_pretrained_pooler=True, use_sigmoid=True)

    model.load_state_dict(torch.load(model_saved_path))

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    #
    dev_instances = bert_cs_reader.read(dev_fitems_list)
    train_instance = bert_cs_reader.read(train_fitems_list)
    test_instances = bert_cs_reader.read(test_fitems_list)

    biterator = BasicIterator(batch_size=forward_size)
    biterator.index_with(vocab)

    # train_iter = biterator(train_instance, num_epochs=1, shuffle=False)
    # dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False)
    test_iter = biterator(test_instances, num_epochs=1, shuffle=False)

    print(len(dev_fitems_list))
    print(len(test_fitems_list))
    print(len(train_fitems_list))

    # cur_dev_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True, show_progress=True)
    # cur_train_eval_results_list = eval_model(model, train_iter, device_num, with_probs=True, show_progress=True)

    cur_test_eval_results_list = eval_model(model, test_iter, device_num, with_probs=True, show_progress=True)
    common.save_jsonl(cur_test_eval_results_list, "test_p_level_bert_v1_results.jsonl")

    print("Test write finished.")
    exit(0)

    copied_dev_o_dict = copy.deepcopy(dev_o_dict)

    list_dict_data_tool.append_subfield_from_list_to_dict(cur_dev_eval_results_list, copied_dev_o_dict,
                                                          'qid', 'fid', check=True)
    # Top_3
    cur_results_dict_top3 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=3)
    upperbound_results_dict_top3 = append_gt_downstream_to_get_upperbound_from_doc_retri(
        cur_results_dict_top3,
        dev_list)

    # Top_5
    cur_results_dict_top5 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=5)
    upperbound_results_dict_top5 = append_gt_downstream_to_get_upperbound_from_doc_retri(
        cur_results_dict_top5,
        dev_list)

    cur_results_dict_top10 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=10)
    upperbound_results_dict_top10 = append_gt_downstream_to_get_upperbound_from_doc_retri(
        cur_results_dict_top10,
        dev_list)

    _, metrics_top3 = ext_hotpot_eval.eval(cur_results_dict_top3, dev_list, verbose=False)
    _, metrics_top3_UB = ext_hotpot_eval.eval(upperbound_results_dict_top3, dev_list, verbose=False)

    _, metrics_top5 = ext_hotpot_eval.eval(cur_results_dict_top5, dev_list, verbose=False)
    _, metrics_top5_UB = ext_hotpot_eval.eval(upperbound_results_dict_top5, dev_list, verbose=False)

    _, metrics_top10 = ext_hotpot_eval.eval(cur_results_dict_top10, dev_list, verbose=False)
    _, metrics_top10_UB = ext_hotpot_eval.eval(upperbound_results_dict_top10, dev_list, verbose=False)

    logging_item = {
        'top3': metrics_top3,
        'top3_UB': metrics_top3_UB,
        'top5': metrics_top5,
        'top5_UB': metrics_top5_UB,
        'top10': metrics_top10,
        'top10_UB': metrics_top10_UB,
    }

    print(logging_item)

    common.save_jsonl(cur_train_eval_results_list, "train_p_level_bert_v1_results.jsonl")
    common.save_jsonl(cur_dev_eval_results_list, "dev_p_level_bert_v1_results.jsonl")
def eval_model_for_downstream(model_saved_path):
    bert_model_name = 'bert-base-uncased'
    lazy = True
    # lazy = True
    forward_size = 64
    # batch_size = 64
    batch_size = 128
    do_lower_case = True

    debug_mode = False
    max_l = 264
    # est_datasize = 900_000

    num_class = 1
    # num_train_optimization_steps
    tag = 'test'

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace("false", namespace="labels")  # 0
    vocab.add_token_to_namespace("true", namespace="labels")  # 1
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden",
                                               -2,
                                               namespace='labels')

    # Load Dataset
    # train_ruleterm_doc_results = common.load_jsonl(
    #     config.PRO_ROOT / "results/doc_retri_results/fever_results/merged_doc_results/m_doc_train.jsonl")
    # dev_ruleterm_doc_results = train_ruleterm_doc_results
    if tag == 'dev':
        dev_ruleterm_doc_results = common.load_jsonl(
            config.PRO_ROOT /
            "results/doc_retri_results/fever_results/merged_doc_results/m_doc_dev.jsonl"
        )

        dev_list = common.load_jsonl(config.FEVER_DEV)

        dev_fitems = fever_p_level_sampler.get_paragraph_forward_pair(
            'dev',
            dev_ruleterm_doc_results,
            is_training=False,
            debug=debug_mode,
            ignore_non_verifiable=False)
    elif tag == 'train':
        dev_ruleterm_doc_results = common.load_jsonl(
            config.PRO_ROOT /
            "results/doc_retri_results/fever_results/merged_doc_results/m_doc_train.jsonl"
        )

        dev_list = common.load_jsonl(config.FEVER_TRAIN)

        dev_fitems = fever_p_level_sampler.get_paragraph_forward_pair(
            'train',
            dev_ruleterm_doc_results,
            is_training=True,
            debug=debug_mode,
            ignore_non_verifiable=False)
    elif tag == 'test':
        dev_ruleterm_doc_results = common.load_jsonl(
            config.PRO_ROOT /
            "results/doc_retri_results/fever_results/merged_doc_results/m_doc_test.jsonl"
        )

        dev_list = common.load_jsonl(config.FEVER_TEST)

        dev_fitems = fever_p_level_sampler.get_paragraph_forward_pair(
            'test',
            dev_ruleterm_doc_results,
            is_training=False,
            debug=debug_mode,
            ignore_non_verifiable=False)
    else:
        raise NotImplemented()

    # dev_fitems = fever_p_level_sampler.get_paragraph_forward_pair('train', dev_ruleterm_doc_results,
    #                                                               is_training=True, debug=debug_mode,
    #                                                               ignore_non_verifiable=False)

    # Just to show the information
    fever_p_level_sampler.down_sample_neg(dev_fitems, None)

    if debug_mode:
        dev_list = dev_list[:100]
        eval_frequency = 2
        # print(dev_list[-1]['_id'])
        # exit(0)

    # sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio)
    dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, 'id')
    # print(dev_o_dict)

    bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name,
                                                   do_lower_case=do_lower_case)
    bert_cs_reader = BertContentSelectionReader(
        bert_tokenizer,
        lazy,
        is_paired=True,
        example_filter=lambda x: len(x['context']) == 0,
        max_l=max_l,
        element_fieldname='element')

    bert_encoder = BertModel.from_pretrained(bert_model_name)
    model = BertMultiLayerSeqClassification(bert_encoder,
                                            num_labels=num_class,
                                            num_of_pooling_layer=1,
                                            act_type='tanh',
                                            use_pretrained_pooler=True,
                                            use_sigmoid=True)

    model.load_state_dict(torch.load(model_saved_path))

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    #

    if debug_mode:
        num_train_optimization_steps = 100

    dev_instances = bert_cs_reader.read(dev_fitems)

    biterator = BasicIterator(batch_size=forward_size)
    biterator.index_with(vocab)

    dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False)

    cur_eval_results_list = eval_model(model,
                                       dev_iter,
                                       device_num,
                                       make_int=True,
                                       with_probs=True,
                                       show_progress=True)

    common.save_jsonl(cur_eval_results_list,
                      f"fever_p_level_{tag}_results.jsonl")

    if tag == 'test':
        exit(0)
    # common.save_jsonl(cur_eval_results_list, "fever_p_level_train_results_1.jsonl")

    copied_dev_o_dict = copy.deepcopy(dev_o_dict)
    copied_dev_d_list = copy.deepcopy(dev_list)
    list_dict_data_tool.append_subfield_from_list_to_dict(
        cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True)

    cur_results_dict_th0_5 = select_top_k_and_to_results_dict(
        copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.5)

    list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(
        copied_dev_d_list, cur_results_dict_th0_5, 'id', 'predicted_docids')
    # mode = {'standard': False, 'check_doc_id_correct': True}
    strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(copied_dev_d_list,
                                                            dev_list,
                                                            max_evidence=5)
    score_05 = {
        'ss': strict_score,
        'pr': pr,
        'rec': rec,
        'f1': f1,
    }

    list_dict_data_tool.append_subfield_from_list_to_dict(
        cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True)

    cur_results_dict_th0_2 = select_top_k_and_to_results_dict(
        copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.2)

    list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(
        copied_dev_d_list, cur_results_dict_th0_2, 'id', 'predicted_docids')
    # mode = {'standard': False, 'check_doc_id_correct': True}
    strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(copied_dev_d_list,
                                                            dev_list,
                                                            max_evidence=5)
    score_02 = {
        'ss': strict_score,
        'pr': pr,
        'rec': rec,
        'f1': f1,
    }

    list_dict_data_tool.append_subfield_from_list_to_dict(
        cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True)

    cur_results_dict_th0_1 = select_top_k_and_to_results_dict(
        copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.1)

    list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(
        copied_dev_d_list, cur_results_dict_th0_1, 'id', 'predicted_docids')
    # mode = {'standard': False, 'check_doc_id_correct': True}
    strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(copied_dev_d_list,
                                                            dev_list,
                                                            max_evidence=5)
    score_01 = {
        'ss': strict_score,
        'pr': pr,
        'rec': rec,
        'f1': f1,
    }

    list_dict_data_tool.append_subfield_from_list_to_dict(
        cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True)

    cur_results_dict_th00_1 = select_top_k_and_to_results_dict(
        copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.01)

    list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(
        copied_dev_d_list, cur_results_dict_th00_1, 'id', 'predicted_docids')
    # mode = {'standard': False, 'check_doc_id_correct': True}
    strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(copied_dev_d_list,
                                                            dev_list,
                                                            max_evidence=5)
    score_001 = {
        'ss': strict_score,
        'pr': pr,
        'rec': rec,
        'f1': f1,
    }

    list_dict_data_tool.append_subfield_from_list_to_dict(
        cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True)

    cur_results_dict_th000_5 = select_top_k_and_to_results_dict(
        copied_dev_o_dict,
        score_field_name='prob',
        top_k=5,
        filter_value=0.005)

    list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(
        copied_dev_d_list, cur_results_dict_th000_5, 'id', 'predicted_docids')
    # mode = {'standard': False, 'check_doc_id_correct': True}
    strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(copied_dev_d_list,
                                                            dev_list,
                                                            max_evidence=5)
    score_0005 = {
        'ss': strict_score,
        'pr': pr,
        'rec': rec,
        'f1': f1,
    }

    logging_item = {
        'score_0005': score_0005,
        'score_001': score_001,
        'score_01': score_01,
        'score_02': score_02,
        'score_05': score_05,
    }

    print(json.dumps(logging_item, indent=2))
Esempio n. 3
0
def eval_model_for_downstream_ablation(model_saved_path,
                                       doc_top_k=2,
                                       tag='dev'):
    print(f"Run doc_top_k:{doc_top_k}")
    bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert'
    seed = 12
    torch.manual_seed(seed)
    bert_model_name = 'bert-base-uncased'
    # lazy = False
    lazy = True
    # forward_size = 256
    forward_size = 256
    # batch_size = 64
    batch_size = 128
    do_lower_case = True
    document_top_k = doc_top_k

    debug_mode = False
    # est_datasize = 900_000

    num_class = 1
    # num_train_optimization_steps

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace("false", namespace="labels")  # 0
    vocab.add_token_to_namespace("true", namespace="labels")  # 1
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden",
                                               -2,
                                               namespace='labels')

    # Load Dataset
    train_list = common.load_json(config.TRAIN_FILE)
    dev_list = common.load_json(config.DEV_FULLWIKI_FILE)
    test_list = common.load_json(config.TEST_FULLWIKI_FILE)

    # Load train eval results list
    # cur_train_eval_results_list = common.load_jsonl(
    #     config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/"
    #                       "i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/train_p_level_bert_v1_results.jsonl")

    cur_dev_eval_results_list = common.load_jsonl(
        config.PRO_ROOT /
        "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/"
        "i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/dev_p_level_bert_v1_results.jsonl"
    )

    # cur_test_eval_results_list = common.load_jsonl(
    #     config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/"
    #                       "i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/test_p_level_bert_v1_results.jsonl")

    # if tag == 'train':
    #     train_fitems = get_sentence_pair(document_top_k, train_list, cur_train_eval_results_list, is_training=True,
    #                                      debug_mode=debug_mode)
    if tag == 'dev':
        dev_fitems = get_sentence_pair(document_top_k,
                                       dev_list,
                                       cur_dev_eval_results_list,
                                       is_training=False,
                                       debug_mode=debug_mode)

    # elif tag == 'test':
    #     test_fitems = get_sentence_pair(document_top_k, test_list, cur_test_eval_results_list, is_training=False,
    #                                     debug_mode=debug_mode)

    if debug_mode:
        eval_frequency = 2

    #     dev_list = dev_list[:10]
    #     dev_fitems_list = dev_fitems_list[:296]
    #     train_fitems_list = train_fitems_list[:300]
    # print(dev_list[-1]['_id'])
    # exit(0)

    dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id')
    train_o_dict = list_dict_data_tool.list_to_dict(train_list, '_id')

    bert_tokenizer = BertTokenizer.from_pretrained(
        bert_model_name,
        do_lower_case=do_lower_case,
        cache_dir=bert_pretrain_path)
    bert_cs_reader = BertContentSelectionReader(
        bert_tokenizer,
        lazy,
        is_paired=True,
        example_filter=lambda x: len(x['context']) == 0,
        max_l=128,
        element_fieldname='element')

    bert_encoder = BertModel.from_pretrained(bert_model_name,
                                             cache_dir=bert_pretrain_path)
    model = BertMultiLayerSeqClassification(bert_encoder,
                                            num_labels=num_class,
                                            num_of_pooling_layer=1,
                                            act_type='tanh',
                                            use_pretrained_pooler=True,
                                            use_sigmoid=True)

    model.load_state_dict(torch.load(model_saved_path))

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    #
    if tag == 'train':
        train_instance = bert_cs_reader.read(train_fitems)
    elif tag == 'dev':
        dev_instances = bert_cs_reader.read(dev_fitems)
    elif tag == 'test':
        test_instances = bert_cs_reader.read(test_fitems)

    biterator = BasicIterator(batch_size=forward_size)
    biterator.index_with(vocab)

    if tag == 'train':
        train_iter = biterator(train_instance, num_epochs=1, shuffle=False)
        print(len(train_fitems))
    elif tag == 'dev':
        dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False)
        print(len(dev_fitems))
    elif tag == 'test':
        test_iter = biterator(test_instances, num_epochs=1, shuffle=False)
        print(len(test_fitems))

    print("Forward size:", forward_size)

    if tag == 'train':
        cur_train_eval_results_list_out = eval_model(model,
                                                     train_iter,
                                                     device_num,
                                                     with_probs=True,
                                                     show_progress=True)
        common.save_jsonl(
            cur_train_eval_results_list_out, config.PRO_ROOT /
            "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/train_s_level_bert_v1_results.jsonl"
        )
    elif tag == 'dev':
        cur_dev_eval_results_list_out = eval_model(model,
                                                   dev_iter,
                                                   device_num,
                                                   with_probs=True,
                                                   show_progress=True)
        common.save_jsonl(
            cur_dev_eval_results_list_out,
            f"hotpot_s_level_{tag}_results_top_k_doc_{document_top_k}.jsonl")

    elif tag == 'test':
        cur_test_eval_results_list_out = eval_model(model,
                                                    test_iter,
                                                    device_num,
                                                    with_probs=True,
                                                    show_progress=True)
        common.save_jsonl(
            cur_test_eval_results_list_out, config.PRO_ROOT /
            "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/test_s_level_bert_v1_results.jsonl"
        )

    if tag == 'train' or tag == 'test':
        exit(0)

    copied_dev_o_dict = copy.deepcopy(dev_o_dict)
    list_dict_data_tool.append_subfield_from_list_to_dict(
        cur_dev_eval_results_list_out,
        copied_dev_o_dict,
        'qid',
        'fid',
        check=True)
    # 0.5
    cur_results_dict_v05 = select_top_k_and_to_results_dict(
        copied_dev_o_dict,
        top_k=5,
        score_field_name='prob',
        filter_value=0.5,
        result_field='sp')

    cur_results_dict_v02 = select_top_k_and_to_results_dict(
        copied_dev_o_dict,
        top_k=5,
        score_field_name='prob',
        filter_value=0.2,
        result_field='sp')

    _, metrics_v5 = ext_hotpot_eval.eval(cur_results_dict_v05,
                                         dev_list,
                                         verbose=False)

    _, metrics_v2 = ext_hotpot_eval.eval(cur_results_dict_v02,
                                         dev_list,
                                         verbose=False)

    logging_item = {
        'v02': metrics_v2,
        'v05': metrics_v5,
    }

    print(logging_item)
    f1 = metrics_v5['sp_f1']
    em = metrics_v5['sp_em']
    pr = metrics_v5['sp_prec']
    rec = metrics_v5['sp_recall']
    common.save_json(
        logging_item,
        f"top_k_doc:{document_top_k}_em:{em}_pr:{pr}_rec:{rec}_f1:{f1}")
Esempio n. 4
0
def eval_model_for_downstream_ablation(model_saved_path, top_k_doc):
    bert_model_name = 'bert-base-uncased'
    lazy = True
    # lazy = True
    forward_size = 128
    # batch_size = 64
    # batch_size = 128
    do_lower_case = True

    debug_mode = False
    max_l = 128
    # est_datasize = 900_000
    tag = 'dev'

    num_class = 1
    # num_train_optimization_steps

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace("false", namespace="labels")  # 0
    vocab.add_token_to_namespace("true", namespace="labels")  # 1
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden",
                                               -2,
                                               namespace='labels')

    # Load Dataset
    train_upstream_doc_results = common.load_jsonl(
        config.PRO_ROOT /
        "data/p_fever/fever_paragraph_level/04-22-15:05:45_fever_v0_plevel_retri_(ignore_non_verifiable:True)/"
        "i(5000)|e(0)|v02_ofever(0.8947894789478947)|v05_ofever(0.8555355535553555)|seed(12)/fever_p_level_train_results.jsonl"
    )

    dev_upstream_doc_results = common.load_jsonl(
        config.PRO_ROOT /
        "data/p_fever/fever_paragraph_level/04-22-15:05:45_fever_v0_plevel_retri_(ignore_non_verifiable:True)/"
        "i(5000)|e(0)|v02_ofever(0.8947894789478947)|v05_ofever(0.8555355535553555)|seed(12)/fever_p_level_dev_results.jsonl"
    )

    test_upstream_doc_results = common.load_jsonl(
        config.PRO_ROOT /
        "data/p_fever/fever_paragraph_level/04-22-15:05:45_fever_v0_plevel_retri_(ignore_non_verifiable:True)/"
        "i(5000)|e(0)|v02_ofever(0.8947894789478947)|v05_ofever(0.8555355535553555)|seed(12)/fever_p_level_test_results.jsonl"
    )

    train_list = common.load_jsonl(config.FEVER_TRAIN)
    dev_list = common.load_jsonl(config.FEVER_DEV)
    test_list = common.load_jsonl(config.FEVER_TEST)
    # dev_list = common.load_jsonl(config.FEVER_DEV)

    if tag == 'dev':
        dev_fitems = fever_s_level_sampler.get_sentence_forward_pair(
            'dev',
            dev_upstream_doc_results,
            is_training=False,
            debug=debug_mode,
            ignore_non_verifiable=False,
            top_k=top_k_doc,
            filter_value=0.00000)
        fever_p_level_sampler.down_sample_neg(dev_fitems, None)
    elif tag == 'train':
        train_fitems = fever_s_level_sampler.get_sentence_forward_pair(
            'train',
            train_upstream_doc_results,
            is_training=True,
            debug=debug_mode,
            ignore_non_verifiable=False,
            top_k=top_k_doc,
            filter_value=0.00000)
        fever_p_level_sampler.down_sample_neg(train_fitems, None)
    elif tag == 'test':
        test_fitems = fever_s_level_sampler.get_sentence_forward_pair(
            'test',
            test_upstream_doc_results,
            is_training=False,
            debug=debug_mode,
            ignore_non_verifiable=False,
            top_k=top_k_doc,
            filter_value=0.00000)
        fever_p_level_sampler.down_sample_neg(test_fitems, None)

    # Just to show the information

    if debug_mode:
        dev_list = dev_list[:100]
        eval_frequency = 2
        # print(dev_list[-1]['_id'])
        # exit(0)

    dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, 'id')
    test_o_dict = list_dict_data_tool.list_to_dict(test_list, 'id')
    train_o_dict = list_dict_data_tool.list_to_dict(train_list, 'id')
    # print(dev_o_dict)

    bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name,
                                                   do_lower_case=do_lower_case)
    bert_cs_reader = BertContentSelectionReader(
        bert_tokenizer,
        lazy,
        is_paired=True,
        example_filter=lambda x: len(x['context']) == 0,
        max_l=max_l,
        element_fieldname='element')

    bert_encoder = BertModel.from_pretrained(bert_model_name)
    model = BertMultiLayerSeqClassification(bert_encoder,
                                            num_labels=num_class,
                                            num_of_pooling_layer=1,
                                            act_type='tanh',
                                            use_pretrained_pooler=True,
                                            use_sigmoid=True)

    model.load_state_dict(torch.load(model_saved_path))

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    #

    biterator = BasicIterator(batch_size=forward_size)
    biterator.index_with(vocab)

    if tag == 'dev':
        dev_instances = bert_cs_reader.read(dev_fitems)

        dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False)

        cur_eval_results_list = eval_model(model,
                                           dev_iter,
                                           device_num,
                                           make_int=True,
                                           with_probs=True,
                                           show_progress=True)

        common.save_jsonl(
            cur_eval_results_list,
            f"fever_s_level_{tag}_results_top_k_doc_{top_k_doc}.jsonl")

        copied_dev_o_dict = copy.deepcopy(dev_o_dict)
        copied_dev_d_list = copy.deepcopy(dev_list)
        list_dict_data_tool.append_subfield_from_list_to_dict(
            cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True)

        cur_results_dict_th0_5 = select_top_k_and_to_results_dict(
            copied_dev_o_dict,
            score_field_name='prob',
            top_k=5,
            filter_value=0.2,
            result_field='predicted_evidence')

        list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(
            copied_dev_d_list, cur_results_dict_th0_5, 'id',
            'predicted_evidence')
        # mode = {'standard': False, 'check_doc_id_correct': True}

        strict_score, pr, rec, f1 = fever_scorer.fever_sent_only(
            copied_dev_d_list, dev_list, max_evidence=5)
        score_05 = {
            'top_k_doc': top_k_doc,
            'ss': strict_score,
            'pr': pr,
            'rec': rec,
            'f1': f1,
        }

        print("Top_k doc:", top_k_doc)
        print(score_05)
        common.save_json(
            score_05,
            f"top_k_doc:{top_k_doc}_ss:{strict_score}_pr:{pr}_rec:{rec}_f1:{f1}"
        )

    elif tag == 'test':
        test_instances = bert_cs_reader.read(test_fitems)

        test_iter = biterator(test_instances, num_epochs=1, shuffle=False)

        cur_eval_results_list = eval_model(model,
                                           test_iter,
                                           device_num,
                                           make_int=True,
                                           with_probs=True,
                                           show_progress=True)

        common.save_jsonl(cur_eval_results_list,
                          f"fever_s_level_{tag}_results.jsonl")

        # copied_test_o_dict = copy.deepcopy(test_o_dict)
        # copied_test_d_list = copy.deepcopy(test_list)
        # list_dict_data_tool.append_subfield_from_list_to_dict(cur_eval_results_list, copied_test_o_dict,
        #                                                       'qid', 'fid', check=True)
        #
        # cur_results_dict_th0_5 = select_top_k_and_to_results_dict(copied_test_o_dict,
        #                                                           score_field_name='prob',
        #                                                           top_k=5, filter_value=0.5,
        #                                                           result_field='predicted_evidence')
        #
        # list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(copied_test_d_list,
        #                                                                cur_results_dict_th0_5,
        #                                                                'id', 'predicted_evidence')
        # mode = {'standard': False, 'check_doc_id_correct': True}

        # copied_train_o_dict = copy.deepcopy(train_o_dict)
        # copied_train_d_list = copy.deepcopy(train_list)
        # list_dict_data_tool.append_subfield_from_list_to_dict(cur_eval_results_list, copied_train_o_dict,
        #                                                       'qid', 'fid', check=True)
        #
        # cur_results_dict_th0_5 = select_top_k_and_to_results_dict(copied_train_o_dict,
        #                                                           score_field_name='prob',
        #                                                           top_k=5, filter_value=0.5,
        #                                                           result_field='predicted_evidence')
        #
        # list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(copied_train_d_list,
        #                                                                cur_results_dict_th0_5,
        #                                                                'id', 'predicted_evidence')
        # # mode = {'standard': False, 'check_doc_id_correct': True}
        # strict_score, pr, rec, f1 = fever_scorer.fever_sent_only(copied_train_d_list, train_list,
        #                                                          max_evidence=5)
        # score_05 = {
        #     'ss': strict_score,
        #     'pr': pr, 'rec': rec, 'f1': f1,
        # }
        #
        # print(score_05)
    elif tag == 'train':
        train_instances = bert_cs_reader.read(train_fitems)

        train_iter = biterator(train_instances, num_epochs=1, shuffle=False)

        cur_eval_results_list = eval_model(model,
                                           train_iter,
                                           device_num,
                                           make_int=True,
                                           with_probs=True,
                                           show_progress=True)

        common.save_jsonl(cur_eval_results_list,
                          f"fever_s_level_{tag}_results.jsonl")

        copied_train_o_dict = copy.deepcopy(train_o_dict)
        copied_train_d_list = copy.deepcopy(train_list)
        list_dict_data_tool.append_subfield_from_list_to_dict(
            cur_eval_results_list,
            copied_train_o_dict,
            'qid',
            'fid',
            check=True)

        cur_results_dict_th0_5 = select_top_k_and_to_results_dict(
            copied_train_o_dict,
            score_field_name='prob',
            top_k=5,
            filter_value=0.5,
            result_field='predicted_evidence')

        list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(
            copied_train_d_list, cur_results_dict_th0_5, 'id',
            'predicted_evidence')
        # mode = {'standard': False, 'check_doc_id_correct': True}
        strict_score, pr, rec, f1 = fever_scorer.fever_sent_only(
            copied_train_d_list, train_list, max_evidence=5)
        score_05 = {
            'ss': strict_score,
            'pr': pr,
            'rec': rec,
            'f1': f1,
        }

        print(score_05)
Esempio n. 5
0
def eval_trainset_for_train_nli(model_path):
    tag = 'test'
    is_training = False

    seed = 12
    torch.manual_seed(seed)
    bert_model_name = 'bert-base-uncased'
    lazy = False
    # lazy = True
    forward_size = 128
    # batch_size = 64
    # batch_size = 192
    batch_size = 128

    do_lower_case = True

    debug_mode = False
    # debug_mode = True

    num_class = 1

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace("false", namespace="labels")  # 0
    vocab.add_token_to_namespace("true", namespace="labels")  # 1
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden",
                                               -2,
                                               namespace='labels')

    # Load Dataset

    train_fitems_list = get_sentences(tag,
                                      is_training=is_training,
                                      debug=debug_mode)
    est_datasize = len(train_fitems_list)

    bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name,
                                                   do_lower_case=do_lower_case)
    bert_cs_reader = BertContentSelectionReader(
        bert_tokenizer,
        lazy,
        is_paired=True,
        example_filter=lambda x: len(x['context']) == 0,
        max_l=128)

    bert_encoder = BertModel.from_pretrained(bert_model_name)
    model = BertMultiLayerSeqClassification(bert_encoder,
                                            num_labels=num_class,
                                            num_of_pooling_layer=1,
                                            act_type='tanh',
                                            use_pretrained_pooler=True,
                                            use_sigmoid=True)

    model.load_state_dict(torch.load(model_path))

    print("Estimated training size", est_datasize)
    print("Estimated forward steps:", est_datasize / forward_size)

    biterator = BasicIterator(batch_size=forward_size)
    biterator.index_with(vocab)

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    train_instance = bert_cs_reader.read(train_fitems_list)
    train_iter = biterator(train_instance, num_epochs=1, shuffle=False)

    cur_eval_results_list = eval_model(model,
                                       train_iter,
                                       device_num,
                                       with_probs=True,
                                       make_int=True,
                                       show_progress=True)

    if debug_mode:
        train_list = common.load_jsonl(config.FEVER_TRAIN)
        train_list = train_list[:50]
        set_gt_nli_label(train_list)
        train_o_dict = list_dict_data_tool.list_to_dict(train_list, 'id')

        copied_dev_o_dict = copy.deepcopy(train_o_dict)
        copied_dev_d_list = copy.deepcopy(train_list)
        list_dict_data_tool.append_subfield_from_list_to_dict(
            cur_eval_results_list, copied_dev_o_dict, 'oid', 'fid', check=True)

        print("Threshold 0.5:")
        cur_results_dict_th0_5 = select_top_k_and_to_results_dict(
            copied_dev_o_dict, top_k=5, threshold=0.1)
        list_dict_data_tool.append_item_from_dict_to_list(
            copied_dev_d_list, cur_results_dict_th0_5, 'id',
            'predicted_evidence')
        mode = {'standard': True, 'check_sent_id_correct': True}
        strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(
            copied_dev_d_list, train_list, mode=mode, max_evidence=5)
        print(strict_score, acc_score, pr, rec, f1)

    common.save_jsonl(cur_eval_results_list,
                      f'{tag}_sent_results_labeled:{is_training}.jsonl')
Esempio n. 6
0
def model_eval_ablation(model_path, filter_value=0.2, top_k_sent=5):
    bert_model_name = 'bert-base-uncased'
    bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert'

    lazy = False
    forward_size = 32
    do_lower_case = True
    pair_order = 'cq'
    debug_mode = False

    maxout_model = False

    num_class = 3

    tag = 'dev'
    exp = 'no_re_train'
    print("Filter value:", filter_value)
    print("top_k_sent:", top_k_sent)
    train_sent_filtering_prob = 0.2
    dev_sent_filtering_prob = filter_value
    test_sent_filtering_prob = 0.2

    # Data dataset and upstream sentence results.
    dev_sent_results_list = common.load_jsonl(
        config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_dev_results.jsonl")
    # train_sent_results_list = common.load_jsonl(
    #     config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_train_results.jsonl")
    test_sent_results_list = common.load_jsonl(
        config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_test_results.jsonl")

    dev_fitems, dev_list = get_nli_pair('dev', is_training=False,
                                        sent_level_results_list=dev_sent_results_list, debug=debug_mode,
                                        sent_top_k=top_k_sent, sent_filter_value=dev_sent_filtering_prob)
    # train_fitems, train_list = get_nli_pair('train', is_training=True,
    #                                         sent_level_results_list=train_sent_results_list, debug=debug_mode,
    #                                         sent_top_k=5, sent_filter_value=train_sent_filtering_prob)
    test_fitems, test_list = get_nli_pair('test', is_training=False,
                                          sent_level_results_list=test_sent_results_list, debug=debug_mode,
                                          sent_top_k=top_k_sent, sent_filter_value=test_sent_filtering_prob)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace('SUPPORTS', namespace='labels')
    vocab.add_token_to_namespace('REFUTES', namespace='labels')
    vocab.add_token_to_namespace('NOT ENOUGH INFO', namespace='labels')
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels')

    if debug_mode:
        dev_list = dev_list[:100]
        # train_list = train_list[:100]
        test_list = test_list[:100]
        eval_frequency = 2

    # est_datasize = len(train_fitems)

    bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case,
                                                   cache_dir=bert_pretrain_path)
    bert_cs_reader = BertFeverNLIReader(bert_tokenizer, lazy, is_paired=True, query_l=64,
                                        example_filter=None, max_l=384, pair_order=pair_order)

    bert_encoder = BertModel.from_pretrained(bert_model_name, cache_dir=bert_pretrain_path)
    if not maxout_model:
        model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1,
                                                act_type='tanh', use_pretrained_pooler=True, use_sigmoid=False)
    else:
        model = BertPairMaxOutMatcher(bert_encoder, num_of_class=num_class, act_type="gelu", num_of_out_layers=2)

    model.load_state_dict(torch.load(model_path))

    dev_instances = bert_cs_reader.read(dev_fitems)
    # train_instances = bert_cs_reader.read(train_fitems)
    test_instances = bert_cs_reader.read(test_fitems)

    biterator = BasicIterator(batch_size=forward_size)
    biterator.index_with(vocab)

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if tag == 'dev':
        dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False)

        cur_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True, make_int=True,
                                           feed_input_span=maxout_model, show_progress=True)
        common.save_jsonl(cur_eval_results_list, f"nli_{tag}_label_results_th{dev_sent_filtering_prob}_{exp}.jsonl")

        ema_results_dict = list_dict_data_tool.list_to_dict(cur_eval_results_list, 'oid')
        copied_dev_list = copy.deepcopy(dev_list)
        list_dict_data_tool.append_item_from_dict_to_list(copied_dev_list, ema_results_dict,
                                                          'id', 'predicted_label')

        common.save_jsonl(copied_dev_list, f"nli_{tag}_cp_results_th{dev_sent_filtering_prob}_{exp}.jsonl")
        mode = {'standard': True}
        strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(copied_dev_list, dev_list,
                                                                        mode=mode, max_evidence=5)
        logging_item = {
            'ss': strict_score, 'ac': acc_score,
            'pr': pr, 'rec': rec, 'f1': f1,
        }

        print(logging_item)
        common.save_json(logging_item,
                         f"nli_th{dev_sent_filtering_prob}_{exp}_ss:{strict_score}_ac:{acc_score}_pr:{pr}_rec:{rec}_f1:{f1}.jsonl")

    elif tag == 'test':
        test_iter = biterator(test_instances, num_epochs=1, shuffle=False)

        cur_eval_results_list = eval_model(model, test_iter, device_num, with_probs=True, make_int=True,
                                           feed_input_span=maxout_model, show_progress=True)

        common.save_jsonl(cur_eval_results_list, f"nli_{tag}_label_results_th{test_sent_filtering_prob}.jsonl")

        ema_results_dict = list_dict_data_tool.list_to_dict(cur_eval_results_list, 'oid')
        copied_test_list = copy.deepcopy(test_list)
        list_dict_data_tool.append_item_from_dict_to_list(copied_test_list, ema_results_dict,
                                                          'id', 'predicted_label')

        common.save_jsonl(copied_test_list, f"nli_{tag}_cp_results_th{test_sent_filtering_prob}.jsonl")