def eval_ensemble():
    sent_file = config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_dev_results.jsonl"
    dev_sent_filtering_prob = 0.01
    tag = 'dev'
    top_k = 5

    # dev_list = common.load_jsonl(config.FEVER_DEV)
    dev_sent_results_list = common.load_jsonl(sent_file)

    dev_fitems, dev_list = get_nli_pair(
        tag,
        is_training=False,
        sent_level_results_list=dev_sent_results_list,
        debug=False,
        sent_top_k=top_k,
        sent_filter_value=dev_sent_filtering_prob)

    pred_file_list = [
        config.PRO_ROOT /
        "data/p_fever/fever_nli/04-25-22:02:53_fever_v2_nli_th0.2/ema_i(20000)|e(3)|ss(0.7002700270027002)|ac(0.746024602460246)|pr(0.6141389138913633)|rec(0.8627362736273627)|f1(0.7175148212089147)|seed(12)/nli_dev_label_results_th0.2.jsonl",
        config.PRO_ROOT /
        "data/p_fever/fever_nli/04-26-10:15:39_fever_v2_nli_th0.2/ema_i(14000)|e(2)|ss(0.6991199119911992)|ac(0.7492249224922493)|pr(0.7129412941294097)|rec(0.8338583858385838)|f1(0.7686736484619933)|seed(12)/nli_dev_label_results_th0.2.jsonl",
        config.PRO_ROOT /
        "data/p_fever/fever_nli/04-27-10:03:27_fever_v2_nli_th0.2/ema_i(26000)|e(3)|ss(0.6958695869586958)|ac(0.7447744774477447)|pr(0.7129412941294097)|rec(0.8338583858385838)|f1(0.7686736484619933)|seed(12)/nli_dev_label_results_th0.2.jsonl",
    ]
    pred_d_list = [common.load_jsonl(file) for file in pred_file_list]
    final_list = ensemble_nli_results(pred_d_list)
    pred_list = final_list

    ema_results_dict = list_dict_data_tool.list_to_dict(pred_list, 'oid')
    copied_dev_list = copy.deepcopy(dev_list)
    list_dict_data_tool.append_item_from_dict_to_list(copied_dev_list,
                                                      ema_results_dict, 'id',
                                                      'predicted_label')

    dev_list = common.load_jsonl(config.FEVER_DEV)
    mode = {'standard': True}
    strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(
        copied_dev_list, dev_list, mode=mode, max_evidence=5)
    logging_item = {
        'ss': strict_score,
        'ac': acc_score,
        'pr': pr,
        'rec': rec,
        'f1': f1,
    }

    print(logging_item)
Exemple #2
0
def eval_nli():
    dev_list = common.load_jsonl(config.FEVER_DEV)
    # prediction_file = config.PRO_ROOT / "data/p_fever/fever_nli/04-25-22:02:53_fever_v2_nli_th0.2/ema_i(20000)|e(3)|ss(0.7002700270027002)|ac(0.746024602460246)|pr(0.6141389138913633)|rec(0.8627362736273627)|f1(0.7175148212089147)|seed(12)/nli_dev_cp_results_th0.2.jsonl"
    # prediction_file = config.PRO_ROOT / "saved_models/04-15-00:15:59_fever_v1_nli/i(18000)|e(2)|ss(0.6154615461546155)|ac(0.6701170117011701)|pr(0.26657540754071885)|rec(0.8852385238523852)|f1(0.40975857963668794)|seed(12)_dev_nli_results.json"
    prediction_file = config.PRO_ROOT / "data/p_fever/non_sent_level/ema_i(32000)|e(4)|ss(0.5592059205920592)|ac(0.6104110411041104)|pr(0.2638851385138135)|rec(0.8928142814281428)|f1(0.4073667130110584)|seed(12)_dev_nli_results.json"
    pred_list = common.load_jsonl(prediction_file)
    mode = {'standard': True}
    strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(
        pred_list, dev_list, mode=mode, max_evidence=5)
    logging_item = {
        'ss': strict_score,
        'ac': acc_score,
        'pr': pr,
        'rec': rec,
        'f1': f1,
    }

    print(logging_item)
    fever_scorer.fever_confusion_matrix(pred_list, dev_list)
def evidence_adjustment(tag, sent_file, label_file, filter_prob=0.2, top_k=5):
    dev_sent_filtering_prob = filter_prob

    # dev_list = common.load_jsonl(config.FEVER_DEV)
    dev_sent_results_list = common.load_jsonl(sent_file)

    dev_fitems, dev_list = get_nli_pair(
        tag,
        is_training=False,
        sent_level_results_list=dev_sent_results_list,
        debug=False,
        sent_top_k=top_k,
        sent_filter_value=dev_sent_filtering_prob)

    cur_eval_results_list = common.load_jsonl(label_file)

    ema_results_dict = list_dict_data_tool.list_to_dict(
        cur_eval_results_list, 'oid')
    copied_dev_list = copy.deepcopy(dev_list)
    list_dict_data_tool.append_item_from_dict_to_list(copied_dev_list,
                                                      ema_results_dict, 'id',
                                                      'predicted_label')

    mode = {'standard': True}
    # delete_unused_evidence(copied_dev_list)
    strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(
        copied_dev_list, dev_list, mode=mode, max_evidence=5)
    logging_item = {
        'ss': strict_score,
        'ac': acc_score,
        'pr': pr,
        'rec': rec,
        'f1': f1,
    }

    print(logging_item)
Exemple #4
0
def model_go_with_old_data():
    seed = 12
    torch.manual_seed(seed)
    # bert_model_name = 'bert-large-uncased'
    bert_model_name = 'bert-base-uncased'
    experiment_name = 'fever_v1_nli'
    lazy = False
    # lazy = True
    forward_size = 16
    # batch_size = 64
    # batch_size = 192
    batch_size = 32
    gradient_accumulate_step = int(batch_size / forward_size)
    warmup_proportion = 0.1
    learning_rate = 5e-5
    num_train_epochs = 3
    eval_frequency = 2000
    do_lower_case = True
    pair_order = 'cq'
    # debug_mode = True
    debug_mode = False
    # est_datasize = 900_000

    num_class = 3
    # num_train_optimization_steps

    train_sent_filtering_prob = 0.35
    dev_sent_filtering_prob = 0.1

    # dev_sent_results_file = config.RESULT_PATH / "doc_retri_results/fever_results/sent_results/4-14-sent_results_v0/i(5000)|e(0)|s01(0.9170917091709171)|s05(0.8842384238423843)|seed(12)_dev_sent_results.json"
    # train_sent_results_file = config.RESULT_PATH / "doc_retri_results/fever_results/sent_results/4-14-sent_results_v0/train_sent_results.jsonl"
    from utest.utest_format_converter_for_old_sent.tool import format_convert
    dev_sent_results_file = format_convert(
        config.PRO_ROOT /
        "results/doc_retri_results/fever_results/sent_results/old_sent_data_by_NSMN/4-15-dev_sent_pred_scores_old_format.jsonl"
    )
    train_sent_results_file = format_convert(
        config.PRO_ROOT /
        "results/doc_retri_results/fever_results/sent_results/old_sent_data_by_NSMN/train_sent_scores_old_format.jsonl"
    )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace('SUPPORTS', namespace='labels')
    vocab.add_token_to_namespace('REFUTES', namespace='labels')
    vocab.add_token_to_namespace('NOT ENOUGH INFO', namespace='labels')
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden",
                                               -2,
                                               namespace='labels')

    # Load Dataset
    # train_fitems_list = get_inference_pair('train', True, train_sent_results_file, debug_mode, train_sent_filtering_prob)
    dev_debug_num = 2481 if debug_mode else None
    dev_fitems_list, dev_list = get_inference_pair('dev', False,
                                                   dev_sent_results_file,
                                                   dev_debug_num,
                                                   dev_sent_filtering_prob)
    # = common.load_jsonl(config.FEVER_DEV)

    if debug_mode:
        dev_list = dev_list[:50]
        eval_frequency = 1
        # print(dev_list[-1]['_id'])
        # exit(0)

    # sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio)
    train_debug_num = 2971 if debug_mode else None
    train_fitems_list, _ = get_inference_pair('train', True,
                                              train_sent_results_file,
                                              train_debug_num,
                                              train_sent_filtering_prob)
    est_datasize = len(train_fitems_list)

    # dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, 'id')
    # print(dev_o_dict)

    bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name,
                                                   do_lower_case=do_lower_case)
    bert_cs_reader = BertFeverNLIReader(bert_tokenizer,
                                        lazy,
                                        is_paired=True,
                                        query_l=64,
                                        example_filter=None,
                                        max_l=364,
                                        pair_order=pair_order)

    bert_encoder = BertModel.from_pretrained(bert_model_name)
    model = BertMultiLayerSeqClassification(bert_encoder,
                                            num_labels=num_class,
                                            num_of_pooling_layer=1,
                                            act_type='tanh',
                                            use_pretrained_pooler=True,
                                            use_sigmoid=False)
    #
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \
                                   num_train_epochs

    if debug_mode:
        num_train_optimization_steps = 100

    print("Estimated training size", est_datasize)
    print("Number of optimization steps:", num_train_optimization_steps)

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=learning_rate,
                         warmup=warmup_proportion,
                         t_total=num_train_optimization_steps)

    dev_instances = bert_cs_reader.read(dev_fitems_list)

    biterator = BasicIterator(batch_size=forward_size)
    biterator.index_with(vocab)

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    forbackward_step = 0
    update_step = 0

    logging_agent = save_tool.ScoreLogger({})

    file_path_prefix = '.'
    if not debug_mode:
        file_path_prefix, date = save_tool.gen_file_prefix(
            f"{experiment_name}")
        # # # Create Log File
        # Save the source code.
        script_name = os.path.basename(__file__)
        with open(os.path.join(file_path_prefix, script_name),
                  'w') as out_f, open(__file__, 'r') as it:
            out_f.write(it.read())
            out_f.flush()
        # # # Log File end

    for epoch_i in range(num_train_epochs):
        print("Epoch:", epoch_i)

        train_fitems_list, _ = get_inference_pair('train', True,
                                                  train_sent_results_file,
                                                  train_debug_num,
                                                  train_sent_filtering_prob)
        random.shuffle(train_fitems_list)
        train_instance = bert_cs_reader.read(train_fitems_list)
        train_iter = biterator(train_instance, num_epochs=1, shuffle=True)

        for batch in tqdm(train_iter):
            model.train()
            batch = move_to_device(batch, device_num)

            paired_sequence = batch['paired_sequence']
            paired_segments_ids = batch['paired_segments_ids']
            labels_ids = batch['label']
            att_mask, _ = torch_util.get_length_and_mask(paired_sequence)
            s1_span = batch['bert_s1_span']
            s2_span = batch['bert_s2_span']

            loss = model(
                paired_sequence,
                token_type_ids=paired_segments_ids,
                attention_mask=att_mask,
                mode=BertMultiLayerSeqClassification.ForwardMode.TRAIN,
                labels=labels_ids)

            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.

            if gradient_accumulate_step > 1:
                loss = loss / gradient_accumulate_step

            loss.backward()
            forbackward_step += 1

            if forbackward_step % gradient_accumulate_step == 0:
                optimizer.step()
                optimizer.zero_grad()
                update_step += 1

                if update_step % eval_frequency == 0:
                    print("Update steps:", update_step)
                    dev_iter = biterator(dev_instances,
                                         num_epochs=1,
                                         shuffle=False)

                    cur_eval_results_list = eval_model(model,
                                                       dev_iter,
                                                       device_num,
                                                       with_probs=True,
                                                       make_int=True)

                    results_dict = list_dict_data_tool.list_to_dict(
                        cur_eval_results_list, 'oid')
                    copied_dev_list = copy.deepcopy(dev_list)
                    list_dict_data_tool.append_item_from_dict_to_list(
                        copied_dev_list, results_dict, 'id', 'predicted_label')

                    mode = {'standard': True}
                    strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(
                        copied_dev_list,
                        dev_fitems_list,
                        mode=mode,
                        max_evidence=5)
                    logging_item = {
                        'ss': strict_score,
                        'ac': acc_score,
                        'pr': pr,
                        'rec': rec,
                        'f1': f1,
                    }

                    save_file_name = f'i({update_step})|e({epoch_i})' \
                        f'|ss({strict_score})|ac({acc_score})|pr({pr})|rec({rec})|f1({f1})' \
                        f'|seed({seed})'

                    common.save_jsonl(
                        copied_dev_list,
                        Path(file_path_prefix) /
                        f"{save_file_name}_dev_nli_results.json")

                    # print(save_file_name)
                    logging_agent.incorporate_results({}, save_file_name,
                                                      logging_item)
                    logging_agent.logging_to_file(
                        Path(file_path_prefix) / "log.json")

                    model_to_save = model.module if hasattr(
                        model, 'module') else model
                    output_model_file = Path(file_path_prefix) / save_file_name
                    torch.save(model_to_save.state_dict(),
                               str(output_model_file))
Exemple #5
0
def model_go_pure_aug():
    # for some_params in [0.25, 0.25, 0.25]:
    for some_params in [0.25, 0.25, 0.25]:
        # bert_model_name = 'bert-large-uncased'
        seed = 6
        bert_model_name = 'bert-base-uncased'
        lazy = False
        forward_size = 16
        batch_size = 32
        gradient_accumulate_step = int(batch_size / forward_size)
        warmup_proportion = 0.1
        learning_rate = 5e-5
        num_train_epochs = 3
        do_ema = False
        dev_prob_threshold = 0.1
        train_prob_threshold = 0.35
        debug_mode = False
        # experiment_name = f"bert_fever_nli_baseline_on_fulldata"
        # experiment_name = f"bert_fever_nli_baseline_on_fulldata_aug_the_same_gt_mrate({some_params})"
        # experiment_name = f"bert_fever_nli_baseline_on_10p_aug_ratio({some_params})"
        experiment_name = f"bert_fever_nli_baseline_on_fulldata_aug_ratio({some_params})"
        # experiment_name = f"bert_fever_nli_baseline_pure_aug"

        data_aug = True
        # data_aug_file = config.FEVER_DATA_ROOT / "qa_aug/squad_train_turker_groundtruth.json"
        # data_aug_size = int(21_015 * some_params)   # 10p
        # data_aug_size = int(208_346 * some_params)

        # training_file = config.FEVER_DATA_ROOT / "fever_1.0/train_10.jsonl"
        training_file = config.FEVER_DATA_ROOT / "fever_1.0/train.jsonl"

        train_sample_top_k = 8

        # est_datasize = 208_346    # full
        # est_datasize = 14_544
        # est_datasize = 21_015 + data_aug_size   # 10p
        aug_size = int(208_346 * some_params)
        est_datasize = 208_346 + aug_size
        # est_datasize = 208_346 + data_aug_size

        num_class = 3

        # num_train_optimization_steps
        torch.manual_seed(seed)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        n_gpu = torch.cuda.device_count()

        unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
        vocab = ExVocabulary(unk_token_num=unk_token_num)
        vocab.add_token_to_namespace('SUPPORTS', namespace='labels')
        vocab.add_token_to_namespace('REFUTES', namespace='labels')
        vocab.add_token_to_namespace('NOT ENOUGH INFO', namespace='labels')
        vocab.add_token_to_namespace("hidden", namespace="labels")
        vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels')
        # Finished build vocabulary.

        # Load standardized sentence file
        dev_upstream_sent_list = common.load_jsonl(config.FEVER_DATA_ROOT /
                                                   "upstream_sentence_selection_Feb16/dev_sent_pred_scores.jsonl")
        dev_sent_after_threshold_filter = fever_ss_sampler.threshold_sampler_insure_unique(
            config.FEVER_DATA_ROOT / "fever_1.0/shared_task_dev.jsonl",
            dev_upstream_sent_list,
            prob_threshold=dev_prob_threshold, top_n=5)

        dev_data_list = fever_nli_sampler.select_sent_with_prob_for_eval(
            config.FEVER_DATA_ROOT / "fever_1.0/shared_task_dev.jsonl", dev_sent_after_threshold_filter,
            None, tokenized=True)

        # print(dev_data_list[0])
        # exit(0)

        train_upstream_sent_list = common.load_jsonl(config.FEVER_DATA_ROOT /
                                                     "upstream_sentence_selection_Feb16/train_sent_scores.jsonl")
        # Finished loading standardized sentence file.

        bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=True)

        bert_fever_reader = BertReaderFeverNLI(bert_tokenizer, lazy=lazy)

        dev_instances = bert_fever_reader.read(dev_data_list)

        biterator = BasicIterator(batch_size=forward_size)
        biterator.index_with(vocab)

        # print(list(mnli_dev_instances))

        # Load training model
        # Load training model
        model_clf = BertForSequenceClassification.from_pretrained(bert_model_name, num_labels=num_class)

        ema_tracker = None
        ema_model_copy = None
        if do_ema and ema_tracker is None:
            ema_tracker = EMA(model_clf.named_parameters(), on_cpu=True)
            ema_model_copy = copy.deepcopy(model_clf)

        model_clf.to(device)

        param_optimizer = list(model_clf.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

        num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \
                                       num_train_epochs

        print(num_train_optimization_steps)

        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=learning_rate,
                             warmup=warmup_proportion,
                             t_total=num_train_optimization_steps)

        # optimizer = optim.Adam(optimizer_grouped_parameters, lr=learning_rate)

        # # # Create Log File
        file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}")
        # Save the source code.
        script_name = os.path.basename(__file__)
        with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it:
            out_f.write(it.read())
            out_f.flush()
        # # # Log File end

        model_clf.train()

        if n_gpu > 1:
            model_clf = nn.DataParallel(model_clf)

        forbackward_step = 0
        update_step = 0
        eval_iter_num = 2_000  # Change this to real evaluation.
        best_fever_score = -1

        for n_epoch in range(num_train_epochs):
            print("Resampling...")
            train_sent_after_threshold_filter = \
                fever_ss_sampler.threshold_sampler_insure_unique(training_file,
                                                                 train_upstream_sent_list,
                                                                 train_prob_threshold,
                                                                 top_n=train_sample_top_k)
            #
            train_data_list = fever_nli_sampler.adv_simi_sample_with_prob_v1_1(
                training_file,
                train_sent_after_threshold_filter,
                None,
                tokenized=True)

            aug_d_list = []
            if data_aug:
                aug_d_list = get_sample_data(-1)
                random.shuffle(aug_d_list)
                aug_d_list = aug_d_list[:aug_size]

            train_data_list = train_data_list + aug_d_list

            random.shuffle(train_data_list)
            # train_data_list = get_sample_data(-1)
            print("Sample data length:", len(train_data_list))
            sampled_train_instances = bert_fever_reader.read(train_data_list)
            #
            train_iter = biterator(sampled_train_instances, shuffle=True, num_epochs=1)

            for i, batch in enumerate(tqdm(train_iter)):
                paired_sequence = batch['paired_sequence']
                paired_segments_ids = batch['paired_segments_ids']
                labels_ids = batch['label']
                att_mask, _ = torch_util.get_length_and_mask(paired_sequence)

                paired_sequence = paired_sequence.to(device)
                paired_segments_ids = paired_segments_ids.to(device)
                labels_ids = labels_ids.to(device)
                att_mask = att_mask.to(device)

                loss = model_clf(paired_sequence, token_type_ids=paired_segments_ids, attention_mask=att_mask,
                                 labels=labels_ids)

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.

                if gradient_accumulate_step > 1:
                    loss = loss / gradient_accumulate_step

                loss.backward()
                forbackward_step += 1

                if forbackward_step % gradient_accumulate_step == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    update_step += 1
                    if do_ema and ema_tracker is not None:
                        # if model_clf is DataParallel, then we use model_clf.module
                        model_to_track = model_clf.module if hasattr(model_clf,
                                                                     'module') else model_clf
                        ema_tracker(model_to_track.named_parameters())  # Whenever we do update, the do ema update

                    if update_step % eval_iter_num == 0:
                        print("Update steps:", update_step)
                        dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False)

                        if do_ema and ema_model_copy is not None and ema_tracker is not None:
                            print("EMA evaluation.")
                            EMA.load_ema_to_model(ema_model_copy, ema_tracker)
                            ema_model_copy.to(device)
                            if n_gpu > 1:
                                ema_model_copy = nn.DataParallel(ema_model_copy)
                            dev_data_list = hidden_eval(ema_model_copy, dev_iter, dev_data_list, device)
                        else:
                            dev_data_list = hidden_eval(model_clf, dev_iter, dev_data_list, device)

                        eval_mode = {'check_sent_id_correct': True, 'standard': True}
                        fever_score, label_score, pr, rec, f1 = fever_scorer.fever_score(dev_data_list,
                                                                                         common.load_jsonl(config.FEVER_DATA_ROOT / "fever_1.0/shared_task_dev.jsonl"),
                                                                                         mode=eval_mode,
                                                                                         verbose=False)
                        print("Fever Score(FScore/LScore:/Precision/Recall/F1):", fever_score, label_score, pr, rec, f1)

                        print(f"Dev:{fever_score}/{label_score}")

                        if best_fever_score < fever_score:
                            print("New Best FScore")
                            best_fever_score = fever_score

                            save_path = os.path.join(
                                file_path_prefix,
                                f'i({update_step})_epoch({n_epoch})_dev({fever_score})_lacc({label_score})_seed({seed})'
                            )
                            model_to_save = model_clf.module if hasattr(model_clf,
                                                                        'module') else model_clf
                            output_model_file = os.path.join(file_path_prefix, save_path)
                            torch.save(model_to_save.state_dict(), output_model_file)

            print("Update steps:", update_step)
            dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False)

            if do_ema and ema_model_copy is not None and ema_tracker is not None:
                print("EMA evaluation.")
                EMA.load_ema_to_model(ema_model_copy, ema_tracker)
                ema_model_copy.to(device)
                if n_gpu > 1:
                    ema_model_copy = nn.DataParallel(ema_model_copy)
                dev_data_list = hidden_eval(ema_model_copy, dev_iter, dev_data_list, device)
            else:
                dev_data_list = hidden_eval(model_clf, dev_iter, dev_data_list, device)

            eval_mode = {'check_sent_id_correct': True, 'standard': True}
            fever_score, label_score, pr, rec, f1 = fever_scorer.fever_score(dev_data_list,
                                                                             common.load_jsonl(config.FEVER_DATA_ROOT / "fever_1.0/shared_task_dev.jsonl"),
                                                                             mode=eval_mode,
                                                                             verbose=False)
            print("Fever Score(FScore/LScore:/Precision/Recall/F1):", fever_score, label_score, pr, rec, f1)

            print(f"Dev:{fever_score}/{label_score}")

            if best_fever_score < fever_score:
                print("New Best FScore")
                best_fever_score = fever_score

                save_path = os.path.join(
                    file_path_prefix,
                    f'i({update_step})_epoch({n_epoch})_dev({fever_score})_lacc({label_score})_seed({seed})'
                )
                model_to_save = model_clf.module if hasattr(model_clf,
                                                            'module') else model_clf
                output_model_file = os.path.join(file_path_prefix, save_path)
                torch.save(model_to_save.state_dict(), output_model_file)
def fever_retrieval_v0(term_retrieval_top_k=3, match_filtering_k=2, tag='dev'):
    # term_retrieval_top_k = 20
    # term_retrieval_top_k = 20

    # term_retrieval_top_k = 3
    # match_filtering_k = 2

    if tag == 'dev':
        d_list = common.load_jsonl(config.FEVER_DEV)
    elif tag == 'train':
        d_list = common.load_jsonl(config.FEVER_TRAIN)
    elif tag == 'test':
        d_list = common.load_jsonl(config.FEVER_TEST)
    else:
        raise ValueError(f"Tag:{tag} not supported.")

    d_tf_idf = common.load_jsonl(
        config.RESULT_PATH /
        f"doc_retri_results/term_based_methods_results/fever_tf_idf_{tag}.jsonl"
    )

    tf_idf_dict = list_dict_data_tool.list_to_dict(d_tf_idf, 'id')

    r_list = []

    ner_set = get_title_entity_set()

    g_score_dict = dict()
    load_from_file(
        g_score_dict, config.PDATA_ROOT /
        "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt")

    keyword_processor = KeywordProcessor(case_sensitive=True)
    keyword_processor_disamb = KeywordProcessor(case_sensitive=True)

    print("Build Processor")
    for kw in tqdm(ner_set):
        if filter_word(kw) or filter_document_id(kw):
            continue  # if the keyword is filtered by above function or is stopwords
        else:
            # matched_key_word is the original matched span. we need to save it for group ordering.
            matched_obj = _MatchedObject(matched_key_word=kw,
                                         matched_keywords_info={kw: 'kwm'})
            keyword_processor.add_keyword(kw, matched_obj)

    for kw in wiki_util.title_entities_set.disambiguation_group:
        if filter_word(kw) or filter_document_id(kw):
            continue  # if the keyword is filtered by above function or is stopwords
        else:
            if kw in keyword_processor:
                # if the kw existed in the kw_processor, we update its dict to add more disamb items
                existing_matched_obj: _MatchedObject = keyword_processor.get_keyword(
                    kw)
                for disamb_kw in wiki_util.title_entities_set.disambiguation_group[
                        kw]:
                    if filter_document_id(disamb_kw):
                        continue
                    if disamb_kw not in existing_matched_obj.matched_keywords_info:
                        existing_matched_obj.matched_keywords_info[
                            disamb_kw] = 'kwm_disamb'
            else:  # If not we add it to the keyword_processor_disamb, which is set to be lower priority
                # new_dict = dict()
                matched_obj = _MatchedObject(matched_key_word=kw,
                                             matched_keywords_info=dict())
                for disamb_kw in wiki_util.title_entities_set.disambiguation_group[
                        kw]:
                    if filter_document_id(disamb_kw):
                        continue
                    matched_obj.matched_keywords_info[disamb_kw] = 'kwm_disamb'
                    # new_dict[disamb_kw] = 'kwm_disamb'
                keyword_processor_disamb.add_keyword(kw, matched_obj)

    for item in tqdm(d_list):
        cur_id = str(item['id'])
        query = item['claim']

        query_terms = get_query_ngrams(query)
        valid_query_terms = [
            term for term in query_terms if term in g_score_dict
        ]

        retrieved_set = RetrievedSet()
        # print(tf_idf_doc_list)
        get_kw_matching_results(query, valid_query_terms, retrieved_set,
                                match_filtering_k, g_score_dict,
                                keyword_processor, keyword_processor_disamb)

        tf_idf_doc_list = tf_idf_dict[cur_id]['retrieved_list']
        added_count = 0
        for score, title in sorted(tf_idf_doc_list,
                                   key=lambda x: x[0],
                                   reverse=True)[:term_retrieval_top_k + 3]:
            if not filter_word(title) and not filter_document_id(
                    title) and not title.startswith('List of '):
                retrieved_set.add_item(RetrievedItem(title, 'tf-idf'))
                added_count += 1
                if term_retrieval_top_k is not None and added_count >= term_retrieval_top_k:
                    break

        predicted_docids = retrieved_set.to_id_list()
        # print(retrieved_set)
        # print(item['claim'], predicted_docids)

        r_item = dict()
        r_item['id'] = int(cur_id)
        r_item['claim'] = item['claim']
        r_item['predicted_docids'] = predicted_docids
        if tag != 'test':
            r_item['label'] = item['label']
        r_list.append(r_item)

    # r_list = common.load_jsonl('dev-debug.jsonl')

    # We need to modify the existing retrieved document for naming consistency
    for i, item in enumerate(r_list):
        predicted_docids = item['predicted_docids']
        modified_docids = []
        for docid in predicted_docids:
            docid = docid.replace(' ', '_')
            docid = reverse_convert_brc(docid)
            modified_docids.append(docid)
        item['predicted_docids'] = modified_docids
    # Modify finished

    # print(r_list[0:10])
    len_list = []
    for rset in r_list:
        len_list.append(len(rset['predicted_docids']))

    print(collections.Counter(len_list).most_common(10000))

    print(np.mean(len_list))
    print(np.std(len_list))
    print(np.max(len_list))
    print(np.min(len_list))

    common.save_jsonl(
        r_list, f'fever_term_based_retri_results_'
        f'{tag}_term_topk:{term_retrieval_top_k}_match_filtering_k:{match_filtering_k}.jsonl'
    )

    mode = {'standard': False, 'check_doc_id_correct': True}
    # fever_scorer.fever_score_analysis(r_list, d_list, mode=mode, max_evidence=None)
    fever_scorer.fever_score(r_list, d_list, mode=mode, max_evidence=None)
Exemple #7
0
def model_eval(model_save_path):
    seed = 6
    bert_model_name = 'bert-base-uncased'
    lazy = False
    forward_size = 16
    batch_size = 32
    # dev_prob_threshold = 0.05
    dev_prob_threshold = 0.1

    num_class = 3

    # num_train_optimization_steps
    torch.manual_seed(seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()

    unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace('SUPPORTS', namespace='labels')
    vocab.add_token_to_namespace('REFUTES', namespace='labels')
    vocab.add_token_to_namespace('NOT ENOUGH INFO', namespace='labels')
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden",
                                               -2,
                                               namespace='labels')
    # Finished build vocabulary.

    # Load standardized sentence file
    # dev_upstream_sent_list = common.load_jsonl(config.RESULT_PATH /
    #                                            "doc_retri_results/fever_results/sent_results/4-14-sent_results_v0/i(5000)|e(0)|s01(0.9170917091709171)|s05(0.8842384238423843)|seed(12)_dev_sent_results.json")

    # dev_upstream_sent_list = common.load_jsonl(config.DATA_ROOT /
    # "utest_data/dev_sent_score_2_shared_task_dev.jsonl")
    # "utest_data/dev_sent_score_1_shared_task_dev_docnum(10)_ensembled.jsonl")

    # dev_upstream_sent_list = common.load_jsonl(config.FEVER_DATA_ROOT /
    #                                            "upstream_sentence_selection_Feb16/dev_sent_pred_scores.jsonl")

    dev_upstream_sent_list = common.load_jsonl(
        config.FEVER_DATA_ROOT /
        "upstream_sentence_selection_Feb16/4-15-dev_sent_pred_scores.jsonl")
    #
    # dev_upstream_sent_list = common.load_jsonl(config.FEVER_DATA_ROOT /
    #                                            "upstream_sentence_selection_Feb16/4-15-test_sent_pred_scores.jsonl")

    # dev_upstream_sent_list = common.load_jsonl(config.FEVER_DATA_ROOT /
    #                                            "upstream_sentence_selection_Feb16/n_dev_sent_pred_scores.jsonl")

    # dev_sent_after_threshold_filter = fever_ss_sampler.threshold_sampler_insure_unique_new_format(
    dev_sent_after_threshold_filter = fever_ss_sampler.threshold_sampler_insure_unique(
        config.FEVER_DEV,
        dev_upstream_sent_list,
        prob_threshold=dev_prob_threshold,
        top_n=5)

    dev_data_list = fever_nli_sampler.select_sent_with_prob_for_eval(
        config.FEVER_DEV,
        dev_sent_after_threshold_filter,
        None,
        tokenized=True)

    # dev_sent_after_threshold_filter = fever_ss_sampler.threshold_sampler_insure_unique(
    #     config.FEVER_TEST,
    #     dev_upstream_sent_list,
    #     prob_threshold=dev_prob_threshold, top_n=5)
    #
    # dev_data_list = fever_nli_sampler.select_sent_with_prob_for_eval(
    #     config.FEVER_TEST, dev_sent_after_threshold_filter,
    #     None, tokenized=True, pipeline=True)

    for item in dev_data_list:
        item['label'] = 'hidden'

    dev_list = common.load_jsonl(config.FEVER_DEV)

    for a, b in zip(dev_list, dev_data_list):
        del b['label']
        b['predicted_label'] = a['label']

    eval_mode = {'check_sent_id_correct': True, 'standard': True}
    fever_score, label_score, pr, rec, f1 = fever_scorer.fever_score(
        dev_data_list, dev_list, mode=eval_mode, verbose=False)
    print("Fever Score(FScore/LScore:/Precision/Recall/F1):", fever_score,
          label_score, pr, rec, f1)
    print(f"Dev:{fever_score}/{label_score}")

    bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name,
                                                   do_lower_case=True)

    bert_fever_reader = BertReaderFeverNLI(bert_tokenizer, lazy=lazy)

    dev_instances = bert_fever_reader.read(dev_data_list)

    biterator = BasicIterator(batch_size=forward_size)
    biterator.index_with(vocab)

    # print(list(mnli_dev_instances))

    # Load training model
    # Load training model
    model_clf = BertForSequenceClassification.from_pretrained(
        bert_model_name, num_labels=num_class)

    model_clf.load_state_dict(torch.load(model_save_path))

    model_clf.to(device)

    model_clf.eval()

    if n_gpu > 1:
        model_clf = nn.DataParallel(model_clf)

    dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False)

    # for item in dev_data_list:

    dev_data_list = hidden_eval(model_clf, dev_iter, dev_data_list, device)

    common.save_jsonl(
        dev_data_list, config.PRO_ROOT /
        "data/fever/upstream_sentence_selection_Feb16/4-15-dev_nli_results.jsonl"
    )

    eval_mode = {'check_sent_id_correct': True, 'standard': True}
    fever_score, label_score, pr, rec, f1 = fever_scorer.fever_score(
        dev_data_list,
        common.load_jsonl(config.FEVER_DEV),
        mode=eval_mode,
        verbose=False)
    print("Fever Score(FScore/LScore:/Precision/Recall/F1):", fever_score,
          label_score, pr, rec, f1)

    print(f"Dev:{fever_score}/{label_score}")
Exemple #8
0
def eval_trainset_for_train_nli(model_path):
    tag = 'test'
    is_training = False

    seed = 12
    torch.manual_seed(seed)
    bert_model_name = 'bert-base-uncased'
    lazy = False
    # lazy = True
    forward_size = 128
    # batch_size = 64
    # batch_size = 192
    batch_size = 128

    do_lower_case = True

    debug_mode = False
    # debug_mode = True

    num_class = 1

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace("false", namespace="labels")  # 0
    vocab.add_token_to_namespace("true", namespace="labels")  # 1
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden",
                                               -2,
                                               namespace='labels')

    # Load Dataset

    train_fitems_list = get_sentences(tag,
                                      is_training=is_training,
                                      debug=debug_mode)
    est_datasize = len(train_fitems_list)

    bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name,
                                                   do_lower_case=do_lower_case)
    bert_cs_reader = BertContentSelectionReader(
        bert_tokenizer,
        lazy,
        is_paired=True,
        example_filter=lambda x: len(x['context']) == 0,
        max_l=128)

    bert_encoder = BertModel.from_pretrained(bert_model_name)
    model = BertMultiLayerSeqClassification(bert_encoder,
                                            num_labels=num_class,
                                            num_of_pooling_layer=1,
                                            act_type='tanh',
                                            use_pretrained_pooler=True,
                                            use_sigmoid=True)

    model.load_state_dict(torch.load(model_path))

    print("Estimated training size", est_datasize)
    print("Estimated forward steps:", est_datasize / forward_size)

    biterator = BasicIterator(batch_size=forward_size)
    biterator.index_with(vocab)

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    train_instance = bert_cs_reader.read(train_fitems_list)
    train_iter = biterator(train_instance, num_epochs=1, shuffle=False)

    cur_eval_results_list = eval_model(model,
                                       train_iter,
                                       device_num,
                                       with_probs=True,
                                       make_int=True,
                                       show_progress=True)

    if debug_mode:
        train_list = common.load_jsonl(config.FEVER_TRAIN)
        train_list = train_list[:50]
        set_gt_nli_label(train_list)
        train_o_dict = list_dict_data_tool.list_to_dict(train_list, 'id')

        copied_dev_o_dict = copy.deepcopy(train_o_dict)
        copied_dev_d_list = copy.deepcopy(train_list)
        list_dict_data_tool.append_subfield_from_list_to_dict(
            cur_eval_results_list, copied_dev_o_dict, 'oid', 'fid', check=True)

        print("Threshold 0.5:")
        cur_results_dict_th0_5 = select_top_k_and_to_results_dict(
            copied_dev_o_dict, top_k=5, threshold=0.1)
        list_dict_data_tool.append_item_from_dict_to_list(
            copied_dev_d_list, cur_results_dict_th0_5, 'id',
            'predicted_evidence')
        mode = {'standard': True, 'check_sent_id_correct': True}
        strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(
            copied_dev_d_list, train_list, mode=mode, max_evidence=5)
        print(strict_score, acc_score, pr, rec, f1)

    common.save_jsonl(cur_eval_results_list,
                      f'{tag}_sent_results_labeled:{is_training}.jsonl')
Exemple #9
0
def model_go():
    seed = 12
    torch.manual_seed(seed)
    # bert_model_name = 'bert-large-uncased'
    bert_model_name = 'bert-base-uncased'
    experiment_name = 'fever_v0_cs_ratio_001'
    # lazy = False
    lazy = True
    forward_size = 128
    # batch_size = 64
    # batch_size = 192
    batch_size = 128
    gradient_accumulate_step = int(batch_size / forward_size)
    warmup_proportion = 0.1
    learning_rate = 5e-5
    num_train_epochs = 5
    eval_frequency = 20000
    pos_ratio = 0.01
    do_lower_case = True

    # debug_mode = True
    debug_mode = False
    # est_datasize = 900_000

    num_class = 1
    # num_train_optimization_steps

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace("false", namespace="labels")  # 0
    vocab.add_token_to_namespace("true", namespace="labels")  # 1
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden",
                                               -2,
                                               namespace='labels')

    # Load Dataset
    # train_list = common.load_jsonl(config.FEVER_TRAIN)
    dev_list = common.load_jsonl(config.FEVER_DEV)
    set_gt_nli_label(dev_list)

    # dev_fitems_list = common.load_jsonl(
    #     config.PDATA_ROOT / "content_selection_forward" / "hotpot_dev_p_level_unlabeled.jsonl")
    # train_fitems_list = common.load_jsonl(
    #     config.PDATA_ROOT / "content_selection_forward" / "hotpot_train_p_level_labeled.jsonl")

    dev_fitems_list = get_sentences('dev', is_training=False, debug=debug_mode)
    train_fitems_list = get_sentences('train',
                                      is_training=True,
                                      debug=debug_mode)

    if debug_mode:
        dev_list = dev_list[:50]
        eval_frequency = 1
        # print(dev_list[-1]['_id'])
        # exit(0)

    sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio)
    est_datasize = len(sampled_train_list)

    dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, 'id')
    # print(dev_o_dict)

    bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name,
                                                   do_lower_case=do_lower_case)
    bert_cs_reader = BertContentSelectionReader(
        bert_tokenizer,
        lazy,
        is_paired=True,
        example_filter=lambda x: len(x['context']) == 0,
        max_l=128)

    bert_encoder = BertModel.from_pretrained(bert_model_name)
    model = BertMultiLayerSeqClassification(bert_encoder,
                                            num_labels=num_class,
                                            num_of_pooling_layer=1,
                                            act_type='tanh',
                                            use_pretrained_pooler=True,
                                            use_sigmoid=True)
    #
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \
                                   num_train_epochs

    if debug_mode:
        num_train_optimization_steps = 100

    print("Estimated training size", est_datasize)
    print("Number of optimization steps:", num_train_optimization_steps)

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=learning_rate,
                         warmup=warmup_proportion,
                         t_total=num_train_optimization_steps)

    dev_instances = bert_cs_reader.read(dev_fitems_list)

    biterator = BasicIterator(batch_size=forward_size)
    biterator.index_with(vocab)

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    forbackward_step = 0
    update_step = 0

    logging_agent = save_tool.ScoreLogger({})

    file_path_prefix = '.'
    if not debug_mode:
        file_path_prefix, date = save_tool.gen_file_prefix(
            f"{experiment_name}")
        # # # Create Log File
        # Save the source code.
        script_name = os.path.basename(__file__)
        with open(os.path.join(file_path_prefix, script_name),
                  'w') as out_f, open(__file__, 'r') as it:
            out_f.write(it.read())
            out_f.flush()
        # # # Log File end

    for epoch_i in range(num_train_epochs):
        print("Epoch:", epoch_i)
        sampled_train_list = down_sample_neg(train_fitems_list,
                                             ratio=pos_ratio)
        train_instance = bert_cs_reader.read(sampled_train_list)
        train_iter = biterator(train_instance, num_epochs=1, shuffle=True)

        for batch in tqdm(train_iter):
            model.train()
            batch = move_to_device(batch, device_num)

            paired_sequence = batch['paired_sequence']
            paired_segments_ids = batch['paired_segments_ids']
            labels_ids = batch['label']
            att_mask, _ = torch_util.get_length_and_mask(paired_sequence)
            s1_span = batch['bert_s1_span']
            s2_span = batch['bert_s2_span']

            loss = model(
                paired_sequence,
                token_type_ids=paired_segments_ids,
                attention_mask=att_mask,
                mode=BertMultiLayerSeqClassification.ForwardMode.TRAIN,
                labels=labels_ids)

            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.

            if gradient_accumulate_step > 1:
                loss = loss / gradient_accumulate_step

            loss.backward()
            forbackward_step += 1

            if forbackward_step % gradient_accumulate_step == 0:
                optimizer.step()
                optimizer.zero_grad()
                update_step += 1

                if update_step % eval_frequency == 0:
                    print("Update steps:", update_step)
                    dev_iter = biterator(dev_instances,
                                         num_epochs=1,
                                         shuffle=False)

                    cur_eval_results_list = eval_model(model,
                                                       dev_iter,
                                                       device_num,
                                                       with_probs=True,
                                                       make_int=True)
                    copied_dev_o_dict = copy.deepcopy(dev_o_dict)
                    copied_dev_d_list = copy.deepcopy(dev_list)
                    list_dict_data_tool.append_subfield_from_list_to_dict(
                        cur_eval_results_list,
                        copied_dev_o_dict,
                        'oid',
                        'fid',
                        check=True)

                    print("Threshold 0.5:")
                    cur_results_dict_th0_5 = select_top_k_and_to_results_dict(
                        copied_dev_o_dict, top_k=5, threshold=0.5)
                    list_dict_data_tool.append_item_from_dict_to_list(
                        copied_dev_d_list, cur_results_dict_th0_5, 'id',
                        'predicted_evidence')
                    mode = {'standard': True, 'check_sent_id_correct': True}
                    strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(
                        copied_dev_d_list, dev_list, mode=mode, max_evidence=5)
                    score_05 = {
                        'ss': strict_score,
                        'as': acc_score,
                        'pr': pr,
                        'rec': rec,
                        'f1': f1,
                    }

                    print("Threshold 0.1:")
                    cur_results_dict_th0_1 = select_top_k_and_to_results_dict(
                        copied_dev_o_dict, top_k=5, threshold=0.1)
                    list_dict_data_tool.append_item_from_dict_to_list(
                        copied_dev_d_list, cur_results_dict_th0_1, 'id',
                        'predicted_evidence')
                    mode = {'standard': True, 'check_sent_id_correct': True}
                    strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(
                        copied_dev_d_list, dev_list, mode=mode, max_evidence=5)
                    score_01 = {
                        'ss': strict_score,
                        'as': acc_score,
                        'pr': pr,
                        'rec': rec,
                        'f1': f1,
                    }

                    logging_item = {
                        'score_01': score_01,
                        'score_05': score_05,
                    }

                    print(logging_item)

                    s01_ss_score = score_01['ss']
                    s05_ss_score = score_05['ss']
                    #
                    # exit(0)

                    # print(logging_item)
                    save_file_name = f'i({update_step})|e({epoch_i})' \
                        f'|s01({s01_ss_score})|s05({s05_ss_score})' \
                        f'|seed({seed})'

                    common.save_jsonl(
                        cur_eval_results_list,
                        Path(file_path_prefix) /
                        f"{save_file_name}_dev_sent_results.json")

                    # print(save_file_name)
                    logging_agent.incorporate_results({}, save_file_name,
                                                      logging_item)
                    logging_agent.logging_to_file(
                        Path(file_path_prefix) / "log.json")

                    model_to_save = model.module if hasattr(
                        model, 'module') else model
                    output_model_file = Path(file_path_prefix) / save_file_name
                    torch.save(model_to_save.state_dict(),
                               str(output_model_file))

                    # print(logging_agent.logging_item_list)

        # Epoch eval:
        print("Update steps:", update_step)
        dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False)

        cur_eval_results_list = eval_model(model,
                                           dev_iter,
                                           device_num,
                                           with_probs=True,
                                           make_int=True)
        copied_dev_o_dict = copy.deepcopy(dev_o_dict)
        copied_dev_d_list = copy.deepcopy(dev_list)
        list_dict_data_tool.append_subfield_from_list_to_dict(
            cur_eval_results_list, copied_dev_o_dict, 'oid', 'fid', check=True)

        print("Threshold 0.5:")
        cur_results_dict_th0_5 = select_top_k_and_to_results_dict(
            copied_dev_o_dict, top_k=5, threshold=0.5)
        list_dict_data_tool.append_item_from_dict_to_list(
            copied_dev_d_list, cur_results_dict_th0_5, 'id',
            'predicted_evidence')
        mode = {'standard': True, 'check_sent_id_correct': True}
        strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(
            copied_dev_d_list, dev_list, mode=mode, max_evidence=5)
        score_05 = {
            'ss': strict_score,
            'as': acc_score,
            'pr': pr,
            'rec': rec,
            'f1': f1,
        }

        print("Threshold 0.1:")
        cur_results_dict_th0_1 = select_top_k_and_to_results_dict(
            copied_dev_o_dict, top_k=5, threshold=0.1)
        list_dict_data_tool.append_item_from_dict_to_list(
            copied_dev_d_list, cur_results_dict_th0_1, 'id',
            'predicted_evidence')
        mode = {'standard': True, 'check_sent_id_correct': True}
        strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(
            copied_dev_d_list, dev_list, mode=mode, max_evidence=5)
        score_01 = {
            'ss': strict_score,
            'as': acc_score,
            'pr': pr,
            'rec': rec,
            'f1': f1,
        }

        logging_item = {
            'score_01': score_01,
            'score_05': score_05,
        }

        print(logging_item)

        s01_ss_score = score_01['ss']
        s05_ss_score = score_05['ss']
        #
        # exit(0)

        # print(logging_item)
        save_file_name = f'i({update_step})|e({epoch_i})' \
            f'|s01({s01_ss_score})|s05({s05_ss_score})' \
            f'|seed({seed})'

        common.save_jsonl(
            cur_eval_results_list,
            Path(file_path_prefix) /
            f"{save_file_name}_dev_sent_results.jsonl")

        # print(save_file_name)
        logging_agent.incorporate_results({}, save_file_name, logging_item)
        logging_agent.logging_to_file(Path(file_path_prefix) / "log.json")

        model_to_save = model.module if hasattr(model, 'module') else model
        output_model_file = Path(file_path_prefix) / save_file_name
        torch.save(model_to_save.state_dict(), str(output_model_file))
Exemple #10
0
def model_eval_ablation(model_path, filter_value=0.2, top_k_sent=5):
    bert_model_name = 'bert-base-uncased'
    bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert'

    lazy = False
    forward_size = 32
    do_lower_case = True
    pair_order = 'cq'
    debug_mode = False

    maxout_model = False

    num_class = 3

    tag = 'dev'
    exp = 'no_re_train'
    print("Filter value:", filter_value)
    print("top_k_sent:", top_k_sent)
    train_sent_filtering_prob = 0.2
    dev_sent_filtering_prob = filter_value
    test_sent_filtering_prob = 0.2

    # Data dataset and upstream sentence results.
    dev_sent_results_list = common.load_jsonl(
        config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_dev_results.jsonl")
    # train_sent_results_list = common.load_jsonl(
    #     config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_train_results.jsonl")
    test_sent_results_list = common.load_jsonl(
        config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_test_results.jsonl")

    dev_fitems, dev_list = get_nli_pair('dev', is_training=False,
                                        sent_level_results_list=dev_sent_results_list, debug=debug_mode,
                                        sent_top_k=top_k_sent, sent_filter_value=dev_sent_filtering_prob)
    # train_fitems, train_list = get_nli_pair('train', is_training=True,
    #                                         sent_level_results_list=train_sent_results_list, debug=debug_mode,
    #                                         sent_top_k=5, sent_filter_value=train_sent_filtering_prob)
    test_fitems, test_list = get_nli_pair('test', is_training=False,
                                          sent_level_results_list=test_sent_results_list, debug=debug_mode,
                                          sent_top_k=top_k_sent, sent_filter_value=test_sent_filtering_prob)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace('SUPPORTS', namespace='labels')
    vocab.add_token_to_namespace('REFUTES', namespace='labels')
    vocab.add_token_to_namespace('NOT ENOUGH INFO', namespace='labels')
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels')

    if debug_mode:
        dev_list = dev_list[:100]
        # train_list = train_list[:100]
        test_list = test_list[:100]
        eval_frequency = 2

    # est_datasize = len(train_fitems)

    bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case,
                                                   cache_dir=bert_pretrain_path)
    bert_cs_reader = BertFeverNLIReader(bert_tokenizer, lazy, is_paired=True, query_l=64,
                                        example_filter=None, max_l=384, pair_order=pair_order)

    bert_encoder = BertModel.from_pretrained(bert_model_name, cache_dir=bert_pretrain_path)
    if not maxout_model:
        model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1,
                                                act_type='tanh', use_pretrained_pooler=True, use_sigmoid=False)
    else:
        model = BertPairMaxOutMatcher(bert_encoder, num_of_class=num_class, act_type="gelu", num_of_out_layers=2)

    model.load_state_dict(torch.load(model_path))

    dev_instances = bert_cs_reader.read(dev_fitems)
    # train_instances = bert_cs_reader.read(train_fitems)
    test_instances = bert_cs_reader.read(test_fitems)

    biterator = BasicIterator(batch_size=forward_size)
    biterator.index_with(vocab)

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if tag == 'dev':
        dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False)

        cur_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True, make_int=True,
                                           feed_input_span=maxout_model, show_progress=True)
        common.save_jsonl(cur_eval_results_list, f"nli_{tag}_label_results_th{dev_sent_filtering_prob}_{exp}.jsonl")

        ema_results_dict = list_dict_data_tool.list_to_dict(cur_eval_results_list, 'oid')
        copied_dev_list = copy.deepcopy(dev_list)
        list_dict_data_tool.append_item_from_dict_to_list(copied_dev_list, ema_results_dict,
                                                          'id', 'predicted_label')

        common.save_jsonl(copied_dev_list, f"nli_{tag}_cp_results_th{dev_sent_filtering_prob}_{exp}.jsonl")
        mode = {'standard': True}
        strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(copied_dev_list, dev_list,
                                                                        mode=mode, max_evidence=5)
        logging_item = {
            'ss': strict_score, 'ac': acc_score,
            'pr': pr, 'rec': rec, 'f1': f1,
        }

        print(logging_item)
        common.save_json(logging_item,
                         f"nli_th{dev_sent_filtering_prob}_{exp}_ss:{strict_score}_ac:{acc_score}_pr:{pr}_rec:{rec}_f1:{f1}.jsonl")

    elif tag == 'test':
        test_iter = biterator(test_instances, num_epochs=1, shuffle=False)

        cur_eval_results_list = eval_model(model, test_iter, device_num, with_probs=True, make_int=True,
                                           feed_input_span=maxout_model, show_progress=True)

        common.save_jsonl(cur_eval_results_list, f"nli_{tag}_label_results_th{test_sent_filtering_prob}.jsonl")

        ema_results_dict = list_dict_data_tool.list_to_dict(cur_eval_results_list, 'oid')
        copied_test_list = copy.deepcopy(test_list)
        list_dict_data_tool.append_item_from_dict_to_list(copied_test_list, ema_results_dict,
                                                          'id', 'predicted_label')

        common.save_jsonl(copied_test_list, f"nli_{tag}_cp_results_th{test_sent_filtering_prob}.jsonl")
Exemple #11
0
def model_go(th_filter_prob=0.2, top_k_sent=5):
    seed = 12
    torch.manual_seed(seed)
    # bert_model_name = 'bert-large-uncased'
    bert_model_name = 'bert-base-uncased'
    bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert'
    lazy = False
    # lazy = True
    forward_size = 32
    # batch_size = 64
    # batch_size = 192
    batch_size = 32
    gradient_accumulate_step = int(batch_size / forward_size)
    warmup_proportion = 0.1
    # schedule_type = 'warmup_constant'
    # 'warmup_cosine': warmup_cosine,
    # 'warmup_constant': warmup_constant,
    # 'warmup_linear': warmup_linear,
    schedule_type = 'warmup_linear'
    learning_rate = 5e-5
    num_train_epochs = 5
    eval_frequency = 4000
    do_lower_case = True
    pair_order = 'cq'
    # debug_mode = True
    # debug_mode = True
    debug_mode = False
    do_ema = True

    maxout_model = False
    # est_datasize = 900_000

    num_class = 3
    # num_train_optimization_steps
    top_k = top_k_sent

    train_sent_filtering_prob = th_filter_prob
    dev_sent_filtering_prob = th_filter_prob
    experiment_name = f'fever_v2_nli_th{train_sent_filtering_prob}_tk{top_k}'

    # Data dataset and upstream sentence results.
    dev_sent_results_list = common.load_jsonl(
        config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_dev_results.jsonl")
    train_sent_results_list = common.load_jsonl(
        config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_train_results.jsonl")

    dev_fitems, dev_list = get_nli_pair('dev', is_training=False,
                                        sent_level_results_list=dev_sent_results_list, debug=debug_mode,
                                        sent_top_k=top_k_sent, sent_filter_value=dev_sent_filtering_prob)
    train_fitems, train_list = get_nli_pair('train', is_training=True,
                                            sent_level_results_list=train_sent_results_list, debug=debug_mode,
                                            sent_top_k=top_k_sent, sent_filter_value=train_sent_filtering_prob)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace('SUPPORTS', namespace='labels')
    vocab.add_token_to_namespace('REFUTES', namespace='labels')
    vocab.add_token_to_namespace('NOT ENOUGH INFO', namespace='labels')
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels')

    if debug_mode:
        dev_list = dev_list[:100]
        train_list = train_list[:100]
        eval_frequency = 2

    est_datasize = len(train_fitems)

    bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case,
                                                   cache_dir=bert_pretrain_path)
    bert_cs_reader = BertFeverNLIReader(bert_tokenizer, lazy, is_paired=True, query_l=64,
                                        example_filter=None, max_l=384, pair_order=pair_order)

    bert_encoder = BertModel.from_pretrained(bert_model_name, cache_dir=bert_pretrain_path)
    if not maxout_model:
        model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1,
                                                act_type='tanh', use_pretrained_pooler=True, use_sigmoid=False)
    else:
        model = BertPairMaxOutMatcher(bert_encoder, num_of_class=num_class, act_type="gelu", num_of_out_layers=2)

    ema = None
    if do_ema:
        ema = EMA(model, model.named_parameters(), device_num=1)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

    num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \
                                   num_train_epochs

    if debug_mode:
        num_train_optimization_steps = 100

    print("Estimated training size", est_datasize)
    print("Number of optimization steps:", num_train_optimization_steps)
    print("Do EMA:", do_ema)

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=learning_rate,
                         warmup=warmup_proportion,
                         t_total=num_train_optimization_steps,
                         schedule=schedule_type)

    dev_instances = bert_cs_reader.read(dev_fitems)

    biterator = BasicIterator(batch_size=forward_size)
    biterator.index_with(vocab)

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    forbackward_step = 0
    update_step = 0

    logging_agent = save_tool.ScoreLogger({})

    file_path_prefix = '.'
    if not debug_mode:
        file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}")
        # # # Create Log File
        # Save the source code.
        script_name = os.path.basename(__file__)
        with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it:
            out_f.write(it.read())
            out_f.flush()
        # # # Log File end

    for epoch_i in range(num_train_epochs):
        print("Epoch:", epoch_i)

        train_fitems_list, _ = get_nli_pair('train', is_training=True,
                                            sent_level_results_list=train_sent_results_list, debug=debug_mode,
                                            sent_top_k=5, sent_filter_value=train_sent_filtering_prob)

        random.shuffle(train_fitems_list)
        train_instance = bert_cs_reader.read(train_fitems_list)
        train_iter = biterator(train_instance, num_epochs=1, shuffle=True)

        for batch in tqdm(train_iter):
            model.train()
            batch = move_to_device(batch, device_num)

            paired_sequence = batch['paired_sequence']
            paired_segments_ids = batch['paired_segments_ids']
            labels_ids = batch['label']
            att_mask, _ = torch_util.get_length_and_mask(paired_sequence)
            s1_span = batch['bert_s1_span']
            s2_span = batch['bert_s2_span']

            if not maxout_model:
                loss = model(paired_sequence, token_type_ids=paired_segments_ids, attention_mask=att_mask,
                             mode=BertMultiLayerSeqClassification.ForwardMode.TRAIN,
                             labels=labels_ids)
            else:
                loss = model(paired_sequence, token_type_ids=paired_segments_ids, attention_mask=att_mask,
                             s1_span=s1_span, s2_span=s2_span,
                             mode=BertPairMaxOutMatcher.ForwardMode.TRAIN,
                             labels=labels_ids)

            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.

            if gradient_accumulate_step > 1:
                loss = loss / gradient_accumulate_step

            loss.backward()
            forbackward_step += 1

            if forbackward_step % gradient_accumulate_step == 0:
                optimizer.step()
                if ema is not None and do_ema:
                    updated_model = model.module if hasattr(model, 'module') else model
                    ema(updated_model.named_parameters())
                optimizer.zero_grad()
                update_step += 1

                if update_step % eval_frequency == 0:
                    print("Update steps:", update_step)
                    # dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False)
                    #
                    # cur_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True, make_int=True,
                    #                                    feed_input_span=maxout_model)
                    #
                    # ema_results_dict = list_dict_data_tool.list_to_dict(cur_eval_results_list, 'oid')
                    # copied_dev_list = copy.deepcopy(dev_list)
                    # list_dict_data_tool.append_item_from_dict_to_list(copied_dev_list, ema_results_dict,
                    #                                                   'id', 'predicted_label')
                    #
                    # mode = {'standard': True}
                    # strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(copied_dev_list, dev_list,
                    #                                                                 mode=mode, max_evidence=5)
                    # logging_item = {
                    #     'ss': strict_score, 'ac': acc_score,
                    #     'pr': pr, 'rec': rec, 'f1': f1,
                    # }
                    #
                    # if not debug_mode:
                    #     save_file_name = f'i({update_step})|e({epoch_i})' \
                    #         f'|ss({strict_score})|ac({acc_score})|pr({pr})|rec({rec})|f1({f1})' \
                    #         f'|seed({seed})'
                    #
                    #     common.save_jsonl(copied_dev_list, Path(file_path_prefix) /
                    #                       f"{save_file_name}_dev_nli_results.json")
                    #
                    #     # print(save_file_name)
                    #     logging_agent.incorporate_results({}, save_file_name, logging_item)
                    #     logging_agent.logging_to_file(Path(file_path_prefix) / "log.json")
                    #
                    #     model_to_save = model.module if hasattr(model, 'module') else model
                    #     output_model_file = Path(file_path_prefix) / save_file_name
                    #     torch.save(model_to_save.state_dict(), str(output_model_file))

                    if do_ema and ema is not None:
                        ema_model = ema.get_inference_model()
                        ema_device_num = 0
                        ema_model = ema_model.to(device)
                        ema_model = torch.nn.DataParallel(ema_model)
                        dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False)
                        cur_ema_eval_results_list = eval_model(ema_model, dev_iter, ema_device_num, with_probs=True,
                                                               make_int=True,
                                                               feed_input_span=maxout_model)

                        ema_results_dict = list_dict_data_tool.list_to_dict(cur_ema_eval_results_list, 'oid')
                        copied_dev_list = copy.deepcopy(dev_list)
                        list_dict_data_tool.append_item_from_dict_to_list(copied_dev_list, ema_results_dict,
                                                                          'id', 'predicted_label')

                        mode = {'standard': True}
                        strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(copied_dev_list, dev_list,
                                                                                        mode=mode, max_evidence=5)
                        ema_logging_item = {
                            'label': 'ema',
                            'ss': strict_score, 'ac': acc_score,
                            'pr': pr, 'rec': rec, 'f1': f1,
                        }

                        if not debug_mode:
                            save_file_name = f'ema_i({update_step})|e({epoch_i})' \
                                f'|ss({strict_score})|ac({acc_score})|pr({pr})|rec({rec})|f1({f1})' \
                                f'|seed({seed})'

                            common.save_jsonl(copied_dev_list, Path(file_path_prefix) /
                                              f"{save_file_name}_dev_nli_results.json")

                            # print(save_file_name)
                            logging_agent.incorporate_results({}, save_file_name, ema_logging_item)
                            logging_agent.logging_to_file(Path(file_path_prefix) / "log.json")

                            model_to_save = ema_model.module if hasattr(ema_model, 'module') else ema_model
                            output_model_file = Path(file_path_prefix) / save_file_name
                            torch.save(model_to_save.state_dict(), str(output_model_file))