def full_eval_model(model, data_iter, criterion, dev_data_list):
    # SUPPORTS < (-.-) > 0
    # REFUTES < (-.-) > 1
    # NOT ENOUGH INFO < (-.-) > 2

    id2label = {0: "SUPPORTS", 1: "REFUTES", 2: "NOT ENOUGH INFO"}

    print("Evaluating ...")
    model.eval()
    n_correct = loss = 0
    totoal_size = 0

    y_pred_list = []
    y_true_list = []
    y_id_list = []

    with torch.no_grad():  # Important fixing.

        for batch_idx, batch in enumerate(data_iter):
            out = model(batch)
            y = batch['label']
            y_id_list.extend(list(batch['pid']))

            n_correct += (torch.max(out,
                                    1)[1].view(y.size()) == y).sum().item()

            y_pred_list.extend(torch.max(out, 1)[1].view(y.size()).tolist())
            y_true_list.extend(y.tolist())

            loss += criterion(out, y).item() * y.size(0)
            totoal_size += y.size(0)

        assert len(y_id_list) == len(dev_data_list)
        assert len(y_pred_list) == len(dev_data_list)
        assert len(y_true_list) == len(dev_data_list)

        for i in range(len(dev_data_list)):
            assert str(y_id_list[i]) == str(dev_data_list[i]['id'])
            # Matching id

            dev_data_list[i]['predicted_label'] = id2label[y_pred_list[i]]
            # Reset neural set
            if len(dev_data_list[i]['predicted_sentids']) == 0:
                dev_data_list[i]['predicted_label'] = "NOT ENOUGH INFO"

                # dev_data_list[i]['predicted_evidence'] = convert_evidence2scoring_format(dev_data_list[i]['predicted_sentids'])

        print('n_correct:', n_correct)
        print('total_size:', totoal_size)

        eval_mode = {'check_sent_id_correct': True, 'standard': True}
        strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score(
            dev_data_list, dev_data_list, mode=eval_mode, verbose=False)
        print("Fever Score(Strict/Acc./Precision/Recall/F1):", strict_score,
              acc_score, pr, rec, f1)

        avg_acc = 100. * n_correct / totoal_size
        avg_loss = loss / totoal_size

    return strict_score, avg_loss
def used_func_for_fast_key_word_matching_expanded_kw():
    """
    Added on July 1.
    :return:
    """
    # Load tokenizer
    path_stanford_corenlp_full_2017_06_09 = str(config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*')
    drqa_yixin.tokenizers.set_default('corenlp_classpath', path_stanford_corenlp_full_2017_06_09)
    tok = CoreNLPTokenizer(annotators=['pos', 'lemma', 'ner'])
    #
    keyword_processor = KeywordProcessor(case_sensitive=True)
    id_to_key_dict = load_keyword_dict(config.DATA_ROOT / "id_dict.jsonl")

    id_dict_key_word_expand(id_to_key_dict, create_new_key_word_dict=False)

    # exit(-2)

    # Write this in a for loop to keep track of the progress
    build_flashtext_processor_wit(keyword_processor, id_to_key_dict)

    # Load data for predicting
    d_list = load_data(config.FEVER_DEV_JSONL)
    sample_answer(d_list, tok, keyword_p=keyword_processor)

    # save the the results for evaluating
    out_fname = config.RESULT_PATH / "doc_retri" / f"{utils.get_current_time_str()}_r" / "dev.jsonl"
    save_intermidiate_results(d_list, out_filename=out_fname)

    # Evaluating
    # out_fname = '/Users/Eason/RA/FunEver/results/doc_retri/2018_07_01_17:20:54_r/dev.jsonl'
    # out_fname = '/Users/Eason/RA/FunEver/results/doc_retri/2018_07_01_17:08:06_r/dev.jsonl'
    # d_list = load_data(out_fname)
    eval_mode = {'check_doc_id_correct': True, 'standard': False}
    # print(fever_score(d_list, d_list, mode=eval_mode, error_analysis_file=Path(out_fname).parent / "analysis.log"))
    print(fever_score(d_list, d_list, mode=eval_mode, verbose=False))
def conduct_search(r_list, eval_list):
    best_strict_score = -1
    best_acc_score = -1
    max = len(r_list)
    for count in range(1, 5):
        combo = itertools.combinations(r_list, count)
        for combo_list in combo:
            test_list = [x[1] for x in combo_list]
            test_ind = [x[0] for x in combo_list]
            nli_results = merge_nli_results(test_list)
            eval_mode = {'standard': True}
            delete_unused_evidence(nli_results)
            strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score(
                nli_results, eval_list, mode=eval_mode, verbose=False)

            if best_acc_score < acc_score:
                best_acc_score = acc_score
                print('-' * 50)
                print("Best Acc:", best_acc_score)
                print("Best Acc Ind:", test_ind)
                print('-' * 50)

            if best_strict_score < strict_score:
                best_strict_score = strict_score
                print('-' * 50)
                print("Best sAcc:", strict_score)
                print("Best sAcc Ind:", test_ind)
                print('-' * 50)
def used_func_for_fast_key_word_matching():
    # Load tokenizer
    path_stanford_corenlp_full_2017_06_09 = str(config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*')
    drqa.tokenizers.set_default('corenlp_classpath', path_stanford_corenlp_full_2017_06_09)
    tok = CoreNLPTokenizer(annotators=['pos', 'lemma', 'ner'])

    keyword_processor = KeywordProcessor(case_sensitive=True)
    id_to_key_dict = load_keyword_dict(config.DATA_ROOT / "id_dict.jsonl")

    # Write this in a for loop to keep track of the progress
    for clean_name, keywords in tqdm(id_to_key_dict.items()):
        if not isinstance(keywords, list):
            raise AttributeError("Value of key {} should be a list".format(clean_name))

        for keyword in keywords:
            keyword_processor.add_keyword(keyword, clean_name)

    # Load data for predicting
    d_list = load_data(config.FEVER_DEV_JSONL)
    sample_answer(d_list, tok, keyword_p=keyword_processor)

    # save the the results for evaluating
    out_fname = config.RESULT_PATH / "doc_retri" / f"{utils.get_current_time_str()}_r" / "dev.jsonl"
    save_intermidiate_results(d_list, out_filename=out_fname)

    # Evaluating
    # out_fname = '/Users/Eason/RA/FunEver/results/doc_retri/2018_06_29_17:41:14_r/dev.jsonl'
    d_list = load_data(out_fname)
    eval_mode = {'check_doc_id_correct': True, 'standard': False}
    # print(fever_score(d_list, d_list, mode=eval_mode, error_analysis_file=Path(out_fname).parent / "analysis.log"))
    print(fever_score(d_list, d_list, mode=eval_mode))
Ejemplo n.º 5
0
def spectrum_eval_manual_check():
    batch_size = 64
    lazy = True

    SAVE_PATH = "/home/easonnie/projects/FunEver/saved_models/07-17-12:10:35_mesim_elmo/i(34800)_epoch(5)_dev(0.5563056305630563)_loss(1.6648460462434564)_seed(12)"

    # IN_FILE = config.RESULT_PATH / "sent_retri_nn/2018_07_17_15:52:19_r/dev_sent.jsonl"
    IN_FILE = config.RESULT_PATH / "sent_retri_nn/2018_07_17_16:34:19_r/dev_sent.jsonl"
    # IN_FILE = config.RESULT_PATH / "sent_retri_nn/2018_07_17_16-34-19_r/dev_sent.jsonl"
    dev_sent_result_lsit = common.load_jsonl(IN_FILE)

    # Prepare Data
    token_indexers = {
        'tokens': SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters')  # This is the elmo_characters
    }

    # Load Vocabulary
    biterator = BasicIterator(batch_size=batch_size)

    vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic")
    vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels')

    print(vocab.get_token_to_index_vocabulary('labels'))
    print(vocab.get_vocab_size('tokens'))

    biterator.index_with(vocab)

    # Build Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0)
    device_num = -1 if device.type == 'cpu' else 0

    model = Model(weight=weight_dict['glove.840B.300d'],
                  vocab_size=vocab.get_vocab_size('tokens'),
                  embedding_dim=300, max_l=300)

    model.load_state_dict(torch.load(SAVE_PATH))
    model.display()
    model.to(device)

    for sc_prob in [0.5, 0.7, 0.8, 0.9, 0.95, 0.98]:
        upstream_dev_list = score_converter_scaled(config.T_FEVER_DEV_JSONL, dev_sent_result_lsit, scale_prob=sc_prob,
                                                   delete_prob=False)
        dev_fever_data_reader = BasicReader(token_indexers=token_indexers, lazy=lazy)
        complete_upstream_dev_data = get_actual_data(config.T_FEVER_DEV_JSONL, upstream_dev_list)
        dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data)

        eval_iter = biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num)
        builded_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data)

        print("------------------------------------")
        print("Scaling_prob:", sc_prob)
        eval_mode = {'check_sent_id_correct': True, 'standard': True}
        print(c_scorer.fever_score(builded_dev_data, config.T_FEVER_DEV_JSONL, mode=eval_mode))
        # del upstream_dev_list
        # del complete_upstream_dev_data
        del dev_fever_data_reader
        del dev_instances
        print("------------------------------------")
def used_func_for_fast_key_word_matching_prioritized_kw_resample():
    """
    Added on July 1.
    :return:
    """
    # Load tokenizer
    # path_stanford_corenlp_full_2017_06_09 = str(config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*')
    # drqa_yixin.tokenizers.set_default('corenlp_classpath', path_stanford_corenlp_full_2017_06_09)
    # tok = CoreNLPTokenizer(annotators=['pos', 'lemma', 'ner'])

    # doc_tokens, doc_lemmas = parse_doc_id('Hourglass_-LRB-James_Taylor_album-RRB-', tok)
    # print(doc_tokens)
    # print(doc_lemmas)
    # print(get_words_inside_parenthese(doc_tokens))
    # print(get_words_inside_parenthese(doc_lemmas))
    # claim_t = ['album']
    # claim_l = ['album']
    # print(check_inside_paretheses_overlap(doc_tokens, doc_lemmas, claim_t, claim_l))
    # exit(-1)

    #
    # keyword_processor = KeywordProcessor(case_sensitive=True)
    #
    # id_to_key_dict = load_keyword_dict(config.DATA_ROOT / "id_dict.jsonl", filtering=True)
    #
    # exact_match_rule_dict = set_priority(id_to_key_dict, priority=5.0)
    # print(len(exact_match_rule_dict))
    #
    # noisy_key_dict = id_dict_key_word_expand(id_to_key_dict, create_new_key_word_dict=True)
    # noisy_parenthese_rule_dict = set_priority(noisy_key_dict, priority=1.0)
    # print("Noisy_Parenthese_Rule_Dict:", len(noisy_parenthese_rule_dict))

    # exit(-2)

    # Write this in a for loop to keep track of the progress
    # build_flashtext_processor_with_prioritized_kw_dict(keyword_processor, exact_match_rule_dict)
    # build_flashtext_processor_with_prioritized_kw_dict(keyword_processor, noisy_parenthese_rule_dict)

    # Load data for predicting
    # d_list = load_data(config.FEVER_TRAIN_JSONL)
    # d_list = load_data(config.FEVER_DEV_JSONL)
    # sample_answer_with_priority(d_list, tok, keyword_processor, top_k=5)

    # save the the results for evaluating
    # out_fname = config.RESULT_PATH / "doc_retri" / f"{utils.get_current_time_str()}_r" / "train.jsonl"
    # out_fname = config.RESULT_PATH / "doc_retri" / f"{utils.get_current_time_str()}_r" / "dev.jsonl"
    out_fname = '/Users/Eason/RA/FunEver/results/sent_retri/2018_07_05_17:17:50_r/dev.jsonl'
    d_list = load_data(out_fname)
    resample_answer_with_priority(d_list, top_k=5)
    # save_intermidiate_results(d_list, out_filename=out_fname)

    # Evaluating
    # out_fname = '/Users/Eason/RA/FunEver/results/doc_retri/2018_07_01_17:20:54_r/dev.jsonl'
    # out_fname = '/Users/Eason/RA/FunEver/results/doc_retri/2018_07_01_17:08:06_r/dev.jsonl'
    # d_list = load_data(out_fname)
    eval_mode = {'check_doc_id_correct': True, 'standard': False}
    # print(fever_score(d_list, d_list, mode=eval_mode, error_analysis_file=Path(out_fname).parent / "analysis.log"))
    print(fever_score(d_list, d_list, mode=eval_mode, verbose=False))
Ejemplo n.º 7
0
def hidden_eval_fever():
    batch_size = 64
    lazy = True

    SAVE_PATH = "/home/easonnie/projects/FunEver/saved_models/07-18-21:07:28_m_esim_wn_elmo_sample_fixed/i(57000)_epoch(8)_dev(0.5755075507550755)_loss(1.7175163737963839)_seed(12)"

    dev_upstream_file = config.RESULT_PATH / "sent_retri/2018_07_05_17:17:50_r/dev.jsonl"

    # Prepare Data
    token_indexers = {
        'tokens': SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters')  # This is the elmo_characters
    }

    p_dict = wn_persistent_api.persistence_load()

    dev_fever_data_reader = WNReader(token_indexers=token_indexers, lazy=lazy, wn_p_dict=p_dict, max_l=360)

    complete_upstream_dev_data = get_actual_data(config.T_FEVER_DEV_JSONL, dev_upstream_file)
    dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data)
    # Load Vocabulary
    biterator = BasicIterator(batch_size=batch_size)
    # dev_biterator = BasicIterator(batch_size=batch_size * 2)

    vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic")
    vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels')

    print(vocab.get_token_to_index_vocabulary('labels'))
    print(vocab.get_vocab_size('tokens'))

    biterator.index_with(vocab)

    # Build Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0)
    device_num = -1 if device.type == 'cpu' else 0

    model = Model(rnn_size_in=(1024 + 300 + dev_fever_data_reader.wn_feature_size,
                               1024 + 300),
                  weight=weight_dict['glove.840B.300d'],
                  vocab_size=vocab.get_vocab_size('tokens'),
                  embedding_dim=300, max_l=300)

    print("Model Max length:", model.max_l)
    model.load_state_dict(torch.load(SAVE_PATH))
    model.display()
    model.to(device)

    eval_iter = biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num)
    builded_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data)

    eval_mode = {'check_sent_id_correct': True, 'standard': True}

    for item in builded_dev_data:
        del item['label']

    print(c_scorer.fever_score(builded_dev_data, common.load_jsonl(config.T_FEVER_DEV_JSONL), mode=eval_mode))
def error_analysis(out_fname):
    d_list = load_data(out_fname)
    eval_mode = {'check_doc_id_correct': True, 'standard': False}
    # print(fever_score(d_list, d_list, mode=eval_mode, error_analysis_file=Path(out_fname).parent / "analysis.log"))
    print(
        fever_score(d_list,
                    d_list,
                    mode=eval_mode,
                    verbose=True,
                    error_analysis_file=Path(out_fname).parent /
                    "analysis.log"))
Ejemplo n.º 9
0
def prepare_data_only_page_view(tokenized_file, eval_file,
                                doc_retrieval_output_file):
    """
    This method prepare document retrieval data using only page view.
    :return:
    """
    doc_retrieval_method = 'pageview'
    print("Method:", doc_retrieval_method)

    haonan_docretri_object = HAONAN_DOCRETRI_OBJECT()

    doc_retrieval_result_list = first_doc_retrieval(
        haonan_docretri_object,
        tokenized_file,
        method=doc_retrieval_method,
        top_k=100)
    eval_list = common.load_jsonl(eval_file)

    disamb.item_resorting(doc_retrieval_result_list)

    print("Evaluating 1st Doc Retrieval")
    eval_mode = {'check_doc_id_correct': True, 'standard': False}
    print(
        c_scorer.fever_score(doc_retrieval_result_list,
                             eval_list,
                             mode=eval_mode,
                             verbose=False))
    print(
        "Max_doc_num_5:",
        c_scorer.fever_doc_only(doc_retrieval_result_list,
                                eval_list,
                                max_evidence=5))
    print(
        "Max_doc_num_10:",
        c_scorer.fever_doc_only(doc_retrieval_result_list,
                                eval_list,
                                max_evidence=10))
    print(
        "Max_doc_num_15:",
        c_scorer.fever_doc_only(doc_retrieval_result_list,
                                eval_list,
                                max_evidence=15))
    print(
        "Max_doc_num_20:",
        c_scorer.fever_doc_only(doc_retrieval_result_list,
                                eval_list,
                                max_evidence=20))
    # First Document retrieval End.
    common.save_jsonl(doc_retrieval_result_list, doc_retrieval_output_file)
Ejemplo n.º 10
0
def hidden_eval_fever():
    batch_size = 64
    lazy = True

    SAVE_PATH = "/home/easonnie/projects/FunEver/saved_models/07-08-19:04:33_mesim_elmo/i(39700)_epoch(6)_dev(0.5251525152515252)_loss(1.5931938096682707)_seed(12)"

    dev_upstream_file = config.RESULT_PATH / "sent_retri/2018_07_05_17:17:50_r/dev.jsonl"

    # Prepare Data
    token_indexers = {
        'tokens': SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters')  # This is the elmo_characters
    }

    dev_fever_data_reader = BasicReader(token_indexers=token_indexers, lazy=lazy)

    complete_upstream_dev_data = get_actual_data(config.T_FEVER_DEV_JSONL, dev_upstream_file)
    dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data)

    # Load Vocabulary
    biterator = BasicIterator(batch_size=batch_size)

    vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic")
    vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels')

    print(vocab.get_token_to_index_vocabulary('labels'))
    print(vocab.get_vocab_size('tokens'))

    biterator.index_with(vocab)

    # Build Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0)
    device_num = -1 if device.type == 'cpu' else 0

    model = Model(weight=weight_dict['glove.840B.300d'],
                  vocab_size=vocab.get_vocab_size('tokens'),
                  embedding_dim=300, max_l=300)

    model.load_state_dict(torch.load(SAVE_PATH))
    model.display()
    model.to(device)

    eval_iter = biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num)
    builded_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data)

    eval_mode = {'check_sent_id_correct': True, 'standard': True}
    print(c_scorer.fever_score(builded_dev_data, config.T_FEVER_DEV_JSONL, mode=eval_mode))
def if_idf_select_sentence():
    db_cursor = fever_db.get_cursor()
    loaded_path = "/Users/Eason/RA/FunEver/results/doc_retri/2018_07_04_21:56:49_r/dev.jsonl"
    d_list = load_data(loaded_path)
    # d_list = load_data("/Users/Eason/RA/FunEver/results/doc_retri/2018_07_04_21:56:49_r/train.jsonl")

    for item in tqdm(d_list):
        # print()
        p_docids = item['predicted_docids']
        cleaned_claim = ' '.join(easy_tokenize(item['claim']))
        # print(cleaned_claim)

        current_sent_list = []
        current_id_list = []
        for doc_id in p_docids:
            r_list, id_list = fever_db.get_all_sent_by_doc_id(
                db_cursor, doc_id)
            current_sent_list.extend(r_list)
            current_id_list.extend(id_list)

        Args = namedtuple('Args', 'ngram hash_size num_workers')

        args = Args(2, int(8192), 4)

        ranker = OnlineTfidfDocRanker(args, args.hash_size, args.ngram,
                                      current_sent_list)

        selected_index, selected_score = ranker.closest_docs(cleaned_claim,
                                                             k=5)

        selected_sent_id = []
        for ind in selected_index:
            curent_selected = current_id_list[ind]
            doc_id, ln = curent_selected.split('(-.-)')
            # ln = int(ln)
            # selected_sent_id.append([doc_id, ln])
            selected_sent_id.append(doc_id + c_scorer.SENT_LINE + ln)

        item['predicted_sentids'] = selected_sent_id

    eval_mode = {'check_sent_id_correct': True, 'standard': False}
    print(c_scorer.fever_score(d_list, d_list, mode=eval_mode, verbose=False))

    out_fname = config.RESULT_PATH / "sent_retri" / f"{utils.get_current_time_str()}_r" / "dev.jsonl"
    save_intermidiate_results(d_list,
                              out_filename=out_fname,
                              last_loaded_path=loaded_path)
def hidden_eval_fever_adv_v1():
    batch_size = 64
    lazy = True
    dev_prob_threshold = 0.5

    SAVE_PATH = "/home/easonnie/projects/FunEver/saved_models/07-20-22:28:24_mesim_wn_450_adv_sample_v1_|t_prob:0.35|top_k:8/i(46000)_epoch(7)_dev(0.6405140514051405)_loss(1.0761665150348825)_seed(12)"

    dev_upstream_sent_list = common.load_jsonl(
        config.RESULT_PATH /
        "sent_retri_nn/2018_07_20_15:17:59_r/dev_sent.jsonl")

    # Prepare Data
    token_indexers = {
        'tokens':
        SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(
            namespace='elmo_characters')  # This is the elmo_characters
    }

    p_dict = wn_persistent_api.persistence_load()

    upstream_dev_list = score_converter_scaled(config.T_FEVER_DEV_JSONL,
                                               dev_upstream_sent_list,
                                               scale_prob=dev_prob_threshold,
                                               delete_prob=False)

    dev_fever_data_reader = WNReader(token_indexers=token_indexers,
                                     lazy=lazy,
                                     wn_p_dict=p_dict,
                                     max_l=360)

    complete_upstream_dev_data = get_actual_data(config.T_FEVER_DEV_JSONL,
                                                 upstream_dev_list)
    dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data)

    # Load Vocabulary
    biterator = BasicIterator(batch_size=batch_size)

    vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT /
                                               "vocab_cache" / "nli_basic")
    vocab.change_token_with_index_to_namespace('hidden',
                                               -2,
                                               namespace='labels')

    print(vocab.get_token_to_index_vocabulary('labels'))
    print(vocab.get_vocab_size('tokens'))

    biterator.index_with(vocab)

    # Build Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu",
                          index=0)
    device_num = -1 if device.type == 'cpu' else 0

    model = Model(
        rnn_size_in=(1024 + 300 + dev_fever_data_reader.wn_feature_size,
                     1024 + 450),
        rnn_size_out=(450, 450),
        weight=weight_dict['glove.840B.300d'],
        vocab_size=vocab.get_vocab_size('tokens'),
        mlp_d=900,
        embedding_dim=300,
        max_l=300)

    print("Model Max length:", model.max_l)
    model.load_state_dict(torch.load(SAVE_PATH))
    model.display()
    model.to(device)

    eval_iter = biterator(dev_instances,
                          shuffle=False,
                          num_epochs=1,
                          cuda_device=device_num)
    builded_dev_data = hidden_eval(model, eval_iter,
                                   complete_upstream_dev_data)

    eval_mode = {'check_sent_id_correct': True, 'standard': True}

    common.save_jsonl(
        builded_dev_data,
        config.RESULT_PATH / "nli_results" / "pipeline_results_1.jsonl")
    c_scorer.delete_label(builded_dev_data)
    print(
        c_scorer.fever_score(builded_dev_data,
                             common.load_jsonl(config.FEVER_DEV_JSONL),
                             mode=eval_mode))
def analysis_model(model_path):
    batch_size = 32
    lazy = True
    train_prob_threshold = 0.02
    train_sample_top_k = 8
    dev_prob_threshold = 0.1
    dev_sample_top_k = 5

    neg_sample_upper_prob = 0.006
    decay_r = 0.002

    top_k_doc = 5
    dev_doc_upstream_file = config.RESULT_PATH / "doc_retri/std_upstream_data_using_pageview/dev_doc.jsonl"

    complete_upstream_dev_data = get_full_list(config.T_FEVER_DEV_JSONL,
                                               dev_doc_upstream_file,
                                               pred=True,
                                               top_k=top_k_doc)

    print("Dev size:", len(complete_upstream_dev_data))

    # Prepare Data
    token_indexers = {
        'tokens':
        SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(
            namespace='elmo_characters')  # This is the elmo_characters
    }

    # Data Reader
    dev_fever_data_reader = VCSS_Reader(token_indexers=token_indexers,
                                        lazy=lazy,
                                        max_l=260)

    # Load Vocabulary
    biterator = BasicIterator(batch_size=batch_size)

    vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT /
                                               "vocab_cache" / "nli_basic")

    vocab.add_token_to_namespace('true', namespace='labels')
    vocab.add_token_to_namespace('false', namespace='labels')
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden",
                                               -2,
                                               namespace='labels')

    print(vocab.get_token_to_index_vocabulary('labels'))
    print(vocab.get_vocab_size('tokens'))

    biterator.index_with(vocab)
    # Reader and prepare end

    # vc_ss_training_sampler = VCSSTrainingSampler(complete_upstream_train_data)
    # vc_ss_training_sampler.show_info()

    # Build Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu",
                          index=0)
    device_num = -1 if device.type == 'cpu' else 0

    model = Model(rnn_size_in=(1024 + 300 + 1, 1024 + 450 + 1),
                  rnn_size_out=(450, 450),
                  weight=weight_dict['glove.840B.300d'],
                  vocab_size=vocab.get_vocab_size('tokens'),
                  mlp_d=900,
                  embedding_dim=300,
                  max_l=300)

    print("Model Max length:", model.max_l)

    model.display()
    model.to(device)

    cloned_empty_model = copy.deepcopy(model)

    load_ema_to_model(cloned_empty_model, model_path)

    vc_ss.data_wrangler.assign_task_label(complete_upstream_dev_data, 'ss')
    dev_ss_instance = dev_fever_data_reader.read(complete_upstream_dev_data)
    eval_ss_iter = biterator(dev_ss_instance, num_epochs=1, shuffle=False)
    scored_dev_sent_data = hidden_eval_ss(cloned_empty_model, eval_ss_iter,
                                          complete_upstream_dev_data)

    common.save_jsonl(scored_dev_sent_data, "dev_scored_sent_data.jsonl")
    # for vc
    filtered_dev_list = vc_ss.data_wrangler.sample_sentences_for_vc_with_nei(
        config.T_FEVER_DEV_JSONL, scored_dev_sent_data, dev_prob_threshold,
        dev_sample_top_k)
    common.save_jsonl(filtered_dev_list,
                      "dev_scored_sent_data_after_sample.jsonl")

    dev_selection_dict = paired_selection_score_dict(scored_dev_sent_data)
    ready_dev_list = select_sent_with_prob_for_eval(config.T_FEVER_DEV_JSONL,
                                                    filtered_dev_list,
                                                    dev_selection_dict,
                                                    tokenized=True)

    vc_ss.data_wrangler.assign_task_label(ready_dev_list, 'vc')
    dev_vc_instance = dev_fever_data_reader.read(ready_dev_list)
    eval_vc_iter = biterator(dev_vc_instance, num_epochs=1, shuffle=False)
    eval_dev_result_list = hidden_eval_vc(cloned_empty_model, eval_vc_iter,
                                          ready_dev_list)

    common.save_jsonl(eval_dev_result_list, "dev_nli_results.jsonl")

    # Scoring
    eval_mode = {'check_sent_id_correct': True, 'standard': True}
    strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score(
        eval_dev_result_list,
        common.load_jsonl(config.T_FEVER_DEV_JSONL),
        mode=eval_mode,
        verbose=False)
    print("Fever Score(Strict/Acc./Precision/Recall/F1):", strict_score,
          acc_score, pr, rec, f1)

    print(f"Dev:{strict_score}/{acc_score}")
def train_fever_std_ema_v1(resume_model=None, do_analysis=False):
    """
    This method is created on 26 Nov 2018 08:50 with the purpose of training vc and ss all together.
    :param resume_model:
    :param wn_feature:
    :return:
    """

    num_epoch = 200
    seed = 12
    batch_size = 32
    lazy = True
    train_prob_threshold = 0.02
    train_sample_top_k = 8
    dev_prob_threshold = 0.1
    dev_sample_top_k = 5
    top_k_doc = 5

    schedule_sample_dict = defaultdict(lambda: 0.1)

    ratio_ss_for_vc = 0.2

    schedule_sample_dict.update({
        0: 0.1,
        1: 0.1,  # 200k + 400K
        2: 0.1,
        3: 0.1,  # 200k + 200k ~ 200k + 100k
        4: 0.1,
        5: 0.1,  # 200k + 100k
        6: 0.1  # 20k + 20k
    })

    # Eval at beginning of the training.
    eval_full_epoch = 1
    eval_nei_epoches = [2, 3, 4, 5, 6, 7]

    neg_only = False
    debug = False

    experiment_name = f"vc_ss_v17_ratio_ss_for_vc:{ratio_ss_for_vc}|t_prob:{train_prob_threshold}|top_k:{train_sample_top_k}_scheduled_neg_sampler"
    # resume_model = None

    print("Do EMA:")

    print("Dev prob threshold:", dev_prob_threshold)
    print("Train prob threshold:", train_prob_threshold)
    print("Train sample top k:", train_sample_top_k)

    # Get upstream sentence document retrieval data
    dev_doc_upstream_file = config.RESULT_PATH / "doc_retri/std_upstream_data_using_pageview/dev_doc.jsonl"
    train_doc_upstream_file = config.RESULT_PATH / "doc_retri/std_upstream_data_using_pageview/train_doc.jsonl"

    complete_upstream_dev_data = get_full_list(config.T_FEVER_DEV_JSONL,
                                               dev_doc_upstream_file,
                                               pred=True,
                                               top_k=top_k_doc)

    complete_upstream_train_data = get_full_list(config.T_FEVER_TRAIN_JSONL,
                                                 train_doc_upstream_file,
                                                 pred=False,
                                                 top_k=top_k_doc)
    if debug:
        complete_upstream_dev_data = complete_upstream_dev_data[:1000]
        complete_upstream_train_data = complete_upstream_train_data[:1000]

    print("Dev size:", len(complete_upstream_dev_data))
    print("Train size:", len(complete_upstream_train_data))

    # Prepare Data
    token_indexers = {
        'tokens':
        SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(
            namespace='elmo_characters')  # This is the elmo_characters
    }

    # Data Reader
    dev_fever_data_reader = VCSS_Reader(token_indexers=token_indexers,
                                        lazy=lazy,
                                        max_l=260)
    train_fever_data_reader = VCSS_Reader(token_indexers=token_indexers,
                                          lazy=lazy,
                                          max_l=260)

    # Load Vocabulary
    biterator = BasicIterator(batch_size=batch_size)

    vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT /
                                               "vocab_cache" / "nli_basic")

    vocab.add_token_to_namespace('true', namespace='labels')
    vocab.add_token_to_namespace('false', namespace='labels')
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden",
                                               -2,
                                               namespace='labels')

    print(vocab.get_token_to_index_vocabulary('labels'))
    print(vocab.get_vocab_size('tokens'))

    biterator.index_with(vocab)
    # Reader and prepare end

    vc_ss_training_sampler = VCSSTrainingSampler(complete_upstream_train_data)
    vc_ss_training_sampler.show_info()

    # Build Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu",
                          index=0)
    device_num = -1 if device.type == 'cpu' else 0

    model = Model(rnn_size_in=(1024 + 300 + 1, 1024 + 450 + 1),
                  rnn_size_out=(450, 450),
                  weight=weight_dict['glove.840B.300d'],
                  vocab_size=vocab.get_vocab_size('tokens'),
                  mlp_d=900,
                  embedding_dim=300,
                  max_l=300,
                  num_of_class=4)

    print("Model Max length:", model.max_l)
    if resume_model is not None:
        model.load_state_dict(torch.load(resume_model))
    model.display()
    model.to(device)

    cloned_empty_model = copy.deepcopy(model)
    ema: EMA = EMA(parameters=model.named_parameters())

    # Create Log File
    file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}")
    # Save the source code.
    script_name = os.path.basename(__file__)
    with open(os.path.join(file_path_prefix, script_name),
              'w') as out_f, open(__file__, 'r') as it:
        out_f.write(it.read())
        out_f.flush()

    analysis_dir = None
    if do_analysis:
        analysis_dir = Path(file_path_prefix) / "analysis_aux"
        analysis_dir.mkdir()
    # Save source code end.

    # Staring parameter setup
    best_dev = -1
    iteration = 0

    start_lr = 0.0001
    optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                  model.parameters()),
                           lr=start_lr)
    criterion = nn.CrossEntropyLoss()
    # parameter setup end

    for i_epoch in range(num_epoch):
        print("Resampling...")
        # This is for train
        # This is for sample candidate data for from result of ss for vc.
        # This we will need to do after each epoch.
        if i_epoch == eval_full_epoch:  # only eval at 1
            print("We now need to eval the whole training set.")
            print("Be patient and hope good luck!")
            load_ema_to_model(cloned_empty_model, ema)
            eval_sent_for_sampler(cloned_empty_model, token_indexers, vocab,
                                  vc_ss_training_sampler)

        elif i_epoch in eval_nei_epoches:  # at 2, 3, 4 eval for NEI
            print("We now need to eval the NEI training set.")
            print("Be patient and hope good luck!")
            load_ema_to_model(cloned_empty_model, ema)
            eval_sent_for_sampler(cloned_empty_model,
                                  token_indexers,
                                  vocab,
                                  vc_ss_training_sampler,
                                  nei_only=True)

        train_data_with_candidate_sample_list = vc_ss.data_wrangler.sample_sentences_for_vc_with_nei(
            config.T_FEVER_TRAIN_JSONL, vc_ss_training_sampler.sent_list,
            train_prob_threshold, train_sample_top_k)
        # We initialize the prob for each sentence so the sampler can work, but we will need to run the model for dev data to work.

        train_selection_dict = paired_selection_score_dict(
            vc_ss_training_sampler.sent_list)

        cur_train_vc_data = adv_simi_sample_with_prob_v1_1(
            config.T_FEVER_TRAIN_JSONL,
            train_data_with_candidate_sample_list,
            train_selection_dict,
            tokenized=True)

        if do_analysis:
            # Customized analysis output
            common.save_jsonl(
                vc_ss_training_sampler.sent_list, analysis_dir /
                f"E_{i_epoch}_whole_train_sent_{save_tool.get_cur_time_str()}.jsonl"
            )
            common.save_jsonl(
                train_data_with_candidate_sample_list, analysis_dir /
                f"E_{i_epoch}_sampled_train_sent_{save_tool.get_cur_time_str()}.jsonl"
            )
            common.save_jsonl(
                cur_train_vc_data, analysis_dir /
                f"E_{i_epoch}_train_vc_data_{save_tool.get_cur_time_str()}.jsonl"
            )

        print(f"E{i_epoch} VC_data:", len(cur_train_vc_data))

        # This is for sample negative candidate data for ss
        # After sampling, we decrease the ratio.
        neg_sample_upper_prob = schedule_sample_dict[i_epoch]
        print("Neg Sampler upper rate:", neg_sample_upper_prob)
        # print("Rate decreasing")
        # neg_sample_upper_prob -= decay_r
        neg_sample_upper_prob = max(0.000, neg_sample_upper_prob)

        cur_train_ss_data = vc_ss_training_sampler.sample_for_ss(
            neg_only=neg_only, upper_prob=neg_sample_upper_prob)

        if i_epoch >= 1:  # if epoch num >= 6 we balance pos and neg example for selection
            # new_ss_data = []
            pos_ss_data = []
            neg_ss_data = []
            for item in cur_train_ss_data:
                if item['selection_label'] == 'true':
                    pos_ss_data.append(item)
                elif item['selection_label'] == 'false':
                    neg_ss_data.append(item)

            ss_sample_size = min(len(pos_ss_data), len(neg_ss_data))
            random.shuffle(pos_ss_data)
            random.shuffle(neg_ss_data)
            cur_train_ss_data = pos_ss_data[:int(
                ss_sample_size * 0.5)] + neg_ss_data[:ss_sample_size]
            random.shuffle(cur_train_ss_data)

        vc_ss_training_sampler.show_info(cur_train_ss_data)
        print(f"E{i_epoch} SS_data:", len(cur_train_ss_data))

        vc_ss.data_wrangler.assign_task_label(cur_train_ss_data, 'ss')
        vc_ss.data_wrangler.assign_task_label(cur_train_vc_data, 'vc')

        vs_ss_train_list = cur_train_ss_data + cur_train_vc_data
        random.shuffle(vs_ss_train_list)
        print(f"E{i_epoch} Total ss+vc:", len(vs_ss_train_list))
        vc_ss_instance = train_fever_data_reader.read(vs_ss_train_list)

        train_iter = biterator(vc_ss_instance, shuffle=True, num_epochs=1)

        for i, batch in tqdm(enumerate(train_iter)):
            model.train()
            out = model(batch)

            if i_epoch >= 1:
                ratio_ss_for_vc = 0.8

            loss = compute_mixing_loss(
                model,
                out,
                batch,
                criterion,
                vc_ss_training_sampler,
                ss_for_vc_prob=ratio_ss_for_vc)  # Important change

            # No decay
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            iteration += 1

            # EMA update
            ema(model.named_parameters())

            if i_epoch < 9:
                mod = 10000
                # mod = 100
            else:
                mod = 2000

            if iteration % mod == 0:

                # This is the code for eval:
                load_ema_to_model(cloned_empty_model, ema)

                vc_ss.data_wrangler.assign_task_label(
                    complete_upstream_dev_data, 'ss')
                dev_ss_instance = dev_fever_data_reader.read(
                    complete_upstream_dev_data)
                eval_ss_iter = biterator(dev_ss_instance,
                                         num_epochs=1,
                                         shuffle=False)
                scored_dev_sent_data = hidden_eval_ss(
                    cloned_empty_model, eval_ss_iter,
                    complete_upstream_dev_data)

                # for vc
                filtered_dev_list = vc_ss.data_wrangler.sample_sentences_for_vc_with_nei(
                    config.T_FEVER_DEV_JSONL, scored_dev_sent_data,
                    dev_prob_threshold, dev_sample_top_k)

                dev_selection_dict = paired_selection_score_dict(
                    scored_dev_sent_data)
                ready_dev_list = select_sent_with_prob_for_eval(
                    config.T_FEVER_DEV_JSONL,
                    filtered_dev_list,
                    dev_selection_dict,
                    tokenized=True)

                vc_ss.data_wrangler.assign_task_label(ready_dev_list, 'vc')
                dev_vc_instance = dev_fever_data_reader.read(ready_dev_list)
                eval_vc_iter = biterator(dev_vc_instance,
                                         num_epochs=1,
                                         shuffle=False)
                eval_dev_result_list = hidden_eval_vc(cloned_empty_model,
                                                      eval_vc_iter,
                                                      ready_dev_list)

                # Scoring
                eval_mode = {'check_sent_id_correct': True, 'standard': True}
                strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score(
                    eval_dev_result_list,
                    common.load_jsonl(config.T_FEVER_DEV_JSONL),
                    mode=eval_mode,
                    verbose=False)
                print("Fever Score(Strict/Acc./Precision/Recall/F1):",
                      strict_score, acc_score, pr, rec, f1)

                print(f"Dev:{strict_score}/{acc_score}")

                if do_analysis:
                    # Customized analysis output
                    common.save_jsonl(
                        scored_dev_sent_data, analysis_dir /
                        f"E_{i_epoch}_scored_dev_sent_{save_tool.get_cur_time_str()}.jsonl"
                    )
                    common.save_jsonl(
                        eval_dev_result_list, analysis_dir /
                        f"E_{i_epoch}_eval_vc_output_data_{save_tool.get_cur_time_str()}.jsonl"
                    )

                need_save = False
                if strict_score > best_dev:
                    best_dev = strict_score
                    need_save = True

                if need_save or i_epoch < 7:
                    # save_path = os.path.join(
                    #     file_path_prefix,
                    #     f'i({iteration})_epoch({i_epoch})_dev({strict_score})_lacc({acc_score})_seed({seed})'
                    # )

                    # torch.save(model.state_dict(), save_path)

                    ema_save_path = os.path.join(
                        file_path_prefix,
                        f'ema_i({iteration})_epoch({i_epoch})_dev({strict_score})_lacc({acc_score})_p({pr})_r({rec})_f1({f1})_seed({seed})'
                    )

                    save_ema_to_file(ema, ema_save_path)
def eval_and_save_v2(model_path,
                     is_ema,
                     saving_dir,
                     save_train_data=True,
                     prob_thresholds=0.5):
    # This method was modified on 21 NOV 2018
    # for evaluating balanced trained selection model with different threshold value.
    # It will then be used for later verification.

    # Evaluate and Save all the sentence pairs results to be used for downstream verificaion
    # 03 Oct 2018 03:56:40.
    seed = 12
    batch_size = 128
    lazy = True
    torch.manual_seed(seed)
    keep_neg_sample_prob = 1
    top_k_doc = 5

    # sample_prob_decay = 0.05
    dev_upstream_file = config.RESULT_PATH / "doc_retri/std_upstream_data_using_pageview/dev_doc.jsonl"
    train_upstream_file = config.RESULT_PATH / "doc_retri/std_upstream_data_using_pageview/train_doc.jsonl"

    # Prepare Data
    token_indexers = {
        'tokens':
        SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(
            namespace='elmo_characters')  # This is the elmo_characters
    }

    train_fever_data_reader = SSelectorReader(token_indexers=token_indexers,
                                              lazy=lazy,
                                              max_l=180)
    dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers,
                                            lazy=lazy,
                                            max_l=180)

    complete_upstream_dev_data = get_full_list(config.T_FEVER_DEV_JSONL,
                                               dev_upstream_file,
                                               pred=True,
                                               top_k=top_k_doc)

    complete_upstream_train_data = get_full_list(config.T_FEVER_TRAIN_JSONL,
                                                 train_upstream_file,
                                                 pred=False,
                                                 top_k=top_k_doc)

    print("Dev size:", len(complete_upstream_dev_data))
    print("Train size:", len(complete_upstream_train_data))
    dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data)
    train_instances = train_fever_data_reader.read(
        complete_upstream_train_data)

    # Load Vocabulary
    biterator = BasicIterator(batch_size=batch_size)
    dev_biterator = BasicIterator(batch_size=batch_size)

    vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT /
                                               "vocab_cache" / "nli_basic")
    # THis is important
    vocab.add_token_to_namespace("true", namespace="selection_labels")
    vocab.add_token_to_namespace("false", namespace="selection_labels")
    vocab.add_token_to_namespace("hidden", namespace="selection_labels")
    vocab.change_token_with_index_to_namespace("hidden",
                                               -2,
                                               namespace='selection_labels')
    # Label value

    vocab.get_index_to_token_vocabulary('selection_labels')

    print(vocab.get_token_to_index_vocabulary('selection_labels'))
    print(vocab.get_vocab_size('tokens'))

    biterator.index_with(vocab)
    dev_biterator.index_with(vocab)

    # exit(0)
    # Build Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu",
                          index=0)
    device_num = -1 if device.type == 'cpu' else 0

    model = Model(weight=weight_dict['glove.840B.300d'],
                  vocab_size=vocab.get_vocab_size('tokens'),
                  embedding_dim=300,
                  max_l=160,
                  num_of_class=2)

    if not is_ema:
        model.load_state_dict(torch.load(model_path))
    else:
        load_ema_to_model(model, model_path)

    model.display()
    model.to(device)

    dev_actual_list = common.load_jsonl(config.T_FEVER_DEV_JSONL)
    train_actual_list = common.load_jsonl(config.T_FEVER_TRAIN_JSONL)

    eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1)
    train_iter = biterator(train_instances, shuffle=False, num_epochs=1)

    complete_upstream_dev_data = hidden_eval(model, eval_iter,
                                             complete_upstream_dev_data)

    if save_train_data:
        complete_upstream_train_data = hidden_eval(
            model, train_iter, complete_upstream_train_data)
        common.save_jsonl(complete_upstream_train_data,
                          Path(str(saving_dir)) / "train_sent_scores.jsonl")
        common.save_jsonl(complete_upstream_dev_data,
                          Path(str(saving_dir)) / "dev_sent_pred_scores.jsonl")

    if not isinstance(prob_thresholds, list):
        prob_thresholds = [prob_thresholds]

    for scal_prob in prob_thresholds:
        print("Eval Dev Data prob_threshold:", scal_prob)

        dev_results_list = score_converter_v1(config.T_FEVER_DEV_JSONL,
                                              complete_upstream_dev_data,
                                              sent_retri_top_k=5,
                                              sent_retri_scal_prob=scal_prob)
        # This is only a wrapper for the simi_sampler

        eval_mode = {'check_sent_id_correct': True, 'standard': True}
        for a, b in zip(dev_actual_list, dev_results_list):
            b['predicted_label'] = a['label']
        strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score(
            dev_results_list, dev_actual_list, mode=eval_mode, verbose=False)
        tracking_score = strict_score
        print(f"Dev(raw_acc/pr/rec/f1):{acc_score}/{pr}/{rec}/{f1}/")
        print("Strict score:", strict_score)
        print(f"Eval Tracking score:", f"{tracking_score}")

    if save_train_data:
        print("Build Train Data")
        train_results_list = score_converter_v1(
            config.T_FEVER_TRAIN_JSONL,
            complete_upstream_train_data,
            sent_retri_top_k=5,
            sent_retri_scal_prob=prob_threshold)

        # This is only a wrapper for the simi_sampler

        eval_mode = {'check_sent_id_correct': True, 'standard': True}
        for a, b in zip(train_actual_list, train_results_list):
            b['predicted_label'] = a['label']
        strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score(
            train_results_list,
            train_actual_list,
            mode=eval_mode,
            verbose=False)
        tracking_score = strict_score
        print(f"Train(raw_acc/pr/rec/f1):{acc_score}/{pr}/{rec}/{f1}/")
        print("Strict score:", strict_score)
        print(f"Eval Tracking score:", f"{tracking_score}")
Ejemplo n.º 16
0
def full_eval_model_hesm(hesm_model, model, dataloader, criterion, dev_data_list):
    id2label = {
        0: "SUPPORTS",
        1: "REFUTES",
        2: "NOT ENOUGH INFO"
    }

    print("Evaluating ...")
    model.eval()
    n_correct = 0
    total_size = 0
    loss = 0

    y_pred_list = []
    y_pred_mult_list = []
    y_true_list = []
    y_id_list = []

    with torch.no_grad():  # Important fixing.

        for batch in dataloader:
            curloss, out, multiout = hesm_model.step(batch)
            
            y = batch['labels'].cuda()
            y_id_list.extend(list(batch['pid']))

            max_index = torch.max(out, 1)[1]
            
            n_correct += (max_index.view(y.size()) == y).sum().item()
            total_size += y.size(0)
            
            y_pred_list.extend(max_index.view(y.size()).tolist())
            y_true_list.extend(y.tolist())

            loss += curloss.mean()
            
            if multiout is not None:
                cur_s_label = []
                for sout in multiout:
                    cur_s_label.append(torch.max(sout, 1)[1].tolist())
                cur_s_label = list(zip(*cur_s_label))
                y_pred_mult_list.extend(cur_s_label)

        assert len(y_id_list) == len(dev_data_list)
        assert len(y_pred_list) == len(dev_data_list)
        assert len(y_true_list) == len(dev_data_list)

        for i in range(len(dev_data_list)):
            assert str(y_id_list[i]) == str(dev_data_list[i]['id'])

            dev_data_list[i]['predicted_label'] = id2label[y_pred_list[i]]
            
            if len(y_pred_mult_list) > 0: 
                dev_data_list[i]['multi_predicted_label'] = [id2label[x] for x in y_pred_mult_list[i]]

            if len(dev_data_list[i]['predicted_sentids']) == 0:
                dev_data_list[i]['predicted_label'] = "NOT ENOUGH INFO"

        print('n_correct:', n_correct)
        print('total_size:', total_size)

        eval_mode = {'check_sent_id_correct': True, 'standard': True}
        strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score(dev_data_list, dev_data_list, mode=eval_mode,
                                                                    verbose=False)
        print("Fever Score(Strict/Acc./Precision/Recall/F1):", strict_score, acc_score, pr, rec, f1)

        avg_loss = loss / total_size

    return strict_score, avg_loss
Ejemplo n.º 17
0
def debug_fever():
    num_epoch = 8
    seed = 12
    batch_size = 128
    experiment_name = "simple_nn"
    lazy = True
    torch.manual_seed(seed)
    keep_neg_sample_prob = 0.6
    sample_prob_decay = 0.1

    dev_upstream_file = config.RESULT_PATH / "doc_retri/cn_util_Jul17_docretri.singularize/dev.jsonl"
    train_upstream_file = config.RESULT_PATH / "doc_retri/cn_util_Jul17_docretri.singularize/train.jsonl"

    # Prepare Data
    token_indexers = {
        'tokens': SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters')  # This is the elmo_characters
    }

    train_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy, max_l=300)
    # dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=False)
    dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy, max_l=300)

    complete_upstream_dev_data = get_full_list(config.T_FEVER_DEV_JSONL, dev_upstream_file, pred=True)
    print("Dev size:", len(complete_upstream_dev_data))
    dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data)

    # Load Vocabulary
    biterator = BasicIterator(batch_size=batch_size)
    dev_biterator = BasicIterator(batch_size=batch_size)

    vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic")
    # THis is important
    vocab.add_token_to_namespace("true", namespace="selection_labels")
    vocab.add_token_to_namespace("false", namespace="selection_labels")
    vocab.add_token_to_namespace("hidden", namespace="selection_labels")
    vocab.change_token_with_index_to_namespace("hidden", -2, namespace='selection_labels')
    # Label value

    vocab.get_index_to_token_vocabulary('selection_labels')

    print(vocab.get_token_to_index_vocabulary('selection_labels'))
    print(vocab.get_vocab_size('tokens'))

    biterator.index_with(vocab)
    dev_biterator.index_with(vocab)

    # exit(0)
    # Build Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0)
    device_num = -1 if device.type == 'cpu' else 0

    model = Model(weight=weight_dict['glove.840B.300d'],
                  vocab_size=vocab.get_vocab_size('tokens'),
                  embedding_dim=300, max_l=280, num_of_class=2)

    model.display()
    model.to(device)

    # Create Log File
    file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}")
    # Save the source code.
    script_name = os.path.basename(__file__)
    with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it:
        out_f.write(it.read())
        out_f.flush()
    # Save source code end.

    best_dev = -1
    iteration = 0
    i_epoch = 0

    start_lr = 0.0002
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=start_lr)
    criterion = nn.CrossEntropyLoss()

    eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num)
    complete_upstream_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data)

    dev_results_list = score_converter_v0(config.T_FEVER_DEV_JSONL, complete_upstream_dev_data)
    eval_mode = {'check_sent_id_correct': True, 'standard': True}
    strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score(dev_results_list, config.T_FEVER_DEV_JSONL,
                                                                mode=eval_mode, verbose=False)
    total = len(dev_results_list)
    hit = eval_mode['check_sent_id_correct_hits']
    tracking_score = hit / total

    print(f"Dev(raw_acc/pr/rec/f1):{acc_score}/{pr}/{rec}/{f1}/")
    print("Strict score:", strict_score)
    print(f"Eval Tracking score:", f"{tracking_score}")

    need_save = False
    if tracking_score > best_dev:
        best_dev = tracking_score
        need_save = True

    if need_save:
        save_path = os.path.join(
            file_path_prefix,
            f'i({iteration})_epoch({i_epoch})_'
            f'(tra_score:{tracking_score}|raw_acc:{acc_score}|pr:{pr}|rec:{rec}|f1:{f1})'
        )

        torch.save(model.state_dict(), save_path)

    print("Epoch Evaluation...")
    eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num)
    complete_upstream_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data)

    dev_results_list = score_converter_v0(config.T_FEVER_DEV_JSONL, complete_upstream_dev_data)
    eval_mode = {'check_sent_id_correct': True, 'standard': True}
    strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score(dev_results_list, config.T_FEVER_DEV_JSONL,
                                                                mode=eval_mode, verbose=False)
    total = len(dev_results_list)
    hit = eval_mode['check_sent_id_correct_hits']
    tracking_score = hit / total

    print(f"Dev(raw_acc/pr/rec/f1):{acc_score}/{pr}/{rec}/{f1}/")
    print("Strict score:", strict_score)
    print(f"Eval Tracking score:", f"{tracking_score}")

    if tracking_score > best_dev:
        best_dev = tracking_score

    save_path = os.path.join(
        file_path_prefix,
        f'i({iteration})_epoch({i_epoch})_'
        f'(tra_score:{tracking_score}|raw_acc:{acc_score}|pr:{pr}|rec:{rec}|f1:{f1})_epoch'
    )

    torch.save(model.state_dict(), save_path)
Ejemplo n.º 18
0
def utest_score_ground_truth():
    d_list = load_data(config.FEVER_DEV_JSONL)
    utest_for_ground_truth(d_list)

    eval_mode = {'check_sent_id_correct': True, 'standard': True}
    print(c_scorer.fever_score(d_list, d_list, mode=eval_mode, verbose=False))
def check_acc(in_path):
    d_list = load_data(in_path)
    eval_mode = {'check_sent_id_correct': True, 'standard': False}
    print(c_scorer.fever_score(d_list, d_list, mode=eval_mode, verbose=False))
Ejemplo n.º 20
0
def train_fever_hesm(model_name = "albert-base-v2"):
    seed = 12
    torch.manual_seed(seed)
    
    num_epoch = 4
    batch_size = 64
    
    # parameters for annealed sampling
    keep_neg_sample_prob = 1
    sample_prob_decay = 0.015
    min_keep_neg_sample_prob = 0.02

    experiment_name = "simple_nn_startkp_{}_de_{}".format(keep_neg_sample_prob, sample_prob_decay)
    resume_model = None

    dev_upstream_file = config.RESULT_PATH / "pipeline_r_aaai_doc_exec/2019_10_07_10:14:16_r/doc_retr_2_shared_task_dev.jsonl"
    train_upstream_file = config.RESULT_PATH / "pipeline_r_aaai_doc/2019_10_27_16:48:33_r/doc_retr_2_train.jsonl"

    complete_upstream_dev_data = get_hyperlink_evidence_list(config.T_FEVER_DEV_JSONL, dev_upstream_file, pred=True)
    print("Dev size:", len(complete_upstream_dev_data))
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0)
    
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels = 2,  
        output_attentions = False,
        output_hidden_states = False,
    )

    if torch.cuda.device_count() > 1:
        print("More than 1 gpu device found...")
        model = nn.DataParallel(model)
    
    model.to(device)
    
    start_lr = 2e-5
    optimizer = AdamW(model.parameters(),
                  lr = start_lr,
                  eps = 1e-8
                )

    if resume_model is not None:
        print("Resume From:", resume_model)
        load_model(resume_model, model, optimizer)

    # Create Log File
    file_path_prefix, _ = save_tool.gen_file_prefix(f"{experiment_name}")
    # Save the source code.
    script_name = os.path.basename(__file__)
    with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it:
        out_f.write(it.read())
        out_f.flush()
    # Save source code end.

    best_dev = -1
    iteration = 0
    
    criterion = nn.CrossEntropyLoss()
    hesm_model = HESMUtil(model, model_name=model_name)
    display(model)

    for i_epoch in range(num_epoch):
        print("Get first evidence for training...")
        complete_upstream_train_data = get_hyperlink_evidence_list(config.T_FEVER_TRAIN_JSONL, train_upstream_file, pred=False)
        
        print("Resampling...")
        print("Sample Prob.:", keep_neg_sample_prob)
        filtered_train_data = post_filter_v2(complete_upstream_train_data, keep_prob=keep_neg_sample_prob,
                                          seed=12 + i_epoch)
        
        keep_neg_sample_prob -= sample_prob_decay
        if keep_neg_sample_prob <= min_keep_neg_sample_prob:
            keep_neg_sample_prob = min_keep_neg_sample_prob
        print("Sampled length:", len(filtered_train_data))
        
        sent_list, label_list, pid_list = hesm_model.read(filtered_train_data)

        train_dataset = HESMDataset({'text': sent_list, 'labels': label_list, 'pid': pid_list})    
        train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )
        
        if i_epoch == 0:
            steps_per_epoch = len(train_dataloader)
            total_steps = steps_per_epoch * num_epoch
            scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)        
            accumulation_steps = 2 # accumulate gradients for increasing `batch_size` by a factor of `accumulation_steps`
            save_epoch = 0.5 # evaluate and save every `save_epoch` epochs

        optimizer.zero_grad()
        for i, batch in tqdm(enumerate(train_dataloader)):
            model.train()
            loss, out = hesm_model.step(batch)
            y = batch['labels'].cuda()
            
            loss = criterion(out, y)
            loss = loss / accumulation_steps
            
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            
            if (i+1) % accumulation_steps == 0:             # Wait for several backward steps
                optimizer.step()                            # Now we can do an optimizer step
                scheduler.step()
                optimizer.zero_grad()
            iteration += 1

            mod = steps_per_epoch * save_epoch
            if iteration % mod == 0:
                
                sent_list, label_list, pid_list = hesm_model.read(complete_upstream_dev_data)

                eval_dataset = HESMDataset({'text': sent_list, 'labels': label_list, 'pid': pid_list})    
                eval_dataloader = DataLoader(
                    eval_dataset,
                    sampler = SequentialSampler(eval_dataset),
                    batch_size = batch_size
                )
                
                complete_upstream_dev_data = hidden_eval_hesm(hesm_model, model, eval_dataloader, complete_upstream_dev_data)

                dev_results_list = score_converter(config.T_FEVER_DEV_JSONL, complete_upstream_dev_data, dev_upstream_file)
                eval_mode = {'check_sent_id_correct': True, 'standard': True}
                strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score(dev_results_list, common.load_jsonl(config.T_FEVER_DEV_JSONL),
                                                                            mode=eval_mode, verbose=False)
                total = len(dev_results_list)
                hit = eval_mode['check_sent_id_correct_hits']
                tracking_score = hit / total

                print(f"Dev(raw_acc/pr/rec/f1):{acc_score}/{pr}/{rec}/{f1}/")
                print("Strict score:", strict_score)
                print(f"Eval Tracking score:", f"{tracking_score}")

                need_save = False
                if tracking_score > best_dev:
                    best_dev = tracking_score
                    need_save = True

                if need_save:
                    save_path = os.path.join(
                        file_path_prefix,
                        f'i({iteration})_epoch({i_epoch})_'
                        f'(tra_score:{tracking_score}|raw_acc:{acc_score}|pr:{pr}|rec:{rec}|f1:{f1})'
                    )

                    save_model(save_path, model, optimizer)
Ejemplo n.º 21
0
 def eval(self, d_list):
     eval_mode = {'check_doc_id_correct': True, 'standard': False}
     return fever_score(d_list, d_list, mode=eval_mode, verbose=False)
Ejemplo n.º 22
0
def train_fever():
    num_epoch = 8
    seed = 12
    batch_size = 128
    experiment_name = "simple_nn"
    lazy = True
    torch.manual_seed(seed)
    keep_neg_sample_prob = 0.5
    sample_prob_decay = 0.1

    dev_upstream_file = config.RESULT_PATH / "sent_retri/2018_07_05_17:17:50_r/dev.jsonl"
    train_upstream_file = config.RESULT_PATH / "sent_retri/2018_07_05_17:17:50_r/train.jsonl"

    # Prepare Data
    token_indexers = {
        'tokens': SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters')  # This is the elmo_characters
    }

    train_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy)
    # dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=False)
    dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy)

    complete_upstream_dev_data = get_full_list(config.T_FEVER_DEV_JSONL, dev_upstream_file, pred=True)
    print("Dev size:", len(complete_upstream_dev_data))
    dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data)

    # Load Vocabulary
    biterator = BasicIterator(batch_size=batch_size)
    dev_biterator = BasicIterator(batch_size=batch_size)

    vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic")
    # THis is important
    vocab.add_token_to_namespace("true", namespace="selection_labels")
    vocab.add_token_to_namespace("false", namespace="selection_labels")
    vocab.add_token_to_namespace("hidden", namespace="selection_labels")
    vocab.change_token_with_index_to_namespace("hidden", -2, namespace='selection_labels')
    # Label value

    vocab.get_index_to_token_vocabulary('selection_labels')

    print(vocab.get_token_to_index_vocabulary('selection_labels'))
    print(vocab.get_vocab_size('tokens'))

    biterator.index_with(vocab)
    dev_biterator.index_with(vocab)

    # exit(0)
    # Build Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0)
    device_num = -1 if device.type == 'cpu' else 0

    model = Model(weight=weight_dict['glove.840B.300d'],
                  vocab_size=vocab.get_vocab_size('tokens'),
                  embedding_dim=300, max_l=300, num_of_class=2)

    model.display()
    model.to(device)

    # Create Log File
    file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}")
    # Save the source code.
    script_name = os.path.basename(__file__)
    with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it:
        out_f.write(it.read())
        out_f.flush()
    # Save source code end.

    best_dev = -1
    iteration = 0

    start_lr = 0.0002
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=start_lr)
    criterion = nn.CrossEntropyLoss()

    for i_epoch in range(num_epoch):
        print("Resampling...")
        # Resampling
        complete_upstream_train_data = get_full_list(config.T_FEVER_TRAIN_JSONL, train_upstream_file, pred=False)
        filtered_train_data = post_filter(complete_upstream_train_data, keep_prob=keep_neg_sample_prob,
                                          seed=12 + i_epoch)
        # Change the seed to avoid duplicate sample...
        keep_neg_sample_prob -= sample_prob_decay

        print("Sampled_length:", len(filtered_train_data))
        sampled_train_instances = train_fever_data_reader.read(filtered_train_data)

        train_iter = biterator(sampled_train_instances, shuffle=True, num_epochs=1, cuda_device=device_num)
        for i, batch in tqdm(enumerate(train_iter)):
            model.train()
            out = model(batch)
            y = batch['selection_label']

            loss = criterion(out, y)

            # No decay
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            iteration += 1

            if i_epoch <= 4:
                mod = 25000
            else:
                mod = 10000

            if iteration % mod == 0:
                eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num)
                dev_score, dev_loss, complete_upstream_dev_data = full_eval_model(model, eval_iter, criterion,
                                                                                  complete_upstream_dev_data)

                dev_results_list = score_converter_v0(config.T_FEVER_DEV_JSONL, complete_upstream_dev_data)
                eval_mode = {'check_sent_id_correct': True, 'standard': True}
                strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score(dev_results_list, config.T_FEVER_DEV_JSONL,
                                                                            mode=eval_mode, verbose=False)
                total = len(dev_results_list)
                hit = eval_mode['check_sent_id_correct_hits']
                tracking_score = hit / total

                print(f"Dev(clf_acc/pr/rec/f1/loss):{dev_score}/{pr}/{rec}/{f1}/{dev_loss}")
                print(f"Tracking score:", f"{tracking_score}")

                need_save = False
                if tracking_score > best_dev:
                    best_dev = tracking_score
                    need_save = True

                if need_save:
                    save_path = os.path.join(
                        file_path_prefix,
                        f'i({iteration})_epoch({i_epoch})_'
                        f'(tra_score:{tracking_score}|clf_acc:{dev_score}|pr:{pr}|rec:{rec}|f1:{f1}|loss:{dev_loss})'
                    )

                    torch.save(model.state_dict(), save_path)
Ejemplo n.º 23
0
    common.save_jsonl(
        dev_results_list, config.RESULT_PATH /
        "sent_retri_nn/2018_07_20_15-17-59_r/dev_scale(0.1).jsonl")

    # for item in dev_results_list:
    #     print(item['scored_sentids'])

    # common.save_jsonl(dev_results_list, "/Users/Eason/RA/FunEver/results/sent_retri_nn/2018_07_17_16-34-19_r/dev_scale(0.1).jsonl")
    # common.save_jsonl(dev_results_list, "/Users/Eason/RA/FunEver/results/sent_retri_nn/2018_07_17_16-34-19_r/dev_scale(0.1).jsonl")

    # eval_mode = {'check_doc_id_correct': True, 'check_sent_id_correct': True, 'standard': True}
    eval_mode = {'check_sent_id_correct': True, 'standard': True}
    # c_scorer.delete_label(dev_results_list)
    strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score(
        dev_results_list,
        common.load_jsonl(config.FEVER_DEV_UNLABELED_JSONL),
        mode=eval_mode,
        verbose=False)
    print(strict_score, acc_score, pr, rec, f1)

    # total = len(dev_results_list)
    # hit = eval_mode['check_sent_id_correct_hits']
    # tracking_score = hit / total
    #
    # print(f"Dev(fever_score/pr/rec/f1):{strict_score}/{pr}/{rec}/{f1}")
    # print(f"Tracking score:", f"{tracking_score}")
    # eval_mode = {'check_sent_id_correct': True, 'standard': True}
    # delete_gold_label(dev_results_list)
    # strict_score, acc_score, pr, rec, f1, error_list = c_scorer.fever_score_analysis(dev_results_list,
    #                                                                                  common.load_jsonl(config.T_FEVER_DEV_JSONL),
    #                                                                                  mode=eval_mode, verbose=False)
def train_fever_ema_v1(resume_model=None):
    """
    This method is training script for bert+nsmn model
    :param resume_model:
    :return:
    """
    num_epoch = 200
    seed = 12
    batch_size = 32
    lazy = True
    dev_prob_threshold = 0.02
    train_prob_threshold = 0.02
    train_sample_top_k = 8
    experiment_name = f"bert_nsmn_ema_lr1|t_prob:{train_prob_threshold}|top_k:{train_sample_top_k}"

    bert_type_name = "bert-large-uncased"
    bert_servant = BertServant(bert_type_name=bert_type_name)

    # print("Do EMA:")
    print("Dev prob threshold:", dev_prob_threshold)
    print("Train prob threshold:", train_prob_threshold)
    print("Train sample top k:", train_sample_top_k)

    dev_upstream_sent_list = common.load_jsonl(
        config.RESULT_PATH /
        "sent_retri_nn/balanced_sentence_selection_results/dev_sent_pred_scores.jsonl"
    )

    train_upstream_sent_list = common.load_jsonl(
        config.RESULT_PATH /
        "sent_retri_nn/balanced_sentence_selection_results/train_sent_scores.jsonl"
    )
    # Prepare Data
    # 22 Nov 2018 03:16
    # Remove this because everything can be handled by Bert Servant.

    print("Building Prob Dicts...")
    train_sent_list = common.load_jsonl(
        config.RESULT_PATH /
        "sent_retri_nn/balanced_sentence_selection_results/train_sent_scores.jsonl"
    )

    dev_sent_list = common.load_jsonl(
        config.RESULT_PATH /
        "sent_retri_nn/balanced_sentence_selection_results/dev_sent_pred_scores.jsonl"
    )

    selection_dict = paired_selection_score_dict(train_sent_list)
    selection_dict = paired_selection_score_dict(dev_sent_list, selection_dict)

    upstream_dev_list = threshold_sampler_insure_unique(
        config.T_FEVER_DEV_JSONL,
        dev_upstream_sent_list,
        prob_threshold=dev_prob_threshold,
        top_n=5)

    dev_fever_data_reader = BertReader(bert_servant, lazy=lazy, max_l=60)
    train_fever_data_reader = BertReader(bert_servant, lazy=lazy, max_l=60)

    complete_upstream_dev_data = select_sent_with_prob_for_eval(
        config.T_FEVER_DEV_JSONL,
        upstream_dev_list,
        selection_dict,
        tokenized=True)

    dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data)

    # Load Vocabulary, if we are using bert, we don't need anything here.
    biterator = BasicIterator(batch_size=batch_size)

    unk_token_num = {'tokens': 2600}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace('SUPPORTS', namespace='labels')
    vocab.add_token_to_namespace('REFUTES', namespace='labels')
    vocab.add_token_to_namespace('NOT ENOUGH INFO', namespace='labels')
    print(vocab)

    biterator.index_with(vocab)

    # Build Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu",
                          index=0)
    device_num = -1 if device.type == 'cpu' else 0
    bert_servant.bert_model.to(device)

    # Init model here
    model = Model(
        bert_servant,
        bert_batch_size=1,
        rnn_size_in=(1024 + 2, 1024 + 2 + 300),  # probs + task indicator.
        rnn_size_out=(300, 300),
        max_l=250,
        mlp_d=300,
        num_of_class=3,
        drop_r=0.5,
        activation_type='gelu')
    model.to(device)

    # Create Log File
    file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}")
    # Save the source code.
    script_name = os.path.basename(__file__)
    with open(os.path.join(file_path_prefix, script_name),
              'w') as out_f, open(__file__, 'r') as it:
        out_f.write(it.read())
        out_f.flush()
    # Save source code end.

    best_dev = -1
    iteration = 0
    #
    start_lr = 0.0001
    optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                  model.parameters()),
                           lr=start_lr)
    criterion = nn.CrossEntropyLoss()

    for i_epoch in range(num_epoch):
        print("Resampling...")
        # Resampling
        train_data_with_candidate_sample_list = \
            threshold_sampler_insure_unique(config.T_FEVER_TRAIN_JSONL, train_upstream_sent_list,
                                            train_prob_threshold,
                                            top_n=train_sample_top_k)

        complete_upstream_train_data = adv_simi_sample_with_prob_v1_1(
            config.T_FEVER_TRAIN_JSONL,
            train_data_with_candidate_sample_list,
            selection_dict,
            tokenized=True)
        random.shuffle(complete_upstream_train_data)
        print("Sample data length:", len(complete_upstream_train_data))
        sampled_train_instances = train_fever_data_reader.read(
            complete_upstream_train_data)

        train_iter = biterator(sampled_train_instances,
                               shuffle=True,
                               num_epochs=1)
        for i, batch in tqdm(enumerate(train_iter)):
            model.train()
            out = model(batch)

            y = batch['label'].to(next(model.parameters()).device)

            loss = criterion(out, y)

            # No decay
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            iteration += 1

            # EMA update
            # ema(model.named_parameters())

            if i_epoch < 15:
                mod = 20000
                # mod = 500
            else:
                mod = 2000

            if iteration % mod == 0:
                eval_iter = biterator(dev_instances,
                                      shuffle=False,
                                      num_epochs=1)
                complete_upstream_dev_data = hidden_eval(
                    model, eval_iter, complete_upstream_dev_data)

                eval_mode = {'check_sent_id_correct': True, 'standard': True}
                strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score(
                    complete_upstream_dev_data,
                    common.load_jsonl(config.T_FEVER_DEV_JSONL),
                    mode=eval_mode,
                    verbose=False)
                print("Fever Score(Strict/Acc./Precision/Recall/F1):",
                      strict_score, acc_score, pr, rec, f1)

                print(f"Dev:{strict_score}/{acc_score}")

                # EMA saving
                # eval_iter = biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num)
                # load_ema_to_model(cloned_empty_model, ema)
                # complete_upstream_dev_data = hidden_eval(cloned_empty_model, eval_iter, complete_upstream_dev_data)
                #
                # eval_mode = {'check_sent_id_correct': True, 'standard': True}
                # strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score(complete_upstream_dev_data,
                #                                                             common.load_jsonl(config.T_FEVER_DEV_JSONL),
                #                                                             mode=eval_mode,
                #                                                             verbose=False)
                # print("Fever Score EMA(Strict/Acc./Precision/Recall/F1):", strict_score, acc_score, pr, rec, f1)
                #
                # print(f"Dev EMA:{strict_score}/{acc_score}")

                need_save = False
                if strict_score > best_dev:
                    best_dev = strict_score
                    need_save = True

                if need_save:
                    save_path = os.path.join(
                        file_path_prefix,
                        f'i({iteration})_epoch({i_epoch})_dev({strict_score})_lacc({acc_score})_seed({seed})'
                    )

                    torch.save(model.state_dict(), save_path)
def train_fever_v2():
    # train_fever_v1 is the old training script.
    # train_fever_v2 is the new training script created on 02 Oct 2018 11:40:24.
    # Here we keep the negative and positive portion to be consistent.
    num_epoch = 10
    seed = 12
    batch_size = 128
    lazy = True
    torch.manual_seed(seed)
    keep_neg_sample_prob = 1
    top_k_doc = 5

    experiment_name = f"simple_nn_remain_{keep_neg_sample_prob}"
    # sample_prob_decay = 0.05

    dev_upstream_file = config.RESULT_PATH / "doc_retri/std_upstream_data_using_pageview/dev_doc.jsonl"
    train_upstream_file = config.RESULT_PATH / "doc_retri/std_upstream_data_using_pageview/train_doc.jsonl"

    # Prepare Data
    token_indexers = {
        'tokens':
        SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(
            namespace='elmo_characters')  # This is the elmo_characters
    }

    train_fever_data_reader = SSelectorReader(token_indexers=token_indexers,
                                              lazy=lazy,
                                              max_l=180)
    # dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=False)
    dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers,
                                            lazy=lazy,
                                            max_l=180)

    complete_upstream_dev_data = get_full_list(config.T_FEVER_DEV_JSONL,
                                               dev_upstream_file,
                                               pred=True,
                                               top_k=top_k_doc)
    print("Dev size:", len(complete_upstream_dev_data))
    dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data)

    # Load Vocabulary
    biterator = BasicIterator(batch_size=batch_size)
    dev_biterator = BasicIterator(batch_size=batch_size)

    vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT /
                                               "vocab_cache" / "nli_basic")
    # THis is important
    vocab.add_token_to_namespace("true", namespace="selection_labels")
    vocab.add_token_to_namespace("false", namespace="selection_labels")
    vocab.add_token_to_namespace("hidden", namespace="selection_labels")
    vocab.change_token_with_index_to_namespace("hidden",
                                               -2,
                                               namespace='selection_labels')
    # Label value

    vocab.get_index_to_token_vocabulary('selection_labels')

    print(vocab.get_token_to_index_vocabulary('selection_labels'))
    print(vocab.get_vocab_size('tokens'))

    biterator.index_with(vocab)
    dev_biterator.index_with(vocab)

    # exit(0)
    # Build Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu",
                          index=0)
    device_num = -1 if device.type == 'cpu' else 0

    model = Model(weight=weight_dict['glove.840B.300d'],
                  vocab_size=vocab.get_vocab_size('tokens'),
                  embedding_dim=300,
                  max_l=160,
                  num_of_class=2)

    model.display()
    model.to(device)

    cloned_empty_model = copy.deepcopy(model)
    ema: EMA = EMA(parameters=model.named_parameters())

    # Create Log File
    file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}")
    # Save the source code.
    script_name = os.path.basename(__file__)
    with open(os.path.join(file_path_prefix, script_name),
              'w') as out_f, open(__file__, 'r') as it:
        out_f.write(it.read())
        out_f.flush()
    # Save source code end.

    best_dev = -1
    iteration = 0

    start_lr = 0.0002
    optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                  model.parameters()),
                           lr=start_lr)
    criterion = nn.CrossEntropyLoss()

    dev_actual_list = common.load_jsonl(config.T_FEVER_DEV_JSONL)

    for i_epoch in range(num_epoch):
        print("Resampling...")
        # Resampling
        complete_upstream_train_data = get_full_list(
            config.T_FEVER_TRAIN_JSONL,
            train_upstream_file,
            pred=False,
            top_k=top_k_doc)

        print("Sample Prob.:", keep_neg_sample_prob)
        filtered_train_data = post_filter(complete_upstream_train_data,
                                          keep_prob=keep_neg_sample_prob,
                                          seed=12 + i_epoch)

        # Change the seed to avoid duplicate sample...
        # keep_neg_sample_prob -= sample_prob_decay
        # if keep_neg_sample_prob <= 0:
        #     keep_neg_sample_prob = 0.005
        print("Sampled_length:", len(filtered_train_data))

        sampled_train_instances = train_fever_data_reader.read(
            filtered_train_data)

        train_iter = biterator(sampled_train_instances,
                               shuffle=True,
                               num_epochs=1,
                               cuda_device=device_num)
        for i, batch in tqdm(enumerate(train_iter)):
            model.train()
            out = model(batch)
            y = batch['selection_label']

            loss = criterion(out, y)

            # No decay
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Update EMA
            ema(model.named_parameters())
            iteration += 1

            if i_epoch <= 5:
                mod = 8000
            else:
                mod = 8000

            if iteration % mod == 0:
                eval_iter = dev_biterator(dev_instances,
                                          shuffle=False,
                                          num_epochs=1,
                                          cuda_device=device_num)

                load_ema_to_model(cloned_empty_model, ema)

                # complete_upstream_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data)

                # Only eval EMA
                complete_upstream_dev_data = hidden_eval(
                    cloned_empty_model, eval_iter, complete_upstream_dev_data)

                dev_results_list = score_converter_v1(
                    config.T_FEVER_DEV_JSONL,
                    complete_upstream_dev_data,
                    sent_retri_top_k=5,
                    sent_retri_scal_prob=0.5)
                # This is only a wrapper for the simi_sampler

                eval_mode = {'check_sent_id_correct': True, 'standard': True}
                for a, b in zip(dev_actual_list, dev_results_list):
                    b['predicted_label'] = a['label']
                strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score(
                    dev_results_list,
                    dev_actual_list,
                    mode=eval_mode,
                    verbose=False)
                tracking_score = strict_score
                print(f"Dev(raw_acc/pr/rec/f1):{acc_score}/{pr}/{rec}/{f1}")
                print("Strict score:", strict_score)
                print(f"Eval Tracking score:", f"{tracking_score}")

                # need_save = False
                # if tracking_score > best_dev:
                #     best_dev = tracking_score
                need_save = True

                if need_save:
                    save_path = os.path.join(
                        file_path_prefix, f'i({iteration})_epoch({i_epoch})_'
                        f'(tra_score:{tracking_score}|raw_acc:{acc_score}|pr:{pr}|rec:{rec}|f1:{f1})_ema'
                    )

                    save_ema_to_file(ema, save_path)
                    # torch.save(model.state_dict(), save_path)

        print("Epoch Evaluation...")
        eval_iter = dev_biterator(dev_instances,
                                  shuffle=False,
                                  num_epochs=1,
                                  cuda_device=device_num)

        load_ema_to_model(cloned_empty_model, ema)
        # complete_upstream_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data)
        complete_upstream_dev_data = hidden_eval(cloned_empty_model, eval_iter,
                                                 complete_upstream_dev_data)

        dev_results_list = score_converter_v1(config.T_FEVER_DEV_JSONL,
                                              complete_upstream_dev_data,
                                              sent_retri_top_k=5,
                                              sent_retri_scal_prob=0.5)

        eval_mode = {'check_sent_id_correct': True, 'standard': True}
        for a, b in zip(dev_actual_list, dev_results_list):
            b['predicted_label'] = a['label']
        strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score(
            dev_results_list, dev_actual_list, mode=eval_mode, verbose=False)
        tracking_score = strict_score
        print(f"Dev(raw_acc/pr/rec/f1):{acc_score}/{pr}/{rec}/{f1}")
        print("Strict score:", strict_score)
        print(f"Eval Tracking score:", f"{tracking_score}")

        if tracking_score > best_dev:
            best_dev = tracking_score

        save_path = os.path.join(
            file_path_prefix, f'i({iteration})_epoch({i_epoch})_'
            f'(tra_score:{tracking_score}|raw_acc:{acc_score}|pr:{pr}|rec:{rec}|f1:{f1})_epoch_ema'
        )

        save_ema_to_file(ema, save_path)
def train_fever_std_ema_v1(resume_model=None, wn_feature=False):
    """
    This method is the new training script for train fever with span and probability score.
    :param resume_model:
    :param wn_feature:
    :return:
    """
    num_epoch = 200
    seed = 12
    batch_size = 32
    lazy = True
    dev_prob_threshold = 0.1
    train_prob_threshold = 0.1
    train_sample_top_k = 8
    experiment_name = f"nsmn_sent_wise_std_ema_lr1|t_prob:{train_prob_threshold}|top_k:{train_sample_top_k}"
    # resume_model = None

    print("Do EMA:")

    print("Dev prob threshold:", dev_prob_threshold)
    print("Train prob threshold:", train_prob_threshold)
    print("Train sample top k:", train_sample_top_k)

    dev_upstream_sent_list = common.load_jsonl(
        config.RESULT_PATH /
        "sent_retri_nn/balanced_sentence_selection_results/dev_sent_pred_scores.jsonl"
    )

    train_upstream_sent_list = common.load_jsonl(
        config.RESULT_PATH /
        "sent_retri_nn/balanced_sentence_selection_results/train_sent_scores.jsonl"
    )

    # Prepare Data
    token_indexers = {
        'tokens':
        SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(
            namespace='elmo_characters')  # This is the elmo_characters
    }

    print("Building Prob Dicts...")
    train_sent_list = common.load_jsonl(
        config.RESULT_PATH /
        "sent_retri_nn/balanced_sentence_selection_results/train_sent_scores.jsonl"
    )

    dev_sent_list = common.load_jsonl(
        config.RESULT_PATH /
        "sent_retri_nn/balanced_sentence_selection_results/dev_sent_pred_scores.jsonl"
    )

    selection_dict = paired_selection_score_dict(train_sent_list)
    selection_dict = paired_selection_score_dict(dev_sent_list, selection_dict)

    upstream_dev_list = threshold_sampler_insure_unique(
        config.T_FEVER_DEV_JSONL,
        dev_upstream_sent_list,
        prob_threshold=dev_prob_threshold,
        top_n=5)

    # Specifiy ablation to remove wordnet and number embeddings.
    dev_fever_data_reader = WNSIMIReader(token_indexers=token_indexers,
                                         lazy=lazy,
                                         wn_p_dict=p_dict,
                                         max_l=320,
                                         ablation=None)
    train_fever_data_reader = WNSIMIReader(token_indexers=token_indexers,
                                           lazy=lazy,
                                           wn_p_dict=p_dict,
                                           max_l=320,
                                           shuffle_sentences=False,
                                           ablation=None)

    complete_upstream_dev_data = select_sent_with_prob_for_eval(
        config.T_FEVER_DEV_JSONL,
        upstream_dev_list,
        selection_dict,
        tokenized=True)

    dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data)

    # Load Vocabulary
    biterator = BasicIterator(batch_size=batch_size)

    vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT /
                                               "vocab_cache" / "nli_basic")
    vocab.change_token_with_index_to_namespace('hidden',
                                               -2,
                                               namespace='labels')

    print(vocab.get_token_to_index_vocabulary('labels'))
    print(vocab.get_vocab_size('tokens'))

    biterator.index_with(vocab)

    # Build Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu",
                          index=0)
    device_num = -1 if device.type == 'cpu' else 0

    model = Model(
        rnn_size_in=(1024 + 300 + dev_fever_data_reader.wn_feature_size,
                     1024 + 450 + dev_fever_data_reader.wn_feature_size),
        rnn_size_out=(450, 450),
        weight=weight_dict['glove.840B.300d'],
        vocab_size=vocab.get_vocab_size('tokens'),
        mlp_d=900,
        embedding_dim=300,
        max_l=300,
        use_extra_lex_feature=False,
        max_span_l=100)

    print("Model Max length:", model.max_l)
    if resume_model is not None:
        model.load_state_dict(torch.load(resume_model))
    model.display()
    model.to(device)

    cloned_empty_model = copy.deepcopy(model)
    ema: EMA = EMA(parameters=model.named_parameters())

    # Create Log File
    file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}")
    # Save the source code.
    script_name = os.path.basename(__file__)
    with open(os.path.join(file_path_prefix, script_name),
              'w') as out_f, open(__file__, 'r') as it:
        out_f.write(it.read())
        out_f.flush()
    # Save source code end.

    best_dev = -1
    iteration = 0

    start_lr = 0.0001
    optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                  model.parameters()),
                           lr=start_lr)
    criterion = nn.CrossEntropyLoss()

    for i_epoch in range(num_epoch):
        print("Resampling...")
        # Resampling
        train_data_with_candidate_sample_list = \
            threshold_sampler_insure_unique(config.T_FEVER_TRAIN_JSONL, train_upstream_sent_list,
                                            train_prob_threshold,
                                            top_n=train_sample_top_k)

        complete_upstream_train_data = adv_simi_sample_with_prob_v1_1(
            config.T_FEVER_TRAIN_JSONL,
            train_data_with_candidate_sample_list,
            selection_dict,
            tokenized=True)

        print("Sample data length:", len(complete_upstream_train_data))
        sampled_train_instances = train_fever_data_reader.read(
            complete_upstream_train_data)

        train_iter = biterator(sampled_train_instances,
                               shuffle=True,
                               num_epochs=1,
                               cuda_device=device_num)
        for i, batch in tqdm(enumerate(train_iter)):
            model.train()
            out = model(batch)
            y = batch['label']

            loss = criterion(out, y)

            # No decay
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            iteration += 1

            # EMA update
            ema(model.named_parameters())

            if i_epoch < 15:
                mod = 10000
                # mod = 10
            else:
                mod = 2000

            if iteration % mod == 0:
                # eval_iter = biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num)
                # complete_upstream_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data)
                #
                # eval_mode = {'check_sent_id_correct': True, 'standard': True}
                # strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score(complete_upstream_dev_data,
                #                                                             common.load_jsonl(config.T_FEVER_DEV_JSONL),
                #                                                             mode=eval_mode,
                #                                                             verbose=False)
                # print("Fever Score(Strict/Acc./Precision/Recall/F1):", strict_score, acc_score, pr, rec, f1)
                #
                # print(f"Dev:{strict_score}/{acc_score}")

                # EMA saving
                eval_iter = biterator(dev_instances,
                                      shuffle=False,
                                      num_epochs=1,
                                      cuda_device=device_num)
                load_ema_to_model(cloned_empty_model, ema)
                complete_upstream_dev_data = hidden_eval(
                    cloned_empty_model, eval_iter, complete_upstream_dev_data)

                eval_mode = {'check_sent_id_correct': True, 'standard': True}
                strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score(
                    complete_upstream_dev_data,
                    common.load_jsonl(config.T_FEVER_DEV_JSONL),
                    mode=eval_mode,
                    verbose=False)
                print("Fever Score EMA(Strict/Acc./Precision/Recall/F1):",
                      strict_score, acc_score, pr, rec, f1)

                print(f"Dev EMA:{strict_score}/{acc_score}")

                need_save = False
                if strict_score > best_dev:
                    best_dev = strict_score
                    need_save = True

                if need_save:
                    # save_path = os.path.join(
                    #     file_path_prefix,
                    #     f'i({iteration})_epoch({i_epoch})_dev({strict_score})_lacc({acc_score})_seed({seed})'
                    # )

                    # torch.save(model.state_dict(), save_path)

                    ema_save_path = os.path.join(
                        file_path_prefix,
                        f'ema_i({iteration})_epoch({i_epoch})_dev({strict_score})_lacc({acc_score})_seed({seed})'
                    )

                    save_ema_to_file(ema, ema_save_path)
def pipeline(in_file,
             eval_file=None,
             model_path_dict=default_model_path_dict,
             steps=default_steps):
    """
    :param in_file: The raw input file.
    :param eval_file: Whether to provide evaluation along the line.
    :return:
    """
    sentence_retri_1_scale_prob = 0.5
    sentence_retri_2_scale_prob = 0.9
    sent_retri_1_top_k = 5
    sent_retri_2_top_k = 1

    sent_prob_for_2doc = 0.1
    sent_topk_for_2doc = 5
    enhance_retri_1_scale_prob = -1

    build_submission = True

    doc_retrieval_method = 'word_freq'

    haonan_docretri_object = HAONAN_DOCRETRI_OBJECT()

    if not PIPELINE_DIR.exists():
        PIPELINE_DIR.mkdir()

    if steps['s1.tokenizing']['do']:
        time_stamp = utils.get_current_time_str()
        current_pipeline_dir = PIPELINE_DIR / f"{time_stamp}_r"
    else:
        current_pipeline_dir = steps['s1.tokenizing']['out_file'].parent

    print("Current Result Root:", current_pipeline_dir)

    if not current_pipeline_dir.exists():
        current_pipeline_dir.mkdir()

    eval_list = common.load_jsonl(eval_file) if eval_file is not None else None

    in_file_stem = in_file.stem
    tokenized_file = current_pipeline_dir / f"t_{in_file_stem}.jsonl"

    # Save code into directory
    script_name = os.path.basename(__file__)
    with open(os.path.join(str(current_pipeline_dir), script_name),
              'w') as out_f, open(__file__, 'r') as it:
        out_f.write(it.read())
        out_f.flush()

    # Tokenizing.
    print("Step 1. Tokenizing.")
    if steps['s1.tokenizing']['do']:
        tokenized_claim(in_file, tokenized_file)  # Auto Saved
        print("Tokenized file saved to:", tokenized_file)
    else:
        tokenized_file = steps['s1.tokenizing']['out_file']
        print("Use preprocessed file:", tokenized_file)
    # Tokenizing End.

    # First Document retrieval.
    print("Step 2. First Document Retrieval")

    if steps['s2.1doc_retri']['do']:
        doc_retrieval_result_list = first_doc_retrieval(
            haonan_docretri_object,
            tokenized_file,
            method=doc_retrieval_method)
        doc_retrieval_file_1 = current_pipeline_dir / f"doc_retr_1_{in_file_stem}.jsonl"
        common.save_jsonl(doc_retrieval_result_list, doc_retrieval_file_1)
        print("First Document Retrieval file saved to:", doc_retrieval_file_1)
    else:
        doc_retrieval_file_1 = steps['s2.1doc_retri']['out_file']
        doc_retrieval_result_list = common.load_jsonl(doc_retrieval_file_1)
        print("Use preprocessed file:", doc_retrieval_file_1)

    if eval_list is not None:
        print("Evaluating 1st Doc Retrieval")
        eval_mode = {'check_doc_id_correct': True, 'standard': False}
        print(
            c_scorer.fever_score(doc_retrieval_result_list,
                                 eval_list,
                                 mode=eval_mode,
                                 verbose=False))
    # First Document retrieval End.

    # First Sentence Selection.
    print("Step 3. First Sentence Selection")
    if steps['s3.1sen_select']['do']:
        dev_sent_list_1_e0 = simple_nnmodel.pipeline_first_sent_selection(
            tokenized_file, doc_retrieval_file_1, model_path_dict['sselector'])
        dev_sent_file_1_e0 = current_pipeline_dir / f"dev_sent_score_1_{in_file_stem}.jsonl"
        common.save_jsonl(dev_sent_list_1_e0, dev_sent_file_1_e0)

        # Manual setting, delete it later
        # dev_sent_file_1_e0 = None
        # dev_sent_list_1_e0 = common.load_jsonl("/home/easonnie/projects/FunEver/results/pipeline_r/2018_07_24_11:07:41_r(new_model_v1_2_for_realtest)_scaled_0.05_selector_em/dev_sent_score_1_shared_task_test.jsonl")
        # End

        if steps['s3.1sen_select']['ensemble']:
            print("Ensemble!")
            dev_sent_list_1_e1 = simple_nnmodel.pipeline_first_sent_selection(
                tokenized_file, doc_retrieval_file_1,
                model_path_dict['sselector_1'])
            dev_sent_file_1_e1 = current_pipeline_dir / f"dev_sent_score_1_{in_file_stem}_e1.jsonl"
            common.save_jsonl(dev_sent_list_1_e1, dev_sent_file_1_e1)
            # exit(0)
            # dev_sent_list_1_e1 = common.load_jsonl(dev_sent_file_1_e1)

            dev_sent_list_1_e2 = simple_nnmodel.pipeline_first_sent_selection(
                tokenized_file, doc_retrieval_file_1,
                model_path_dict['sselector_2'])
            dev_sent_file_1_e2 = current_pipeline_dir / f"dev_sent_score_1_{in_file_stem}_e2.jsonl"
            common.save_jsonl(dev_sent_list_1_e2, dev_sent_file_1_e2)
            # exit(0)
            # dev_sent_list_1_e2 = common.load_jsonl(dev_sent_file_1_e2)

            dev_sent_list_1 = merge_sent_results(
                [dev_sent_list_1_e0, dev_sent_list_1_e1, dev_sent_list_1_e2])
            dev_sent_file_1 = current_pipeline_dir / f"dev_sent_score_1_{in_file_stem}_ensembled.jsonl"
            common.save_jsonl(dev_sent_list_1, dev_sent_file_1)
            # exit(0)
        else:
            dev_sent_list_1 = dev_sent_list_1_e0
            dev_sent_file_1 = dev_sent_file_1_e0
        # Merging two results

        print("First Sentence Selection file saved to:", dev_sent_file_1)

    else:
        dev_sent_file_1 = steps['s3.1sen_select']['out_file']
        dev_sent_list_1 = common.load_jsonl(dev_sent_file_1)
        print("Use preprocessed file:", dev_sent_file_1)

    # exit(0)

    if eval_list is not None:
        print("Evaluating 1st Sentence Selection")
        # sent_select_results_list_1 = simi_sampler.threshold_sampler(tokenized_file, dev_sent_full_list,
        #                                                             sentence_retri_scale_prob, top_n=5)
        # additional_dev_sent_list = common.load_jsonl("/Users/Eason/RA/FunEver/results/sent_retri_nn/2018_07_20_15-17-59_r/dev_sent_2r.jsonl")
        # dev_sent_full_list = dev_sent_full_list + additional_dev_sent_list
        sent_select_results_list_1 = simi_sampler.threshold_sampler_insure_unique(
            tokenized_file,
            dev_sent_list_1,
            sentence_retri_1_scale_prob,
            top_n=sent_retri_1_top_k)
        # sent_select_results_list_1 = simi_sampler.threshold_sampler_insure_unique_merge(sent_select_results_list_1,
        #                                                                                 additional_dev_sent_list,
        #                                                                                 sentence_retri_2_scale_prob,
        #                                                                                 top_n=5, add_n=1)

        eval_mode = {'check_sent_id_correct': True, 'standard': False}
        # for a, b in zip(eval_list, sent_select_results_list_1):
        #     b['predicted_label'] = a['label']
        print(
            c_scorer.fever_score(sent_select_results_list_1,
                                 eval_list,
                                 mode=eval_mode,
                                 verbose=False))

    print("Step 4. Second Document Retrieval")
    if steps['s4.2doc_retri']['do']:
        dev_sent_list_1 = common.load_jsonl(dev_sent_file_1)
        filtered_dev_instance_1_for_doc2 = simi_sampler.threshold_sampler_insure_unique(
            tokenized_file,
            dev_sent_list_1,
            sent_prob_for_2doc,
            top_n=sent_topk_for_2doc)
        filtered_dev_instance_1_for_doc2_file = current_pipeline_dir / f"dev_sent_score_1_{in_file_stem}_scaled_for_doc2.jsonl"
        common.save_jsonl(filtered_dev_instance_1_for_doc2,
                          filtered_dev_instance_1_for_doc2_file)

        dev_sent_1_result = simi_sampler.threshold_sampler_insure_unique(
            doc_retrieval_file_1,  # Remember this name
            dev_sent_list_1,
            sentence_retri_1_scale_prob,
            top_n=sent_topk_for_2doc)

        dev_doc2_list = second_doc_retrieval(
            haonan_docretri_object, filtered_dev_instance_1_for_doc2_file,
            dev_sent_1_result)

        dev_doc2_file = current_pipeline_dir / f"doc_retr_2_{in_file_stem}.jsonl"
        common.save_jsonl(dev_doc2_list, dev_doc2_file)
        print("Second Document Retrieval File saved to:", dev_doc2_file)
    else:
        dev_doc2_file = steps['s4.2doc_retri']['out_file']
        # dev_doc2_list = common.load_jsonl(dev_doc2_file)
        print("Use preprocessed file:", dev_doc2_file)

    print("Step 5. Second Sentence Selection")
    if steps['s5.2sen_select']['do']:
        dev_sent_2_list = get_score_multihop(
            tokenized_file,
            dev_doc2_file,
            model_path=model_path_dict['sselector'])

        dev_sent_file_2 = current_pipeline_dir / f"dev_sent_score_2_{in_file_stem}.jsonl"
        common.save_jsonl(dev_sent_2_list, dev_sent_file_2)
        print("First Sentence Selection file saved to:", dev_sent_file_2)
    else:
        dev_sent_file_2 = steps['s5.2sen_select']['out_file']

    if eval_list is not None:
        print("Evaluating 1st Sentence Selection")
        dev_sent_list_1 = common.load_jsonl(dev_sent_file_1)
        dev_sent_list_2 = common.load_jsonl(dev_sent_file_2)
        sent_select_results_list_1 = simi_sampler.threshold_sampler_insure_unique(
            tokenized_file,
            dev_sent_list_1,
            sentence_retri_1_scale_prob,
            top_n=5)
        sent_select_results_list_1 = simi_sampler.threshold_sampler_insure_unique_merge(
            sent_select_results_list_1,
            dev_sent_list_2,
            sentence_retri_2_scale_prob,
            top_n=5,
            add_n=sent_retri_2_top_k)
        eval_mode = {'check_sent_id_correct': True, 'standard': False}
        # for a, b in zip(eval_list, sent_select_results_list_1):
        #     b['predicted_label'] = a['label']
        print(
            c_scorer.fever_score(sent_select_results_list_1,
                                 eval_list,
                                 mode=eval_mode,
                                 verbose=False))

    # print("Step 6. NLI")
    # if steps['s6.nli']['do']:
    #     dev_sent_list_1 = common.load_jsonl(dev_sent_file_1)
    #     dev_sent_list_2 = common.load_jsonl(dev_sent_file_2)
    #     sentence_retri_1_scale_prob = 0.05
    #     print("Threshold:", sentence_retri_1_scale_prob)
    #     sent_select_results_list_1 = simi_sampler.threshold_sampler_insure_unique(tokenized_file, dev_sent_list_1,
    #                                                                               sentence_retri_1_scale_prob, top_n=5)
    #     # sent_select_results_list_2 = simi_sampler.threshold_sampler_insure_unique_merge(sent_select_results_list_1,
    #     #                                                                                 dev_sent_list_2,
    #     #                                                                                 sentence_retri_2_scale_prob,
    #     #                                                                                 top_n=5,
    #     #                                                                                 add_n=sent_retri_2_top_k)
    #     nli_results = nli.mesim_wn_simi_v1_2.pipeline_nli_run(tokenized_file,
    #                                                           sent_select_results_list_1,
    #                                                           [dev_sent_file_1, dev_sent_file_2],
    #                                                           model_path_dict['nli'],
    #                                                           with_logits=True,
    #                                                           with_probs=True)
    #
    #     nli_results_file = current_pipeline_dir / f"nli_r_{in_file_stem}.jsonl"
    #     common.save_jsonl(nli_results, nli_results_file)
    # else:
    #     nli_results_file = steps['s6.nli']['out_file']
    #     nli_results = common.load_jsonl(nli_results_file)

    # Ensemble code
    # dev_sent_list_1 = common.load_jsonl(dev_sent_file_1)
    # dev_sent_list_2 = common.load_jsonl(dev_sent_file_2)
    # sentence_retri_1_scale_prob = 0.05
    # print("NLI sentence threshold:", sentence_retri_1_scale_prob)
    # sent_select_results_list_1 = simi_sampler.threshold_sampler_insure_unique(tokenized_file, dev_sent_list_1,
    #                                                                           sentence_retri_1_scale_prob, top_n=5)
    #
    # # sent_select_results_list_2 = simi_sampler.threshold_sampler_insure_unique_merge(sent_select_results_list_1,
    # #                                                                                 dev_sent_list_2,
    # #                                                                                 sentence_retri_2_scale_prob,
    # #                                                                                 top_n=5,
    # #                                                                                 add_n=sent_retri_2_top_k)
    # # nli_results = nli.mesim_wn_simi_v1_2.pipeline_nli_run(tokenized_file,
    # #                                                       sent_select_results_list_1,
    # #                                                       [dev_sent_file_1, dev_sent_file_2],
    # #                                                       model_path_dict['nli'], with_probs=True, with_logits=True)
    #
    # # nli_results = nli.mesim_wn_simi_v1_2.pipeline_nli_run_bigger(tokenized_file,
    # #                                                       sent_select_results_list_1,
    # #                                                       [dev_sent_file_1, dev_sent_file_2],
    # #                                                       model_path_dict['nli_2'],
    # #                                                              with_probs=True,
    # #                                                              with_logits=True)
    #
    # nli_results = nli.mesim_wn_simi_v1_2.pipeline_nli_run_bigger(tokenized_file,
    #                                                       sent_select_results_list_1,
    #                                                       [dev_sent_file_1, dev_sent_file_2],
    #                                                       model_path_dict['nli_4'],
    #                                                       with_probs=True,
    #                                                       with_logits=True)
    #
    # nli_results_file = current_pipeline_dir / f"nli_r_{in_file_stem}_withlb_e4.jsonl"
    # common.save_jsonl(nli_results, nli_results_file)
    # Ensemble code end
    # exit(0)

    nli_r_e0 = common.load_jsonl(current_pipeline_dir /
                                 "nli_r_shared_task_test_withlb_e0.jsonl")
    nli_r_e1 = common.load_jsonl(current_pipeline_dir /
                                 "nli_r_shared_task_test_withlb_e1.jsonl")
    nli_r_e2 = common.load_jsonl(current_pipeline_dir /
                                 "nli_r_shared_task_test_withlb_e2.jsonl")
    nli_r_e3 = common.load_jsonl(current_pipeline_dir /
                                 "nli_r_shared_task_test_withlb_e3.jsonl")
    nli_r_e4 = common.load_jsonl(current_pipeline_dir /
                                 "nli_r_shared_task_test_withlb_e4.jsonl")

    nli_results = merge_nli_results(
        [nli_r_e0, nli_r_e1, nli_r_e2, nli_r_e3, nli_r_e4])

    print("Post Processing enhancement")
    delete_unused_evidence(nli_results)
    print("Deleting Useless Evidence")

    dev_sent_list_1 = common.load_jsonl(dev_sent_file_1)
    dev_sent_list_2 = common.load_jsonl(dev_sent_file_2)

    print("Appending 1 of second Evidence")
    nli_results = simi_sampler.threshold_sampler_insure_unique_merge(
        nli_results,
        dev_sent_list_2,
        sentence_retri_2_scale_prob,
        top_n=5,
        add_n=sent_retri_2_top_k)
    delete_unused_evidence(nli_results)

    # High tolerance enhancement!
    print("Final High Tolerance Enhancement")
    print("Appending all of first Evidence")
    nli_results = simi_sampler.threshold_sampler_insure_unique_merge(
        nli_results,
        dev_sent_list_1,
        enhance_retri_1_scale_prob,
        top_n=100,
        add_n=100)
    delete_unused_evidence(nli_results)

    if build_submission:
        output_file = current_pipeline_dir / "predictions.jsonl"
        build_submission_file(nli_results, output_file)