Esempio n. 1
0
def run_reweight():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 7
    param = {'k1': 0.5}
    pred = predict_by_reweighter(get_bm25_module(), claims, top_k, param)
    print(evaluate(pred))
Esempio n. 2
0
def run_bm25_rm():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    rm_info = load_from_pickle("perspective_dev_claim_rm")
    top_k = 7
    pred = predict_by_bm25_rm(get_bm25_module(), rm_info, claims, top_k)
    print(evaluate(pred))
Esempio n. 3
0
def run_bm25_ex():
    claims, val = train_split()
    top_k = 100
    candidate_dict = get_eval_candidates_l(
        get_expanded_query_text(claims, "train"))
    pred = predict_by_bm25_from_candidate(get_bm25_module(), claims,
                                          candidate_dict, top_k)
    print(evaluate_recall(pred, True))
Esempio n. 4
0
def run_reweight():
    top_k = 7
    claims, val = train_split()
    param = {'k1': 1}
    target = claims[:50]
    pred = predict_by_reweighter(get_bm25_module(), target, top_k, param)
    print(param)
    print(evaluate(pred))
Esempio n. 5
0
def run_bm25_2():
    claims, val = train_split()
    top_k = 1000
    candidate_dict: List[Tuple[int, List[int]]] = get_eval_candidates_w_q_text(
        claim_as_query(claims), top_k)
    pred = predict_by_bm25_from_candidate(get_bm25_module(), claims,
                                          candidate_dict, top_k)
    print(evaluate_recall(pred, True))
Esempio n. 6
0
def save_random_walk_pred():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 50
    q_tf_replace = dict(load_from_pickle("random_walk_score_100"))
    bm25 = get_bm25_module()
    pred = pc_predict_from_vector_query(bm25, q_tf_replace, claims, top_k)
    score_d = prediction_to_dict(pred)
    save_to_pickle(score_d, "pc_random_walk_based_score_d")
Esempio n. 7
0
def run_lm():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 5
    q_tf_replace = dict(load_from_pickle("pc_dev_par_tf"))
    q_tf_replace = dict(load_from_pickle("random_walk_score_100"))
    bm25 = get_bm25_module()
    ctf = load_collection_tf()
    pred = predict_by_lm(q_tf_replace, ctf, bm25, claims, top_k)
    print(evaluate(pred))
Esempio n. 8
0
def run_random_walk_score_with_weight():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 7
    q_tf_replace = dict(load_from_pickle("random_walk_score_100"))
    q_tf_replace = dict_key_map(lambda x: int(x), q_tf_replace)
    bm25 = get_bm25_module()
    pred = pc_predict_vector_query_and_reweight(bm25, q_tf_replace, claims,
                                                top_k, {'k1': 0.5})
    print(evaluate(pred))
Esempio n. 9
0
def run_random_walk_score():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 7
    q_tf_replace = dict(load_from_pickle("random_walk_score_100"))
    q_tf_replace = dict_key_map(lambda x: int(x), q_tf_replace)
    #q_tf_replace = dict(load_from_pickle("pc_dev_par_tf"))
    #q_tf_replace = dict(load_from_pickle("bias_random_walk_dev_plus_all"))
    bm25 = get_bm25_module()
    pred = pc_predict_from_vector_query(bm25, q_tf_replace, claims, top_k)
    print(evaluate(pred))
Esempio n. 10
0
def save_bm25_as_trec_format():
    d_ids: List[int] = list(load_train_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 200
    candidate_dict: List[Tuple[int, List[int]]] = get_eval_candidates_w_q_text(
        claim_as_query(claims), top_k)
    pred = predict_by_bm25_from_candidate(get_bm25_module(), claims,
                                          candidate_dict, top_k)
    entries = prediction_to_trec_format(pred, "bm25")
    write_trec_ranked_list_entry(
        entries, os.path.join(output_path, "ranked_list", "bm25.txt"))
Esempio n. 11
0
def main():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 50
    q_tf_replace = dict(load_from_pickle("random_walk_score_100"))
    bm25 = get_bm25_module()
    pred2 = pc_predict_from_vector_query(bm25, q_tf_replace, claims, top_k)
    pc_score_d = load_from_pickle("pc_bert_baseline_score_d")
    pred1 = predict_from_dict(pc_score_d, claims, top_k)

    compare_two_runs(pred1, pred2)
Esempio n. 12
0
def run_random_walk_score():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 20
    bm25 = get_bm25_module()
    #df, N = get_idf()
    #bm25.df = df
    #bm25.N = N
    q_tf_replace_0 = dict(load_from_pickle("random_walk_score_100"))
    q_tf_replace = dict(load_from_pickle("dev_claim_random_walk_debug2"))
    q_tf_replace = dict_key_map(lambda x: int(x), q_tf_replace)
    pc_predict_to_inspect(bm25, q_tf_replace, q_tf_replace_0, claims, top_k)
Esempio n. 13
0
def get_extended_eval_candidate(split) -> Dict[int, List[int]]:
    bm25 = get_bm25_module()
    d_ids = load_claim_ids_for_split(split)
    claims: List[Dict] = get_claims_from_ids(d_ids)
    cid_to_pids: Dict[int, List[int]] = get_claim_perspective_id_dict2()
    tokenizer = PCTokenizer()

    def get_tf_idf(c: Counter):
        r = Counter()
        for t, cnt in c.items():
            tfidf = bm25.term_idf_factor(t) * cnt
            r[t] = tfidf
        return r

    def get_candidates(c: Dict) -> Tuple[int, List[int]]:
        cid = c["cId"]
        assert type(cid) == int
        claim_text = c["text"]
        claim_tokens = tokenizer.tokenize_stem(claim_text)
        top_k = 50
        lucene_results = es_helper.get_perspective_from_pool(claim_text, top_k)
        candidate_list: List[int] = []

        for rank, (_text, _pid, _score) in enumerate(lucene_results):
            candidate_list.append(_pid)

        gold_pids = cid_to_pids[int(cid)]
        hard_candidate = []
        mismatch_voca = Counter()
        for pid in gold_pids:
            if pid not in candidate_list:
                hard_candidate.append(pid)
                p_text = perspective_getter(pid)
                p_tokens = tokenizer.tokenize_stem(p_text)

                for t in p_tokens:
                    if t not in claim_tokens:
                        mismatch_voca[t] += 1

        candidate_list.extend(hard_candidate)
        mismatch_tf_idf = get_tf_idf(mismatch_voca)
        new_qterms = left(mismatch_tf_idf.most_common(30))
        lucene_results = es_helper.get_perspective_from_pool(
            " ".join(new_qterms), top_k)

        for rank, (_text, _pid, _score) in enumerate(lucene_results):
            if _pid not in candidate_list:
                candidate_list.append(_pid)

        return cid, candidate_list

    candidates: List[Tuple[int, List[int]]] = lmap(get_candidates, claims)
    return dict(candidates)
Esempio n. 14
0
def run_lm2():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 5
    tokenizer = PCTokenizer()
    tf_d = {
        c['cId']: Counter(nltk.tokenize.word_tokenize(c['text']))
        for c in claims
    }
    bm25 = get_bm25_module()
    ctf = get_perspective_tf()
    pred = predict_by_lm(tf_d, ctf, bm25, claims, top_k)
    print(evaluate(pred))
Esempio n. 15
0
def run_bm25_ex_pers():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 7
    pred = predict_see_candidate(get_bm25_module(), claims, top_k)
    print(evaluate(pred))
Esempio n. 16
0
def run_bm25():
    claims, val = train_split()
    top_k = 20
    pred = predict_by_bm25(get_bm25_module(), claims, top_k)

    inspect(pred)
Esempio n. 17
0
def run_bm25_map():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 50
    pred = predict_by_bm25(get_bm25_module(), claims, top_k)
    print(evaluate_map(pred))
Esempio n. 18
0
def run_bm25():
    claims, val = train_split()
    top_k = 20
    pred = predict_by_bm25(get_bm25_module(), claims, top_k)
    print(evaluate(pred))
Esempio n. 19
0
def run_next_sent():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)[:10]
    top_k = 7
    pred = pc_predict_by_bert_next_sent(get_bm25_module(), claims, top_k)
    print(evaluate(pred))