def run_reweight(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 7 param = {'k1': 0.5} pred = predict_by_reweighter(get_bm25_module(), claims, top_k, param) print(evaluate(pred))
def run_bm25_rm(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) rm_info = load_from_pickle("perspective_dev_claim_rm") top_k = 7 pred = predict_by_bm25_rm(get_bm25_module(), rm_info, claims, top_k) print(evaluate(pred))
def run_bm25_ex(): claims, val = train_split() top_k = 100 candidate_dict = get_eval_candidates_l( get_expanded_query_text(claims, "train")) pred = predict_by_bm25_from_candidate(get_bm25_module(), claims, candidate_dict, top_k) print(evaluate_recall(pred, True))
def run_reweight(): top_k = 7 claims, val = train_split() param = {'k1': 1} target = claims[:50] pred = predict_by_reweighter(get_bm25_module(), target, top_k, param) print(param) print(evaluate(pred))
def run_bm25_2(): claims, val = train_split() top_k = 1000 candidate_dict: List[Tuple[int, List[int]]] = get_eval_candidates_w_q_text( claim_as_query(claims), top_k) pred = predict_by_bm25_from_candidate(get_bm25_module(), claims, candidate_dict, top_k) print(evaluate_recall(pred, True))
def save_random_walk_pred(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 50 q_tf_replace = dict(load_from_pickle("random_walk_score_100")) bm25 = get_bm25_module() pred = pc_predict_from_vector_query(bm25, q_tf_replace, claims, top_k) score_d = prediction_to_dict(pred) save_to_pickle(score_d, "pc_random_walk_based_score_d")
def run_lm(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 5 q_tf_replace = dict(load_from_pickle("pc_dev_par_tf")) q_tf_replace = dict(load_from_pickle("random_walk_score_100")) bm25 = get_bm25_module() ctf = load_collection_tf() pred = predict_by_lm(q_tf_replace, ctf, bm25, claims, top_k) print(evaluate(pred))
def run_random_walk_score_with_weight(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 7 q_tf_replace = dict(load_from_pickle("random_walk_score_100")) q_tf_replace = dict_key_map(lambda x: int(x), q_tf_replace) bm25 = get_bm25_module() pred = pc_predict_vector_query_and_reweight(bm25, q_tf_replace, claims, top_k, {'k1': 0.5}) print(evaluate(pred))
def run_random_walk_score(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 7 q_tf_replace = dict(load_from_pickle("random_walk_score_100")) q_tf_replace = dict_key_map(lambda x: int(x), q_tf_replace) #q_tf_replace = dict(load_from_pickle("pc_dev_par_tf")) #q_tf_replace = dict(load_from_pickle("bias_random_walk_dev_plus_all")) bm25 = get_bm25_module() pred = pc_predict_from_vector_query(bm25, q_tf_replace, claims, top_k) print(evaluate(pred))
def save_bm25_as_trec_format(): d_ids: List[int] = list(load_train_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 200 candidate_dict: List[Tuple[int, List[int]]] = get_eval_candidates_w_q_text( claim_as_query(claims), top_k) pred = predict_by_bm25_from_candidate(get_bm25_module(), claims, candidate_dict, top_k) entries = prediction_to_trec_format(pred, "bm25") write_trec_ranked_list_entry( entries, os.path.join(output_path, "ranked_list", "bm25.txt"))
def main(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 50 q_tf_replace = dict(load_from_pickle("random_walk_score_100")) bm25 = get_bm25_module() pred2 = pc_predict_from_vector_query(bm25, q_tf_replace, claims, top_k) pc_score_d = load_from_pickle("pc_bert_baseline_score_d") pred1 = predict_from_dict(pc_score_d, claims, top_k) compare_two_runs(pred1, pred2)
def run_random_walk_score(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 20 bm25 = get_bm25_module() #df, N = get_idf() #bm25.df = df #bm25.N = N q_tf_replace_0 = dict(load_from_pickle("random_walk_score_100")) q_tf_replace = dict(load_from_pickle("dev_claim_random_walk_debug2")) q_tf_replace = dict_key_map(lambda x: int(x), q_tf_replace) pc_predict_to_inspect(bm25, q_tf_replace, q_tf_replace_0, claims, top_k)
def get_extended_eval_candidate(split) -> Dict[int, List[int]]: bm25 = get_bm25_module() d_ids = load_claim_ids_for_split(split) claims: List[Dict] = get_claims_from_ids(d_ids) cid_to_pids: Dict[int, List[int]] = get_claim_perspective_id_dict2() tokenizer = PCTokenizer() def get_tf_idf(c: Counter): r = Counter() for t, cnt in c.items(): tfidf = bm25.term_idf_factor(t) * cnt r[t] = tfidf return r def get_candidates(c: Dict) -> Tuple[int, List[int]]: cid = c["cId"] assert type(cid) == int claim_text = c["text"] claim_tokens = tokenizer.tokenize_stem(claim_text) top_k = 50 lucene_results = es_helper.get_perspective_from_pool(claim_text, top_k) candidate_list: List[int] = [] for rank, (_text, _pid, _score) in enumerate(lucene_results): candidate_list.append(_pid) gold_pids = cid_to_pids[int(cid)] hard_candidate = [] mismatch_voca = Counter() for pid in gold_pids: if pid not in candidate_list: hard_candidate.append(pid) p_text = perspective_getter(pid) p_tokens = tokenizer.tokenize_stem(p_text) for t in p_tokens: if t not in claim_tokens: mismatch_voca[t] += 1 candidate_list.extend(hard_candidate) mismatch_tf_idf = get_tf_idf(mismatch_voca) new_qterms = left(mismatch_tf_idf.most_common(30)) lucene_results = es_helper.get_perspective_from_pool( " ".join(new_qterms), top_k) for rank, (_text, _pid, _score) in enumerate(lucene_results): if _pid not in candidate_list: candidate_list.append(_pid) return cid, candidate_list candidates: List[Tuple[int, List[int]]] = lmap(get_candidates, claims) return dict(candidates)
def run_lm2(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 5 tokenizer = PCTokenizer() tf_d = { c['cId']: Counter(nltk.tokenize.word_tokenize(c['text'])) for c in claims } bm25 = get_bm25_module() ctf = get_perspective_tf() pred = predict_by_lm(tf_d, ctf, bm25, claims, top_k) print(evaluate(pred))
def run_bm25_ex_pers(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 7 pred = predict_see_candidate(get_bm25_module(), claims, top_k) print(evaluate(pred))
def run_bm25(): claims, val = train_split() top_k = 20 pred = predict_by_bm25(get_bm25_module(), claims, top_k) inspect(pred)
def run_bm25_map(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 50 pred = predict_by_bm25(get_bm25_module(), claims, top_k) print(evaluate_map(pred))
def run_bm25(): claims, val = train_split() top_k = 20 pred = predict_by_bm25(get_bm25_module(), claims, top_k) print(evaluate(pred))
def run_next_sent(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids)[:10] top_k = 7 pred = pc_predict_by_bert_next_sent(get_bm25_module(), claims, top_k) print(evaluate(pred))