def main(config): # select claims # load relevant documents # remove duplicate q_res_path = config['q_res_path'] ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) claims = get_all_claims() claim_d = claims_to_dict(claims) keys = list(ranked_list.keys()) keys.sort() num_doc_per_query = 10 url_prefix = "http://localhost:36559/document?identifier=" rows = [] for query_id in keys[:10]: entries: List[SimpleRankedListEntry] = ranked_list[query_id] entries = entries[:num_doc_per_query * 3] doc_ids: List[str] = remove_duplicate(list([e.doc_id for e in entries])) claim = claim_d[int(query_id)] s = "{} : {}".format(query_id, claim) rows.append([Cell(s)]) for doc_id in doc_ids[:num_doc_per_query]: url = url_prefix + doc_id s = "<a href=\"{}\">{}</a>".format(url, doc_id) rows.append([Cell(s)]) html = HtmlVisualizer("claim_docs_urls.html") html.write_table(rows)
def predict_by_bm25_from_candidate(bm25_module, claims, candidate_dict: List[Tuple[int, List[int]]], top_k) -> List[Tuple[int, List[Dict]]]: cid_to_text: Dict[int, str] = claims_to_dict(claims) def scorer(c_text, p_text) -> NamedNumber: score = bm25_module.score(c_text, p_text) return score all_prediction_list: List[Tuple[int, List[Dict]]] = [] for cid, candidates in candidate_dict: prediction_list: List[Dict] = [] claim_text = cid_to_text[cid] for pid in candidates: p_text = perspective_getter(pid) p_entry = { 'cid': cid, 'pid': pid, 'claim_text': claim_text, 'perspective_text': p_text, 'rationale': "", 'score': scorer(claim_text, p_text), } prediction_list.append(p_entry) prediction_list.sort(key=lambda x: x['score'], reverse=True) prediction_list = prediction_list[:top_k] all_prediction_list.append((cid, prediction_list)) return all_prediction_list
def pc_predict_by_bert_next_sent(bm25_module: BM25, claims, top_k) -> List[Tuple[str, List[Dict]]]: cid_to_text: Dict[int, str] = claims_to_dict(claims) port = 8123 # Example usage : proxy = xmlrpc.client.ServerProxy( 'http://ingham.cs.umass.edu:{}'.format(port)) voca_path = pjoin(data_path, "bert_voca.txt") encoder = EncoderUnitPlain(512, voca_path) def scorer(lucene_score, query_id) -> NamedNumber: claim_id, p_id = query_id.split("_") i_claim_id = int(claim_id) payload = [] p_text = perspective_getter(int(p_id)) c_text = cid_to_text[i_claim_id] payload.append(encoder.encode_pair(c_text, p_text)) r = proxy.predict(payload) ns_score = -float(r[0]) #ns_score = 0 score = bm25_module.score(c_text, p_text) new_score = score + ns_score * 10 score = NamedNumber(new_score, score.name + " {}".format(ns_score)) return score r = predict_interface(claims, top_k, scorer) return r
def pc_new_init_prob(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) claim_d = claims_to_dict(claims) bias_plus_word: Counter = load_from_pickle("bias_plus_words") tokenizer = PCTokenizer() base_p = max(bias_plus_word.values()) init_p_score_d = {} for cid in d_ids: c_text = claim_d[cid] tokens = tokenizer.tokenize_stem(c_text) score_for_cid = Counter() for t, cnt in Counter(tokens).items(): prob = cnt * base_p score_for_cid[t] = prob for t, score in bias_plus_word.items(): score_for_cid[t] += score score_for_cid = normalize_counter_to_sum1(score_for_cid) init_p_score_d[cid] = score_for_cid save_to_pickle(init_p_score_d, "pc_dev_new_init_prob")
def pc_predict_from_vector_query(bm25_module: BM25, q_tf_replace: Dict[int, Counter], claims, top_k) -> List[Tuple[str, List[Dict]]]: cid_to_text: Dict[int, str] = claims_to_dict(claims) found_claim = set() q_tf_replace_norm = dict_value_map(normalize_counter, q_tf_replace) c_qtf_d = {} for cid, c_text in cid_to_text.items(): c_tokens = bm25_module.tokenizer.tokenize_stem(c_text) c_qtf_d[cid] = Counter(c_tokens) def scorer(lucene_score, query_id) -> NamedNumber: nonlocal found_claim claim_id, p_id = query_id.split("_") i_claim_id = int(claim_id) if i_claim_id in q_tf_replace_norm: claim_qtf = Counter( dict_value_map(lambda x: x * 1, c_qtf_d[i_claim_id])) ex_qtf = q_tf_replace_norm[i_claim_id] ex_qtf = Counter(dict(ex_qtf.most_common(50))) qtf = ex_qtf + claim_qtf found_claim.add(i_claim_id) else: qtf = c_qtf_d[i_claim_id] p_text = perspective_getter(int(p_id)) p_tokens = bm25_module.tokenizer.tokenize_stem(p_text) score = bm25_module.score_inner(qtf, Counter(p_tokens)) return score r = predict_interface(claims, top_k, scorer) print("{} of {} found".format(len(found_claim), len(claims))) return r
def sum_random_walk_score(name_class): d_ids: List[int] = list(load_train_claim_ids()) claims = get_claims_from_ids(d_ids) claim_d = claims_to_dict(claims) prob_score_d = load_from_pickle("pc_{}_word_prob_train".format(name_class)) stopwords = load_stopwords() acc_counter_prob_init = Counter() for claim_id, prob_scores in prob_score_d.items(): for k, v in prob_scores: if k not in stopwords: acc_counter_prob_init[k] += v rw_score = dict(load_from_pickle("bias_random_walk_train_{}".format(name_class))) acc_counter = Counter() for claim_id, qtf in rw_score.items(): for k, v in qtf.items(): acc_counter[k] += v acc_counter_prob_init = normalize_counter_to_sum1(acc_counter_prob_init) acc_counter = normalize_counter_to_sum1(acc_counter) new_counter = Counter() for k, v in acc_counter.items(): if len(k) > 2: new_v = v - acc_counter_prob_init[k] new_counter[k] = new_v return new_counter
def main(config): word_list_path = config['word_list_path'] claims = get_all_claims() claim_d = claims_to_dict(claims) stopwords = load_stopwords_for_query() word_list_d: Dict = json.load(open(word_list_path, "r")) tokenizer = PCTokenizer() for query_id in word_list_d: claim = claim_d[int(query_id)] word_list = word_list_d[query_id] base_query_terms = tokenizer.tokenize_stem(claim) base_query_terms = list( [t for t in base_query_terms if t not in stopwords]) #print new_term_set = set() for new_term in word_list: t = tokenizer.stemmer.stem(new_term) if t not in base_query_terms: new_term_set.add(t) print() print("Claim {}: {}".format(query_id, claim)) print("base query terms: ", base_query_terms) print("new terms: ", new_term_set)
def show_random_walk_score(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) claim_d = claims_to_dict(claims) top_k = 7 q_tf_replace = dict(load_from_pickle("bias_random_walk_dev_plus_all")) for claim_id, qtf in q_tf_replace.items(): print(claim_d[claim_id]) print(qtf.most_common(100)) print("")
def show_num_mention(): train, val = load_feature_and_split() p_dict = get_perspective_dict() claims = get_claims_from_ids(lmap(lambda x: x['cid'], train)) claim_d = claims_to_dict(claims) grouped = group_by(train, lambda x: x['cid']) for cid in grouped: print("Claim:", claim_d[cid]) for dp in grouped[cid]: p_text = p_dict[dp['pid']] print(dp['label'], get_num_mention(dp), p_text)
def predict_see_candidate(bm25_module: BM25, claims, top_k): cid_to_text: Dict[int, str] = claims_to_dict(claims) c_qtf_d = {} for cid, c_text in cid_to_text.items(): c_tokens = bm25_module.tokenizer.tokenize_stem(c_text) c_qtf_d[cid] = Counter(c_tokens) output = [] for claim in claims: cid = claim['cId'] claim_text = claim['text'] lucene_results = es_helper.get_perspective_from_pool(claim_text, 50) candidate_pids = [] for rank, (_text, _pid, _score) in enumerate(lucene_results): candidate_pids.append(_pid) p_text = lmap(perspective_getter, candidate_pids) p_tokens = lmap(bm25_module.tokenizer.tokenize_stem, p_text) acc_counter = Counter() for tokens in p_tokens[:30]: for t in tokens: acc_counter[t] += 1 / len(tokens) c = normalize_counter(acc_counter) c_tokens = bm25_module.tokenizer.tokenize_stem(claim_text) qtf = Counter(c_tokens) qtf = c + qtf ranked_list = [] for pid in candidate_pids: p_tokens = bm25_module.tokenizer.tokenize_stem( perspective_getter(pid)) score = bm25_module.score_inner(qtf, Counter(p_tokens)) ranked_list.append((pid, score)) ranked_list.sort(key=lambda x: x[1], reverse=True) prediction_list = [] for pid, score in ranked_list[:top_k]: p_entry = { 'cid': cid, 'pid': pid, 'claim_text': claim_text, 'perspective_text': perspective_getter(pid), 'rationale': score.name, 'score': score, } prediction_list.append(p_entry) output.append((cid, prediction_list)) return output
def predict_by_bm25(bm25_module, claims, top_k) -> List[Tuple[str, List[Dict]]]: cid_to_text: Dict[int, str] = claims_to_dict(claims) def scorer(lucene_score, query_id) -> NamedNumber: claim_id, p_id = query_id.split("_") c_text = cid_to_text[int(claim_id)] p_text = perspective_getter(int(p_id)) score = bm25_module.score(c_text, p_text) return score r = predict_interface(claims, top_k, scorer) return r
def predict_by_bm25_rm(bm25_module: BM25, rm_info: Dict[str, List[Tuple[str, str]]], claims, top_k) -> List[Tuple[str, List[Dict]]]: cid_to_text: Dict[int, str] = claims_to_dict(claims) tokenizer = PCTokenizer() def stem_merge(score_list: List[Tuple[str, float]]) -> Counter: c = Counter() for k, v in score_list: try: new_k = tokenizer.stemmer.stem(k) c[new_k] += v except UnicodeDecodeError: pass return c rm_info: Dict[str, List[Tuple[str, float]]] = dict_value_map(parse_float, rm_info) rm_info: Dict[str, List[Tuple[str, float]]] = dict_value_map(normalize_scores, rm_info) rm_info_c: Dict[str, Counter] = dict_value_map(stem_merge, rm_info) print(len(rm_info_c.keys())) print(len(claims)) not_found = set() def scorer(lucene_score, query_id) -> NamedNumber: claim_id, p_id = query_id.split("_") c_text = cid_to_text[int(claim_id)] p_text = perspective_getter(int(p_id)) score: NamedNumber = bm25_module.score(c_text, p_text) nclaim_id = int(claim_id) if nclaim_id in rm_info: ex_qtf = rm_info_c[nclaim_id] p_tokens = tokenizer.tokenize_stem(p_text) ex_score = bm25_module.score_inner(ex_qtf, Counter(p_tokens)) new_info = score.name + "({})".format(ex_score.name) score = NamedNumber(score + ex_score, new_info) else: not_found.add(claim_id) return score r = predict_interface(claims, top_k, scorer) print(not_found) return r
def predict_by_reweighter(bm25_module: BM25, claims, top_k, param) -> List[Tuple[str, List[Dict]]]: cid_to_text: Dict[int, str] = claims_to_dict(claims) claim_term_weight: Dict[int, Dict[str, float]] = get_claim_term_weighting( claims, param) nlp = spacy.load("en_core_web_sm") def do_stem(t: str) -> str: r = bm25_module.tokenizer.stemmer.stem(t) return r def stem_tokenize(text: str) -> Iterator[str]: for t in nlp(text): try: yield do_stem(t.text) except UnicodeDecodeError: pass def apply_stem(term_weight: Dict[str, float]) -> Dict[str, float]: return {do_stem(k): v for k, v in term_weight.items()} claim_term_weight: Dict[int, Dict[str, float]] = dict_value_map( apply_stem, claim_term_weight) def scorer(lucene_score, query_id) -> NamedNumber: claim_id, p_id = query_id.split("_") c_text = cid_to_text[int(claim_id)] p_text = perspective_getter(int(p_id)) qtf = Counter(stem_tokenize(c_text)) weight = claim_term_weight[int(claim_id)] new_qtf = Counter() for k, v in qtf.items(): try: w = weight[k] new_qtf[k] = w * v except Exception as e: print("Exception") print(e) print(k) tf = Counter(stem_tokenize(p_text)) score = bm25_module.score_inner(new_qtf, tf) return score r = predict_interface(claims, top_k, scorer) return r
def main(config): split = config['split'] word_prob_path = config['word_prob_path'] save_path = config['save_path'] threshold = config['threshold'] per_query_infos: Dict[str, Dict[WordAsID, np.array]] = load_pickle_from(word_prob_path) claims = load_claims_for_sub_split(split) claim_d = claims_to_dict(claims) stopwords = load_stopwords_for_query() def is_stopword(tokens): if len(tokens) == 1 and tokens[0] in stopwords: return True else: return False tokenizer = get_tokenizer() all_d = {} for query_id, d in per_query_infos.items(): entry = [] for key in d.keys(): tokens: List[str] = decode_word_as_id(tokenizer, key) if is_stopword(tokens): continue plain_word: str = pretty_tokens(tokens, True) pos, neg = d[key] pos_log = math.log(pos + 1e-10) neg_log = math.log(neg + 1e-10) diff = pos_log - neg_log entry.append((plain_word, diff, pos_log, neg_log)) entry.sort(key=get_second, reverse=True) word_list = [] for word, diff, pos, neg in entry[:100]: if diff > threshold: word = word.strip() word_list.append(word) all_d[query_id] = word_list json.dump(all_d, open(save_path, "w"))
def write_csv(config): # select claims # load relevant documents # remove duplicate q_res_path = config['q_res_path'] ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) claims = get_all_claims() claim_d = claims_to_dict(claims) keys = list(ranked_list.keys()) keys.sort() num_doc_per_query = 10 url_prefix = "http://gosford.cs.umass.edu:36559/document?identifier=" rows = [] header = ["claim" ] + ["url{}".format(i) for i in range(1, num_doc_per_query + 1)] rows.append(header) for query_id in keys[:10]: entries: List[SimpleRankedListEntry] = ranked_list[query_id] entries = entries[:num_doc_per_query * 3] doc_ids: List[str] = remove_duplicate(list([e.doc_id for e in entries])) claim = claim_d[int(query_id)] urls = [] for doc_id in doc_ids[:num_doc_per_query]: url = url_prefix + doc_id urls.append(url) assert len(urls) == num_doc_per_query row = [claim] + urls rows.append(row) save_path = os.path.join(output_path, "claim10_train.csv") f = open(save_path, "w") csv_writer = csv.writer(f) csv_writer.writerows(rows)
def main(config): split = config['split'] word_prob_path = config['word_prob_path'] per_query_infos: Dict[str, Dict[WordAsID, np.array]] = load_pickle_from(word_prob_path) claims = load_claims_for_sub_split(split) claim_d = claims_to_dict(claims) stopwords = load_stopwords_for_query() def is_stopword(tokens): if len(tokens) == 1 and tokens[0] in stopwords: return True else: return False tokenizer = get_tokenizer() for query_id, d in per_query_infos.items(): entry = [] for key in d.keys(): tokens: List[str] = decode_word_as_id(tokenizer, key) if is_stopword(tokens): continue plain_word: str = pretty_tokens(tokens, True) pos, neg = d[key] pos_log = math.log(pos + 1e-10) neg_log = math.log(neg + 1e-10) diff = pos_log - neg_log entry.append((plain_word, diff, pos_log, neg_log)) print(query_id, claim_d[int(query_id)]) entry.sort(key=get_second, reverse=True) for word, diff, pos, neg in entry[:100]: word = word.strip() print("{0}\t{1:.2f}\t{2:.2f}\t{3:.2f}".format( word, diff, pos, neg))
def main(input_path): claims = get_all_claims() claim_d = claims_to_dict(claims) gold: Dict[int, List[List[int]]] = get_claim_perspective_id_dict() grouped_ranked_list = load_ranked_list_grouped(input_path) def is_correct(qid: str, doc_id: str): return any([int(doc_id) in cluster for cluster in gold[int(qid)]]) top_k = 5 for qid, entries in grouped_ranked_list.items(): n_gold = sum(map(len, gold[int(qid)])) cut_n = min(n_gold, top_k) correctness = list([is_correct(qid, e.doc_id) for e in entries[:cut_n]]) num_correct = sum(lmap(int, correctness)) p_at_k = num_correct / cut_n pid_to_rank: Dict[str, int] = {e.doc_id: e.rank for e in entries} def get_rank(pid: int): if str(pid) in pid_to_rank: return pid_to_rank[str(pid)] else: return "X" if p_at_k < 0.3: print(n_gold) print(p_at_k) print("Claim {} {}".format(qid, claim_d[int(qid)]))## for cluster in gold[int(qid)]: print("-") for pid in cluster: print("[{}]".format(get_rank(pid)), perspective_getter(int(pid))) for e in entries[:50]: correct_str = "Y" if is_correct(qid, e.doc_id) else "N" print("{} {} {}".format(correct_str, e.score, perspective_getter(int(e.doc_id))))
def work(): split = "train" assert split in ["train", "dev", "test"] tokenizer = PCTokenizer() d_ids = list({ "train": load_train_claim_ids(), "dev": load_dev_claim_ids(), "test": load_test_claim_ids() }[split]) claims = get_claims_from_ids(d_ids) claim_d = claims_to_dict(claims) print(len(claims), " claims") do_balance = False all_data_points: List[PerspectiveCandidate] = get_candidates( claims, do_balance) grouped: Dict[str, List] = group_by(all_data_points, lambda x: x.cid) def get_frequency_per_class(datapoints: List[PerspectiveCandidate]): pos_text = [] neg_text = [] for dp in datapoints: tokens = tokenizer.tokenize_stem(dp.p_text) tf = Counter(tokens) dl = sum(tf.values()) tf_rel = {k: v / dl for k, v in tf.items()} if dp.label == "1": pos_text.append(tf_rel) elif dp.label == "0": neg_text.append(tf_rel) else: assert False def accumulate(tf_list: List[Dict]): out_c = Counter() n = len(tf_list) for tf in tf_list: for k, v in tf.items(): out_c[k] += v / n return out_c pos_avg_tf = accumulate(pos_text) neg_avg_tf = accumulate(neg_text) return pos_avg_tf, neg_avg_tf class_freq: Dict[str, Tuple[Counter, Counter]] = dict_value_map(get_frequency_per_class, grouped) save_to_pickle(class_freq, "per_claim_class_word_tf_{}".format(split)) def normalize(s_list: List[float]) -> List[float]: m = sum(s_list) return list([s / m for s in s_list]) pos_prob_dict = {} neg_prob_dict = {} for cid, info in class_freq.items(): pos, neg = info all_words = set(pos.keys()) all_words.update(neg.keys()) info = [] for word in all_words: score = pos[word] - neg[word] info.append((word, score)) pos_scores = list([(w, s) for w, s in info if s > 0]) neg_scores = list([(w, s) for w, s in info if s < 0]) def normalize_right(pair_list): right_scores = normalize(right(pair_list)) return list(zip(left(pair_list), right_scores)) pos_prob_dict[cid] = normalize_right(pos_scores) neg_prob_dict[cid] = normalize_right(neg_scores) save_to_pickle(pos_prob_dict, "pc_pos_word_prob_{}".format(split)) save_to_pickle(neg_prob_dict, "pc_neg_word_prob_{}".format(split))
def pc_predict_vector_query_and_reweight( bm25_module: BM25, q_tf_replace: Dict[int, Counter], claims, top_k, param) -> List[Tuple[str, List[Dict]]]: cid_to_text: Dict[int, str] = claims_to_dict(claims) found_claim = set() q_tf_replace_norm = dict_value_map(normalize_counter, q_tf_replace) def do_stem(t: str) -> str: r = bm25_module.tokenizer.stemmer.stem(t) return r def apply_stem(term_weight: Dict[str, float]) -> Dict[str, float]: return {do_stem(k): v for k, v in term_weight.items()} claim_term_weight: Dict[int, Dict[str, float]] = get_claim_term_weighting( claims, param) claim_term_weight: Dict[int, Dict[str, float]] = dict_value_map( apply_stem, claim_term_weight) nlp = spacy.load("en_core_web_sm") def stem_tokenize(text: str) -> Iterator[str]: for t in nlp(text): try: yield do_stem(t.text) except UnicodeDecodeError: pass def get_qtf(claim_id): weight = claim_term_weight[claim_id] new_qtf = Counter() c_text = cid_to_text[int(claim_id)] qtf = Counter(stem_tokenize(c_text)) print(weight) for k, v in qtf.items(): try: if k in weight: w = weight[k] new_qtf[k] = w * v else: new_qtf[k] = v except Exception as e: print("Exception") print(e) print(k) return new_qtf c_qtf_d = {k: get_qtf(k) for k in cid_to_text.keys()} # for cid, c_text in cid_to_text.items(): # c_tokens = bm25_module.tokenizer.tokenize_stem(c_text) # c_qtf_d[cid] = Counter(c_tokens) def scorer(lucene_score, query_id) -> NamedNumber: nonlocal found_claim claim_id, p_id = query_id.split("_") i_claim_id = int(claim_id) if i_claim_id in q_tf_replace_norm: ex_qtf = q_tf_replace_norm[i_claim_id] ex_qtf = Counter(dict(ex_qtf.most_common(50))) qtf = ex_qtf + c_qtf_d[i_claim_id] found_claim.add(i_claim_id) else: qtf = c_qtf_d[i_claim_id] p_text = perspective_getter(int(p_id)) p_tokens = bm25_module.tokenizer.tokenize_stem(p_text) score = bm25_module.score_inner(qtf, Counter(p_tokens)) return score r = predict_interface(claims, top_k, scorer) print("{} of {} found".format(len(found_claim), len(claims))) return r
def pc_predict_to_inspect(bm25_module: BM25, q_tf_replace: Dict[int, Counter], q_tf_replace_0: Dict[int, Counter], claims, top_k): gold = get_claim_perspective_id_dict() q_tf_replace_norm = dict_value_map(normalize_counter, q_tf_replace) q_tf_replace_0_norm = dict_value_map(normalize_counter, q_tf_replace_0) cid_to_text: Dict[int, str] = claims_to_dict(claims) c_qtf_d = {} for cid, c_text in cid_to_text.items(): c_tokens = bm25_module.tokenizer.tokenize_stem(c_text) c_qtf_d[cid] = Counter(c_tokens) def counter_to_str(c: Dict) -> str: s = "" for k, v in c.items(): s += "{0} {1:.2f}".format(k, v) + "\t" return s for claim in claims: cid = claim['cId'] i_claim_id = int(cid) claim_text = claim['text'] lucene_results = es_helper.get_perspective_from_pool(claim_text, 50) candidate_pids = [] for rank, (_text, _pid, _score) in enumerate(lucene_results): candidate_pids.append(_pid) if i_claim_id in q_tf_replace_norm: claim_qtf = Counter( dict_value_map(lambda x: x * 1, c_qtf_d[i_claim_id])) ex_qtf = q_tf_replace_norm[i_claim_id] ex_qtf = Counter(dict(ex_qtf.most_common(50))) qtf = ex_qtf + claim_qtf else: qtf = c_qtf_d[i_claim_id] ranked_list = [] for pid in candidate_pids: p_text = perspective_getter(int(pid)) p_tokens = bm25_module.tokenizer.tokenize_stem(p_text) score = bm25_module.score_inner(qtf, Counter(p_tokens)) debug_str = "" e = score, pid, p_text, debug_str ranked_list.append(e) gold_pids = gold[cid] def is_correct(pid): for pids in gold_pids: if pid in pids: return True return False ranked_list.sort(key=lambda x: x[0], reverse=True) qtf_idf_applied = { k: v * bm25_module.term_idf_factor(k) for k, v in qtf.items() } print() print("Claim: ", cid, claim_text) for cluster in gold_pids: print("-") for pid in cluster: print(pid, perspective_getter(pid)) print() print("qtf:", counter_to_str(qtf)) if i_claim_id in q_tf_replace_norm and i_claim_id in q_tf_replace_0_norm: print("ex_qtf:", counter_to_str(ex_qtf)) ex_qtf_0 = q_tf_replace_0_norm[i_claim_id] ex_qtf_0 = Counter(dict(ex_qtf_0.most_common(50))) print("ex_qtf_0:", counter_to_str(ex_qtf_0)) print("qtf idf apllied:", counter_to_str(qtf_idf_applied)) for score, pid, p_text, debug_str in ranked_list[:top_k]: if i_claim_id in q_tf_replace_0_norm: p_text = perspective_getter(int(pid)) p_tokens = bm25_module.tokenizer.tokenize_stem(p_text) ex_qtf_0 = q_tf_replace_0_norm[i_claim_id] qtf = ex_qtf_0 + c_qtf_d[i_claim_id] score2 = bm25_module.score_inner(qtf, Counter(p_tokens)) correct_str = "Y" if is_correct(pid) else "N" print("{0} {1:.2f} ({2:.2f}) {3} / {4} / {5}".format( correct_str, score, score2, p_text, score.name, score2.name))