def predict_see_candidate(bm25_module: BM25, claims, top_k): cid_to_text: Dict[int, str] = claims_to_dict(claims) c_qtf_d = {} for cid, c_text in cid_to_text.items(): c_tokens = bm25_module.tokenizer.tokenize_stem(c_text) c_qtf_d[cid] = Counter(c_tokens) output = [] for claim in claims: cid = claim['cId'] claim_text = claim['text'] lucene_results = es_helper.get_perspective_from_pool(claim_text, 50) candidate_pids = [] for rank, (_text, _pid, _score) in enumerate(lucene_results): candidate_pids.append(_pid) p_text = lmap(perspective_getter, candidate_pids) p_tokens = lmap(bm25_module.tokenizer.tokenize_stem, p_text) acc_counter = Counter() for tokens in p_tokens[:30]: for t in tokens: acc_counter[t] += 1 / len(tokens) c = normalize_counter(acc_counter) c_tokens = bm25_module.tokenizer.tokenize_stem(claim_text) qtf = Counter(c_tokens) qtf = c + qtf ranked_list = [] for pid in candidate_pids: p_tokens = bm25_module.tokenizer.tokenize_stem( perspective_getter(pid)) score = bm25_module.score_inner(qtf, Counter(p_tokens)) ranked_list.append((pid, score)) ranked_list.sort(key=lambda x: x[1], reverse=True) prediction_list = [] for pid, score in ranked_list[:top_k]: p_entry = { 'cid': cid, 'pid': pid, 'claim_text': claim_text, 'perspective_text': perspective_getter(pid), 'rationale': score.name, 'score': score, } prediction_list.append(p_entry) output.append((cid, prediction_list)) return output
def predict_by_bm25_from_candidate(bm25_module, claims, candidate_dict: List[Tuple[int, List[int]]], top_k) -> List[Tuple[int, List[Dict]]]: cid_to_text: Dict[int, str] = claims_to_dict(claims) def scorer(c_text, p_text) -> NamedNumber: score = bm25_module.score(c_text, p_text) return score all_prediction_list: List[Tuple[int, List[Dict]]] = [] for cid, candidates in candidate_dict: prediction_list: List[Dict] = [] claim_text = cid_to_text[cid] for pid in candidates: p_text = perspective_getter(pid) p_entry = { 'cid': cid, 'pid': pid, 'claim_text': claim_text, 'perspective_text': p_text, 'rationale': "", 'score': scorer(claim_text, p_text), } prediction_list.append(p_entry) prediction_list.sort(key=lambda x: x['score'], reverse=True) prediction_list = prediction_list[:top_k] all_prediction_list.append((cid, prediction_list)) return all_prediction_list
def main(): pc_data: List[Dict] = load_claim_perspective_pair() pc_data.sort(key=lambda e: len(e['perspectives']), reverse=True) gold_d: Dict[int, List[PerspectiveCluster]] = load_perspectrum_golds() ca_cid = 1 out_j = [] for e in pc_data[:100]: cid = e['cId'] if not gold_d[cid]: continue c_text = e['text'] for pc in gold_d[cid]: if random.random() < 0.3: first_pid = pc.perspective_ids[0] p_text = perspective_getter(first_pid) j_entry = { 'cid': cid, 'claim_text': c_text, 'ca_cid': ca_cid, 'perspective': { 'stance': pc.stance_label_3, 'pid': first_pid, 'p_text': p_text } } ca_cid += 1 out_j.append(j_entry) print("total of {}".format(len(out_j))) out_f = open(at_output_dir("ca_building", "claims.step1.txt"), "w", encoding="utf-8") json.dump(out_j, out_f, indent=True)
def generate_instances(self, claim: Dict, data_id_manager) -> List[Payload]: cid = claim['cId'] claim = claim['text'] perspectives = self.candidate_perspective[cid] passages = self.cid_to_passages[cid] output = [] for pid in perspectives: info = { 'cid': cid, 'pid': pid, } is_correct = any([pid in cluster for cluster in self.gold[cid]]) perspective = perspective_getter(pid) passage_list = left(passages) payload = Payload( passage_list, claim, perspective, data_id_manager.assign(info), is_correct, ) output.append(payload) return output
def generate_instances(self, claim: Dict, data_id_manager) -> List[Payload]: cid = claim['cId'] claim = claim['text'] perspectives = self.candidate_perspective[cid] passages = self.cid_to_passages[cid] if self.filter_good: filter_condition = score_over_zero else: def filter_condition(dummy): return True good_passages: List[List[str]] = left( lfilter(filter_condition, passages)) output = [] for pid in perspectives: is_correct = any([pid in cluster for cluster in self.gold[cid]]) for passage_idx, passage in enumerate(good_passages): perspective = perspective_getter(pid) info = {'cid': cid, 'pid': pid, 'passage_idx': passage_idx} p = Payload(passage, claim, perspective, data_id_manager.assign(info), is_correct) output.append(p) return output
def main(): pc_data: List[Dict] = load_claim_perspective_pair() pc_data.sort(key=lambda e: len(e['perspectives']), reverse=True) gold_d: Dict[int, List[PerspectiveCluster]] = load_perspectrum_golds() out_f = open(at_data_dir("perspective", "claims_and_perspective.txt"), "w", encoding="utf-8") for e in pc_data: cid = e['cId'] if not gold_d[cid]: continue text = e['text'] rows = [] row = [str(cid), text] rows.append(row) for pc in gold_d[cid]: rows.append([pc.stance_label_3, pc.stance_label_5]) for pid in pc.perspective_ids: row = [perspective_getter(pid)] rows.append(row) rows.append([]) for row in rows: out_f.write("\t".join(row) + "\n") out_f.write("\n\n\n")
def scorer(lucene_score, query_id) -> NamedNumber: claim_id, p_id = query_id.split("_") p_text = perspective_getter(int(p_id)) tokens = tokenizer.tokenize_stem(p_text) c_lm = claim_log_odds_dict[claim_id] reason = " ".join(["{0} ({1:.2f})".format(t, c_lm[t]) for t in tokens]) score = sum([c_lm[t] for t in tokens]) return NamedNumber(score, reason)
def main(): claim_text_d: Dict[int, str] = get_all_claim_d() claim_text_d: Dict[str, str] = dict_key_map(str, claim_text_d) evi_dict: Dict[str, str] = dict_key_map(str, load_evidence_dict()) evi_gold_dict: Dict[str, List[int]] = evidence_gold_dict_str_qid() print("V2") def print_entry(entry): evidence_text = evi_dict[entry.doc_id] print("[{}] {}: {}".format(entry.rank, entry.doc_id, evidence_text)) ranked_list_dict = load_ranked_list_grouped(sys.argv[1]) for query, ranked_list in ranked_list_dict.items(): print() claim_id, perspective_id = query.split("_") gold_ids: List[str] = lmap(str, evi_gold_dict[query]) if not gold_ids: print("query {} has no gold".format(query)) continue assert gold_ids claim_text = claim_text_d[claim_id] perspective_text = perspective_getter(int(perspective_id)) pos_entries = [] neg_entries = [] for entry in ranked_list: label = entry.doc_id in gold_ids if label: pos_entries.append(entry) elif entry.rank < 3: neg_entries.append(entry) if not pos_entries: print("gold not in ranked list") continue num_rel = len(pos_entries) correctness = [] for entry in ranked_list[:num_rel]: label = entry.doc_id in gold_ids correctness.append(int(label)) precision = average(correctness) if precision > 0.99: print("Good") continue print("precision at {}: {}".format(num_rel, precision)) print("Claim: ", claim_text) print("perspective_text: ", perspective_text) print(" < GOLD >") foreach(print_entry, pos_entries) print(" < False Positive >") foreach(print_entry, neg_entries)
def encode(e: Tuple[int, int, List[Dict]]): cid, pid, passages = e text1 = tokenize(cid_to_text[cid]) text2 = tokenize(perspective_getter(pid)) for passage_idx, passage in enumerate(passages): info = { 'cid': cid, 'pid': pid, 'passage_idx': passage_idx, 'passage': passage['passage'], 'c_text': cid_to_text[cid], 'p_text': perspective_getter(pid) } yield PayloadAsTokens(passage=passage['passage'], text1=text1, text2=text2, data_id=data_id_man.assign(info), is_correct=0 )
def scorer(lucene_score, query_id) -> NamedNumber: claim_id, p_id = query_id.split("_") i_claim_id = int(claim_id) payload = [] p_text = perspective_getter(int(p_id)) c_text = cid_to_text[i_claim_id] payload.append(encoder.encode_pair(c_text, p_text)) r = proxy.predict(payload) ns_score = -float(r[0]) #ns_score = 0 score = bm25_module.score(c_text, p_text) new_score = score + ns_score * 10 score = NamedNumber(new_score, score.name + " {}".format(ns_score)) return score
def get_qck_queries_all() -> List[QCKQuery]: pc_itr = enum_perspective_clusters() claim_text_d: Dict[int, str] = get_all_claim_d() query_list = [] for pc in pc_itr: c_text = claim_text_d[pc.claim_id] pid = min(pc.perspective_ids) p_text = perspective_getter(pid) text = c_text + " " + p_text query = QCKQuery(get_pc_cluster_query_id(pc), text) query_list.append(query) return query_list
def scorer(lucene_score, query_id) -> NamedNumber: nonlocal found_claim claim_id, p_id = query_id.split("_") i_claim_id = int(claim_id) if i_claim_id in q_tf_replace_norm: ex_qtf = q_tf_replace_norm[i_claim_id] ex_qtf = Counter(dict(ex_qtf.most_common(50))) qtf = ex_qtf + c_qtf_d[i_claim_id] found_claim.add(i_claim_id) else: qtf = c_qtf_d[i_claim_id] p_text = perspective_getter(int(p_id)) p_tokens = bm25_module.tokenizer.tokenize_stem(p_text) score = bm25_module.score_inner(qtf, Counter(p_tokens)) return score
def main(): claim_text_d: Dict[int, str] = get_all_claim_d() evidence_d = load_evidence_dict() evidence_gold = evidence_gold_dict() while True: s = input() cid, pid = s.split("_") cid = int(cid) pid = int(pid) print("Claim: ", claim_text_d[cid]) print("Perspective: ", perspective_getter(pid)) key = cid, pid e_ids = evidence_gold[key] for eid in e_ids: print("Evidence: ", evidence_d[eid])
def main(input_path): claims = get_all_claims() claim_d = claims_to_dict(claims) gold: Dict[int, List[List[int]]] = get_claim_perspective_id_dict() grouped_ranked_list = load_ranked_list_grouped(input_path) def is_correct(qid: str, doc_id: str): return any([int(doc_id) in cluster for cluster in gold[int(qid)]]) top_k = 5 for qid, entries in grouped_ranked_list.items(): n_gold = sum(map(len, gold[int(qid)])) cut_n = min(n_gold, top_k) correctness = list([is_correct(qid, e.doc_id) for e in entries[:cut_n]]) num_correct = sum(lmap(int, correctness)) p_at_k = num_correct / cut_n pid_to_rank: Dict[str, int] = {e.doc_id: e.rank for e in entries} def get_rank(pid: int): if str(pid) in pid_to_rank: return pid_to_rank[str(pid)] else: return "X" if p_at_k < 0.3: print(n_gold) print(p_at_k) print("Claim {} {}".format(qid, claim_d[int(qid)]))## for cluster in gold[int(qid)]: print("-") for pid in cluster: print("[{}]".format(get_rank(pid)), perspective_getter(int(pid))) for e in entries[:50]: correct_str = "Y" if is_correct(qid, e.doc_id) else "N" print("{} {} {}".format(correct_str, e.score, perspective_getter(int(e.doc_id))))
def get_qck_queries(split) -> List[QCKQuery]: claim_ids = set(load_claim_ids_for_split(split)) pc_itr = enum_perspective_clusters_for_split(split) claim_text_d: Dict[int, str] = get_all_claim_d() query_list = [] for pc in pc_itr: if pc.claim_id in claim_ids: c_text = claim_text_d[pc.claim_id] pid = min(pc.perspective_ids) p_text = perspective_getter(pid) text = c_text + " " + p_text query = QCKQuery(get_pc_cluster_query_id(pc), text) query_list.append(query) return query_list
def scorer(lucene_score, query_id) -> NamedNumber: claim_id, p_id = query_id.split("_") c_text = cid_to_text[int(claim_id)] p_text = perspective_getter(int(p_id)) score: NamedNumber = bm25_module.score(c_text, p_text) nclaim_id = int(claim_id) if nclaim_id in rm_info: ex_qtf = rm_info_c[nclaim_id] p_tokens = tokenizer.tokenize_stem(p_text) ex_score = bm25_module.score_inner(ex_qtf, Counter(p_tokens)) new_info = score.name + "({})".format(ex_score.name) score = NamedNumber(score + ex_score, new_info) else: not_found.add(claim_id) return score
def scorer(lucene_score, query_id) -> NamedNumber: claim_id, p_id = query_id.split("_") c_text = cid_to_text[int(claim_id)] p_text = perspective_getter(int(p_id)) qtf = Counter(stem_tokenize(c_text)) weight = claim_term_weight[int(claim_id)] new_qtf = Counter() for k, v in qtf.items(): try: w = weight[k] new_qtf[k] = w * v except Exception as e: print("Exception") print(e) print(k) tf = Counter(stem_tokenize(p_text)) score = bm25_module.score_inner(new_qtf, tf) return score
def generate_instances(self, claim: Dict, data_id_manager) -> List[PayloadAsTokens]: cid = claim['cId'] claim_tokens = self.tokenizer.tokenize(claim['text']) perspectives = self.candidate_perspective[cid] passages = self.cid_to_passages[cid] output = [] for pid in perspectives: is_correct = any([pid in cluster for cluster in self.gold[cid]]) perspective = perspective_getter(pid) perspective_tokens = self.tokenizer.tokenize(perspective) for passage_idx, passage in enumerate(left(passages)): passage_subtokens = tokenize_from_tokens( self.tokenizer, passage) info = {'cid': cid, 'pid': pid, 'passage_idx': passage_idx} p = PayloadAsTokens(passage_subtokens, perspective_tokens, claim_tokens, data_id_manager.assign(info), is_correct) output.append(p) return output
def get_candidates(c: Dict) -> Tuple[int, List[int]]: cid = c["cId"] assert type(cid) == int claim_text = c["text"] claim_tokens = tokenizer.tokenize_stem(claim_text) top_k = 50 lucene_results = es_helper.get_perspective_from_pool(claim_text, top_k) candidate_list: List[int] = [] for rank, (_text, _pid, _score) in enumerate(lucene_results): candidate_list.append(_pid) gold_pids = cid_to_pids[int(cid)] hard_candidate = [] mismatch_voca = Counter() for pid in gold_pids: if pid not in candidate_list: hard_candidate.append(pid) p_text = perspective_getter(pid) p_tokens = tokenizer.tokenize_stem(p_text) for t in p_tokens: if t not in claim_tokens: mismatch_voca[t] += 1 candidate_list.extend(hard_candidate) mismatch_tf_idf = get_tf_idf(mismatch_voca) new_qterms = left(mismatch_tf_idf.most_common(30)) lucene_results = es_helper.get_perspective_from_pool( " ".join(new_qterms), top_k) for rank, (_text, _pid, _score) in enumerate(lucene_results): if _pid not in candidate_list: candidate_list.append(_pid) return cid, candidate_list
def doc_id_to_candidate(doc_id: str) -> QCKCandidate: return QCKCandidate(doc_id, perspective_getter(int(doc_id)))
def get_qck_candidate_from_candidate_id(candidate_id: str): text = perspective_getter(int(candidate_id)) return QCKCandidate(candidate_id, text)
def cid_pid_format_to_qck(candidate_pers): candidate_dict: Dict[str, List[QCKCandidate]] = dict() for cid, candidate_pids in candidate_pers: candidate_dict[str(cid)] = \ lmap(lambda pid: QCKCandidate(str(pid), perspective_getter(pid)), candidate_pids) return candidate_dict
def get_p_tokens(self, pid: int): if pid not in self.p_tokens_d: text = perspective_getter(pid) self.p_tokens_d[pid] = self.tokenizer.tokenize(text) return self.p_tokens_d[pid]
def main(): while True: s = input() pid = int(s) print(perspective_getter(pid))
def pc_predict_to_inspect(bm25_module: BM25, q_tf_replace: Dict[int, Counter], q_tf_replace_0: Dict[int, Counter], claims, top_k): gold = get_claim_perspective_id_dict() q_tf_replace_norm = dict_value_map(normalize_counter, q_tf_replace) q_tf_replace_0_norm = dict_value_map(normalize_counter, q_tf_replace_0) cid_to_text: Dict[int, str] = claims_to_dict(claims) c_qtf_d = {} for cid, c_text in cid_to_text.items(): c_tokens = bm25_module.tokenizer.tokenize_stem(c_text) c_qtf_d[cid] = Counter(c_tokens) def counter_to_str(c: Dict) -> str: s = "" for k, v in c.items(): s += "{0} {1:.2f}".format(k, v) + "\t" return s for claim in claims: cid = claim['cId'] i_claim_id = int(cid) claim_text = claim['text'] lucene_results = es_helper.get_perspective_from_pool(claim_text, 50) candidate_pids = [] for rank, (_text, _pid, _score) in enumerate(lucene_results): candidate_pids.append(_pid) if i_claim_id in q_tf_replace_norm: claim_qtf = Counter( dict_value_map(lambda x: x * 1, c_qtf_d[i_claim_id])) ex_qtf = q_tf_replace_norm[i_claim_id] ex_qtf = Counter(dict(ex_qtf.most_common(50))) qtf = ex_qtf + claim_qtf else: qtf = c_qtf_d[i_claim_id] ranked_list = [] for pid in candidate_pids: p_text = perspective_getter(int(pid)) p_tokens = bm25_module.tokenizer.tokenize_stem(p_text) score = bm25_module.score_inner(qtf, Counter(p_tokens)) debug_str = "" e = score, pid, p_text, debug_str ranked_list.append(e) gold_pids = gold[cid] def is_correct(pid): for pids in gold_pids: if pid in pids: return True return False ranked_list.sort(key=lambda x: x[0], reverse=True) qtf_idf_applied = { k: v * bm25_module.term_idf_factor(k) for k, v in qtf.items() } print() print("Claim: ", cid, claim_text) for cluster in gold_pids: print("-") for pid in cluster: print(pid, perspective_getter(pid)) print() print("qtf:", counter_to_str(qtf)) if i_claim_id in q_tf_replace_norm and i_claim_id in q_tf_replace_0_norm: print("ex_qtf:", counter_to_str(ex_qtf)) ex_qtf_0 = q_tf_replace_0_norm[i_claim_id] ex_qtf_0 = Counter(dict(ex_qtf_0.most_common(50))) print("ex_qtf_0:", counter_to_str(ex_qtf_0)) print("qtf idf apllied:", counter_to_str(qtf_idf_applied)) for score, pid, p_text, debug_str in ranked_list[:top_k]: if i_claim_id in q_tf_replace_0_norm: p_text = perspective_getter(int(pid)) p_tokens = bm25_module.tokenizer.tokenize_stem(p_text) ex_qtf_0 = q_tf_replace_0_norm[i_claim_id] qtf = ex_qtf_0 + c_qtf_d[i_claim_id] score2 = bm25_module.score_inner(qtf, Counter(p_tokens)) correct_str = "Y" if is_correct(pid) else "N" print("{0} {1:.2f} ({2:.2f}) {3} / {4} / {5}".format( correct_str, score, score2, p_text, score.name, score2.name))
def get_tokens(pid): if pid not in tokens_d: text = perspective_getter(pid) tokens_d[pid] = tokenizer.tokenize(text) return tokens_d[pid]
def scorer(lucene_score, query_id) -> NamedNumber: claim_id, p_id = query_id.split("_") c_text = cid_to_text[int(claim_id)] p_text = perspective_getter(int(p_id)) score = bm25_module.score(c_text, p_text) return score