def qck_gen(job_name, qk_candidate_name, candidate_ranked_list_path, kdp_ranked_list_path, split): claim_ids = load_claim_ids_for_split(split) cids: List[str] = lmap(str, claim_ids) qk_candidate: List[QKUnit] = load_from_pickle(qk_candidate_name) kdp_ranked_list: Dict[ str, List[TrecRankedListEntry]] = load_ranked_list_grouped( kdp_ranked_list_path) print("cids", len(cids)) print("len(qk_candidate)", len(qk_candidate)) print("Generate instances : ", split) generator = QCKInstGenWScore( get_qck_candidate_from_ranked_list_path(candidate_ranked_list_path), is_correct_factory(), kdp_ranked_list) qk_candidate_train: List[QKUnit] = list( [qk for qk in qk_candidate if qk[0].query_id in cids]) def worker_factory(out_dir): return QCKWorker(qk_candidate_train, generator, out_dir) num_jobs = d_n_claims_per_split2[split] runner = JobRunnerS(job_man_dir, num_jobs, job_name + "_" + split, worker_factory) runner.start()
def get_eval_candidates(split, top_k=50) -> List[Tuple[int, List[Dict]]]: # split -> claims d_ids = load_claim_ids_for_split(split) claims: List[Dict] = get_claims_from_ids(d_ids) tokenizer = PCTokenizer() def get_candidates(c: Dict) -> Tuple[int, List[Dict]]: cid = c["cId"] assert type(cid) == int claim_text = c["text"] lucene_results = es_helper.get_perspective_from_pool(claim_text, top_k) candidate_list = [] for rank, (_text, _pid, _score) in enumerate(lucene_results): rationale = "es_rank={} , es_score={}".format(rank, _score) p_entry = { 'cid': cid, 'pid': _pid, 'claim_text': claim_text, 'perspective_text': _text, 'p_tokens': tokenizer.tokenize_stem(_text), 'rationale': rationale, } candidate_list.append(p_entry) return cid, candidate_list candidates: List[Tuple[int, List[Dict]]] = lmap(get_candidates, claims) return candidates
def main(): split = sys.argv[1] ids = load_claim_ids_for_split(split) claims = get_claims_from_ids(ids) for c in claims: print("Claim {} :\t{}".format(c['cId'], c['text']))
def write_simple_claim_queries(): for split in splits: claim_ids = load_claim_ids_for_split(split) claims = get_claims_from_ids(claim_ids) queries = get_simple_claim_query(claims, True) out_path = os.path.join(output_path, "perspective_query", "simple_query_{}.json".format(split)) save_queries_to_file(queries, out_path)
def get_extended_eval_candidate(split) -> Dict[int, List[int]]: bm25 = get_bm25_module() d_ids = load_claim_ids_for_split(split) claims: List[Dict] = get_claims_from_ids(d_ids) cid_to_pids: Dict[int, List[int]] = get_claim_perspective_id_dict2() tokenizer = PCTokenizer() def get_tf_idf(c: Counter): r = Counter() for t, cnt in c.items(): tfidf = bm25.term_idf_factor(t) * cnt r[t] = tfidf return r def get_candidates(c: Dict) -> Tuple[int, List[int]]: cid = c["cId"] assert type(cid) == int claim_text = c["text"] claim_tokens = tokenizer.tokenize_stem(claim_text) top_k = 50 lucene_results = es_helper.get_perspective_from_pool(claim_text, top_k) candidate_list: List[int] = [] for rank, (_text, _pid, _score) in enumerate(lucene_results): candidate_list.append(_pid) gold_pids = cid_to_pids[int(cid)] hard_candidate = [] mismatch_voca = Counter() for pid in gold_pids: if pid not in candidate_list: hard_candidate.append(pid) p_text = perspective_getter(pid) p_tokens = tokenizer.tokenize_stem(p_text) for t in p_tokens: if t not in claim_tokens: mismatch_voca[t] += 1 candidate_list.extend(hard_candidate) mismatch_tf_idf = get_tf_idf(mismatch_voca) new_qterms = left(mismatch_tf_idf.most_common(30)) lucene_results = es_helper.get_perspective_from_pool( " ".join(new_qterms), top_k) for rank, (_text, _pid, _score) in enumerate(lucene_results): if _pid not in candidate_list: candidate_list.append(_pid) return cid, candidate_list candidates: List[Tuple[int, List[int]]] = lmap(get_candidates, claims) return dict(candidates)
def generate_pair_insts(split) -> Iterable[Instance]: pos_rate = 1 neg1_rate = 3 neg2_rate = 6 ids: List[int] = list(load_claim_ids_for_split(split)) id_dict: Dict[int, List[List[int]]] = get_claim_perspective_id_dict() def same_cluster_example() -> Iterator[Tuple[int, int]]: for claim_id in ids: clusters = id_dict[claim_id] for cluster in clusters: for p1, p2 in combinations(cluster, 2): yield p1, p2 def same_claim_different_cluster() -> Iterator[Tuple[int, int]]: for claim_id in ids: clusters = id_dict[claim_id] for cluster1, cluster2 in combinations(clusters, 2): for p1 in cluster1: for p2 in cluster2: yield p1, p2 def different_claim() -> Iterator[Tuple[int, int]]: for cid1, cid2 in combinations(ids, 2): clusters1 = id_dict[cid1] clusters2 = id_dict[cid2] for p1 in flatten(clusters1): for p2 in flatten(clusters2): yield p1, p2 pos: List[Tuple[int, int]] = list(same_cluster_example()) neg1: List[Tuple[int, int]] = list(same_claim_different_cluster()) neg2: List[Tuple[int, int]] = list(different_claim()) pos_len = len(pos) neg1_len = pos_len * neg1_rate neg2_len = pos_len * neg2_rate print("pos/neg1/neg2 = {}/{}/{}".format(pos_len, neg1_len, neg2_len)) random.shuffle(neg1) random.shuffle(neg2) neg1 = neg1[:neg1_len] neg2 = neg2[:neg2_len] pos_data = list([Instance(pid1, pid2, 1) for pid1, pid2 in pos]) neg_data = list([Instance(pid1, pid2, 0) for pid1, pid2 in neg1 + neg2]) all_data = pos_data + neg_data random.shuffle(all_data) return all_data
def load_data_point_50_train_val(split): d_ids = list(load_claim_ids_for_split("train")) train_ids, val_ids = split_7_3(d_ids) print(len(d_ids), len(train_ids), len(val_ids)) if split == "train": sel_ids = train_ids elif split == "val": sel_ids = val_ids else: sel_ids = d_ids print("Taking all claims in {}".format(split)) claims = get_claims_from_ids(sel_ids) all_data_points = get_candidates(claims, False) return all_data_points
def get_qck_queries(split) -> List[QCKQuery]: claim_ids = set(load_claim_ids_for_split(split)) pc_itr = enum_perspective_clusters_for_split(split) claim_text_d: Dict[int, str] = get_all_claim_d() query_list = [] for pc in pc_itr: if pc.claim_id in claim_ids: c_text = claim_text_d[pc.claim_id] pid = min(pc.perspective_ids) p_text = perspective_getter(pid) text = c_text + " " + p_text query = QCKQuery(get_pc_cluster_query_id(pc), text) query_list.append(query) return query_list
def multi_qck_gen(job_name, qk_candidate_name, ranked_list_path, split, k_group_size): claim_ids = load_claim_ids_for_split(split) cids: List[str] = lmap(str, claim_ids) qk_candidate: List[QKUnit] = load_from_pickle(qk_candidate_name) print("cids", len(cids)) print("len(qk_candidate)", len(qk_candidate)) print("Generate instances : ", split) generator = QCKGeneratorGrouped( get_qck_candidate_from_ranked_list_path(ranked_list_path), is_correct_factory(), False, k_group_size) qk_candidate_train: List[QKUnit] = list( [qk for qk in qk_candidate if qk[0].query_id in cids]) def worker_factory(out_dir): return QCKWorker(qk_candidate_train, generator, out_dir) num_jobs = d_n_claims_per_split2[split] runner = JobRunnerS(job_man_dir, num_jobs, job_name + "_" + split, worker_factory) runner.start()
def qck_gen_w_ranked_list_multiple(job_name, qk_candidate_name, ranked_list_path, split, n_qk_per_job): claim_ids = load_claim_ids_for_split(split) cids: List[str] = lmap(str, claim_ids) qk_candidate: List[QKUnit] = load_from_pickle(qk_candidate_name) print("cids", len(cids)) print("len(qk_candidate)", len(qk_candidate)) print("Generate instances : ", split) generator = QCKInstanceGenerator( get_qck_candidate_from_ranked_list_path(ranked_list_path), is_correct_factory()) qk_candidate_train: List[QKUnit] = list( [qk for qk in qk_candidate if qk[0].query_id in cids]) def worker_factory(out_dir): return QCKWorkerMultiple(qk_candidate_train, generator, n_qk_per_job, out_dir) num_qks = d_n_claims_per_split2[split] num_jobs = ceil_divide(num_qks, n_qk_per_job) runner = JobRunnerS(job_man_dir, num_jobs, job_name + "_" + split, worker_factory) runner.start()
def build_gold_lms_for_split(split) -> List[ClaimLM]: d_ids: Iterable[int] = load_claim_ids_for_split(split) claims = get_claims_from_ids(d_ids) return build_gold_lms(claims)
def get_qck_queries(split) -> List[QCKQuery]: d_ids: List[int] = list(load_claim_ids_for_split(split)) return get_qck_queries_from_cids(d_ids)
def load_data_point_50(split): d_ids = list(load_claim_ids_for_split(split)) claims = get_claims_from_ids(d_ids) all_data_points = get_candidates(claims, False) return all_data_points
def load_data_point(split): d_ids = list(load_claim_ids_for_split(split)) claims = get_claims_from_ids(d_ids) is_train = split == "train" all_data_points = get_candidates(claims, is_train) return all_data_points