def get_qck_gen_dynamic_kdp(): split = "train" candidate_d: Dict[str, List[QCKCandidate]] = get_extended_eval_candidate_as_qck(split) train2_claims = load_claims_for_sub_split("val") target_qids = list([str(c['cId']) for c in train2_claims ]) queries = get_qck_queries(split) queries = list([q for q in queries if q.query_id in target_qids]) return QCKGenDynamicKDP(queries, candidate_d, get_is_correct_fn())
def do_all_jobs(generator: InstanceGenerator, qk_candidate_name, name_prefix, sub_split): print("do all jobs") num_jobs = d_n_claims_per_subsplit[sub_split] claims = load_claims_for_sub_split(sub_split) cids = {str(t['cId']) for t in claims} qk_candidate: List[QKUnit] = load_from_pickle(qk_candidate_name) qk_candidate: List[QKUnit] = list( [qk for qk in qk_candidate if qk[0].query_id in cids]) job_name = name_prefix + "_{}".format(sub_split) out_dir = os.path.join(job_man_dir, job_name) exist_or_mkdir(out_dir) worker = QCKWorker(qk_candidate, generator, out_dir) for i in range(num_jobs + 1): worker.work(i)
def start_generate_jobs_for_sub_split(generator: InstanceGenerator, qk_candidate_name, name_prefix, sub_split): # claim ids split to train/val print("Loading data ....") claims = load_claims_for_sub_split(sub_split) cids = {str(t['cId']) for t in claims} qk_candidate: List[QKUnit] = load_from_pickle(qk_candidate_name) print("Generate instances : ", sub_split) qk_candidate: List[QKUnit] = list( [qk for qk in qk_candidate if qk[0].query_id in cids]) def worker_factory(out_dir): return QCKWorker(qk_candidate, generator, out_dir) num_jobs = d_n_claims_per_subsplit[sub_split] runner = JobRunner(job_man_dir, num_jobs, name_prefix + "_" + sub_split, worker_factory) runner.auto_runner()
def main(config): split = config['split'] word_prob_path = config['word_prob_path'] save_path = config['save_path'] threshold = config['threshold'] per_query_infos: Dict[str, Dict[WordAsID, np.array]] = load_pickle_from(word_prob_path) claims = load_claims_for_sub_split(split) claim_d = claims_to_dict(claims) stopwords = load_stopwords_for_query() def is_stopword(tokens): if len(tokens) == 1 and tokens[0] in stopwords: return True else: return False tokenizer = get_tokenizer() all_d = {} for query_id, d in per_query_infos.items(): entry = [] for key in d.keys(): tokens: List[str] = decode_word_as_id(tokenizer, key) if is_stopword(tokens): continue plain_word: str = pretty_tokens(tokens, True) pos, neg = d[key] pos_log = math.log(pos + 1e-10) neg_log = math.log(neg + 1e-10) diff = pos_log - neg_log entry.append((plain_word, diff, pos_log, neg_log)) entry.sort(key=get_second, reverse=True) word_list = [] for word, diff, pos, neg in entry[:100]: if diff > threshold: word = word.strip() word_list.append(word) all_d[query_id] = word_list json.dump(all_d, open(save_path, "w"))
def run_jobs_with_qk_candidate(generator: InstanceGenerator, sub_split, qk_candidate_name, name_prefix): n_claims_per_split = d_n_claims_per_subsplit[sub_split] print("Loading data ....") claims = load_claims_for_sub_split(sub_split) cids = {str(t['cId']) for t in claims} qk_candidate: List[QKUnit] = load_from_pickle(qk_candidate_name) print("Generate instances : {}".format(sub_split)) qk_candidate_val: List[QKUnit] = list([qk for qk in qk_candidate if qk[0].query_id in cids]) def worker_factory(out_dir): return QCKWorker(qk_candidate_val, generator, out_dir) runner = JobRunner(job_man_dir, n_claims_per_split, name_prefix + "_" + sub_split, worker_factory) runner.start()
def main(config): split = config['split'] word_prob_path = config['word_prob_path'] per_query_infos: Dict[str, Dict[WordAsID, np.array]] = load_pickle_from(word_prob_path) claims = load_claims_for_sub_split(split) claim_d = claims_to_dict(claims) stopwords = load_stopwords_for_query() def is_stopword(tokens): if len(tokens) == 1 and tokens[0] in stopwords: return True else: return False tokenizer = get_tokenizer() for query_id, d in per_query_infos.items(): entry = [] for key in d.keys(): tokens: List[str] = decode_word_as_id(tokenizer, key) if is_stopword(tokens): continue plain_word: str = pretty_tokens(tokens, True) pos, neg = d[key] pos_log = math.log(pos + 1e-10) neg_log = math.log(neg + 1e-10) diff = pos_log - neg_log entry.append((plain_word, diff, pos_log, neg_log)) print(query_id, claim_d[int(query_id)]) entry.sort(key=get_second, reverse=True) for word, diff, pos, neg in entry[:100]: word = word.strip() print("{0}\t{1:.2f}\t{2:.2f}\t{3:.2f}".format( word, diff, pos, neg))
def main(): run_name = "es" for split in ["dev", "test"]: claims = load_claims_for_sub_split(split) candidates_data: List[Tuple[Dict, List[Dict]]] = get_all_candidate(claims) flat_entries = [] for c, candidates in candidates_data: assert len(candidates) <= 50 print(len(candidates)) query_id = str(c["cId"]) for rank, e in enumerate(candidates): doc_id = str(e['pid']) score = e['score'] entry = TrecRankedListEntry(query_id, doc_id, rank, score, run_name) flat_entries.append(entry) save_path = os.path.join(output_path, "ranked_list", "pc_es_{}.txt".format(split)) write_trec_ranked_list_entry(flat_entries, save_path)
def build_gold_lms_for_sub_split(sub_split) -> List[ClaimLM]: claims = load_claims_for_sub_split(sub_split) return build_gold_lms(claims)