def do_generate_jobs(candidate_dict, is_correct_fn, save_dir, split): queries = get_qck_queries(split) generator = QCInstanceGenerator(candidate_dict, is_correct_fn) data_id_manager = DataIDManager() insts = generator.generate(queries, data_id_manager) save_path = os.path.join(save_dir, split) write_records_w_encode_fn(save_path, generator.encode_fn, insts) json.dump(data_id_manager.id_to_info, open(save_path + ".info", "w"))
def get_candidate_full_text(split) -> Dict[str, List[QCKCandidate]]: queries = get_qck_queries(split) def get_candidate_for_query(query: QCKQuery): res = get_evidence_from_pool(query.text, 60) output = [] for text, e_id, score in res: c = QCKCandidate(str(e_id), text) output.append(c) return output qid_list = lmap(lambda q: q.query_id, queries) candidate_list_list = lmap(get_candidate_for_query, queries) return dict(zip(qid_list, candidate_list_list))
def get_query_lms(split) -> Dict[str, Counter]: evi_dict: Dict[int, str] = load_evidence_dict() tokenzier = PCTokenizer() queries = get_qck_queries(split) evi_gold_dict: Dict[str, List[int]] = evidence_gold_dict_str_qid() def get_evidence_texts(query: QCKQuery) -> List[str]: query_id = query.query_id e_ids: List[int] = evi_gold_dict[query_id] return list([evi_dict[eid] for eid in e_ids]) def get_query_lm(query: QCKQuery) -> Counter: return text_list_to_lm(tokenzier, get_evidence_texts(query)) lms = lmap(get_query_lm, queries) qids = lmap(QCKQuery.get_id, queries) query_lms: Dict[str, Counter] = dict(zip(qids, lms)) return query_lms
def get_candidate(split) -> Dict[str, List[QCKCandidateI]]: tokenizer = get_tokenizer() queries = get_qck_queries(split) max_seq_length = 512 def get_candidate_for_query(query: QCKQuery): res = get_evidence_from_pool(query.text, 60) query_len = len(tokenizer.tokenize(query.text)) candidate_max_len = max_seq_length - 3 - query_len output = [] for text, e_id, score in res: tokens = tokenizer.tokenize(text) for passage in enum_passage(tokens, candidate_max_len): c = QCKCandidateWToken(str(e_id), "", passage) output.append(c) return output qid_list = lmap(lambda q: q.query_id, queries) candidate_list_list = lmap(get_candidate_for_query, queries) return dict(zip(qid_list, candidate_list_list))
def get_ex_candidate_for_training(split, balanced=True, cached=False ) -> Dict[str, List[QCKCandidateI]]: if cached: bow_ranked = load_top_rank_candidate(split) else: bow_ranked = get_candidate(split) tokenizer = get_tokenizer() evi_dict: Dict[int, str] = load_evidence_dict() evi_gold_dict: Dict[str, List[int]] = evidence_gold_dict_str_qid() queries = get_qck_queries(split) max_seq_length = 512 out_d = {} for query in queries: qid = query.query_id c_list = bow_ranked[qid] gold_e_ids: List[int] = evi_gold_dict[qid] top_ranked: List[int] = lmap(int, map(QCKCandidate.get_id, c_list)) query_len = len(tokenizer.tokenize(query.text)) candidate_max_len = max_seq_length - 3 - query_len neg_e_ids = [] for e_id in set(top_ranked): if e_id not in gold_e_ids: neg_e_ids.append(e_id) if balanced and len(neg_e_ids) == len(gold_e_ids): break def make_candidate(e_id: int) -> Iterable[QCKCandidate]: text = evi_dict[e_id] tokens = tokenizer.tokenize(text) for passage in enum_passage(tokens, candidate_max_len): yield QCKCandidateWToken(str(e_id), "", passage) new_list = lflatten(map(make_candidate, gold_e_ids + neg_e_ids)) out_d[qid] = new_list return out_d