def qk_candidate_gen(q_res_path: str, doc_score_path, split,
                     config) -> List[Tuple[QCKQuery, List[KDP]]]:
    queries: List[QCKQuery] = get_qck_queries(split)
    num_jobs = d_n_claims_per_split2[split]
    score_d = load_doc_scores(doc_score_path, num_jobs)

    tprint("loading ranked list")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    query_ids = list(ranked_list.keys())
    query_ids.sort()
    print("num queries", len(query_ids))
    q_id_to_job_id = {q_id: job_id for job_id, q_id in enumerate(query_ids)}
    print("Pre loading docs")
    top_n = config['top_n']
    out_qk: List[Tuple[QCKQuery, List[KnowledgeDocumentPart]]] = []

    all_doc_parts = 0
    ticker = TimeEstimator(len(queries))
    for q in queries:
        job_id: int = q_id_to_job_id[q.query_id]
        entries: List = score_d[job_id]
        entries.sort(key=get_second, reverse=True)
        doc_ids = left(entries)
        doc_ids = doc_ids[:top_n]
        preload_man.preload(TokenizedCluewebDoc, doc_ids)
        docs = iterate_docs(doc_ids)
        doc_part_list: List[KDP] = iterate_document_parts(
            docs, config['window_size'], config['step_size'], 20)

        all_doc_parts += len(doc_part_list)
        out_qk.append((q, doc_part_list))
        ticker.tick()
    return out_qk
Exemple #2
0
def main():
    save_dir = os.path.join(output_path, "pc_qc")
    exist_or_mkdir(save_dir)
    for split in splits:
        queries = get_qck_queries(split)
        eval_candidate = get_eval_candidates_as_qck(split)
        save_path = os.path.join(save_dir, split)
        make_pc_qc(queries, eval_candidate, is_correct_factory(), save_path)
Exemple #3
0
def main():
    for split in splits:
        q_res_path = os.path.join(output_path, "perspective_experiments",
                                  "clueweb_qres", "{}.txt".format(split))
        qck_queries = get_qck_queries(split)
        candidate = get_qk_candidate(config1(), q_res_path, qck_queries)
        print("Num candidate : {}", len(candidate))
        save_to_pickle(candidate, "pc_qk2_{}".format(split))
def get_qck_gen_dynamic_kdp():
    split = "train"
    candidate_d: Dict[str, List[QCKCandidate]] = get_extended_eval_candidate_as_qck(split)

    train2_claims = load_claims_for_sub_split("val")

    target_qids = list([str(c['cId']) for c in train2_claims ])
    queries = get_qck_queries(split)
    queries = list([q for q in queries if q.query_id in target_qids])
    return QCKGenDynamicKDP(queries, candidate_d, get_is_correct_fn())
Exemple #5
0
def main():
    save_dir = os.path.join(output_path, "pc_qc2")
    exist_or_mkdir(save_dir)
    for split in splits:
        queries = get_qck_queries(split)
        q_res_path = os.path.join("output",
                                  "perspective_experiments",
                                  "q_res_{}.txt".format(split))
        eval_candidate = get_qck_candidate_from_ranked_list_path(q_res_path)
        save_path = os.path.join(save_dir, split)
        make_pc_qc(queries, eval_candidate, is_correct_factory(), save_path)
def make_qcknc_problem(
    passage_score_path: FilePath,
    info_path: FilePath,
    config_path: FilePath,
    split: str,
    save_name: str,
) -> None:
    candidate_dict: Dict[int, List[Dict]] = dict(
        get_eval_candidates_from_pickle(split))
    queries: List[QCKQuery] = get_qck_queries(split)

    config = json.load(open(config_path, "r"))

    def get_pids(l: List[Dict]) -> List[str]:
        return lmap(lambda x: x['pid'], l)

    candidate_id_dict_1: Dict[int, List[str]] = dict_value_map(
        get_pids, candidate_dict)
    candidate_id_dict: Dict[str,
                            List[str]] = dict_key_map(str, candidate_id_dict_1)

    all_candidate_ids = set(flatten(candidate_id_dict.values()))
    candidate_dict: Dict[str, QCKCandidate] = {
        cid: get_qck_candidate_from_candidate_id(cid)
        for cid in all_candidate_ids
    }

    data_id_to_info: Dict = load_combine_info_jsons(info_path, qk_convert_map)
    print("number of dat info ", len(data_id_to_info))
    qk_result: List[Tuple[str, List[QKOutEntry]]] = collect_good_passages(
        data_id_to_info, passage_score_path, config)

    query_dict = {q.query_id: q for q in queries}
    payloads = qck_from_qk_results(qk_result, candidate_id_dict, query_dict,
                                   candidate_dict)

    out_dir = os.path.join(output_path, "cppnc")
    exist_or_mkdir(out_dir)
    save_path = os.path.join(out_dir, save_name + ".tfrecord")
    data_id_man = write_qck_as_tfrecord(save_path, payloads)
    info_save_path = os.path.join(out_dir, save_name + ".info")
    print("Payload size : ", len(data_id_man.id_to_info))

    json.dump(data_id_man.id_to_info, open(info_save_path, "w"))
    print("tfrecord saved at :", save_path)
    print("info saved at :", info_save_path)
def get_candidates(q_res_path, split, config) -> List[QKUnit]:
    queries = get_qck_queries(split)
    top_n = config['top_n']
    candidate: List[Tuple[QCKQuery, List[KDP]]] = qk_candidate_gen(
        q_res_path, queries, top_n, config)
    return candidate