Ejemplo n.º 1
0
def do_generate_jobs(candidate_dict, is_correct_fn, save_dir, split):
    queries = get_qck_queries(split)
    generator = QCInstanceGenerator(candidate_dict, is_correct_fn)
    data_id_manager = DataIDManager()
    insts = generator.generate(queries, data_id_manager)
    save_path = os.path.join(save_dir, split)
    write_records_w_encode_fn(save_path, generator.encode_fn, insts)
    json.dump(data_id_manager.id_to_info, open(save_path + ".info", "w"))
Ejemplo n.º 2
0
def get_candidate_full_text(split) -> Dict[str, List[QCKCandidate]]:
    queries = get_qck_queries(split)

    def get_candidate_for_query(query: QCKQuery):
        res = get_evidence_from_pool(query.text, 60)
        output = []
        for text, e_id, score in res:
            c = QCKCandidate(str(e_id), text)
            output.append(c)
        return output

    qid_list = lmap(lambda q: q.query_id, queries)
    candidate_list_list = lmap(get_candidate_for_query, queries)
    return dict(zip(qid_list, candidate_list_list))
Ejemplo n.º 3
0
def get_query_lms(split) -> Dict[str, Counter]:
    evi_dict: Dict[int, str] = load_evidence_dict()
    tokenzier = PCTokenizer()
    queries = get_qck_queries(split)
    evi_gold_dict: Dict[str, List[int]] = evidence_gold_dict_str_qid()

    def get_evidence_texts(query: QCKQuery) -> List[str]:
        query_id = query.query_id
        e_ids: List[int] = evi_gold_dict[query_id]
        return list([evi_dict[eid] for eid in e_ids])

    def get_query_lm(query: QCKQuery) -> Counter:
        return text_list_to_lm(tokenzier, get_evidence_texts(query))

    lms = lmap(get_query_lm, queries)
    qids = lmap(QCKQuery.get_id, queries)
    query_lms: Dict[str, Counter] = dict(zip(qids, lms))
    return query_lms
Ejemplo n.º 4
0
def get_candidate(split) -> Dict[str, List[QCKCandidateI]]:
    tokenizer = get_tokenizer()
    queries = get_qck_queries(split)
    max_seq_length = 512

    def get_candidate_for_query(query: QCKQuery):
        res = get_evidence_from_pool(query.text, 60)
        query_len = len(tokenizer.tokenize(query.text))
        candidate_max_len = max_seq_length - 3 - query_len

        output = []
        for text, e_id, score in res:
            tokens = tokenizer.tokenize(text)
            for passage in enum_passage(tokens, candidate_max_len):
                c = QCKCandidateWToken(str(e_id), "", passage)
                output.append(c)
        return output

    qid_list = lmap(lambda q: q.query_id, queries)
    candidate_list_list = lmap(get_candidate_for_query, queries)
    return dict(zip(qid_list, candidate_list_list))
Ejemplo n.º 5
0
def get_ex_candidate_for_training(split,
                                  balanced=True,
                                  cached=False
                                  ) -> Dict[str, List[QCKCandidateI]]:
    if cached:
        bow_ranked = load_top_rank_candidate(split)
    else:
        bow_ranked = get_candidate(split)
    tokenizer = get_tokenizer()
    evi_dict: Dict[int, str] = load_evidence_dict()
    evi_gold_dict: Dict[str, List[int]] = evidence_gold_dict_str_qid()
    queries = get_qck_queries(split)
    max_seq_length = 512
    out_d = {}
    for query in queries:
        qid = query.query_id
        c_list = bow_ranked[qid]
        gold_e_ids: List[int] = evi_gold_dict[qid]
        top_ranked: List[int] = lmap(int, map(QCKCandidate.get_id, c_list))
        query_len = len(tokenizer.tokenize(query.text))
        candidate_max_len = max_seq_length - 3 - query_len
        neg_e_ids = []
        for e_id in set(top_ranked):
            if e_id not in gold_e_ids:
                neg_e_ids.append(e_id)
            if balanced and len(neg_e_ids) == len(gold_e_ids):
                break

        def make_candidate(e_id: int) -> Iterable[QCKCandidate]:
            text = evi_dict[e_id]
            tokens = tokenizer.tokenize(text)
            for passage in enum_passage(tokens, candidate_max_len):
                yield QCKCandidateWToken(str(e_id), "", passage)

        new_list = lflatten(map(make_candidate, gold_e_ids + neg_e_ids))
        out_d[qid] = new_list
    return out_d