コード例 #1
0
def get_qck_gen_dynamic_kdp():
    split = "train"
    candidate_d: Dict[str, List[QCKCandidate]] = get_extended_eval_candidate_as_qck(split)

    train2_claims = load_claims_for_sub_split("val")

    target_qids = list([str(c['cId']) for c in train2_claims ])
    queries = get_qck_queries(split)
    queries = list([q for q in queries if q.query_id in target_qids])
    return QCKGenDynamicKDP(queries, candidate_d, get_is_correct_fn())
コード例 #2
0
ファイル: qcknc_common.py プロジェクト: clover3/Chair
def do_all_jobs(generator: InstanceGenerator, qk_candidate_name, name_prefix,
                sub_split):
    print("do all jobs")
    num_jobs = d_n_claims_per_subsplit[sub_split]
    claims = load_claims_for_sub_split(sub_split)
    cids = {str(t['cId']) for t in claims}
    qk_candidate: List[QKUnit] = load_from_pickle(qk_candidate_name)
    qk_candidate: List[QKUnit] = list(
        [qk for qk in qk_candidate if qk[0].query_id in cids])

    job_name = name_prefix + "_{}".format(sub_split)
    out_dir = os.path.join(job_man_dir, job_name)
    exist_or_mkdir(out_dir)
    worker = QCKWorker(qk_candidate, generator, out_dir)
    for i in range(num_jobs + 1):
        worker.work(i)
コード例 #3
0
ファイル: qcknc_common.py プロジェクト: clover3/Chair
def start_generate_jobs_for_sub_split(generator: InstanceGenerator,
                                      qk_candidate_name, name_prefix,
                                      sub_split):
    # claim ids split to train/val
    print("Loading data ....")
    claims = load_claims_for_sub_split(sub_split)
    cids = {str(t['cId']) for t in claims}
    qk_candidate: List[QKUnit] = load_from_pickle(qk_candidate_name)
    print("Generate instances : ", sub_split)
    qk_candidate: List[QKUnit] = list(
        [qk for qk in qk_candidate if qk[0].query_id in cids])

    def worker_factory(out_dir):
        return QCKWorker(qk_candidate, generator, out_dir)

    num_jobs = d_n_claims_per_subsplit[sub_split]
    runner = JobRunner(job_man_dir, num_jobs, name_prefix + "_" + sub_split,
                       worker_factory)
    runner.auto_runner()
コード例 #4
0
ファイル: save_key_terms.py プロジェクト: clover3/Chair
def main(config):
    split = config['split']
    word_prob_path = config['word_prob_path']
    save_path = config['save_path']
    threshold = config['threshold']
    per_query_infos: Dict[str,
                          Dict[WordAsID,
                               np.array]] = load_pickle_from(word_prob_path)
    claims = load_claims_for_sub_split(split)
    claim_d = claims_to_dict(claims)
    stopwords = load_stopwords_for_query()

    def is_stopword(tokens):
        if len(tokens) == 1 and tokens[0] in stopwords:
            return True
        else:
            return False

    tokenizer = get_tokenizer()

    all_d = {}
    for query_id, d in per_query_infos.items():
        entry = []
        for key in d.keys():
            tokens: List[str] = decode_word_as_id(tokenizer, key)
            if is_stopword(tokens):
                continue

            plain_word: str = pretty_tokens(tokens, True)
            pos, neg = d[key]
            pos_log = math.log(pos + 1e-10)
            neg_log = math.log(neg + 1e-10)
            diff = pos_log - neg_log
            entry.append((plain_word, diff, pos_log, neg_log))

        entry.sort(key=get_second, reverse=True)
        word_list = []
        for word, diff, pos, neg in entry[:100]:
            if diff > threshold:
                word = word.strip()
                word_list.append(word)
        all_d[query_id] = word_list
    json.dump(all_d, open(save_path, "w"))
コード例 #5
0
ファイル: qcknc_pred_datagen.py プロジェクト: clover3/Chair
def run_jobs_with_qk_candidate(generator: InstanceGenerator,
                               sub_split,
                               qk_candidate_name,
                               name_prefix):

    n_claims_per_split = d_n_claims_per_subsplit[sub_split]
    print("Loading data ....")

    claims = load_claims_for_sub_split(sub_split)

    cids = {str(t['cId']) for t in claims}
    qk_candidate: List[QKUnit] = load_from_pickle(qk_candidate_name)
    print("Generate instances : {}".format(sub_split))
    qk_candidate_val: List[QKUnit] = list([qk for qk in qk_candidate if qk[0].query_id in cids])

    def worker_factory(out_dir):
        return QCKWorker(qk_candidate_val,
                         generator,
                         out_dir)

    runner = JobRunner(job_man_dir, n_claims_per_split, name_prefix + "_" + sub_split, worker_factory)
    runner.start()
コード例 #6
0
ファイル: print_token_scoring.py プロジェクト: clover3/Chair
def main(config):
    split = config['split']
    word_prob_path = config['word_prob_path']
    per_query_infos: Dict[str,
                          Dict[WordAsID,
                               np.array]] = load_pickle_from(word_prob_path)
    claims = load_claims_for_sub_split(split)
    claim_d = claims_to_dict(claims)
    stopwords = load_stopwords_for_query()

    def is_stopword(tokens):
        if len(tokens) == 1 and tokens[0] in stopwords:
            return True
        else:
            return False

    tokenizer = get_tokenizer()

    for query_id, d in per_query_infos.items():
        entry = []
        for key in d.keys():
            tokens: List[str] = decode_word_as_id(tokenizer, key)
            if is_stopword(tokens):
                continue

            plain_word: str = pretty_tokens(tokens, True)
            pos, neg = d[key]
            pos_log = math.log(pos + 1e-10)
            neg_log = math.log(neg + 1e-10)
            diff = pos_log - neg_log
            entry.append((plain_word, diff, pos_log, neg_log))

        print(query_id, claim_d[int(query_id)])
        entry.sort(key=get_second, reverse=True)
        for word, diff, pos, neg in entry[:100]:
            word = word.strip()
            print("{0}\t{1:.2f}\t{2:.2f}\t{3:.2f}".format(
                word, diff, pos, neg))
コード例 #7
0
ファイル: save_es_ranked_list.py プロジェクト: clover3/Chair
def main():
    run_name = "es"
    for split in ["dev", "test"]:
        claims = load_claims_for_sub_split(split)
        candidates_data: List[Tuple[Dict,
                                    List[Dict]]] = get_all_candidate(claims)

        flat_entries = []
        for c, candidates in candidates_data:
            assert len(candidates) <= 50
            print(len(candidates))
            query_id = str(c["cId"])

            for rank, e in enumerate(candidates):
                doc_id = str(e['pid'])
                score = e['score']
                entry = TrecRankedListEntry(query_id, doc_id, rank, score,
                                            run_name)
                flat_entries.append(entry)

        save_path = os.path.join(output_path, "ranked_list",
                                 "pc_es_{}.txt".format(split))
        write_trec_ranked_list_entry(flat_entries, save_path)
コード例 #8
0
def build_gold_lms_for_sub_split(sub_split) -> List[ClaimLM]:
    claims = load_claims_for_sub_split(sub_split)

    return build_gold_lms(claims)