Esempio n. 1
0
def qck_gen(job_name, qk_candidate_name, candidate_ranked_list_path,
            kdp_ranked_list_path, split):
    claim_ids = load_claim_ids_for_split(split)
    cids: List[str] = lmap(str, claim_ids)
    qk_candidate: List[QKUnit] = load_from_pickle(qk_candidate_name)
    kdp_ranked_list: Dict[
        str, List[TrecRankedListEntry]] = load_ranked_list_grouped(
            kdp_ranked_list_path)

    print("cids", len(cids))
    print("len(qk_candidate)", len(qk_candidate))
    print("Generate instances : ", split)
    generator = QCKInstGenWScore(
        get_qck_candidate_from_ranked_list_path(candidate_ranked_list_path),
        is_correct_factory(), kdp_ranked_list)
    qk_candidate_train: List[QKUnit] = list(
        [qk for qk in qk_candidate if qk[0].query_id in cids])

    def worker_factory(out_dir):
        return QCKWorker(qk_candidate_train, generator, out_dir)

    num_jobs = d_n_claims_per_split2[split]
    runner = JobRunnerS(job_man_dir, num_jobs, job_name + "_" + split,
                        worker_factory)
    runner.start()
Esempio n. 2
0
def get_eval_candidates(split, top_k=50) -> List[Tuple[int, List[Dict]]]:
    # split -> claims
    d_ids = load_claim_ids_for_split(split)
    claims: List[Dict] = get_claims_from_ids(d_ids)
    tokenizer = PCTokenizer()

    def get_candidates(c: Dict) -> Tuple[int, List[Dict]]:
        cid = c["cId"]
        assert type(cid) == int
        claim_text = c["text"]
        lucene_results = es_helper.get_perspective_from_pool(claim_text, top_k)
        candidate_list = []
        for rank, (_text, _pid, _score) in enumerate(lucene_results):
            rationale = "es_rank={} , es_score={}".format(rank, _score)
            p_entry = {
                'cid': cid,
                'pid': _pid,
                'claim_text': claim_text,
                'perspective_text': _text,
                'p_tokens': tokenizer.tokenize_stem(_text),
                'rationale': rationale,
            }
            candidate_list.append(p_entry)
        return cid, candidate_list

    candidates: List[Tuple[int, List[Dict]]] = lmap(get_candidates, claims)
    return candidates
Esempio n. 3
0
def main():
    split = sys.argv[1]

    ids = load_claim_ids_for_split(split)
    claims = get_claims_from_ids(ids)

    for c in claims:
        print("Claim {} :\t{}".format(c['cId'], c['text']))
Esempio n. 4
0
def write_simple_claim_queries():
    for split in splits:
        claim_ids = load_claim_ids_for_split(split)
        claims = get_claims_from_ids(claim_ids)
        queries = get_simple_claim_query(claims, True)
        out_path = os.path.join(output_path, "perspective_query",
                                "simple_query_{}.json".format(split))
        save_queries_to_file(queries, out_path)
Esempio n. 5
0
def get_extended_eval_candidate(split) -> Dict[int, List[int]]:
    bm25 = get_bm25_module()
    d_ids = load_claim_ids_for_split(split)
    claims: List[Dict] = get_claims_from_ids(d_ids)
    cid_to_pids: Dict[int, List[int]] = get_claim_perspective_id_dict2()
    tokenizer = PCTokenizer()

    def get_tf_idf(c: Counter):
        r = Counter()
        for t, cnt in c.items():
            tfidf = bm25.term_idf_factor(t) * cnt
            r[t] = tfidf
        return r

    def get_candidates(c: Dict) -> Tuple[int, List[int]]:
        cid = c["cId"]
        assert type(cid) == int
        claim_text = c["text"]
        claim_tokens = tokenizer.tokenize_stem(claim_text)
        top_k = 50
        lucene_results = es_helper.get_perspective_from_pool(claim_text, top_k)
        candidate_list: List[int] = []

        for rank, (_text, _pid, _score) in enumerate(lucene_results):
            candidate_list.append(_pid)

        gold_pids = cid_to_pids[int(cid)]
        hard_candidate = []
        mismatch_voca = Counter()
        for pid in gold_pids:
            if pid not in candidate_list:
                hard_candidate.append(pid)
                p_text = perspective_getter(pid)
                p_tokens = tokenizer.tokenize_stem(p_text)

                for t in p_tokens:
                    if t not in claim_tokens:
                        mismatch_voca[t] += 1

        candidate_list.extend(hard_candidate)
        mismatch_tf_idf = get_tf_idf(mismatch_voca)
        new_qterms = left(mismatch_tf_idf.most_common(30))
        lucene_results = es_helper.get_perspective_from_pool(
            " ".join(new_qterms), top_k)

        for rank, (_text, _pid, _score) in enumerate(lucene_results):
            if _pid not in candidate_list:
                candidate_list.append(_pid)

        return cid, candidate_list

    candidates: List[Tuple[int, List[int]]] = lmap(get_candidates, claims)
    return dict(candidates)
Esempio n. 6
0
def generate_pair_insts(split) -> Iterable[Instance]:
    pos_rate = 1
    neg1_rate = 3
    neg2_rate = 6
    ids: List[int] = list(load_claim_ids_for_split(split))
    id_dict: Dict[int, List[List[int]]] = get_claim_perspective_id_dict()

    def same_cluster_example() -> Iterator[Tuple[int, int]]:
        for claim_id in ids:
            clusters = id_dict[claim_id]
            for cluster in clusters:
                for p1, p2 in combinations(cluster, 2):
                    yield p1, p2

    def same_claim_different_cluster() -> Iterator[Tuple[int, int]]:
        for claim_id in ids:
            clusters = id_dict[claim_id]
            for cluster1, cluster2 in combinations(clusters, 2):
                for p1 in cluster1:
                    for p2 in cluster2:
                        yield p1, p2

    def different_claim() -> Iterator[Tuple[int, int]]:
        for cid1, cid2 in combinations(ids, 2):
            clusters1 = id_dict[cid1]
            clusters2 = id_dict[cid2]
            for p1 in flatten(clusters1):
                for p2 in flatten(clusters2):
                    yield p1, p2

    pos: List[Tuple[int, int]] = list(same_cluster_example())
    neg1: List[Tuple[int, int]] = list(same_claim_different_cluster())
    neg2: List[Tuple[int, int]] = list(different_claim())

    pos_len = len(pos)
    neg1_len = pos_len * neg1_rate
    neg2_len = pos_len * neg2_rate

    print("pos/neg1/neg2 = {}/{}/{}".format(pos_len, neg1_len, neg2_len))

    random.shuffle(neg1)
    random.shuffle(neg2)

    neg1 = neg1[:neg1_len]
    neg2 = neg2[:neg2_len]

    pos_data = list([Instance(pid1, pid2, 1) for pid1, pid2 in pos])
    neg_data = list([Instance(pid1, pid2, 0) for pid1, pid2 in neg1 + neg2])

    all_data = pos_data + neg_data
    random.shuffle(all_data)
    return all_data
Esempio n. 7
0
def load_data_point_50_train_val(split):
    d_ids = list(load_claim_ids_for_split("train"))
    train_ids, val_ids = split_7_3(d_ids)
    print(len(d_ids), len(train_ids), len(val_ids))
    if split == "train":
        sel_ids = train_ids
    elif split == "val":
        sel_ids = val_ids
    else:
        sel_ids = d_ids
        print("Taking all claims in {}".format(split))

    claims = get_claims_from_ids(sel_ids)
    all_data_points = get_candidates(claims, False)
    return all_data_points
Esempio n. 8
0
def get_qck_queries(split) -> List[QCKQuery]:
    claim_ids = set(load_claim_ids_for_split(split))
    pc_itr = enum_perspective_clusters_for_split(split)
    claim_text_d: Dict[int, str] = get_all_claim_d()

    query_list = []
    for pc in pc_itr:
        if pc.claim_id in claim_ids:
            c_text = claim_text_d[pc.claim_id]
            pid = min(pc.perspective_ids)
            p_text = perspective_getter(pid)
            text = c_text + " " + p_text
            query = QCKQuery(get_pc_cluster_query_id(pc), text)
            query_list.append(query)

    return query_list
Esempio n. 9
0
def multi_qck_gen(job_name, qk_candidate_name, ranked_list_path, split,
                  k_group_size):
    claim_ids = load_claim_ids_for_split(split)
    cids: List[str] = lmap(str, claim_ids)
    qk_candidate: List[QKUnit] = load_from_pickle(qk_candidate_name)
    print("cids", len(cids))
    print("len(qk_candidate)", len(qk_candidate))
    print("Generate instances : ", split)
    generator = QCKGeneratorGrouped(
        get_qck_candidate_from_ranked_list_path(ranked_list_path),
        is_correct_factory(), False, k_group_size)
    qk_candidate_train: List[QKUnit] = list(
        [qk for qk in qk_candidate if qk[0].query_id in cids])

    def worker_factory(out_dir):
        return QCKWorker(qk_candidate_train, generator, out_dir)

    num_jobs = d_n_claims_per_split2[split]
    runner = JobRunnerS(job_man_dir, num_jobs, job_name + "_" + split,
                        worker_factory)
    runner.start()
Esempio n. 10
0
def qck_gen_w_ranked_list_multiple(job_name, qk_candidate_name,
                                   ranked_list_path, split, n_qk_per_job):
    claim_ids = load_claim_ids_for_split(split)
    cids: List[str] = lmap(str, claim_ids)
    qk_candidate: List[QKUnit] = load_from_pickle(qk_candidate_name)
    print("cids", len(cids))
    print("len(qk_candidate)", len(qk_candidate))
    print("Generate instances : ", split)
    generator = QCKInstanceGenerator(
        get_qck_candidate_from_ranked_list_path(ranked_list_path),
        is_correct_factory())
    qk_candidate_train: List[QKUnit] = list(
        [qk for qk in qk_candidate if qk[0].query_id in cids])

    def worker_factory(out_dir):
        return QCKWorkerMultiple(qk_candidate_train, generator, n_qk_per_job,
                                 out_dir)

    num_qks = d_n_claims_per_split2[split]
    num_jobs = ceil_divide(num_qks, n_qk_per_job)
    runner = JobRunnerS(job_man_dir, num_jobs, job_name + "_" + split,
                        worker_factory)
    runner.start()
Esempio n. 11
0
def build_gold_lms_for_split(split) -> List[ClaimLM]:
    d_ids: Iterable[int] = load_claim_ids_for_split(split)
    claims = get_claims_from_ids(d_ids)
    return build_gold_lms(claims)
Esempio n. 12
0
def get_qck_queries(split) -> List[QCKQuery]:
    d_ids: List[int] = list(load_claim_ids_for_split(split))
    return get_qck_queries_from_cids(d_ids)
Esempio n. 13
0
def load_data_point_50(split):
    d_ids = list(load_claim_ids_for_split(split))
    claims = get_claims_from_ids(d_ids)
    all_data_points = get_candidates(claims, False)
    return all_data_points
Esempio n. 14
0
def load_data_point(split):
    d_ids = list(load_claim_ids_for_split(split))
    claims = get_claims_from_ids(d_ids)
    is_train = split == "train"
    all_data_points = get_candidates(claims, is_train)
    return all_data_points