Esempio n. 1
0
def main():
    pc_clusters: Iterable[PerspectiveCluster] = enum_perspective_clusters()
    tokenizer = TokenizerForGalago()

    def get_terms(text: str) -> Counter:
        terms = tokenizer.tokenize(text)
        return Counter(terms)

    # Query = [claim :: avg(perspective)]
    claim_text_d: Dict[int, str] = get_all_claim_d()
    perspective_text_d: Dict[int, str] = get_perspective_dict()

    def cluster_to_query(cluster: PerspectiveCluster) -> DocQuery:
        claim_text = claim_text_d[cluster.claim_id]
        perspective_text_list = list(
            [perspective_text_d[pid] for pid in cluster.perspective_ids])
        query_id = get_pc_cluster_query_id(cluster)
        claim_tf: Counter = get_terms(claim_text)
        pers_tf: Counter = average_counters(
            lmap(get_terms, perspective_text_list))
        tf = sum_counters([claim_tf, pers_tf])
        query: DocQuery = counter_to_galago_query(query_id, tf)
        return query

    query_list: List[DocQuery] = lmap(cluster_to_query, pc_clusters)
    print(len(query_list))
    out_path = os.path.join(output_path, "perspective_query",
                            "pc_query_for_evidence.json")
    save_queries_to_file(query_list, out_path)
Esempio n. 2
0
def main():
    claim_text_d: Dict[int, str] = get_all_claim_d()
    claim_text_d: Dict[str, str] = dict_key_map(str, claim_text_d)
    evi_dict: Dict[str, str] = dict_key_map(str, load_evidence_dict())
    evi_gold_dict: Dict[str, List[int]] = evidence_gold_dict_str_qid()
    print("V2")

    def print_entry(entry):
        evidence_text = evi_dict[entry.doc_id]
        print("[{}] {}: {}".format(entry.rank, entry.doc_id, evidence_text))

    ranked_list_dict = load_ranked_list_grouped(sys.argv[1])
    for query, ranked_list in ranked_list_dict.items():
        print()

        claim_id, perspective_id = query.split("_")
        gold_ids: List[str] = lmap(str, evi_gold_dict[query])
        if not gold_ids:
            print("query {} has no gold".format(query))
            continue
        assert gold_ids
        claim_text = claim_text_d[claim_id]
        perspective_text = perspective_getter(int(perspective_id))

        pos_entries = []
        neg_entries = []
        for entry in ranked_list:
            label = entry.doc_id in gold_ids
            if label:
                pos_entries.append(entry)
            elif entry.rank < 3:
                neg_entries.append(entry)

        if not pos_entries:
            print("gold not in ranked list")
            continue

        num_rel = len(pos_entries)

        correctness = []
        for entry in ranked_list[:num_rel]:
            label = entry.doc_id in gold_ids
            correctness.append(int(label))

        precision = average(correctness)
        if precision > 0.99:
            print("Good")
            continue
        print("precision at {}: {}".format(num_rel, precision))

        print("Claim: ", claim_text)
        print("perspective_text: ", perspective_text)
        print(" < GOLD >")
        foreach(print_entry, pos_entries)
        print(" < False Positive >")
        foreach(print_entry, neg_entries)
Esempio n. 3
0
def main(config):
    q_res_path = config['q_res_path']
    split = config['split']
    query_d: Dict[int, str] = get_all_claim_d()

    def worker_gen(out_dir):
        qkgen = QKGenFromDB(q_res_path, query_d, out_dir)
        return qkgen

    num_jobs = d_n_claims_per_split2[split]
    runner = JobRunner(job_man_dir, num_jobs, config['job_name'], worker_gen)
    runner.auto_runner()
Esempio n. 4
0
def get_qck_queries_all() -> List[QCKQuery]:
    pc_itr = enum_perspective_clusters()
    claim_text_d: Dict[int, str] = get_all_claim_d()

    query_list = []
    for pc in pc_itr:
        c_text = claim_text_d[pc.claim_id]
        pid = min(pc.perspective_ids)
        p_text = perspective_getter(pid)
        text = c_text + " " + p_text
        query = QCKQuery(get_pc_cluster_query_id(pc), text)
        query_list.append(query)

    return query_list
def main():
    claim_text_d: Dict[int, str] = get_all_claim_d()
    evidence_d = load_evidence_dict()
    evidence_gold = evidence_gold_dict()
    while True:
        s = input()
        cid, pid = s.split("_")
        cid = int(cid)
        pid = int(pid)
        print("Claim: ", claim_text_d[cid])
        print("Perspective: ", perspective_getter(pid))
        key = cid, pid
        e_ids = evidence_gold[key]
        for eid in e_ids:
            print("Evidence: ", evidence_d[eid])
Esempio n. 6
0
def get_qck_queries(split) -> List[QCKQuery]:
    claim_ids = set(load_claim_ids_for_split(split))
    pc_itr = enum_perspective_clusters_for_split(split)
    claim_text_d: Dict[int, str] = get_all_claim_d()

    query_list = []
    for pc in pc_itr:
        if pc.claim_id in claim_ids:
            c_text = claim_text_d[pc.claim_id]
            pid = min(pc.perspective_ids)
            p_text = perspective_getter(pid)
            text = c_text + " " + p_text
            query = QCKQuery(get_pc_cluster_query_id(pc), text)
            query_list.append(query)

    return query_list