Beispiel #1
0
def main():
    pc_clusters: Iterable[PerspectiveCluster] = enum_perspective_clusters()
    tokenizer = TokenizerForGalago()

    def get_terms(text: str) -> Counter:
        terms = tokenizer.tokenize(text)
        return Counter(terms)

    # Query = [claim :: avg(perspective)]
    claim_text_d: Dict[int, str] = get_all_claim_d()
    perspective_text_d: Dict[int, str] = get_perspective_dict()

    def cluster_to_query(cluster: PerspectiveCluster) -> DocQuery:
        claim_text = claim_text_d[cluster.claim_id]
        perspective_text_list = list(
            [perspective_text_d[pid] for pid in cluster.perspective_ids])
        query_id = get_pc_cluster_query_id(cluster)
        claim_tf: Counter = get_terms(claim_text)
        pers_tf: Counter = average_counters(
            lmap(get_terms, perspective_text_list))
        tf = sum_counters([claim_tf, pers_tf])
        query: DocQuery = counter_to_galago_query(query_id, tf)
        return query

    query_list: List[DocQuery] = lmap(cluster_to_query, pc_clusters)
    print(len(query_list))
    out_path = os.path.join(output_path, "perspective_query",
                            "pc_query_for_evidence.json")
    save_queries_to_file(query_list, out_path)
Beispiel #2
0
def get_valid_terms():
    perspective = get_perspective_dict()
    tokenizer = PCTokenizer()
    voca = set()
    for text in perspective.values():
        voca.update(tokenizer.tokenize_stem(text))
    return voca
Beispiel #3
0
    def __init__(self):
        claim_and_perspective = load_claim_perspective_pair()
        perspective = get_perspective_dict()
        all_sents = []
        for e in claim_and_perspective:
            claim_text = e['text']
            all_sents.append(claim_text)

        for pid, text in perspective.items():
            all_sents.append(text)

        print("tokenizing {} docs".format(len(all_sents)))
        token_docs = []
        for s in all_sents:
            tokens = nltk.sent_tokenize(s)
            token_docs.append(tokens)

        print("get_idf")
        idf = inverse_document_frequencies(token_docs)
        tfidf_documents = []
        print("sublinear tf")
        for document in token_docs:
            doc_tfidf = []
            for term in idf.keys():
                tf = sublinear_term_frequency(term, document)
                doc_tfidf.append(tf * idf[term])
            tfidf_documents.append(doc_tfidf)

        self.d = {}
        for sent, tfidf_val in zip(all_sents, tfidf_documents):
            self.d[sent] = tfidf_val
def main():
    d: Dict[str, str] = get_perspective_dict()

    save_path = os.path.join(output_path, "perspective", "corpus.xml")
    f = open(save_path, "w")
    for pid, text in d.items():
        lines = trec_writer(pid, text)
        f.writelines(lines)
    f.close()
Beispiel #5
0
def test_es():
    claim_and_perspective = load_claim_perspective_pair()
    perspective = get_perspective_dict()

    for e in claim_and_perspective:
        claim_text = e['text']
        for perspective_cluster in e['perspectives']:
            pids = perspective_cluster['pids']
            for pid in pids:
                query = claim_text + " " + perspective[pid]
                es_helper.get_perspective_from_pool(query, 50)
Beispiel #6
0
def show_num_mention():
    train, val = load_feature_and_split()
    p_dict = get_perspective_dict()
    claims = get_claims_from_ids(lmap(lambda x: x['cid'], train))
    claim_d = claims_to_dict(claims)
    grouped = group_by(train, lambda x: x['cid'])

    for cid in grouped:
        print("Claim:", claim_d[cid])
        for dp in grouped[cid]:
            p_text = p_dict[dp['pid']]
            print(dp['label'], get_num_mention(dp), p_text)
Beispiel #7
0
def get_candidates(claims, balance) -> List[PerspectiveCandidate]:
    related_p_map = get_claim_perspective_id_dict()
    related_p_map = {
        key: flatten(value)
        for key, value in related_p_map.items()
    }
    p_map = get_perspective_dict()

    all_data_points = []
    for c in claims:
        cid = c["cId"]
        claim_text = c["text"]
        lucene_results = es_helper.get_perspective_from_pool(claim_text, 50)

        rp = related_p_map[cid]

        pid_set = list([_pid for _text, _pid, _score in lucene_results])
        data_point_list = []
        for pid in pid_set:
            p_text = p_map[pid]
            label = 1 if pid in rp else 0
            data_point = PerspectiveCandidate(label=str(label),
                                              cid=cid,
                                              pid=pid,
                                              claim_text=claim_text,
                                              p_text=p_text)
            #data_point = [str(label), str(cid), str(pid), claim_text, p_text]
            data_point_list.append(data_point)

        # If training, we balance positive and negative examples.
        if balance:
            pos_insts = list([e for e in data_point_list if e.label == "1"])
            neg_insts = list([e for e in data_point_list if e.label == "0"])
            neg_insts = neg_insts[:len(pos_insts)]
            data_point_list = pos_insts + neg_insts
        all_data_points.extend(data_point_list)

    return all_data_points
Beispiel #8
0
def perspective_getter(pid):
    global perspective
    if perspective is None:
        perspective = get_perspective_dict()
    return perspective[pid]