Esempio n. 1
0
def main(config):
    # select claims
    # load relevant documents
    # remove duplicate
    q_res_path = config['q_res_path']
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    claims = get_all_claims()
    claim_d = claims_to_dict(claims)

    keys = list(ranked_list.keys())
    keys.sort()
    num_doc_per_query = 10
    url_prefix = "http://localhost:36559/document?identifier="
    rows = []
    for query_id in keys[:10]:
        entries: List[SimpleRankedListEntry] = ranked_list[query_id]
        entries = entries[:num_doc_per_query * 3]
        doc_ids: List[str] = remove_duplicate(list([e.doc_id
                                                    for e in entries]))
        claim = claim_d[int(query_id)]
        s = "{} : {}".format(query_id, claim)
        rows.append([Cell(s)])
        for doc_id in doc_ids[:num_doc_per_query]:
            url = url_prefix + doc_id
            s = "<a href=\"{}\">{}</a>".format(url, doc_id)
            rows.append([Cell(s)])

    html = HtmlVisualizer("claim_docs_urls.html")
    html.write_table(rows)
Esempio n. 2
0
def predict_by_bm25_from_candidate(bm25_module, claims,
                                   candidate_dict: List[Tuple[int, List[int]]],
                                   top_k) -> List[Tuple[int, List[Dict]]]:

    cid_to_text: Dict[int, str] = claims_to_dict(claims)

    def scorer(c_text, p_text) -> NamedNumber:
        score = bm25_module.score(c_text, p_text)
        return score

    all_prediction_list: List[Tuple[int, List[Dict]]] = []
    for cid, candidates in candidate_dict:
        prediction_list: List[Dict] = []
        claim_text = cid_to_text[cid]
        for pid in candidates:
            p_text = perspective_getter(pid)
            p_entry = {
                'cid': cid,
                'pid': pid,
                'claim_text': claim_text,
                'perspective_text': p_text,
                'rationale': "",
                'score': scorer(claim_text, p_text),
            }
            prediction_list.append(p_entry)
        prediction_list.sort(key=lambda x: x['score'], reverse=True)
        prediction_list = prediction_list[:top_k]
        all_prediction_list.append((cid, prediction_list))
    return all_prediction_list
Esempio n. 3
0
def pc_predict_by_bert_next_sent(bm25_module: BM25, claims,
                                 top_k) -> List[Tuple[str, List[Dict]]]:

    cid_to_text: Dict[int, str] = claims_to_dict(claims)
    port = 8123
    # Example usage :
    proxy = xmlrpc.client.ServerProxy(
        'http://ingham.cs.umass.edu:{}'.format(port))

    voca_path = pjoin(data_path, "bert_voca.txt")
    encoder = EncoderUnitPlain(512, voca_path)

    def scorer(lucene_score, query_id) -> NamedNumber:
        claim_id, p_id = query_id.split("_")
        i_claim_id = int(claim_id)
        payload = []
        p_text = perspective_getter(int(p_id))
        c_text = cid_to_text[i_claim_id]
        payload.append(encoder.encode_pair(c_text, p_text))
        r = proxy.predict(payload)
        ns_score = -float(r[0])
        #ns_score = 0
        score = bm25_module.score(c_text, p_text)
        new_score = score + ns_score * 10
        score = NamedNumber(new_score, score.name + " {}".format(ns_score))
        return score

    r = predict_interface(claims, top_k, scorer)
    return r
Esempio n. 4
0
def pc_new_init_prob():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    claim_d = claims_to_dict(claims)
    bias_plus_word: Counter = load_from_pickle("bias_plus_words")
    tokenizer = PCTokenizer()

    base_p = max(bias_plus_word.values())

    init_p_score_d = {}
    for cid in d_ids:
        c_text = claim_d[cid]
        tokens = tokenizer.tokenize_stem(c_text)

        score_for_cid = Counter()
        for t, cnt in Counter(tokens).items():
            prob = cnt * base_p
            score_for_cid[t] = prob

        for t, score in bias_plus_word.items():
            score_for_cid[t] += score

        score_for_cid = normalize_counter_to_sum1(score_for_cid)
        init_p_score_d[cid] = score_for_cid

    save_to_pickle(init_p_score_d, "pc_dev_new_init_prob")
Esempio n. 5
0
def pc_predict_from_vector_query(bm25_module: BM25,
                                 q_tf_replace: Dict[int, Counter], claims,
                                 top_k) -> List[Tuple[str, List[Dict]]]:

    cid_to_text: Dict[int, str] = claims_to_dict(claims)
    found_claim = set()
    q_tf_replace_norm = dict_value_map(normalize_counter, q_tf_replace)

    c_qtf_d = {}
    for cid, c_text in cid_to_text.items():
        c_tokens = bm25_module.tokenizer.tokenize_stem(c_text)
        c_qtf_d[cid] = Counter(c_tokens)

    def scorer(lucene_score, query_id) -> NamedNumber:
        nonlocal found_claim
        claim_id, p_id = query_id.split("_")
        i_claim_id = int(claim_id)
        if i_claim_id in q_tf_replace_norm:
            claim_qtf = Counter(
                dict_value_map(lambda x: x * 1, c_qtf_d[i_claim_id]))
            ex_qtf = q_tf_replace_norm[i_claim_id]
            ex_qtf = Counter(dict(ex_qtf.most_common(50)))
            qtf = ex_qtf + claim_qtf
            found_claim.add(i_claim_id)
        else:
            qtf = c_qtf_d[i_claim_id]
        p_text = perspective_getter(int(p_id))
        p_tokens = bm25_module.tokenizer.tokenize_stem(p_text)
        score = bm25_module.score_inner(qtf, Counter(p_tokens))
        return score

    r = predict_interface(claims, top_k, scorer)
    print("{} of {} found".format(len(found_claim), len(claims)))
    return r
Esempio n. 6
0
def sum_random_walk_score(name_class):
    d_ids: List[int] = list(load_train_claim_ids())
    claims = get_claims_from_ids(d_ids)
    claim_d = claims_to_dict(claims)

    prob_score_d = load_from_pickle("pc_{}_word_prob_train".format(name_class))
    stopwords = load_stopwords()
    acc_counter_prob_init = Counter()
    for claim_id, prob_scores in prob_score_d.items():
        for k, v in prob_scores:
            if k not in stopwords:
                acc_counter_prob_init[k] += v

    rw_score = dict(load_from_pickle("bias_random_walk_train_{}".format(name_class)))
    acc_counter = Counter()
    for claim_id, qtf in rw_score.items():
        for k, v in qtf.items():
            acc_counter[k] += v

    acc_counter_prob_init = normalize_counter_to_sum1(acc_counter_prob_init)
    acc_counter = normalize_counter_to_sum1(acc_counter)

    new_counter = Counter()
    for k, v in acc_counter.items():
        if len(k) > 2:
            new_v = v - acc_counter_prob_init[k]
            new_counter[k] = new_v

    return new_counter
def main(config):
    word_list_path = config['word_list_path']
    claims = get_all_claims()
    claim_d = claims_to_dict(claims)
    stopwords = load_stopwords_for_query()

    word_list_d: Dict = json.load(open(word_list_path, "r"))

    tokenizer = PCTokenizer()

    for query_id in word_list_d:
        claim = claim_d[int(query_id)]
        word_list = word_list_d[query_id]
        base_query_terms = tokenizer.tokenize_stem(claim)
        base_query_terms = list(
            [t for t in base_query_terms if t not in stopwords])
        #print

        new_term_set = set()
        for new_term in word_list:
            t = tokenizer.stemmer.stem(new_term)
            if t not in base_query_terms:
                new_term_set.add(t)

        print()
        print("Claim {}: {}".format(query_id, claim))
        print("base query terms: ", base_query_terms)
        print("new terms: ", new_term_set)
Esempio n. 8
0
def show_random_walk_score():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    claim_d = claims_to_dict(claims)

    top_k = 7
    q_tf_replace = dict(load_from_pickle("bias_random_walk_dev_plus_all"))

    for claim_id, qtf in q_tf_replace.items():
        print(claim_d[claim_id])
        print(qtf.most_common(100))
    print("")
Esempio n. 9
0
def show_num_mention():
    train, val = load_feature_and_split()
    p_dict = get_perspective_dict()
    claims = get_claims_from_ids(lmap(lambda x: x['cid'], train))
    claim_d = claims_to_dict(claims)
    grouped = group_by(train, lambda x: x['cid'])

    for cid in grouped:
        print("Claim:", claim_d[cid])
        for dp in grouped[cid]:
            p_text = p_dict[dp['pid']]
            print(dp['label'], get_num_mention(dp), p_text)
Esempio n. 10
0
def predict_see_candidate(bm25_module: BM25, claims, top_k):
    cid_to_text: Dict[int, str] = claims_to_dict(claims)
    c_qtf_d = {}
    for cid, c_text in cid_to_text.items():
        c_tokens = bm25_module.tokenizer.tokenize_stem(c_text)
        c_qtf_d[cid] = Counter(c_tokens)

    output = []
    for claim in claims:
        cid = claim['cId']
        claim_text = claim['text']
        lucene_results = es_helper.get_perspective_from_pool(claim_text, 50)
        candidate_pids = []
        for rank, (_text, _pid, _score) in enumerate(lucene_results):
            candidate_pids.append(_pid)

        p_text = lmap(perspective_getter, candidate_pids)

        p_tokens = lmap(bm25_module.tokenizer.tokenize_stem, p_text)

        acc_counter = Counter()
        for tokens in p_tokens[:30]:
            for t in tokens:
                acc_counter[t] += 1 / len(tokens)
        c = normalize_counter(acc_counter)
        c_tokens = bm25_module.tokenizer.tokenize_stem(claim_text)
        qtf = Counter(c_tokens)
        qtf = c + qtf

        ranked_list = []
        for pid in candidate_pids:
            p_tokens = bm25_module.tokenizer.tokenize_stem(
                perspective_getter(pid))
            score = bm25_module.score_inner(qtf, Counter(p_tokens))
            ranked_list.append((pid, score))

        ranked_list.sort(key=lambda x: x[1], reverse=True)
        prediction_list = []

        for pid, score in ranked_list[:top_k]:
            p_entry = {
                'cid': cid,
                'pid': pid,
                'claim_text': claim_text,
                'perspective_text': perspective_getter(pid),
                'rationale': score.name,
                'score': score,
            }
            prediction_list.append(p_entry)
        output.append((cid, prediction_list))

    return output
Esempio n. 11
0
def predict_by_bm25(bm25_module, claims,
                    top_k) -> List[Tuple[str, List[Dict]]]:

    cid_to_text: Dict[int, str] = claims_to_dict(claims)

    def scorer(lucene_score, query_id) -> NamedNumber:
        claim_id, p_id = query_id.split("_")
        c_text = cid_to_text[int(claim_id)]
        p_text = perspective_getter(int(p_id))
        score = bm25_module.score(c_text, p_text)
        return score

    r = predict_interface(claims, top_k, scorer)
    return r
Esempio n. 12
0
def predict_by_bm25_rm(bm25_module: BM25, rm_info: Dict[str, List[Tuple[str,
                                                                        str]]],
                       claims, top_k) -> List[Tuple[str, List[Dict]]]:

    cid_to_text: Dict[int, str] = claims_to_dict(claims)
    tokenizer = PCTokenizer()

    def stem_merge(score_list: List[Tuple[str, float]]) -> Counter:
        c = Counter()
        for k, v in score_list:
            try:
                new_k = tokenizer.stemmer.stem(k)
                c[new_k] += v
            except UnicodeDecodeError:
                pass
        return c

    rm_info: Dict[str,
                  List[Tuple[str,
                             float]]] = dict_value_map(parse_float, rm_info)
    rm_info: Dict[str,
                  List[Tuple[str,
                             float]]] = dict_value_map(normalize_scores,
                                                       rm_info)
    rm_info_c: Dict[str, Counter] = dict_value_map(stem_merge, rm_info)
    print(len(rm_info_c.keys()))
    print(len(claims))
    not_found = set()

    def scorer(lucene_score, query_id) -> NamedNumber:
        claim_id, p_id = query_id.split("_")
        c_text = cid_to_text[int(claim_id)]
        p_text = perspective_getter(int(p_id))
        score: NamedNumber = bm25_module.score(c_text, p_text)

        nclaim_id = int(claim_id)
        if nclaim_id in rm_info:
            ex_qtf = rm_info_c[nclaim_id]
            p_tokens = tokenizer.tokenize_stem(p_text)
            ex_score = bm25_module.score_inner(ex_qtf, Counter(p_tokens))
            new_info = score.name + "({})".format(ex_score.name)
            score = NamedNumber(score + ex_score, new_info)
        else:
            not_found.add(claim_id)
        return score

    r = predict_interface(claims, top_k, scorer)
    print(not_found)
    return r
Esempio n. 13
0
def predict_by_reweighter(bm25_module: BM25, claims, top_k,
                          param) -> List[Tuple[str, List[Dict]]]:

    cid_to_text: Dict[int, str] = claims_to_dict(claims)
    claim_term_weight: Dict[int, Dict[str, float]] = get_claim_term_weighting(
        claims, param)
    nlp = spacy.load("en_core_web_sm")

    def do_stem(t: str) -> str:
        r = bm25_module.tokenizer.stemmer.stem(t)
        return r

    def stem_tokenize(text: str) -> Iterator[str]:
        for t in nlp(text):
            try:
                yield do_stem(t.text)
            except UnicodeDecodeError:
                pass

    def apply_stem(term_weight: Dict[str, float]) -> Dict[str, float]:
        return {do_stem(k): v for k, v in term_weight.items()}

    claim_term_weight: Dict[int, Dict[str, float]] = dict_value_map(
        apply_stem, claim_term_weight)

    def scorer(lucene_score, query_id) -> NamedNumber:
        claim_id, p_id = query_id.split("_")
        c_text = cid_to_text[int(claim_id)]
        p_text = perspective_getter(int(p_id))
        qtf = Counter(stem_tokenize(c_text))
        weight = claim_term_weight[int(claim_id)]

        new_qtf = Counter()
        for k, v in qtf.items():
            try:
                w = weight[k]
                new_qtf[k] = w * v
            except Exception as e:
                print("Exception")
                print(e)
                print(k)

        tf = Counter(stem_tokenize(p_text))
        score = bm25_module.score_inner(new_qtf, tf)
        return score

    r = predict_interface(claims, top_k, scorer)
    return r
Esempio n. 14
0
def main(config):
    split = config['split']
    word_prob_path = config['word_prob_path']
    save_path = config['save_path']
    threshold = config['threshold']
    per_query_infos: Dict[str,
                          Dict[WordAsID,
                               np.array]] = load_pickle_from(word_prob_path)
    claims = load_claims_for_sub_split(split)
    claim_d = claims_to_dict(claims)
    stopwords = load_stopwords_for_query()

    def is_stopword(tokens):
        if len(tokens) == 1 and tokens[0] in stopwords:
            return True
        else:
            return False

    tokenizer = get_tokenizer()

    all_d = {}
    for query_id, d in per_query_infos.items():
        entry = []
        for key in d.keys():
            tokens: List[str] = decode_word_as_id(tokenizer, key)
            if is_stopword(tokens):
                continue

            plain_word: str = pretty_tokens(tokens, True)
            pos, neg = d[key]
            pos_log = math.log(pos + 1e-10)
            neg_log = math.log(neg + 1e-10)
            diff = pos_log - neg_log
            entry.append((plain_word, diff, pos_log, neg_log))

        entry.sort(key=get_second, reverse=True)
        word_list = []
        for word, diff, pos, neg in entry[:100]:
            if diff > threshold:
                word = word.strip()
                word_list.append(word)
        all_d[query_id] = word_list
    json.dump(all_d, open(save_path, "w"))
Esempio n. 15
0
def write_csv(config):
    # select claims
    # load relevant documents
    # remove duplicate
    q_res_path = config['q_res_path']
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    claims = get_all_claims()
    claim_d = claims_to_dict(claims)

    keys = list(ranked_list.keys())
    keys.sort()
    num_doc_per_query = 10
    url_prefix = "http://gosford.cs.umass.edu:36559/document?identifier="
    rows = []

    header = ["claim"
              ] + ["url{}".format(i) for i in range(1, num_doc_per_query + 1)]
    rows.append(header)
    for query_id in keys[:10]:
        entries: List[SimpleRankedListEntry] = ranked_list[query_id]
        entries = entries[:num_doc_per_query * 3]
        doc_ids: List[str] = remove_duplicate(list([e.doc_id
                                                    for e in entries]))
        claim = claim_d[int(query_id)]
        urls = []
        for doc_id in doc_ids[:num_doc_per_query]:
            url = url_prefix + doc_id
            urls.append(url)

        assert len(urls) == num_doc_per_query
        row = [claim] + urls
        rows.append(row)

    save_path = os.path.join(output_path, "claim10_train.csv")
    f = open(save_path, "w")

    csv_writer = csv.writer(f)
    csv_writer.writerows(rows)
Esempio n. 16
0
def main(config):
    split = config['split']
    word_prob_path = config['word_prob_path']
    per_query_infos: Dict[str,
                          Dict[WordAsID,
                               np.array]] = load_pickle_from(word_prob_path)
    claims = load_claims_for_sub_split(split)
    claim_d = claims_to_dict(claims)
    stopwords = load_stopwords_for_query()

    def is_stopword(tokens):
        if len(tokens) == 1 and tokens[0] in stopwords:
            return True
        else:
            return False

    tokenizer = get_tokenizer()

    for query_id, d in per_query_infos.items():
        entry = []
        for key in d.keys():
            tokens: List[str] = decode_word_as_id(tokenizer, key)
            if is_stopword(tokens):
                continue

            plain_word: str = pretty_tokens(tokens, True)
            pos, neg = d[key]
            pos_log = math.log(pos + 1e-10)
            neg_log = math.log(neg + 1e-10)
            diff = pos_log - neg_log
            entry.append((plain_word, diff, pos_log, neg_log))

        print(query_id, claim_d[int(query_id)])
        entry.sort(key=get_second, reverse=True)
        for word, diff, pos, neg in entry[:100]:
            word = word.strip()
            print("{0}\t{1:.2f}\t{2:.2f}\t{3:.2f}".format(
                word, diff, pos, neg))
Esempio n. 17
0
def main(input_path):
    claims = get_all_claims()
    claim_d = claims_to_dict(claims)
    gold: Dict[int, List[List[int]]] = get_claim_perspective_id_dict()
    grouped_ranked_list = load_ranked_list_grouped(input_path)

    def is_correct(qid: str, doc_id: str):
        return any([int(doc_id) in cluster for cluster in gold[int(qid)]])

    top_k = 5
    for qid, entries in grouped_ranked_list.items():
        n_gold = sum(map(len, gold[int(qid)]))
        cut_n = min(n_gold, top_k)
        correctness = list([is_correct(qid, e.doc_id) for e in entries[:cut_n]])
        num_correct = sum(lmap(int, correctness))
        p_at_k = num_correct / cut_n

        pid_to_rank: Dict[str, int] = {e.doc_id: e.rank for e in entries}

        def get_rank(pid: int):
            if str(pid) in pid_to_rank:
                return pid_to_rank[str(pid)]
            else:
                return "X"

        if p_at_k < 0.3:
            print(n_gold)
            print(p_at_k)
            print("Claim {} {}".format(qid, claim_d[int(qid)]))##
            for cluster in gold[int(qid)]:
                print("-")
                for pid in cluster:
                    print("[{}]".format(get_rank(pid)), perspective_getter(int(pid)))
            for e in entries[:50]:
                correct_str = "Y" if is_correct(qid, e.doc_id) else "N"
                print("{} {} {}".format(correct_str, e.score, perspective_getter(int(e.doc_id))))
Esempio n. 18
0
def work():
    split = "train"
    assert split in ["train", "dev", "test"]

    tokenizer = PCTokenizer()
    d_ids = list({
        "train": load_train_claim_ids(),
        "dev": load_dev_claim_ids(),
        "test": load_test_claim_ids()
    }[split])
    claims = get_claims_from_ids(d_ids)
    claim_d = claims_to_dict(claims)

    print(len(claims), " claims")
    do_balance = False
    all_data_points: List[PerspectiveCandidate] = get_candidates(
        claims, do_balance)

    grouped: Dict[str, List] = group_by(all_data_points, lambda x: x.cid)

    def get_frequency_per_class(datapoints: List[PerspectiveCandidate]):
        pos_text = []
        neg_text = []
        for dp in datapoints:
            tokens = tokenizer.tokenize_stem(dp.p_text)
            tf = Counter(tokens)
            dl = sum(tf.values())
            tf_rel = {k: v / dl for k, v in tf.items()}

            if dp.label == "1":
                pos_text.append(tf_rel)
            elif dp.label == "0":
                neg_text.append(tf_rel)
            else:
                assert False

        def accumulate(tf_list: List[Dict]):
            out_c = Counter()
            n = len(tf_list)
            for tf in tf_list:
                for k, v in tf.items():
                    out_c[k] += v / n

            return out_c

        pos_avg_tf = accumulate(pos_text)
        neg_avg_tf = accumulate(neg_text)
        return pos_avg_tf, neg_avg_tf

    class_freq: Dict[str,
                     Tuple[Counter,
                           Counter]] = dict_value_map(get_frequency_per_class,
                                                      grouped)

    save_to_pickle(class_freq, "per_claim_class_word_tf_{}".format(split))

    def normalize(s_list: List[float]) -> List[float]:
        m = sum(s_list)
        return list([s / m for s in s_list])

    pos_prob_dict = {}
    neg_prob_dict = {}

    for cid, info in class_freq.items():
        pos, neg = info
        all_words = set(pos.keys())
        all_words.update(neg.keys())

        info = []
        for word in all_words:
            score = pos[word] - neg[word]
            info.append((word, score))

        pos_scores = list([(w, s) for w, s in info if s > 0])
        neg_scores = list([(w, s) for w, s in info if s < 0])

        def normalize_right(pair_list):
            right_scores = normalize(right(pair_list))
            return list(zip(left(pair_list), right_scores))

        pos_prob_dict[cid] = normalize_right(pos_scores)
        neg_prob_dict[cid] = normalize_right(neg_scores)

    save_to_pickle(pos_prob_dict, "pc_pos_word_prob_{}".format(split))
    save_to_pickle(neg_prob_dict, "pc_neg_word_prob_{}".format(split))
Esempio n. 19
0
def pc_predict_vector_query_and_reweight(
        bm25_module: BM25, q_tf_replace: Dict[int, Counter], claims, top_k,
        param) -> List[Tuple[str, List[Dict]]]:

    cid_to_text: Dict[int, str] = claims_to_dict(claims)
    found_claim = set()
    q_tf_replace_norm = dict_value_map(normalize_counter, q_tf_replace)

    def do_stem(t: str) -> str:
        r = bm25_module.tokenizer.stemmer.stem(t)
        return r

    def apply_stem(term_weight: Dict[str, float]) -> Dict[str, float]:
        return {do_stem(k): v for k, v in term_weight.items()}

    claim_term_weight: Dict[int, Dict[str, float]] = get_claim_term_weighting(
        claims, param)
    claim_term_weight: Dict[int, Dict[str, float]] = dict_value_map(
        apply_stem, claim_term_weight)

    nlp = spacy.load("en_core_web_sm")

    def stem_tokenize(text: str) -> Iterator[str]:
        for t in nlp(text):
            try:
                yield do_stem(t.text)
            except UnicodeDecodeError:
                pass

    def get_qtf(claim_id):
        weight = claim_term_weight[claim_id]
        new_qtf = Counter()
        c_text = cid_to_text[int(claim_id)]
        qtf = Counter(stem_tokenize(c_text))
        print(weight)
        for k, v in qtf.items():
            try:
                if k in weight:
                    w = weight[k]
                    new_qtf[k] = w * v
                else:
                    new_qtf[k] = v
            except Exception as e:
                print("Exception")
                print(e)
                print(k)
        return new_qtf

    c_qtf_d = {k: get_qtf(k) for k in cid_to_text.keys()}

    # for cid, c_text in cid_to_text.items():
    #     c_tokens = bm25_module.tokenizer.tokenize_stem(c_text)
    #     c_qtf_d[cid] = Counter(c_tokens)

    def scorer(lucene_score, query_id) -> NamedNumber:
        nonlocal found_claim
        claim_id, p_id = query_id.split("_")
        i_claim_id = int(claim_id)
        if i_claim_id in q_tf_replace_norm:
            ex_qtf = q_tf_replace_norm[i_claim_id]
            ex_qtf = Counter(dict(ex_qtf.most_common(50)))
            qtf = ex_qtf + c_qtf_d[i_claim_id]
            found_claim.add(i_claim_id)
        else:
            qtf = c_qtf_d[i_claim_id]
        p_text = perspective_getter(int(p_id))
        p_tokens = bm25_module.tokenizer.tokenize_stem(p_text)
        score = bm25_module.score_inner(qtf, Counter(p_tokens))
        return score

    r = predict_interface(claims, top_k, scorer)
    print("{} of {} found".format(len(found_claim), len(claims)))
    return r
Esempio n. 20
0
def pc_predict_to_inspect(bm25_module: BM25, q_tf_replace: Dict[int, Counter],
                          q_tf_replace_0: Dict[int, Counter], claims, top_k):
    gold = get_claim_perspective_id_dict()
    q_tf_replace_norm = dict_value_map(normalize_counter, q_tf_replace)
    q_tf_replace_0_norm = dict_value_map(normalize_counter, q_tf_replace_0)

    cid_to_text: Dict[int, str] = claims_to_dict(claims)
    c_qtf_d = {}
    for cid, c_text in cid_to_text.items():
        c_tokens = bm25_module.tokenizer.tokenize_stem(c_text)
        c_qtf_d[cid] = Counter(c_tokens)

    def counter_to_str(c: Dict) -> str:
        s = ""
        for k, v in c.items():
            s += "{0} {1:.2f}".format(k, v) + "\t"
        return s

    for claim in claims:
        cid = claim['cId']
        i_claim_id = int(cid)
        claim_text = claim['text']
        lucene_results = es_helper.get_perspective_from_pool(claim_text, 50)
        candidate_pids = []
        for rank, (_text, _pid, _score) in enumerate(lucene_results):
            candidate_pids.append(_pid)

        if i_claim_id in q_tf_replace_norm:
            claim_qtf = Counter(
                dict_value_map(lambda x: x * 1, c_qtf_d[i_claim_id]))
            ex_qtf = q_tf_replace_norm[i_claim_id]
            ex_qtf = Counter(dict(ex_qtf.most_common(50)))
            qtf = ex_qtf + claim_qtf
        else:
            qtf = c_qtf_d[i_claim_id]

        ranked_list = []
        for pid in candidate_pids:
            p_text = perspective_getter(int(pid))
            p_tokens = bm25_module.tokenizer.tokenize_stem(p_text)
            score = bm25_module.score_inner(qtf, Counter(p_tokens))
            debug_str = ""

            e = score, pid, p_text, debug_str
            ranked_list.append(e)

        gold_pids = gold[cid]

        def is_correct(pid):
            for pids in gold_pids:
                if pid in pids:
                    return True
            return False

        ranked_list.sort(key=lambda x: x[0], reverse=True)

        qtf_idf_applied = {
            k: v * bm25_module.term_idf_factor(k)
            for k, v in qtf.items()
        }
        print()
        print("Claim: ", cid, claim_text)
        for cluster in gold_pids:
            print("-")
            for pid in cluster:
                print(pid, perspective_getter(pid))
        print()
        print("qtf:", counter_to_str(qtf))
        if i_claim_id in q_tf_replace_norm and i_claim_id in q_tf_replace_0_norm:
            print("ex_qtf:", counter_to_str(ex_qtf))
            ex_qtf_0 = q_tf_replace_0_norm[i_claim_id]
            ex_qtf_0 = Counter(dict(ex_qtf_0.most_common(50)))
            print("ex_qtf_0:", counter_to_str(ex_qtf_0))
        print("qtf idf apllied:", counter_to_str(qtf_idf_applied))

        for score, pid, p_text, debug_str in ranked_list[:top_k]:

            if i_claim_id in q_tf_replace_0_norm:
                p_text = perspective_getter(int(pid))
                p_tokens = bm25_module.tokenizer.tokenize_stem(p_text)
                ex_qtf_0 = q_tf_replace_0_norm[i_claim_id]
                qtf = ex_qtf_0 + c_qtf_d[i_claim_id]
                score2 = bm25_module.score_inner(qtf, Counter(p_tokens))
                correct_str = "Y" if is_correct(pid) else "N"
                print("{0} {1:.2f} ({2:.2f}) {3} / {4} / {5}".format(
                    correct_str, score, score2, p_text, score.name,
                    score2.name))