Exemple #1
0
def main():
    save_dir = os.path.join(output_path, "pc_qc4")
    exist_or_mkdir(save_dir)
    split_filename = split_name2
    for split in splits:
        qids: Iterable[str] = get_qids_for_split(split_filename, split)
        queries = get_qck_queries_from_cids(lmap(int, qids))
        eval_candidate = get_qck_candidate_for_split(split_filename, split)
        save_path = os.path.join(save_dir, split)
        make_pc_qc(queries, eval_candidate, is_correct_factory(), save_path)
Exemple #2
0
def get_lm_scorer(claim_lms: List[ClaimLM], alpha):
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    claim_log_odds_dict: Dict[int, Counter] = {c_lm.cid: get_log_odd(c_lm, bg_lm, alpha) for c_lm in claim_lms}

    def scorer(claim_id: int, p_tokens: List[str]) -> NamedNumber:
        c_lm = claim_log_odds_dict[claim_id]
        reason = " ".join(["{0} ({1:.2f})".format(t, c_lm[t]) for t in p_tokens])
        score = sum([c_lm[t] for t in p_tokens])
        return NamedNumber(score, reason)
    return scorer
Exemple #3
0
def get_ap_from_file_path(input_path):
    tf_prediction_data = load_pickle_from(input_path)
    tf_prediction_data = flatten_batches(tf_prediction_data)
    logits = tf_prediction_data["logits"]
    label_ids = tf_prediction_data["label_ids"]

    scores = lmap(logit_to_score_softmax, logits)

    assert len(scores) == len(label_ids)
    return get_ap(label_ids, scores)
Exemple #4
0
    def encode(inst: Payload) -> OrderedDict:
        tokens_1_1: List[str] = tokenizer.tokenize(inst.text1)
        tokens_1_2: List[str] = tokenizer.tokenize(inst.text2)

        def tokenize_from_tokens_fn(tokens):
            return tokenize_from_tokens(tokenizer, tokens)

        tokens_2_list: List[List[str]] = lmap(tokenize_from_tokens_fn,
                                              inst.passage_list)

        tokens, segment_ids = combine_with_sep_cls(max_seq_length, tokens_1_1,
                                                   tokens_1_2)
        input_ids, input_mask, segment_ids = get_basic_input_feature_as_list(
            tokenizer, max_seq_length, tokens, segment_ids)
        features = collections.OrderedDict()
        features["input_ids"] = create_int_feature(input_ids)
        features["input_mask"] = create_int_feature(input_mask)
        features["segment_ids"] = create_int_feature(segment_ids)

        def iterate_over(tokens1, tokens2_list) -> Tuple[List[str], List[int]]:
            dummy_tokens = ["[PAD]"] * max_seq_length
            dummy_segment_ids = [0] * max_seq_length

            def make_for_each_window(tokens2):
                tokens, segment_ids = combine_and_pad(tokens1, tokens2)
                return tokens, segment_ids

            tokens_and_segment_ids_list: List[Tuple[List[str], List[int]]] = \
                lmap(make_for_each_window, tokens2_list[:num_windows])

            pad_len = num_windows - len(tokens_and_segment_ids_list)
            tokens_and_segment_ids_list += [(dummy_tokens, dummy_segment_ids)
                                            ] * pad_len
            tokens_list, segment_ids_list = zip(*tokens_and_segment_ids_list)
            return lflatten(tokens_list), lflatten(segment_ids_list)

        def get_second_feature_parts(tokens1, tokens2_list):
            tokens, segment_ids = iterate_over(tokens1, tokens2_list)
            return get_basic_input_feature_as_list(tokenizer, d_max_seq_length,
                                                   tokens, segment_ids)

        input_ids, input_mask, segment_ids = get_second_feature_parts(
            tokens_1_2, tokens_2_list)
        features["input_ids2"] = create_int_feature(input_ids)
        features["input_mask2"] = create_int_feature(input_mask)
        features["segment_ids2"] = create_int_feature(segment_ids)

        input_ids, input_mask, segment_ids = get_second_feature_parts(
            tokens_1_1, tokens_2_list)
        features["input_ids3"] = create_int_feature(input_ids)
        features["input_mask3"] = create_int_feature(input_mask)
        features["segment_ids3"] = create_int_feature(segment_ids)
        features['label_ids'] = create_int_feature([inst.is_correct])
        features['data_id'] = create_int_feature([inst.data_id])
        return features
Exemple #5
0
def context_viewer(target_topic):
    docs = get_relevant_docs(target_topic)[:100]
    predictor = Predictor(target_topic)

    def get_topic_stance(sents, target_topic):
        return predictor.predict(target_topic, sents)

    window_size = [-3, 1] # inclusive
    #window_size = [0,0]
    def window(center_loc, list_len):
        start = max(0, center_loc + window_size[0])
        end = min(list_len-1, center_loc + window_size[1])
        return start, end+1

    line_split = sent_tokenize
    sents_list = lmap(line_split, docs)

    #topic_stances_list = load_from_pickle("stance_{}_rel.pickle".format(target_topic))
    topic_stances_list = flat_apply_stack(lambda x: get_topic_stance(x, target_topic), sents_list, False)
    save_to_pickle(topic_stances_list, "stance_{}_rel.pickle".format(target_topic))

    def summarize_stance(list_stance):
        assert len(list_stance) > 0
        stance_count = Counter()
        for s in list_stance:
            stance_count[s] += 1
        if stance_count[1] > 0 and stance_count[2] > 0:
            return 3

        for stance in [1,2]:
            if stance_count[stance] > 0:
                return stance

        return 0

    def contains(sents, query):
        return query in " ".join(sents)
    count = Counter()
    for doc_idx, doc in enumerate(docs):
        sents = line_split(doc)
        num_sents = len(sents)
        if num_sents < 1:
            print("Skip doc #{}".format(doc_idx))
            continue

        topic_stances = topic_stances_list[doc_idx]
        for i, sent in enumerate(sents):
            st, ed = window(i, num_sents)
            A_stance = summarize_stance(topic_stances[st:ed])

            if A_stance in [1,2]:
                print("-------------")
                for j in range(st, ed):
                    print(topic_stances[j], sents[j])
                print("-------------")
Exemple #6
0
    def get_rm(data_point):
        label, cid, pid, claim_text, p_text = data_point
        file_name = "{}_{}_{}.txt".format(disk_name, cid, pid)
        f = open(os.path.join(dir_path, file_name))

        def parse_line(line):
            term, prob = line.split("\t")  #
            prob = float(prob) * 1000
            return term, prob

        return lmap(parse_line, f), int(label)
Exemple #7
0
def preload_docs(ranked_list, claims, top_n):
    def get_doc_ids(claim: Dict):
        # Find the q_res
        q_res: List[SimpleRankedListEntry] = ranked_list[str(claim['cId'])]
        return list([q_res[i].doc_id for i in range(top_n)])

    all_doc_ids: Set[str] = set(flatten(lmap(get_doc_ids, claims)))
    print(f"total of {len(all_doc_ids)} docs")
    print("Accessing DB")
    #  Get the doc from DB
    preload_man.preload(TokenizedCluewebDoc, all_doc_ids)
Exemple #8
0
 def cluster_to_query(cluster: PerspectiveCluster) -> DocQuery:
     claim_text = claim_text_d[cluster.claim_id]
     perspective_text_list = list(
         [perspective_text_d[pid] for pid in cluster.perspective_ids])
     query_id = get_pc_cluster_query_id(cluster)
     claim_tf: Counter = get_terms(claim_text)
     pers_tf: Counter = average_counters(
         lmap(get_terms, perspective_text_list))
     tf = sum_counters([claim_tf, pers_tf])
     query: DocQuery = counter_to_galago_query(query_id, tf)
     return query
Exemple #9
0
def cap_ed(ss_list: List[SegmentScore], step_size) -> List[SegmentScore]:
    max_start_idx = max([s.start_idx for s in ss_list])
    cap_end_idx = max_start_idx + step_size

    def transform(ss: SegmentScore):
        if ss.end_idx < cap_end_idx:
            return ss
        else:
            return SegmentScore(ss.start_idx, cap_end_idx, ss.score)

    return lmap(transform, ss_list)
Exemple #10
0
def load_qk_score(config) -> List[QKOutEntry]:
    info_path = config['info_path']
    passage_score_path = config['pred_path']
    score_type = config['score_type']
    fetch_field_list = ["logits", "input_ids", "data_id"]
    data_id_to_info: Dict = load_combine_info_jsons(info_path, qk_convert_map)
    data: List[Dict] = join_prediction_with_info(passage_score_path,
                                                 data_id_to_info,
                                                 fetch_field_list)
    qk_out_entries: List[QKOutEntry] = lmap(QKOutEntry.from_dict2, data)
    return qk_out_entries
Exemple #11
0
def a_relevant():
    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    claims = claims
    top_n = 10
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    stopwords = load_stopwords_for_query()
    alpha = 0.3

    tokenizer = PCTokenizer()
    all_passages = []
    entries = []
    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        claim_lm = claim_lms_d[c['cId']]
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        claim_text = c['text']
        claim_tokens = tokenizer.tokenize_stem(claim_text)

        scores = []
        for t in claim_tokens:
            if t in log_odd:
                scores.append(log_odd[t])
        base = average(scores)

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        passages = iterate_passages(q_res, top_n, get_passage_score)

        all_passages.extend(passages)
        a_rel_passages = lfilter(lambda x: x[1] > 0, passages)

        entries.append((c, a_rel_passages))

    data = entries, all_passages

    save_to_pickle(data, "pc_train_a_passages")
Exemple #12
0
def get_eval_candidates_1k_as_qck(split) -> Dict[str, List[QCKCandidate]]:
    cid_dict_format: List[Tuple[int,
                                List[Dict]]] = get_eval_candidates_1k(split)

    def convert(e) -> Tuple[int, List[int]]:
        cid, p_list = e
        return cid, lmap(lambda p: p['pid'], p_list)

    cid_pid_format: List[Tuple[int, List[int]]] = lmap(convert,
                                                       cid_dict_format)
    return cid_pid_format_to_qck(cid_pid_format)
    def generate_instances(
            self, claim: Dict,
            data_id_manager: DataIDManager) -> List[PairedInstance]:
        cid = claim['cId']
        claim = claim['text']

        passages = self.cid_to_passages[cid]
        good_passages: List[List[str]] = left(
            lfilter(score_over_zero, passages))
        not_good_passages: List[List[str]] = left(
            lfilter_not(score_over_zero, passages))

        n_good = len(good_passages)
        n_not_good = len(not_good_passages)

        # len(pair_list_g_ng) = n_not_good   ( assuming n_not_good > n_good)
        pair_list_g_ng: List[Tuple[
            List[str], List[str]]] = generate_pairwise_combinations(
                not_good_passages, good_passages, True)
        # len(pair_list_g_rand) = n_good
        pair_list_g_rand: List[Tuple[List[str], List[str]]] = list([
            (inst, self.random_sample(cid)) for inst in good_passages
        ])
        # len(pair_list_g_rand) = n_not_good
        pair_list_ng_rand: List[Tuple[List[str], List[str]]] = list([
            (inst, self.random_sample(cid)) for inst in not_good_passages
        ])

        def make_instance(passage_pair, strict_good, strict_bad):
            passage_good, passage_worse = passage_pair
            info = {'cid': cid}
            return PairedInstance(claim, passage_good, passage_worse,
                                  strict_good, strict_bad,
                                  data_id_manager.assign(info))

        l1 = lmap(lambda pair: make_instance(pair, 1, 0), pair_list_g_ng)
        l2 = lmap(lambda pair: make_instance(pair, 0, 1), pair_list_ng_rand)
        l3 = lmap(lambda pair: make_instance(pair, 1, 1), pair_list_g_rand)
        print("g-ng : ng-rank : g-rand = {} : {} : {}".format(
            len(l1), len(l2), len(l3)))
        return l1 + l2 + l3
Exemple #14
0
def get_candidate(split) -> Dict[str, List[QCKCandidateI]]:
    tokenizer = get_tokenizer()
    queries = get_qck_queries(split)
    max_seq_length = 512

    def get_candidate_for_query(query: QCKQuery):
        res = get_evidence_from_pool(query.text, 60)
        query_len = len(tokenizer.tokenize(query.text))
        candidate_max_len = max_seq_length - 3 - query_len

        output = []
        for text, e_id, score in res:
            tokens = tokenizer.tokenize(text)
            for passage in enum_passage(tokens, candidate_max_len):
                c = QCKCandidateWToken(str(e_id), "", passage)
                output.append(c)
        return output

    qid_list = lmap(lambda q: q.query_id, queries)
    candidate_list_list = lmap(get_candidate_for_query, queries)
    return dict(zip(qid_list, candidate_list_list))
Exemple #15
0
def calculate_score(info,
                    pred_path,
                    baseline_score: Dict[Tuple[str, str], float],
                    str_data_id=False) -> List[DocValueParts]:

    predictions: List[Dict] = join_prediction_with_info(
        pred_path, info, ["logits"], str_data_id)
    out_entries: List[QCKOutEntry] = lmap(QCKOutEntry.from_dict, predictions)
    labels: Dict[str, List[str]] = load_labels()
    doc_score_parts: List[DocValueParts] = get_doc_value_parts(
        out_entries, baseline_score, labels)
    return doc_score_parts
Exemple #16
0
def main():
    split = "train"
    subjectivity_path = sys.argv[1]
    q_res_path = sys.argv[2]
    ranked_list: Dict[str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)

    # load LM
    claim_lms: List[ClaimLM] = build_gold_lms_for_sub_split(split)
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)
    alpha = 0.1
    stopwords = load_stopwords_for_query()
    # load subjectivity predictions.
    subj_d: Dict[str, Tuple[int, int]] = load_subjectivity(subjectivity_path)
    doc_ids = subj_d.keys()
    preload_man.preload(TokenizedCluewebDoc, doc_ids)
    tokenizer = PCTokenizer()

    lm_scores = []
    rates = []
    num_subj_list = []
    num_sent_list = []
    for claim_lm in claim_lms:
        qid = str(claim_lm.cid)
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        for entry in ranked_list[qid]:
            if entry.doc_id in subj_d:
                tokens = load_doc(entry.doc_id)
                assert type(tokens[0]) == str
                lm_score = get_passage_score(tokens)
                num_subj, num_sent = subj_d[entry.doc_id]
                rate = num_subj / num_sent
                lm_scores.append(lm_score)
                rates.append(rate)
                num_subj_list.append(num_subj)
                num_sent_list.append(num_sent)



    print("lm scores correlation with ")
    print("rates: ", pearsonr(lm_scores, rates))
    print("num subj: ", pearsonr(lm_scores, num_subj_list))
    print("num sent: ", pearsonr(lm_scores, num_sent_list))
Exemple #17
0
def print_features():
    job_dir = "perspective_paragraph_feature"
    job_id = 0
    file_path = os.path.join(sydney_working_dir, job_dir, str(job_id))

    features: List[ParagraphClaimPersFeature] = pickle.load(
        open(os.path.join(file_path), "rb"))
    features: List[ParagraphFeature] = lmap(to_paragraph_feature, features)

    out_path = pjoin(output_path,
                     FileName("perspective_paragraph_feature.html"))
    print_paragraph_feature(features, out_path)
    def work(self, job_id):
        qid_list = self.query_group[job_id]
        ticker = TimeEstimator(len(qid_list))
        missing_rel_cnt = 0
        missing_nrel_cnt = 0

        def empty_doc_fn(query_id, doc_id):
            rel_docs = self.ms_reader.qrel[query_id]
            nonlocal missing_rel_cnt
            nonlocal missing_nrel_cnt
            if doc_id in rel_docs:
                missing_rel_cnt += 1
            else:
                missing_nrel_cnt += 1

        for qid in qid_list:
            if qid not in self.candidate_docs_d:
                continue

            docs: List[MSMarcoDoc] = load_per_query_docs(qid, empty_doc_fn)
            ticker.tick()

            target_docs = self.candidate_docs_d[qid]
            text_d = {}
            bert_tokens_d = {}
            stemmed_tokens_d = {}

            for d in docs:
                if d.doc_id in target_docs:
                    title = d.title
                    title = crop_to_space(title, self.max_title_length)

                    body_sents = sent_tokenize(d.body)
                    new_body_sents = self.resplit_body_sents(body_sents)
                    text_d[d.doc_id] = title, new_body_sents

                    for tokenize_fn, save_dict in [
                        (self.bert_tokenizer.tokenize, bert_tokens_d),
                        (self.stem_tokenizer.tokenize_stem, stemmed_tokens_d)
                    ]:
                        title_tokens = tokenize_fn(title)
                        body_tokens_list = lmap(tokenize_fn, new_body_sents)
                        save_dict[d.doc_id] = (title_tokens, body_tokens_list)

            todo = [
                (text_d, self.text_dir_name),
                (bert_tokens_d, self.bert_tokens_dir_name),
                (stemmed_tokens_d, self.stemmed_tokens_dir_name),
            ]

            for tokens_d, dir_name in todo:
                save_path = os.path.join(self.out_dir, dir_name, str(qid))
                pickle.dump(tokens_d, open(save_path, "wb"))
def summarize_score(info_dir, prediction_file) -> Dict[Tuple[str, str], float]:
    info = load_combine_info_jsons(info_dir, qckl_convert_map, False)
    print("Info has {} entries".format(len(info)))
    data: List[Dict] = join_prediction_with_info(prediction_file, info,
                                                 ["data_id", "logits"])

    def get_score(entry):
        return entry['logits']

    grouped: Dict[Tuple[str, str], List[Dict]] = group_by(data, get_qc_pair_id)
    print("Group size:", len(grouped))
    out_d = {}
    for pair_id, items in grouped.items():
        scores = lmap(get_score, items)
        assert len(scores) == 1
        final_score = scores[0]
        out_d[pair_id] = final_score

    num_items_per_group = average(lmap(len, grouped.values()))
    print("Num items per group : ", num_items_per_group)
    return out_d
Exemple #20
0
def remove_duplicate(doc_id_list: List[str]) -> List[str]:
    docs_d: Dict[str, List[str]] = load_multiple(TokenizedCluewebDoc,
                                                 doc_id_list, True)
    hashes = lmap(doc_hash, [
        docs_d[doc_id] if doc_id in docs_d else None for doc_id in doc_id_list
    ])
    duplicate_indice = get_duplicate_list(hashes)
    non_duplicate = list([
        doc_id_list[i] for i in range(len(doc_id_list))
        if i not in duplicate_indice
    ])
    return non_duplicate
Exemple #21
0
def combine_pc_train_info():
    st = 0
    ed = 606

    def load_file(i):
        pickle_path = os.path.join(sydney_working_dir, "pc_rel_tfrecord_info",
                                   "{}".format(i))
        return pickle.load(open(pickle_path, "rb"))

    d_list = lmap(load_file, range(st, ed))
    combined_dict = merge_dict_list(d_list)
    save_to_pickle(combined_dict, "pc_rel_info_all")
Exemple #22
0
def debug_clean_text():
    queries = load_queries(all_years)

    def convert_query(q):
        return trec_query_to_galago_query(q, KEYWORD_QUERY)

    new_queries = lmap(convert_query, queries)

    for q_old, q_new in zip(queries, new_queries):
        if q_new['text'] != q_old.keyword_query:
            print("before:", q_old.keyword_query)
            print("after:", q_new['text'])
    def get_ukp_dev_sents(self, topic):
        loader = DataLoader(topic)
        data = loader.get_dev_data()
        tokenizer = get_tokenizer()

        def encode(e):
            sent, label = e
            tokens = tokenizer.tokenize(sent)
            return label, tokens

        label_sent_pairs = lmap(encode, data)
        return label_sent_pairs
def add_cls_to_parsed():
    d = load_from_pickle("webster_parsed")

    def add_cls(def_tokens):
        return ["[CLS]"] + def_tokens

    new_d = {}
    for word, def_list in d.items():
        new_def_list = lmap(add_cls, def_list)
        new_d[word.lower()] = new_def_list

    save_to_pickle(new_d, "webster_parsed_w_cls")
Exemple #25
0
def show_num_mention():
    train, val = load_feature_and_split()
    p_dict = get_perspective_dict()
    claims = get_claims_from_ids(lmap(lambda x: x['cid'], train))
    claim_d = claims_to_dict(claims)
    grouped = group_by(train, lambda x: x['cid'])

    for cid in grouped:
        print("Claim:", claim_d[cid])
        for dp in grouped[cid]:
            p_text = p_dict[dp['pid']]
            print(dp['label'], get_num_mention(dp), p_text)
Exemple #26
0
def build_single_claim_lm(all_ranked_list, claim):
    candidate_k = 50
    claim_text, perspectives = get_perspective(claim, candidate_k)
    unigrams = get_relevant_unigrams(perspectives)
    cid = claim['cId']
    ranked_list = all_ranked_list.get(str(cid))
    doc_ids = [t[0] for t in ranked_list]
    preload_docs(doc_ids)
    preload_tf(doc_ids)
    docs = lmap(load_and_format_doc, doc_ids)
    lm_classifier = build_lm(docs, unigrams)
    return lm_classifier
Exemple #27
0
def get_extended_eval_candidate(split) -> Dict[int, List[int]]:
    bm25 = get_bm25_module()
    d_ids = load_claim_ids_for_split(split)
    claims: List[Dict] = get_claims_from_ids(d_ids)
    cid_to_pids: Dict[int, List[int]] = get_claim_perspective_id_dict2()
    tokenizer = PCTokenizer()

    def get_tf_idf(c: Counter):
        r = Counter()
        for t, cnt in c.items():
            tfidf = bm25.term_idf_factor(t) * cnt
            r[t] = tfidf
        return r

    def get_candidates(c: Dict) -> Tuple[int, List[int]]:
        cid = c["cId"]
        assert type(cid) == int
        claim_text = c["text"]
        claim_tokens = tokenizer.tokenize_stem(claim_text)
        top_k = 50
        lucene_results = es_helper.get_perspective_from_pool(claim_text, top_k)
        candidate_list: List[int] = []

        for rank, (_text, _pid, _score) in enumerate(lucene_results):
            candidate_list.append(_pid)

        gold_pids = cid_to_pids[int(cid)]
        hard_candidate = []
        mismatch_voca = Counter()
        for pid in gold_pids:
            if pid not in candidate_list:
                hard_candidate.append(pid)
                p_text = perspective_getter(pid)
                p_tokens = tokenizer.tokenize_stem(p_text)

                for t in p_tokens:
                    if t not in claim_tokens:
                        mismatch_voca[t] += 1

        candidate_list.extend(hard_candidate)
        mismatch_tf_idf = get_tf_idf(mismatch_voca)
        new_qterms = left(mismatch_tf_idf.most_common(30))
        lucene_results = es_helper.get_perspective_from_pool(
            " ".join(new_qterms), top_k)

        for rank, (_text, _pid, _score) in enumerate(lucene_results):
            if _pid not in candidate_list:
                candidate_list.append(_pid)

        return cid, candidate_list

    candidates: List[Tuple[int, List[int]]] = lmap(get_candidates, claims)
    return dict(candidates)
Exemple #28
0
    def scorer(query_p: Passage,
               candidate: List[Passage]) -> List[NamedNumber]:
        q_tf = basic_tf.get_tf(query_p)

        def do_score(candidate_p: Passage) -> NamedNumber:
            if candidate_p.text == query_p.text:
                return NamedNumber(-99, "equal")
            p_tf = basic_tf.get_tf(candidate_p)
            return bm25_module.score_inner(q_tf, p_tf)

        scores = lmap(do_score, candidate)
        return scores
Exemple #29
0
def doc_score_predictions():
    passage_score_path = "output/cppnc/qknc_val"
    info = load_combine_info_jsons("output/cppnc/qknc_val.info", qk_convert_map)
    data = join_prediction_with_info(passage_score_path, info)
    grouped: Dict[str, List[Dict]] = group_by(data, lambda x: x['query'].query_id)

    def get_score_from_logit(logits):
        return scipy.special.softmax(logits)[1]

    for cid, passages in grouped.items():
        scores: List[float] = lmap(lambda d: get_score_from_logit(d['logits']), passages)
        yield cid, scores
Exemple #30
0
def main():
    baseline_cid_grouped, cid_grouped, claim_d = load_cppnc_related_data()
    gold = get_claim_perspective_id_dict()

    bin_keys = ["< 0.05", "< 0.50", "< 0.95", "< 1"]

    def bin_fn(item: float):
        if item > 0.95:
            return "< 1"
        elif item > 0.5:
            return "< 0.95"
        elif item > 0.05:
            return "< 0.50"
        else:
            return "< 0.05"

    for cid, pid_entries in cid_grouped.items():
        baseline_pid_entries = baseline_cid_grouped[cid]

        baseline_score_d = {}
        for cpid, a_thing_array in baseline_pid_entries:
            _, pid = cpid
            assert len(a_thing_array) == 1
            score = a_thing_array[0]['score']
            baseline_score_d[pid] = score

        gold_pids = gold[cid]

        def get_score_per_pid_entry(p_entries: Tuple[CPIDPair, List[Dict]]):
            cpid, entries = p_entries
            return average(lmap(lambda e: e['score'], entries))

        pid_entries.sort(key=get_score_per_pid_entry, reverse=True)

        s = "{} : {}".format(cid, claim_d[cid])
        print(s)
        head_row = [""] + bin_keys
        rows = [head_row]
        for cpid, things in pid_entries:
            histogram = BinHistogram(bin_fn)
            _, pid = cpid
            label = any([pid in pids for pids in gold_pids])
            label_str = bool_to_yn(label)
            base_score = baseline_score_d[pid]
            base_score_str = "{0:.2f}".format(base_score)
            scores: List[float] = lmap(lambda x: (x['score']), things)
            foreach(histogram.add, scores)
            row = [label_str, base_score_str] + [
                str(histogram.counter[bin_key]) for bin_key in bin_keys
            ]
            rows.append(row)
        print_table(rows)