def evaluate2(predictions): gold = get_claim_perspective_id_dict() tot_p = tot_r = tot_count = 0 for c_Id, p_Id_list in predictions: gold_pids = gold[c_Id] covered = [False for _c in gold_pids] for pid in p_Id_list: for idx, cluster in enumerate(gold_pids): if pid in cluster: covered[idx] = True tot_gold = len(covered) tot_pred = len(p_Id_list) hit = [h for h in covered if h] if tot_pred == 0: tot_p += 1 else: tot_p += len(hit) / tot_pred if tot_gold == 0: tot_r += 1 else: tot_r += len(hit) / tot_gold mean_p = tot_p / len(predictions) mean_r = tot_r / len(predictions) mean_f1 = 2 * mean_p * mean_r / (mean_p + mean_r) return {'precision': mean_p, 'recall': mean_r, 'f1': mean_f1}
def save_to_csv(): gold = get_claim_perspective_id_dict() def routine(claims, out_path): payloads = predict_by_elastic_search(claims, 50) head = ['sentence1', 'sentence2', 'gold_label', 'cid', 'pid'] rows = [] for cid, data_list in payloads: gold_pids = gold[cid] all_pid_set = set(flatten(gold_pids)) for p_entry in data_list: c_text = p_entry['claim_text'] p_text = p_entry['perspective_text'] y = 1 if p_entry['pid'] in all_pid_set else 0 row = [c_text, p_text, y, cid, p_entry['pid']] rows.append(row) f_out = csv.writer(open(out_path, "w", encoding="utf-8"), dialect='excel-tab') f_out.writerows([head] + rows) claims, val = train_split() routine(claims, get_file_path('train')) d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) routine(claims, get_file_path('dev')) d_ids: List[int] = list(load_test_claim_ids()) claims = get_claims_from_ids(d_ids) routine(claims, get_file_path('test'))
def inspect(predictions): gold = get_claim_perspective_id_dict() suc_counter = SuccessCounter() for c_Id, prediction_list in predictions: gold_pids = gold[c_Id] def is_valid(pid): for cluster in gold_pids: if pid in cluster: return True return False top_pred = prediction_list[0] if is_valid(top_pred['pid']): suc_counter.suc() else: suc_counter.fail() prediction = prediction_list[0] claim_text = prediction['claim_text'] print("Claim {}: ".format(c_Id), claim_text) print("{0:.2f} {1} {2}".format(prediction['score'], prediction['rationale'], prediction['perspective_text'])) print() print("P@1", suc_counter.get_suc_prob())
def debug_failture(predictions): gold = get_claim_perspective_id_dict() ap_list = [] for c_Id, prediction_list in predictions: gold_pids = gold[c_Id] gold_pids_set: Set[int] = set(flatten(gold_pids)) claim_text = prediction_list[0]['claim_text'] print("Claim {}: ".format(c_Id), claim_text) correctness_list = lmap(lambda p: p['pid'] in gold_pids_set, prediction_list) ap = get_ap(prediction_list, gold_pids, False) if not any(correctness_list): # all wrong continue if ap > 0.9: continue def print_line(prediction): pid = prediction['pid'] correct = pid in gold_pids_set if correct: correct_str = "Y" else: correct_str = "N" score = prediction['score'] print(correct_str, score, score.name, prediction['perspective_text']) foreach(print_line, prediction_list) ap_list.append(ap) map = average(ap_list) return {'map': map}
def __init__(self, cid_to_passages: Dict[int, List[Tuple[List[str], float]]], candidate_perspective: Dict[int, List[int]], filer_good): self.gold = get_claim_perspective_id_dict() self.candidate_perspective = candidate_perspective self.cid_to_passages = cid_to_passages self.filter_good = filer_good
def __init__( self, cid_to_passages: Dict[int, List[Tuple[List[str], float]]], ): self.cid_to_passages = cid_to_passages self.all_cids = list(cid_to_passages.keys()) self.gold = get_claim_perspective_id_dict()
def get_trec_relevance_judgement() -> Iterable[TrecRelevanceJudgementEntry]: gold: Dict[int, List[List[int]]] = get_claim_perspective_id_dict() for cid, clusters in gold.items(): query_id = str(cid) pids = set(flatten(clusters)) for pid in pids: e = TrecRelevanceJudgementEntry(query_id, str(pid), 1) yield e
def get_correctness_list(predictions, debug) -> List[List[int]]: gold = get_claim_perspective_id_dict() all_correctness_list = [] for c_Id, prediction_list in predictions: gold_pids = gold[c_Id] correctness_list: List[int] = get_correctness(prediction_list, gold_pids) all_correctness_list.append(correctness_list) return all_correctness_list
def is_correct_factory(): gold = get_claim_perspective_id_dict() def is_correct(query: QCKQuery, candidate: QCKCandidate) -> int: pid_cluster = gold[int(query.query_id)] return int( any([int(candidate.id) in cluster for cluster in pid_cluster])) return is_correct
def __init__( self, cid_to_passages: Dict[int, List[Tuple[List[str], float]]], candidate_perspective: Dict[int, List[int]], ): self.gold = get_claim_perspective_id_dict() self.candidate_perspective = candidate_perspective self.cid_to_passages = cid_to_passages self.tokenizer = get_tokenizer()
def load_passage_score_d( cppnc_save_name, baseline_save_name) -> Dict[Tuple[str, str, int], float]: cid_grouped: Dict[str, Dict[str, List[Dict]]] = load_cppnc_score(cppnc_save_name) gold = get_claim_perspective_id_dict() baseline_cid_grouped = load_baseline(baseline_save_name) score_d: Dict[Tuple[str, str, int], float] = {} def get_score_from_entry(entry): logit = entry['logits'] return scipy.special.softmax(logit)[1] for cid, pid_entries_d in cid_grouped.items(): pid_entries_d: Dict[str, List[Dict]] = pid_entries_d baseline_pid_entries = baseline_cid_grouped[int(cid)] baseline_score_d = fetch_score(baseline_pid_entries) gold_pids = gold[int(cid)] value_arr_pid_row = [] for pid, entries_for_pid in pid_entries_d.items(): label = any([pid in pids for pids in gold_pids]) base_score = baseline_score_d[int(pid)] def get_value_from_entry(entry) -> float: score = get_score_from_entry(entry) value = doc_value(score, base_score, int(label)) return value cur_value_row: List[float] = lmap(get_value_from_entry, entries_for_pid) value_arr_pid_row.append(cur_value_row) value_arr_doc_row: List[List[float]] = list( map(list, zip(*value_arr_pid_row))) avg_value = lmap(average, value_arr_doc_row) doc_info = [] for pid, entries_for_pid in pid_entries_d.items(): for entry in entries_for_pid: e = entry['kdp'].doc_id, entry['kdp'].passage_idx doc_info.append(e) break assert len(avg_value) == len(doc_info) for value, (doc_id, passage_idx) in zip(avg_value, doc_info): key = cid, doc_id, passage_idx score_d[key] = value return score_d
def get_average_precision_list(predictions, debug): gold = get_claim_perspective_id_dict() ap_list = [] for c_Id, prediction_list in predictions: gold_pids = gold[c_Id] claim_text = prediction_list[0]['claim_text'] if debug: print("Claim {}: ".format(c_Id), claim_text) ap = get_ap(prediction_list, gold_pids, debug) ap_list.append(ap) return ap_list
def generate_pair_insts(split) -> Iterable[Instance]: pos_rate = 1 neg1_rate = 3 neg2_rate = 6 ids: List[int] = list(load_claim_ids_for_split(split)) id_dict: Dict[int, List[List[int]]] = get_claim_perspective_id_dict() def same_cluster_example() -> Iterator[Tuple[int, int]]: for claim_id in ids: clusters = id_dict[claim_id] for cluster in clusters: for p1, p2 in combinations(cluster, 2): yield p1, p2 def same_claim_different_cluster() -> Iterator[Tuple[int, int]]: for claim_id in ids: clusters = id_dict[claim_id] for cluster1, cluster2 in combinations(clusters, 2): for p1 in cluster1: for p2 in cluster2: yield p1, p2 def different_claim() -> Iterator[Tuple[int, int]]: for cid1, cid2 in combinations(ids, 2): clusters1 = id_dict[cid1] clusters2 = id_dict[cid2] for p1 in flatten(clusters1): for p2 in flatten(clusters2): yield p1, p2 pos: List[Tuple[int, int]] = list(same_cluster_example()) neg1: List[Tuple[int, int]] = list(same_claim_different_cluster()) neg2: List[Tuple[int, int]] = list(different_claim()) pos_len = len(pos) neg1_len = pos_len * neg1_rate neg2_len = pos_len * neg2_rate print("pos/neg1/neg2 = {}/{}/{}".format(pos_len, neg1_len, neg2_len)) random.shuffle(neg1) random.shuffle(neg2) neg1 = neg1[:neg1_len] neg2 = neg2[:neg2_len] pos_data = list([Instance(pid1, pid2, 1) for pid1, pid2 in pos]) neg_data = list([Instance(pid1, pid2, 0) for pid1, pid2 in neg1 + neg2]) all_data = pos_data + neg_data random.shuffle(all_data) return all_data
def main(): baseline_cid_grouped, cid_grouped, claim_d = load_cppnc_related_data() gold = get_claim_perspective_id_dict() bin_keys = ["< 0.05", "< 0.50", "< 0.95", "< 1"] def bin_fn(item: float): if item > 0.95: return "< 1" elif item > 0.5: return "< 0.95" elif item > 0.05: return "< 0.50" else: return "< 0.05" for cid, pid_entries in cid_grouped.items(): baseline_pid_entries = baseline_cid_grouped[cid] baseline_score_d = {} for cpid, a_thing_array in baseline_pid_entries: _, pid = cpid assert len(a_thing_array) == 1 score = a_thing_array[0]['score'] baseline_score_d[pid] = score gold_pids = gold[cid] def get_score_per_pid_entry(p_entries: Tuple[CPIDPair, List[Dict]]): cpid, entries = p_entries return average(lmap(lambda e: e['score'], entries)) pid_entries.sort(key=get_score_per_pid_entry, reverse=True) s = "{} : {}".format(cid, claim_d[cid]) print(s) head_row = [""] + bin_keys rows = [head_row] for cpid, things in pid_entries: histogram = BinHistogram(bin_fn) _, pid = cpid label = any([pid in pids for pids in gold_pids]) label_str = bool_to_yn(label) base_score = baseline_score_d[pid] base_score_str = "{0:.2f}".format(base_score) scores: List[float] = lmap(lambda x: (x['score']), things) foreach(histogram.add, scores) row = [label_str, base_score_str] + [ str(histogram.counter[bin_key]) for bin_key in bin_keys ] rows.append(row) print_table(rows)
def main(): baseline_cid_grouped, cid_grouped, claim_d = load_cppnc_related_data() gold = get_claim_perspective_id_dict() doc_scores = dict(doc_score_predictions()) for cid, pid_entries in cid_grouped.items(): baseline_pid_entries = baseline_cid_grouped[cid] baseline_score_d = {} for cpid, a_thing_array in baseline_pid_entries: _, pid = cpid assert len(a_thing_array) == 1 score = a_thing_array[0]['score'] baseline_score_d[pid] = score gold_pids = gold[cid] def get_score_per_pid_entry(p_entries: Tuple[CPIDPair, List[Dict]]): cpid, entries = p_entries return average(lmap(lambda e: e['score'], entries)) pid_entries.sort(key=get_score_per_pid_entry, reverse=True) s = "{} : {}".format(cid, claim_d[cid]) print(s) num_docs = len(pid_entries[0][1]) doc_value_arr = list([list() for _ in range(num_docs)]) labels = [] for cpid, things in pid_entries: _, pid = cpid label = any([pid in pids for pids in gold_pids]) labels.append(label) base_score = baseline_score_d[pid] for doc_idx, per_doc in enumerate(things): score = per_doc['score'] value = doc_value(score, base_score, int(label)) doc_value_arr[doc_idx].append(value) head = ["avg", "pred"] + lmap(bool_to_yn, labels) rows = [head] doc_score = doc_scores[cid] assert len(doc_value_arr) == len(doc_score) for pred_score, doc_values in zip(doc_score, doc_value_arr): avg = average(doc_values) row_float = [avg, pred_score] + doc_values row = lmap(lambda x: "{0}".format(x), row_float) rows.append(row) print_table(rows)
def eval_classification(classifier, split): payloads = load_payload(split) gold = get_claim_perspective_id_dict() r = [] for cid, data_list in payloads: gold_pids = gold[cid] all_pid_set = set(flatten(gold_pids)) for p_entry in data_list: c_text = p_entry['claim_text'] p_text = p_entry['perspective_text'] z = classifier(c_text, p_text) y = 1 if p_entry['pid'] in all_pid_set else 0 r.append((z, y)) return get_scores(r)
def tune_kernel_a(): split = "train" payloads = load_payload(split) gold = get_claim_perspective_id_dict() r = [] for cid, data_list in payloads: gold_pids = gold[cid] all_pid_set = set(flatten(gold_pids)) for p_entry in data_list: c_text = p_entry['claim_text'] p_text = p_entry['perspective_text'] y = 1 if p_entry['pid'] in all_pid_set else 0 r.append((c_text, p_text, y)) tune_kernel_save(r)
def predict_by_oracle_on_candidate(claims, top_k) -> List[Tuple[str, List[Dict]]]: gold: Dict[int, List[List[int]]] = get_claim_perspective_id_dict() def scorer(lucene_score, query_id) -> NamedNumber: claim_id, p_id = query_id.split("_") gold_pids = gold[int(claim_id)] score = 0 for p_ids in gold_pids: if int(p_id) in p_ids: score = 1 return NamedNumber(score, "") r = predict_interface(claims, top_k, scorer) return r
def main(): relevance_scores: Dict[CPIDPair, List[Tuple[Logits, Logits]]] = load_from_pickle("pc_relevance_score") gold = get_claim_perspective_id_dict() true_feature = [] false_feature = [] ticker = TimeEstimator(len(relevance_scores)) for key in relevance_scores: ticker.tick() cid, pid = key gold_pids = flatten(gold[cid]) gold_pids = list([int(pid) for pid in gold_pids]) correct = pid in gold_pids scores: List[Tuple[List[float], List[float]]] = relevance_scores[key] c_count = 0 p_count = 0 pc_count = 0 for c_logits, p_logits in scores: c_rel = softmax(c_logits)[1] > 0.5 p_rel = softmax(p_logits)[1] > 0.5 c_count += int(c_rel) p_count += int(p_rel) pc_count += int(c_rel and p_rel) if correct: true_feature.append(pc_count) else: false_feature.append(pc_count) all_feature = true_feature + false_feature all_feature.sort() mid = int(len(all_feature)/2) cut_off = all_feature[mid] tp = sum([int(t > cut_off) for t in true_feature]) fp = sum([int(t > cut_off) for t in false_feature]) tn = sum([int(t <= cut_off) for t in false_feature]) fn = sum([int(t <= cut_off) for t in true_feature]) print(tp, fp, tn, fn) print("true feature", average(true_feature)) print("false feature", average(false_feature))
def main(): save_name = "qcknc_val" cid_grouped: Dict[str, Dict[str, List[Dict]]] = load_cppnc_score(save_name) baseline_cid_grouped: Dict[int, List] = load_baseline("train_baseline") # baseline_cid_grouped, cid_grouped, claim_d = load_cppnc_related_data() gold = get_claim_perspective_id_dict() columns = ["cid", "doc_id", "num_good-num_bad"] rows = [columns] for cid_s, pid_entries in cid_grouped.items(): cid = int(cid_s) baseline_pid_entries = baseline_cid_grouped[cid] baseline_score_d: Dict[int, float] = fetch_score_per_pid( baseline_pid_entries) gold_pids = gold[cid] labels = [] per_doc_counter = Counter() for pid, entries in pid_entries.items(): label = any([pid in pids for pids in gold_pids]) labels.append(label) base_score = baseline_score_d[int(pid)] try: for doc_idx, entry in enumerate(entries): doc_id = entry['kdp'].doc_id score = get_score_from_entry(entry) value = doc_value(score, base_score, int(label)) value_type = doc_value_group(value) per_doc_counter[doc_id, value_type] += 1 except KeyError: print(cid, doc_idx, "not found") pass doc_ids = set(left(per_doc_counter.keys())) for doc_id in doc_ids: n_good = per_doc_counter[doc_id, "good"] n_bad = per_doc_counter[doc_id, "bad"] doc_score = n_good - n_bad row = [cid, doc_id, doc_score] if doc_score > 2 or doc_score < -2: rows.append(row) print_table(rows)
def main(): print("Loading doc score") doc_scores = dict(doc_score_predictions()) print("Loading cppnc scores") save_name = "qcknc_val" cid_grouped: Dict[str, Dict[str, List[Dict]]] = load_cppnc_score(save_name) print(".") gold = get_claim_perspective_id_dict() baseline_cid_grouped: Dict[int, List] = load_baseline("train_baseline") claim_d = load_train_claim_d() for cid, pid_entries_d in cid_grouped.items(): pid_entries_d: Dict[str, List[Dict]] = pid_entries_d baseline_pid_entries = baseline_cid_grouped[int(cid)] baseline_score_d = fetch_score_per_pid(baseline_pid_entries) gold_pids = gold[int(cid)] def get_score_per_pid_entry(p_entries: Tuple[str, List[Dict]]): _, entries = p_entries return average(lmap(get_score_from_entry, entries)) pid_entries: List[Tuple[str, List[Dict]]] = list(pid_entries_d.items()) pid_entries.sort(key=get_score_per_pid_entry, reverse=True) s = "{} : {}".format(cid, claim_d[int(cid)]) print(s) doc_info_d, doc_value_arr, labels = collect_score_per_doc(baseline_score_d, get_score_from_entry, gold_pids, pid_entries) pids = left(pid_entries) head1 = [""] * 4 + pids head2 = ["avg", "doc_id", "passage_idx", "pknc_pred"] + lmap(bool_to_yn, labels) rows = [head1, head2] doc_score = doc_scores[cid] assert len(doc_value_arr) == len(doc_score) for doc_idx, (pred_score, doc_values) in enumerate(zip(doc_score, doc_value_arr)): doc_id, passage_idx = doc_info_d[doc_idx] avg = average(doc_values) row_float = [avg, doc_id, passage_idx, pred_score] + doc_values row = lmap(lambda x: "{0}".format(x), row_float) rows.append(row) print_table(rows)
def get_relevance_judgement_only_from_candidate(): split = "dev" candidates: List[Tuple[int, List[Dict]]] = get_eval_candidates_from_pickle(split) valid_set = set() for cid, items in candidates: for e in items: pid = e['pid'] valid_set.add((cid, pid)) gold: Dict[int, List[List[int]]] = get_claim_perspective_id_dict() l = [] for cid, clusters in gold.items(): query_id = str(cid) pids = set(flatten(clusters)) for pid in pids: if (cid, pid) in valid_set: e = TrecRelevanceJudgementEntry(query_id, str(pid), 1) l.append(e) return l
def build_gold_lms(claims) -> List[ClaimLM]: gold = get_claim_perspective_id_dict() tokenizer = PCTokenizer() def get_cluster_lm(cluster: List[int]) -> Counter: p_text_list: List[str] = lmap(perspective_getter, cluster) tokens_list: List[List[str]] = lmap(tokenizer.tokenize_stem, p_text_list) counter_list = lmap(tokens_to_freq, tokens_list) counter = average_counters(counter_list) return counter def get_claim_lm(claim) -> ClaimLM: cid = claim["cId"] counter_list: List[Counter] = lmap(get_cluster_lm, gold[cid]) counter: Counter = average_counters(counter_list) return ClaimLM(cid, claim['text'], counter) claim_lms = lmap(get_claim_lm, claims) return claim_lms
def perspective_lm_correlation(): d_ids = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 20 gold = get_claim_perspective_id_dict() predictions = predict_with_lm(claims, top_k) avg_pos_list = [] avg_neg_list = [] for c_Id, prediction_list in predictions: gold_pids = gold[c_Id] claim_text = prediction_list[0]['claim_text'] pos_list = [] neg_list = [] print("Claim {}: ".format(c_Id), claim_text) for prediction in prediction_list: pid = prediction['pid'] valid = False for cluster in gold_pids: if pid in cluster: valid = True break print("{0} {1:.2f} {2}".format(valid, prediction['lm_score'], prediction['perspective_text'])) if not valid: neg_list.append(prediction['lm_score']) else: pos_list.append(prediction['lm_score']) if pos_list and neg_list: pos_score = average(pos_list) neg_score = average(neg_list) avg_pos_list.append(pos_score) avg_neg_list.append(neg_score) diff, p = ttest_ind(avg_pos_list, avg_neg_list) print("pos", average(avg_pos_list), "neg", average(avg_neg_list)) print("pos", avg_pos_list) print("neg", avg_neg_list) print(diff, p)
def evaluate(predictions, debug=True): gold = get_claim_perspective_id_dict() prec_list = [] recall_list = [] for c_Id, prediction_list in predictions: gold_pids = gold[c_Id] claim_text = prediction_list[0]['claim_text'] if debug: print("Claim {}: ".format(c_Id), claim_text) prec, recall = get_prec_recll(prediction_list, gold_pids, debug) prec_list.append(prec) recall_list.append(recall) avg_prec = average(prec_list) avg_recall = average(recall_list) return { 'precision': avg_prec, 'recall': avg_recall, 'f1': get_f1(avg_prec, avg_recall) }
def get_candidates(claims, balance) -> List[PerspectiveCandidate]: related_p_map = get_claim_perspective_id_dict() related_p_map = { key: flatten(value) for key, value in related_p_map.items() } p_map = get_perspective_dict() all_data_points = [] for c in claims: cid = c["cId"] claim_text = c["text"] lucene_results = es_helper.get_perspective_from_pool(claim_text, 50) rp = related_p_map[cid] pid_set = list([_pid for _text, _pid, _score in lucene_results]) data_point_list = [] for pid in pid_set: p_text = p_map[pid] label = 1 if pid in rp else 0 data_point = PerspectiveCandidate(label=str(label), cid=cid, pid=pid, claim_text=claim_text, p_text=p_text) #data_point = [str(label), str(cid), str(pid), claim_text, p_text] data_point_list.append(data_point) # If training, we balance positive and negative examples. if balance: pos_insts = list([e for e in data_point_list if e.label == "1"]) neg_insts = list([e for e in data_point_list if e.label == "0"]) neg_insts = neg_insts[:len(pos_insts)] data_point_list = pos_insts + neg_insts all_data_points.extend(data_point_list) return all_data_points
def build_df(): claims, val = train_split() gold = get_claim_perspective_id_dict() tokenizer = PCTokenizer() df = Counter() dl_list = [] for claim in claims: cid = claim["cId"] gold_pids = flatten(gold[cid]) p_text_list: List[str] = lmap(perspective_getter, gold_pids) tokens_list = lmap(tokenizer.tokenize_stem, p_text_list) dl_list.extend(lmap(len, tokens_list)) for t in set(flatten(tokens_list)): df[t] += 1 print(dl_list) print("Avdl", average(dl_list)) print(len(claims)) print(df.most_common(30)) save_to_pickle(df, "pc_df")
def main(input_path): claims = get_all_claims() claim_d = claims_to_dict(claims) gold: Dict[int, List[List[int]]] = get_claim_perspective_id_dict() grouped_ranked_list = load_ranked_list_grouped(input_path) def is_correct(qid: str, doc_id: str): return any([int(doc_id) in cluster for cluster in gold[int(qid)]]) top_k = 5 for qid, entries in grouped_ranked_list.items(): n_gold = sum(map(len, gold[int(qid)])) cut_n = min(n_gold, top_k) correctness = list([is_correct(qid, e.doc_id) for e in entries[:cut_n]]) num_correct = sum(lmap(int, correctness)) p_at_k = num_correct / cut_n pid_to_rank: Dict[str, int] = {e.doc_id: e.rank for e in entries} def get_rank(pid: int): if str(pid) in pid_to_rank: return pid_to_rank[str(pid)] else: return "X" if p_at_k < 0.3: print(n_gold) print(p_at_k) print("Claim {} {}".format(qid, claim_d[int(qid)]))## for cluster in gold[int(qid)]: print("-") for pid in cluster: print("[{}]".format(get_rank(pid)), perspective_getter(int(pid))) for e in entries[:50]: correct_str = "Y" if is_correct(qid, e.doc_id) else "N" print("{} {} {}".format(correct_str, e.score, perspective_getter(int(e.doc_id))))
def main(): run_config = json.load(open(sys.argv[1], "r")) passage_score_path = run_config['passage_score_path'] payload_name = run_config['payload_name'] doc_scores: Dict[int, List[float]] = dict( load_doc_score_prediction(passage_score_path)) baseline_cid_grouped, cid_grouped, claim_d = load_cppnc_score_and_baseline_and_group( payload_name) gold = get_claim_perspective_id_dict() g_counter = Counter() columns = ["pid doc pair", "good", "bad", "no effect", "no effect pid"] rows = [columns] record = [] for cid, pid_entries in cid_grouped.items(): baseline_pid_entries = baseline_cid_grouped[cid] baseline_score_d = {} for cpid, a_thing_array in baseline_pid_entries: _, pid = cpid assert len(a_thing_array) == 1 score = a_thing_array[0]['score'] baseline_score_d[pid] = score gold_pids = gold[cid] labels = [] counter = Counter() for cpid, things in pid_entries: _, pid = cpid label = any([pid in pids for pids in gold_pids]) labels.append(label) base_score = baseline_score_d[pid] any_effect = False try: for doc_idx, per_doc in enumerate(things): score = per_doc['score'] value = doc_value(score, base_score, int(label)) qknc_score = doc_scores[cid][doc_idx] if qknc_score < 0: continue value_type = doc_value_group(value) counter[value_type] += 1 if value_type in ["good", "bad"]: record.append((cid, pid, doc_idx, value_type)) if value_type != "no effect": any_effect = True counter["pid doc pair"] += 1 if not any_effect: counter["no effect pid"] += 1 except KeyError: print(cid, doc_idx, "not found") pass row = [cid] + list([counter[c] for c in columns]) rows.append(row) for key, count in counter.items(): g_counter[key] += count row = ["all"] + list([g_counter[c] for c in columns]) rows.append(row) row = ["rate"] + list( [g_counter[c] / g_counter["pid doc pair"] for c in columns]) rows.append(row) print_table(rows) print_table(record)
def main(): print("Loading scores...") cid_grouped: Dict[str, Dict[str, List[Dict]]] = load_cppnc_score_wrap() baseline_cid_grouped = load_baseline("train_baseline") gold = get_claim_perspective_id_dict() tokenizer = get_tokenizer() claim_d = load_train_claim_d() print("Start analyzing") html = HtmlVisualizer("cppnc_value_per_token_score.html") claim_cnt = 0 for cid, pid_entries_d in cid_grouped.items(): pid_entries_d: Dict[str, List[Dict]] = pid_entries_d pid_entries: List[Tuple[str, List[Dict]]] = list(pid_entries_d.items()) baseline_pid_entries = baseline_cid_grouped[int(cid)] baseline_score_d = fetch_score_per_pid(baseline_pid_entries) gold_pids = gold[int(cid)] ret = collect_score_per_doc(baseline_score_d, get_score_from_entry, gold_pids, pid_entries) passage_tokens_d = collect_passage_tokens(pid_entries) doc_info_d: Dict[int, Tuple[str, int]] = ret[0] doc_value_arr: List[List[float]] = ret[1] kdp_result_grouped = defaultdict(list) for doc_idx, doc_values in enumerate(doc_value_arr): doc_id, passage_idx = doc_info_d[doc_idx] avg_score = average(doc_values) kdp_result = doc_id, passage_idx, avg_score kdp_result_grouped[doc_id].append(kdp_result) s = "{} : {}".format(cid, claim_d[int(cid)]) html.write_headline(s) claim_cnt += 1 if claim_cnt > 10: break scores: List[float] = list([r[2] for r in doc_value_arr]) foreach(html.write_paragraph, lmap(str, scores)) for doc_id, kdp_result_list in kdp_result_grouped.items(): html.write_headline(doc_id) tokens, per_token_score = combine_collect_score(tokenizer, doc_id, passage_tokens_d, kdp_result_list) str_tokens = tokenizer.convert_ids_to_tokens(tokens) row = cells_from_tokens(str_tokens) for idx in range(len(str_tokens)): score = per_token_score[idx][0] norm_score = min(abs(score) * 10000, 100) color = "B" if score > 0 else "R" row[idx].highlight_score = norm_score row[idx].target_color = color rows = [row] nth = 0 any_score_found = True while any_score_found: any_score_found = False score_list = [] for idx in range(len(str_tokens)): if nth < len(per_token_score[idx]): score = per_token_score[idx][nth] any_score_found = True else: score = "-" score_list.append(score) def get_cell(score): if score == "-": return Cell("-") else: # 0.01 -> 100 norm_score = min(abs(score) * 10000, 100) color = "B" if score > 0 else "R" return Cell("", highlight_score=norm_score, target_color=color) nth += 1 if any_score_found: row = lmap(get_cell, score_list) rows.append(row) html.multirow_print_from_cells_list(rows)