def main(): judgment_path = sys.argv[1] metric = sys.argv[2] ranked_list_path1 = sys.argv[3] ranked_list_path2 = sys.argv[4] # print qrels = load_qrels_flat(judgment_path) ranked_list_1: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped(ranked_list_path1) ranked_list_2: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped(ranked_list_path2) metric_fn = get_metric_fn(metric) score_d1 = get_score_per_query(qrels, metric_fn, ranked_list_1) score_d2 = get_score_per_query(qrels, metric_fn, ranked_list_2) pairs = [] for key in score_d1: try: e = (score_d1[key], score_d2[key]) pairs.append(e) except KeyError as e: pass if len(pairs) < len(score_d1) or len(pairs) < len(score_d2): print("{} matched from {} and {} scores".format(len(pairs), len(score_d1), len(score_d2))) l1, l2 = zip(*pairs) d, p_value = stats.ttest_rel(l1, l2) print("baseline:", average(l1)) print("treatment:", average(l2)) print(d, p_value)
def per_doc_score(): filename = "tlm_view.pickle" html_writer = HtmlVisualizer("per_doc_score.html", dark_mode=False) data = EstimatorPredictionViewerGosford(filename) amp = 20 small_threshold = 40 for inst_i, entry in enumerate(data): if inst_i > 1000: break scores = entry.get_vector("priority_score") tokens = entry.get_mask_resolved_input_mask_with_input() cells = data.cells_from_tokens(tokens) if len(cells) < small_threshold: continue avg_score = average(scores) if -0.11 > avg_score > -0.30: continue print(average(scores)) html_writer.write_headline(avg_score) rows = [] row = [] for idx, cell in enumerate(cells): row.append(cell) if len(row) == 20: html_writer.write_table([row]) row = []
def avg_fn(l): r = average(l) cnt = 0 for t in l: if abs(t - r) > 0.5: cnt += 1 print(l) return average(l)
def main(): score_path1 = sys.argv[1] score_path2 = sys.argv[2] # print l1 = get_score_per_query(score_path1) l2 = get_score_per_query(score_path2) assert len(l1) == len(l2) d, p_value = stats.ttest_rel(l1, l2) print("baseline:", average(l1)) print("treatment:", average(l2)) print(d, p_value)
def valid_fn(): loss_list = [] acc_list = [] for batch in dev_batches: loss_val, acc, g_step_val = sess.run( [task.loss, task.acc, global_step], feed_dict=batch2feed_dict(batch)) loss_list.append(loss_val) acc_list.append(acc) log.info("Step dev step={0} loss={1:.04f} acc={2:.03f}".format( g_step_val, average(loss_list), average(acc_list))) return average(acc_list)
def sanity_check(): dvp: List[DocValueParts2] = load() candidate_d_raw: List[Tuple[int, List[int]]] = get_eval_candidate_as_pids( "train") candidate_d = {str(k): lmap(str, v) for k, v in candidate_d_raw} # Group by doc id dvp_qid_grouped: Dict[str, List[DocValueParts2]] = group_by(dvp, get_qid) ap_baseline = [] ap_new_score = [] for qid, entries in dvp_qid_grouped.items(): ranked_list_new = [] ranked_list_baseline = [] candidate_id_grouped = group_by(entries, get_candidate) for candidate_id, entries2 in candidate_id_grouped.items(): is_initial_candidate = candidate_id in candidate_d[qid] gold = entries2[0].label skip = gold and not is_initial_candidate def get_new_score(dvp: DocValueParts2): return dvp.score def get_baseline_score(dvp: DocValueParts2): return dvp.init_score if skip: continue new_score = top_k_avg(lmap(get_new_score, entries2)) baseline_score = average(lmap(get_baseline_score, entries2)) ranked_list_new.append((candidate_id, new_score, gold)) ranked_list_baseline.append((candidate_id, baseline_score, gold)) def get_ap(ranked_list): ranked_list.sort(key=lambda x: x[1], reverse=True) p_list = [] p = 0 for rank, (cid, score, gold) in enumerate(ranked_list): if gold: p += 1 p_list.append(p / (rank + 1)) return average(p_list) ap_baseline.append(get_ap(ranked_list_baseline)) ap_new_score.append(get_ap(ranked_list_new)) print("MAP baseline", average(ap_baseline)) print("MAP new score", average(ap_new_score))
def valid_fn_factory(sess, dev_batches, loss_tensor, acc_tensor, global_step_tensor, batch2feed_dict): loss_list = [] acc_list = [] for batch in dev_batches: loss_val, acc, g_step_val = sess.run( [loss_tensor, acc_tensor, global_step_tensor], feed_dict=batch2feed_dict(batch)) loss_list.append(loss_val) acc_list.append(acc) tf_logging.info("Step dev step={0} loss={1:.04f} acc={2:.03f}".format( g_step_val, average(loss_list), average(acc_list))) return average(acc_list)
def get_precision_recall( input_entries: List[Tuple[QCKQuery, List[QCKCandidate]]]) -> Dict: gold_dict: Dict[str, List[int]] = evidence_gold_dict_str_qid() all_scores = [] for query, ranked_list in input_entries: e_id_list = lmap(QCKCandidate.get_id, ranked_list) gold_id = gold_dict[query.query_id] tp = 0 for e_id in e_id_list: if e_id in gold_id: tp += 1 precision = tp / len(e_id_list) if len(e_id_list) else 1 recall = tp / len(gold_id) if len(gold_id) else 1 f1 = get_f1(precision, recall) per_score = { 'precision': precision, 'recall': recall, } all_scores.append(per_score) average_scores = {} for metric in ['precision', 'recall']: average_scores[metric] = average([e[metric] for e in all_scores]) average_scores['f1'] = get_f1(average_scores['precision'], average_scores['recall']) return average_scores
def main(): info_d = {} for job_id in range(5): p = os.path.join(cpath.data_path, "tlm", "pred", "info_d_{}.pickle".format(job_id)) d = pickle.load(open(p, "rb")) info_d.update(d) p = os.path.join(cpath.data_path, "tlm", "pred", "tlm1.pickle") pred = pickle.load(open(p, "rb")) p_l = list([list() for i in range(5)]) tf_id_set = set() for e in pred: tf_id = info_d[e.unique_ids] if tf_id not in tf_id_set: tf_id_set.add(tf_id) loss = e.losses print(tf_id, e.unique_ids, loss) j = e.unique_ids % 10 p_l[j].append(loss) for i in range(5): print("Type : {} : {}".format(i, average(p_l[i])))
def debug_failture(predictions): gold = get_claim_perspective_id_dict() ap_list = [] for c_Id, prediction_list in predictions: gold_pids = gold[c_Id] gold_pids_set: Set[int] = set(flatten(gold_pids)) claim_text = prediction_list[0]['claim_text'] print("Claim {}: ".format(c_Id), claim_text) correctness_list = lmap(lambda p: p['pid'] in gold_pids_set, prediction_list) ap = get_ap(prediction_list, gold_pids, False) if not any(correctness_list): # all wrong continue if ap > 0.9: continue def print_line(prediction): pid = prediction['pid'] correct = pid in gold_pids_set if correct: correct_str = "Y" else: correct_str = "N" score = prediction['score'] print(correct_str, score, score.name, prediction['perspective_text']) foreach(print_line, prediction_list) ap_list.append(ap) map = average(ap_list) return {'map': map}
def train_test_repeat(load_id, exp_name, n_repeat): hp = hyperparams.HPBert() e_config = ExperimentConfig() e_config.name = "RTE_{}".format("A") e_config.num_epoch = 10 e_config.save_interval = 30 * 60 # 30 minutes e_config.load_names = ['bert'] vocab_filename = "bert_voca.txt" data_loader = rte.DataLoader(hp.seq_max, vocab_filename, True) print(load_id) scores = [] for i in range(n_repeat): e = Experiment(hp) print(exp_name) e_config.name = "rte_{}".format(exp_name) save_path = e.train_rte(e_config, data_loader, load_id) acc = e.eval_rte(e_config, data_loader, save_path) scores.append(acc) print(exp_name) for e in scores: print(e, end="\t") print() r = average(scores) print("Avg\n{0:.03f}".format(r)) return r
def get_ap(predicted_perspectives, gold_pids, debug): ## In this metrics, it is possible to get precision > 1, as some clusters shares same perspective # if debug: # print(gold_pids) # for cluster in gold_pids: # print("-") # for pid in cluster: # print(pid, perspective_getter(pid)) def is_correct(pid): for cluster in gold_pids: if pid in cluster: return True return False tp = 0 precision_list = [] for idx, prediction in enumerate(predicted_perspectives): pid = prediction['pid'] if is_correct(pid): tp += 1 n_pred = idx + 1 prec = tp / n_pred precision_list.append(prec) correct_str = "Y" else: correct_str = "N" if debug: print(correct_str, prediction['score'], prediction['rationale'], pid, prediction['perspective_text']) assert tp == len(precision_list) ap = average(precision_list) if tp > 0 else 1 return ap
def main(): judgment_path = sys.argv[1] ranked_list_path = sys.argv[2] metric = sys.argv[3] qrels = load_qrels_flat_per_query(judgment_path) ranked_list: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped( ranked_list_path) metric_fn = get_metric_fn(metric) score_per_query_list = [] not_found = 0 for query_id in ranked_list: q_ranked_list = ranked_list[query_id] try: gold_list = qrels[query_id] true_gold = list( [doc_id for doc_id, score in gold_list if score > 0]) score_per_query = metric_fn(q_ranked_list, true_gold) score_per_query_list.append(score_per_query) except KeyError as e: not_found += 1 if not_found: print("{} of {} queires not found".format(not_found, len(ranked_list))) score = average(score_per_query_list) print("{}\t{}".format(metric, score))
def extract_qk_unit(info_path, pred_path, config_path) -> Iterable[QKUnit]: info = load_combine_info_jsons(info_path, qk_convert_map, False) predictions = join_prediction_with_info(pred_path, info) grouped: Dict[str, List[Dict]] = group_by(predictions, lambda x: x['query'].query_id) config = json.load(open(config_path, "r")) score_cut = config['score_cut'] top_k = config['top_k'] def is_good(entry): return get_regression_score(entry) > score_cut select_rate_list = [] qk_units = [] for qid, entries in grouped.items(): any_entry = entries[0] query = any_entry['query'] good_entries = lfilter(is_good, entries) good_entries.sort(key=get_regression_score, reverse=True) selected_entries = good_entries[:top_k] if not selected_entries: continue kd_list = lmap(lambda x: x['kdp'], selected_entries) qk_units.append((query, kd_list)) select_rate = len(selected_entries) / len(entries) select_rate_list.append(select_rate) print("{} of {} qk units selected".format(len(qk_units), len(grouped))) print("average select rate", average(select_rate_list)) return qk_units
def show(n): topic = "abortion" count = load_n_gram_from_pickle(topic, n) clueweb_tf, clueweb_df = load_subword_term_stat() clueweb_idf = df_to_idf(clueweb_df) c_tf, nc_tf = load_from_pickle("abortion_clm") avg_idf = average(list(clueweb_idf.values())) def get_idf(t): if t in clueweb_idf: return clueweb_idf[t] else: return avg_idf l = list(count.items()) skip_count = 0 l.sort(key=lambda x:x[1], reverse=True) for n_gram, cnt in l[:1000]: if is_single_char_n_gram(n_gram): skip_count += 1 else: idf_sum = sum([get_idf(t) for t in n_gram]) print("{} {}".format(n_gram, cnt) + " {0:.2f} {1:.2f} ".format(idf_sum, cnt * idf_sum)) print("Skip", skip_count)
def statistics_tlm(): filename = "blc_cold_scores.pickle" data = EstimatorPredictionViewerGosford(filename) bins = {} bin_fn = get_bin_fn_from_interval(0, 1.05, 0.05) for inst_i, entry in enumerate(data): loss1 = entry.get_vector("lm_loss1") loss2 = entry.get_vector("lm_loss2") prob1 = loss_to_prob(loss1) prob2 = loss_to_prob(loss2) tokens = entry.get_mask_resolved_input_mask_with_input() for i, _ in enumerate(tokens): key = bin_fn(prob1[i]) if key not in bins: bins[key] = [] bins[key].append(prob2[i]) keys = list([k for k in bins.keys() if not k == "Unidentifed"]) keys.sort(key=lambda x:x[0]) mean_dict = {} std_dict = {} for key in keys: l = average(bins[key]) std = np.std(bins[key]) mean_dict[key] = l std_dict[key] = std st, ed = key #print("{0:.2f} {1:.2f}".format(st, ed), l) return bin_fn, mean_dict, std_dict
def summarize_score(info: Dict, prediction_file_path: str, f_handler: FormatHandler, combine_score: Callable, score_type) -> Dict[Tuple[str, str], float]: key_logit = "logits" data: List[Dict] = join_prediction_with_info(prediction_file_path, info, ["data_id", key_logit]) def logit_to_score_softmax(logit): return scipy.special.softmax(logit)[1] def get_score(entry): if score_type == "softmax": return logit_to_score_softmax(entry['logits']) elif score_type == "raw": return entry[key_logit][0] elif score_type == "scalar": return entry[key_logit] elif score_type == "tuple": return entry[key_logit][1] else: assert False grouped: Dict[Tuple[str, str], List[Dict]] = group_by(data, f_handler.get_pair_id) tprint("Group size:", len(grouped)) out_d = {} for pair_id, items in grouped.items(): scores = lmap(get_score, items) final_score = combine_score(scores) out_d[pair_id] = final_score num_items_per_group = average(lmap(len, grouped.values())) tprint("Num items per group : ", num_items_per_group) return out_d
def view_grad_overlap(): filename = "gradient_overlap_4K.pickle" out_name = filename.split(".")[0] + ".html" html_writer = HtmlVisualizer(out_name, dark_mode=False) data = EstimatorPredictionViewerGosford(filename) iba = IntBinAverage() scores = [] for inst_i, entry in enumerate(data): masked_lm_example_loss = entry.get_vector("masked_lm_example_loss") score = entry.get_vector("overlap_score") if masked_lm_example_loss > 1: norm_score = score / masked_lm_example_loss iba.add(masked_lm_example_loss, norm_score) scores.append(score) score_avg = average(scores) score_std = np.std(scores) avg = iba.all_average() std_dict = {} for key, values in iba.list_dict.items(): std_dict[key] = np.std(values) if len(values) == 1: std_dict[key] = 999 def unlikeliness(value, mean, std): return abs(value - mean) / std data = EstimatorPredictionViewerGosford(filename) print("num record : ", data.data_len) cnt = 0 for inst_i, entry in enumerate(data): tokens = entry.get_mask_resolved_input_mask_with_input() masked_lm_example_loss = entry.get_vector("masked_lm_example_loss") highlight = lmap(is_mask, tokens) score = entry.get_vector("overlap_score") print(score) cells = data.cells_from_tokens(tokens, highlight) if masked_lm_example_loss > 1: bin_key = int(masked_lm_example_loss) norm_score = score / masked_lm_example_loss if norm_score > 5000: cnt += 1 expectation = avg[bin_key] if unlikeliness(score, score_avg, score_std) > 2 or True: html_writer.multirow_print(cells, 20) if norm_score > expectation: html_writer.write_paragraph("High") else: html_writer.write_paragraph("Low") html_writer.write_paragraph("Norm score: " + str(norm_score)) html_writer.write_paragraph("score: " + str(score)) html_writer.write_paragraph("masked_lm_example_loss: " + str(masked_lm_example_loss)) html_writer.write_paragraph("expectation: " + str(expectation)) print("number over 5000: ", cnt)
def main(): relevance_scores: Dict[CPIDPair, List[Tuple[Logits, Logits]]] = load_from_pickle("pc_relevance_score") gold = get_claim_perspective_id_dict() true_feature = [] false_feature = [] ticker = TimeEstimator(len(relevance_scores)) for key in relevance_scores: ticker.tick() cid, pid = key gold_pids = flatten(gold[cid]) gold_pids = list([int(pid) for pid in gold_pids]) correct = pid in gold_pids scores: List[Tuple[List[float], List[float]]] = relevance_scores[key] c_count = 0 p_count = 0 pc_count = 0 for c_logits, p_logits in scores: c_rel = softmax(c_logits)[1] > 0.5 p_rel = softmax(p_logits)[1] > 0.5 c_count += int(c_rel) p_count += int(p_rel) pc_count += int(c_rel and p_rel) if correct: true_feature.append(pc_count) else: false_feature.append(pc_count) all_feature = true_feature + false_feature all_feature.sort() mid = int(len(all_feature)/2) cut_off = all_feature[mid] tp = sum([int(t > cut_off) for t in true_feature]) fp = sum([int(t > cut_off) for t in false_feature]) tn = sum([int(t <= cut_off) for t in false_feature]) fn = sum([int(t <= cut_off) for t in true_feature]) print(tp, fp, tn, fn) print("true feature", average(true_feature)) print("false feature", average(false_feature))
def text_len(): d = get_tokens() data = list([list() for _ in range(5)]) for key, tokens in d: data[key].append(len(pretty_tokens(tokens))) for i in range(5): print(i, average(data[i]))
def a_relevant(): d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) claims = claims[:10] top_n = 100 q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) stopwords = load_stopwords_for_query() alpha = 0.7 tokenizer = PCTokenizer() for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] claim_lm = claim_lms_d[c['cId']] log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) def get_passage_score(p): def get_score(t): if t in stopwords: return 0 return log_odd[tokenizer.stemmer.stem(t)] return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0 docs = [] for i in range(top_n): try: doc = load_doc(q_res[i].doc_id) docs.append(doc) except KeyError: docs.append(None) pass print(c['text']) rows = [] for rank, doc in enumerate(docs): if doc is None: rows.append((rank, "-", "-")) continue scores = get_doc_score(doc, get_passage_score) avg_score = average(scores) max_score = max(scores) rows.append((rank, avg_score, max_score)) print_table(rows)
def main(): claim_text_d: Dict[int, str] = get_all_claim_d() claim_text_d: Dict[str, str] = dict_key_map(str, claim_text_d) evi_dict: Dict[str, str] = dict_key_map(str, load_evidence_dict()) evi_gold_dict: Dict[str, List[int]] = evidence_gold_dict_str_qid() print("V2") def print_entry(entry): evidence_text = evi_dict[entry.doc_id] print("[{}] {}: {}".format(entry.rank, entry.doc_id, evidence_text)) ranked_list_dict = load_ranked_list_grouped(sys.argv[1]) for query, ranked_list in ranked_list_dict.items(): print() claim_id, perspective_id = query.split("_") gold_ids: List[str] = lmap(str, evi_gold_dict[query]) if not gold_ids: print("query {} has no gold".format(query)) continue assert gold_ids claim_text = claim_text_d[claim_id] perspective_text = perspective_getter(int(perspective_id)) pos_entries = [] neg_entries = [] for entry in ranked_list: label = entry.doc_id in gold_ids if label: pos_entries.append(entry) elif entry.rank < 3: neg_entries.append(entry) if not pos_entries: print("gold not in ranked list") continue num_rel = len(pos_entries) correctness = [] for entry in ranked_list[:num_rel]: label = entry.doc_id in gold_ids correctness.append(int(label)) precision = average(correctness) if precision > 0.99: print("Good") continue print("precision at {}: {}".format(num_rel, precision)) print("Claim: ", claim_text) print("perspective_text: ", perspective_text) print(" < GOLD >") foreach(print_entry, pos_entries) print(" < False Positive >") foreach(print_entry, neg_entries)
def run_eval_threaded(split, predictor_getter): print("Loading data..") problems: List[ArguDataPoint] = list(load_labeled_data(split)) payload: List[Passage] = get_eval_payload_from_dp(problems) print("starting predictions") predictions = parallel_run(payload, (split, predictor_getter), eval_thread, 5) correctness = eval_correctness(predictions, problems) avg_p_at_1 = average(correctness) print(avg_p_at_1)
def get_ap(ranked_list): ranked_list.sort(key=lambda x: x[1], reverse=True) p_list = [] p = 0 for rank, (cid, score, gold) in enumerate(ranked_list): if gold: p += 1 p_list.append(p / (rank + 1)) return average(p_list)
def a_relevant(): d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) claims = claims top_n = 10 q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) stopwords = load_stopwords_for_query() alpha = 0.3 tokenizer = PCTokenizer() all_passages = [] entries = [] for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] claim_lm = claim_lms_d[c['cId']] log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) claim_text = c['text'] claim_tokens = tokenizer.tokenize_stem(claim_text) scores = [] for t in claim_tokens: if t in log_odd: scores.append(log_odd[t]) base = average(scores) def get_passage_score(p): def get_score(t): if t in stopwords: return 0 return log_odd[tokenizer.stemmer.stem(t)] return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0 passages = iterate_passages(q_res, top_n, get_passage_score) all_passages.extend(passages) a_rel_passages = lfilter(lambda x: x[1] > 0, passages) entries.append((c, a_rel_passages)) data = entries, all_passages save_to_pickle(data, "pc_train_a_passages")
def view(): filename = os.path.join(output_path, "nli_dev_loss.pickle") data = EstimatorPredictionViewerGosford(filename) loss_arr = [] for inst_i, entry in enumerate(data): t = entry.get_vector("loss") loss_arr.append(float(t)) print(len(loss_arr)) print("avg:", average(loss_arr))
def perspective_lm_correlation(): d_ids = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 20 gold = get_claim_perspective_id_dict() predictions = predict_with_lm(claims, top_k) avg_pos_list = [] avg_neg_list = [] for c_Id, prediction_list in predictions: gold_pids = gold[c_Id] claim_text = prediction_list[0]['claim_text'] pos_list = [] neg_list = [] print("Claim {}: ".format(c_Id), claim_text) for prediction in prediction_list: pid = prediction['pid'] valid = False for cluster in gold_pids: if pid in cluster: valid = True break print("{0} {1:.2f} {2}".format(valid, prediction['lm_score'], prediction['perspective_text'])) if not valid: neg_list.append(prediction['lm_score']) else: pos_list.append(prediction['lm_score']) if pos_list and neg_list: pos_score = average(pos_list) neg_score = average(neg_list) avg_pos_list.append(pos_score) avg_neg_list.append(neg_score) diff, p = ttest_ind(avg_pos_list, avg_neg_list) print("pos", average(avg_pos_list), "neg", average(avg_neg_list)) print("pos", avg_pos_list) print("neg", avg_neg_list) print(diff, p)
def generate( self, query_list, data_id_manager: DataIDManager ) -> List[ClassificationInstanceWDataID]: neg_k = 1000 all_insts = [] pos_n_segment = [] neg_n_segment = [] for query_id in query_list: if query_id not in self.judgement: continue judgement = self.judgement[query_id] query = self.queries[query_id] query_tokens = self.tokenizer.tokenize(query) ranked_list = self.galago_rank[query_id] ranked_list = ranked_list[:neg_k] target_docs = set(judgement.keys()) target_docs.update([e.doc_id for e in ranked_list]) print("Total of {} docs".format(len(target_docs))) for doc_id in target_docs: tokens = self.data[doc_id] insts: List[Tuple[List, List]] = self.encoder.encode( query_tokens, tokens) label = 1 if doc_id in judgement and judgement[ doc_id] > 0 else 0 target_indices = self.target_selection_fn( query_id, doc_id, insts) n_segment = len(target_indices) if label: pos_n_segment.append(n_segment) else: neg_n_segment.append(n_segment) print("num pos docs: ", len(pos_n_segment)) print("num neg docs: ", len(neg_n_segment)) print("avg n_seg per doc [pos]", average(pos_n_segment)) print("avg n_seg per doc [neg]", average(neg_n_segment)) return all_insts
def average_likelihood(sent, fn_get_prob): log_p = 0 for i, w in enumerate(sent): p_list = [] for j, w2 in enumerate(sent): if i is not j: p = fn_get_prob(w, w2) p_list.append(p) avg_p = average(p_list) log_p += math.log(avg_p) return math.exp(log_p)
def evaluate(predictions, debug=True): gold = get_claim_perspective_id_dict() prec_list = [] recall_list = [] for c_Id, prediction_list in predictions: gold_pids = gold[c_Id] claim_text = prediction_list[0]['claim_text'] if debug: print("Claim {}: ".format(c_Id), claim_text) prec, recall = get_prec_recll(prediction_list, gold_pids, debug) prec_list.append(prec) recall_list.append(recall) avg_prec = average(prec_list) avg_recall = average(recall_list) return { 'precision': avg_prec, 'recall': avg_recall, 'f1': get_f1(avg_prec, avg_recall) }