def main(): save_dir = os.path.join(output_path, "pc_qc4") exist_or_mkdir(save_dir) split_filename = split_name2 for split in splits: qids: Iterable[str] = get_qids_for_split(split_filename, split) queries = get_qck_queries_from_cids(lmap(int, qids)) eval_candidate = get_qck_candidate_for_split(split_filename, split) save_path = os.path.join(save_dir, split) make_pc_qc(queries, eval_candidate, is_correct_factory(), save_path)
def get_lm_scorer(claim_lms: List[ClaimLM], alpha): bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) claim_log_odds_dict: Dict[int, Counter] = {c_lm.cid: get_log_odd(c_lm, bg_lm, alpha) for c_lm in claim_lms} def scorer(claim_id: int, p_tokens: List[str]) -> NamedNumber: c_lm = claim_log_odds_dict[claim_id] reason = " ".join(["{0} ({1:.2f})".format(t, c_lm[t]) for t in p_tokens]) score = sum([c_lm[t] for t in p_tokens]) return NamedNumber(score, reason) return scorer
def get_ap_from_file_path(input_path): tf_prediction_data = load_pickle_from(input_path) tf_prediction_data = flatten_batches(tf_prediction_data) logits = tf_prediction_data["logits"] label_ids = tf_prediction_data["label_ids"] scores = lmap(logit_to_score_softmax, logits) assert len(scores) == len(label_ids) return get_ap(label_ids, scores)
def encode(inst: Payload) -> OrderedDict: tokens_1_1: List[str] = tokenizer.tokenize(inst.text1) tokens_1_2: List[str] = tokenizer.tokenize(inst.text2) def tokenize_from_tokens_fn(tokens): return tokenize_from_tokens(tokenizer, tokens) tokens_2_list: List[List[str]] = lmap(tokenize_from_tokens_fn, inst.passage_list) tokens, segment_ids = combine_with_sep_cls(max_seq_length, tokens_1_1, tokens_1_2) input_ids, input_mask, segment_ids = get_basic_input_feature_as_list( tokenizer, max_seq_length, tokens, segment_ids) features = collections.OrderedDict() features["input_ids"] = create_int_feature(input_ids) features["input_mask"] = create_int_feature(input_mask) features["segment_ids"] = create_int_feature(segment_ids) def iterate_over(tokens1, tokens2_list) -> Tuple[List[str], List[int]]: dummy_tokens = ["[PAD]"] * max_seq_length dummy_segment_ids = [0] * max_seq_length def make_for_each_window(tokens2): tokens, segment_ids = combine_and_pad(tokens1, tokens2) return tokens, segment_ids tokens_and_segment_ids_list: List[Tuple[List[str], List[int]]] = \ lmap(make_for_each_window, tokens2_list[:num_windows]) pad_len = num_windows - len(tokens_and_segment_ids_list) tokens_and_segment_ids_list += [(dummy_tokens, dummy_segment_ids) ] * pad_len tokens_list, segment_ids_list = zip(*tokens_and_segment_ids_list) return lflatten(tokens_list), lflatten(segment_ids_list) def get_second_feature_parts(tokens1, tokens2_list): tokens, segment_ids = iterate_over(tokens1, tokens2_list) return get_basic_input_feature_as_list(tokenizer, d_max_seq_length, tokens, segment_ids) input_ids, input_mask, segment_ids = get_second_feature_parts( tokens_1_2, tokens_2_list) features["input_ids2"] = create_int_feature(input_ids) features["input_mask2"] = create_int_feature(input_mask) features["segment_ids2"] = create_int_feature(segment_ids) input_ids, input_mask, segment_ids = get_second_feature_parts( tokens_1_1, tokens_2_list) features["input_ids3"] = create_int_feature(input_ids) features["input_mask3"] = create_int_feature(input_mask) features["segment_ids3"] = create_int_feature(segment_ids) features['label_ids'] = create_int_feature([inst.is_correct]) features['data_id'] = create_int_feature([inst.data_id]) return features
def context_viewer(target_topic): docs = get_relevant_docs(target_topic)[:100] predictor = Predictor(target_topic) def get_topic_stance(sents, target_topic): return predictor.predict(target_topic, sents) window_size = [-3, 1] # inclusive #window_size = [0,0] def window(center_loc, list_len): start = max(0, center_loc + window_size[0]) end = min(list_len-1, center_loc + window_size[1]) return start, end+1 line_split = sent_tokenize sents_list = lmap(line_split, docs) #topic_stances_list = load_from_pickle("stance_{}_rel.pickle".format(target_topic)) topic_stances_list = flat_apply_stack(lambda x: get_topic_stance(x, target_topic), sents_list, False) save_to_pickle(topic_stances_list, "stance_{}_rel.pickle".format(target_topic)) def summarize_stance(list_stance): assert len(list_stance) > 0 stance_count = Counter() for s in list_stance: stance_count[s] += 1 if stance_count[1] > 0 and stance_count[2] > 0: return 3 for stance in [1,2]: if stance_count[stance] > 0: return stance return 0 def contains(sents, query): return query in " ".join(sents) count = Counter() for doc_idx, doc in enumerate(docs): sents = line_split(doc) num_sents = len(sents) if num_sents < 1: print("Skip doc #{}".format(doc_idx)) continue topic_stances = topic_stances_list[doc_idx] for i, sent in enumerate(sents): st, ed = window(i, num_sents) A_stance = summarize_stance(topic_stances[st:ed]) if A_stance in [1,2]: print("-------------") for j in range(st, ed): print(topic_stances[j], sents[j]) print("-------------")
def get_rm(data_point): label, cid, pid, claim_text, p_text = data_point file_name = "{}_{}_{}.txt".format(disk_name, cid, pid) f = open(os.path.join(dir_path, file_name)) def parse_line(line): term, prob = line.split("\t") # prob = float(prob) * 1000 return term, prob return lmap(parse_line, f), int(label)
def preload_docs(ranked_list, claims, top_n): def get_doc_ids(claim: Dict): # Find the q_res q_res: List[SimpleRankedListEntry] = ranked_list[str(claim['cId'])] return list([q_res[i].doc_id for i in range(top_n)]) all_doc_ids: Set[str] = set(flatten(lmap(get_doc_ids, claims))) print(f"total of {len(all_doc_ids)} docs") print("Accessing DB") # Get the doc from DB preload_man.preload(TokenizedCluewebDoc, all_doc_ids)
def cluster_to_query(cluster: PerspectiveCluster) -> DocQuery: claim_text = claim_text_d[cluster.claim_id] perspective_text_list = list( [perspective_text_d[pid] for pid in cluster.perspective_ids]) query_id = get_pc_cluster_query_id(cluster) claim_tf: Counter = get_terms(claim_text) pers_tf: Counter = average_counters( lmap(get_terms, perspective_text_list)) tf = sum_counters([claim_tf, pers_tf]) query: DocQuery = counter_to_galago_query(query_id, tf) return query
def cap_ed(ss_list: List[SegmentScore], step_size) -> List[SegmentScore]: max_start_idx = max([s.start_idx for s in ss_list]) cap_end_idx = max_start_idx + step_size def transform(ss: SegmentScore): if ss.end_idx < cap_end_idx: return ss else: return SegmentScore(ss.start_idx, cap_end_idx, ss.score) return lmap(transform, ss_list)
def load_qk_score(config) -> List[QKOutEntry]: info_path = config['info_path'] passage_score_path = config['pred_path'] score_type = config['score_type'] fetch_field_list = ["logits", "input_ids", "data_id"] data_id_to_info: Dict = load_combine_info_jsons(info_path, qk_convert_map) data: List[Dict] = join_prediction_with_info(passage_score_path, data_id_to_info, fetch_field_list) qk_out_entries: List[QKOutEntry] = lmap(QKOutEntry.from_dict2, data) return qk_out_entries
def a_relevant(): d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) claims = claims top_n = 10 q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) stopwords = load_stopwords_for_query() alpha = 0.3 tokenizer = PCTokenizer() all_passages = [] entries = [] for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] claim_lm = claim_lms_d[c['cId']] log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) claim_text = c['text'] claim_tokens = tokenizer.tokenize_stem(claim_text) scores = [] for t in claim_tokens: if t in log_odd: scores.append(log_odd[t]) base = average(scores) def get_passage_score(p): def get_score(t): if t in stopwords: return 0 return log_odd[tokenizer.stemmer.stem(t)] return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0 passages = iterate_passages(q_res, top_n, get_passage_score) all_passages.extend(passages) a_rel_passages = lfilter(lambda x: x[1] > 0, passages) entries.append((c, a_rel_passages)) data = entries, all_passages save_to_pickle(data, "pc_train_a_passages")
def get_eval_candidates_1k_as_qck(split) -> Dict[str, List[QCKCandidate]]: cid_dict_format: List[Tuple[int, List[Dict]]] = get_eval_candidates_1k(split) def convert(e) -> Tuple[int, List[int]]: cid, p_list = e return cid, lmap(lambda p: p['pid'], p_list) cid_pid_format: List[Tuple[int, List[int]]] = lmap(convert, cid_dict_format) return cid_pid_format_to_qck(cid_pid_format)
def generate_instances( self, claim: Dict, data_id_manager: DataIDManager) -> List[PairedInstance]: cid = claim['cId'] claim = claim['text'] passages = self.cid_to_passages[cid] good_passages: List[List[str]] = left( lfilter(score_over_zero, passages)) not_good_passages: List[List[str]] = left( lfilter_not(score_over_zero, passages)) n_good = len(good_passages) n_not_good = len(not_good_passages) # len(pair_list_g_ng) = n_not_good ( assuming n_not_good > n_good) pair_list_g_ng: List[Tuple[ List[str], List[str]]] = generate_pairwise_combinations( not_good_passages, good_passages, True) # len(pair_list_g_rand) = n_good pair_list_g_rand: List[Tuple[List[str], List[str]]] = list([ (inst, self.random_sample(cid)) for inst in good_passages ]) # len(pair_list_g_rand) = n_not_good pair_list_ng_rand: List[Tuple[List[str], List[str]]] = list([ (inst, self.random_sample(cid)) for inst in not_good_passages ]) def make_instance(passage_pair, strict_good, strict_bad): passage_good, passage_worse = passage_pair info = {'cid': cid} return PairedInstance(claim, passage_good, passage_worse, strict_good, strict_bad, data_id_manager.assign(info)) l1 = lmap(lambda pair: make_instance(pair, 1, 0), pair_list_g_ng) l2 = lmap(lambda pair: make_instance(pair, 0, 1), pair_list_ng_rand) l3 = lmap(lambda pair: make_instance(pair, 1, 1), pair_list_g_rand) print("g-ng : ng-rank : g-rand = {} : {} : {}".format( len(l1), len(l2), len(l3))) return l1 + l2 + l3
def get_candidate(split) -> Dict[str, List[QCKCandidateI]]: tokenizer = get_tokenizer() queries = get_qck_queries(split) max_seq_length = 512 def get_candidate_for_query(query: QCKQuery): res = get_evidence_from_pool(query.text, 60) query_len = len(tokenizer.tokenize(query.text)) candidate_max_len = max_seq_length - 3 - query_len output = [] for text, e_id, score in res: tokens = tokenizer.tokenize(text) for passage in enum_passage(tokens, candidate_max_len): c = QCKCandidateWToken(str(e_id), "", passage) output.append(c) return output qid_list = lmap(lambda q: q.query_id, queries) candidate_list_list = lmap(get_candidate_for_query, queries) return dict(zip(qid_list, candidate_list_list))
def calculate_score(info, pred_path, baseline_score: Dict[Tuple[str, str], float], str_data_id=False) -> List[DocValueParts]: predictions: List[Dict] = join_prediction_with_info( pred_path, info, ["logits"], str_data_id) out_entries: List[QCKOutEntry] = lmap(QCKOutEntry.from_dict, predictions) labels: Dict[str, List[str]] = load_labels() doc_score_parts: List[DocValueParts] = get_doc_value_parts( out_entries, baseline_score, labels) return doc_score_parts
def main(): split = "train" subjectivity_path = sys.argv[1] q_res_path = sys.argv[2] ranked_list: Dict[str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) # load LM claim_lms: List[ClaimLM] = build_gold_lms_for_sub_split(split) bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) alpha = 0.1 stopwords = load_stopwords_for_query() # load subjectivity predictions. subj_d: Dict[str, Tuple[int, int]] = load_subjectivity(subjectivity_path) doc_ids = subj_d.keys() preload_man.preload(TokenizedCluewebDoc, doc_ids) tokenizer = PCTokenizer() lm_scores = [] rates = [] num_subj_list = [] num_sent_list = [] for claim_lm in claim_lms: qid = str(claim_lm.cid) log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) def get_passage_score(p): def get_score(t): if t in stopwords: return 0 return log_odd[tokenizer.stemmer.stem(t)] return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0 for entry in ranked_list[qid]: if entry.doc_id in subj_d: tokens = load_doc(entry.doc_id) assert type(tokens[0]) == str lm_score = get_passage_score(tokens) num_subj, num_sent = subj_d[entry.doc_id] rate = num_subj / num_sent lm_scores.append(lm_score) rates.append(rate) num_subj_list.append(num_subj) num_sent_list.append(num_sent) print("lm scores correlation with ") print("rates: ", pearsonr(lm_scores, rates)) print("num subj: ", pearsonr(lm_scores, num_subj_list)) print("num sent: ", pearsonr(lm_scores, num_sent_list))
def print_features(): job_dir = "perspective_paragraph_feature" job_id = 0 file_path = os.path.join(sydney_working_dir, job_dir, str(job_id)) features: List[ParagraphClaimPersFeature] = pickle.load( open(os.path.join(file_path), "rb")) features: List[ParagraphFeature] = lmap(to_paragraph_feature, features) out_path = pjoin(output_path, FileName("perspective_paragraph_feature.html")) print_paragraph_feature(features, out_path)
def work(self, job_id): qid_list = self.query_group[job_id] ticker = TimeEstimator(len(qid_list)) missing_rel_cnt = 0 missing_nrel_cnt = 0 def empty_doc_fn(query_id, doc_id): rel_docs = self.ms_reader.qrel[query_id] nonlocal missing_rel_cnt nonlocal missing_nrel_cnt if doc_id in rel_docs: missing_rel_cnt += 1 else: missing_nrel_cnt += 1 for qid in qid_list: if qid not in self.candidate_docs_d: continue docs: List[MSMarcoDoc] = load_per_query_docs(qid, empty_doc_fn) ticker.tick() target_docs = self.candidate_docs_d[qid] text_d = {} bert_tokens_d = {} stemmed_tokens_d = {} for d in docs: if d.doc_id in target_docs: title = d.title title = crop_to_space(title, self.max_title_length) body_sents = sent_tokenize(d.body) new_body_sents = self.resplit_body_sents(body_sents) text_d[d.doc_id] = title, new_body_sents for tokenize_fn, save_dict in [ (self.bert_tokenizer.tokenize, bert_tokens_d), (self.stem_tokenizer.tokenize_stem, stemmed_tokens_d) ]: title_tokens = tokenize_fn(title) body_tokens_list = lmap(tokenize_fn, new_body_sents) save_dict[d.doc_id] = (title_tokens, body_tokens_list) todo = [ (text_d, self.text_dir_name), (bert_tokens_d, self.bert_tokens_dir_name), (stemmed_tokens_d, self.stemmed_tokens_dir_name), ] for tokens_d, dir_name in todo: save_path = os.path.join(self.out_dir, dir_name, str(qid)) pickle.dump(tokens_d, open(save_path, "wb"))
def summarize_score(info_dir, prediction_file) -> Dict[Tuple[str, str], float]: info = load_combine_info_jsons(info_dir, qckl_convert_map, False) print("Info has {} entries".format(len(info))) data: List[Dict] = join_prediction_with_info(prediction_file, info, ["data_id", "logits"]) def get_score(entry): return entry['logits'] grouped: Dict[Tuple[str, str], List[Dict]] = group_by(data, get_qc_pair_id) print("Group size:", len(grouped)) out_d = {} for pair_id, items in grouped.items(): scores = lmap(get_score, items) assert len(scores) == 1 final_score = scores[0] out_d[pair_id] = final_score num_items_per_group = average(lmap(len, grouped.values())) print("Num items per group : ", num_items_per_group) return out_d
def remove_duplicate(doc_id_list: List[str]) -> List[str]: docs_d: Dict[str, List[str]] = load_multiple(TokenizedCluewebDoc, doc_id_list, True) hashes = lmap(doc_hash, [ docs_d[doc_id] if doc_id in docs_d else None for doc_id in doc_id_list ]) duplicate_indice = get_duplicate_list(hashes) non_duplicate = list([ doc_id_list[i] for i in range(len(doc_id_list)) if i not in duplicate_indice ]) return non_duplicate
def combine_pc_train_info(): st = 0 ed = 606 def load_file(i): pickle_path = os.path.join(sydney_working_dir, "pc_rel_tfrecord_info", "{}".format(i)) return pickle.load(open(pickle_path, "rb")) d_list = lmap(load_file, range(st, ed)) combined_dict = merge_dict_list(d_list) save_to_pickle(combined_dict, "pc_rel_info_all")
def debug_clean_text(): queries = load_queries(all_years) def convert_query(q): return trec_query_to_galago_query(q, KEYWORD_QUERY) new_queries = lmap(convert_query, queries) for q_old, q_new in zip(queries, new_queries): if q_new['text'] != q_old.keyword_query: print("before:", q_old.keyword_query) print("after:", q_new['text'])
def get_ukp_dev_sents(self, topic): loader = DataLoader(topic) data = loader.get_dev_data() tokenizer = get_tokenizer() def encode(e): sent, label = e tokens = tokenizer.tokenize(sent) return label, tokens label_sent_pairs = lmap(encode, data) return label_sent_pairs
def add_cls_to_parsed(): d = load_from_pickle("webster_parsed") def add_cls(def_tokens): return ["[CLS]"] + def_tokens new_d = {} for word, def_list in d.items(): new_def_list = lmap(add_cls, def_list) new_d[word.lower()] = new_def_list save_to_pickle(new_d, "webster_parsed_w_cls")
def show_num_mention(): train, val = load_feature_and_split() p_dict = get_perspective_dict() claims = get_claims_from_ids(lmap(lambda x: x['cid'], train)) claim_d = claims_to_dict(claims) grouped = group_by(train, lambda x: x['cid']) for cid in grouped: print("Claim:", claim_d[cid]) for dp in grouped[cid]: p_text = p_dict[dp['pid']] print(dp['label'], get_num_mention(dp), p_text)
def build_single_claim_lm(all_ranked_list, claim): candidate_k = 50 claim_text, perspectives = get_perspective(claim, candidate_k) unigrams = get_relevant_unigrams(perspectives) cid = claim['cId'] ranked_list = all_ranked_list.get(str(cid)) doc_ids = [t[0] for t in ranked_list] preload_docs(doc_ids) preload_tf(doc_ids) docs = lmap(load_and_format_doc, doc_ids) lm_classifier = build_lm(docs, unigrams) return lm_classifier
def get_extended_eval_candidate(split) -> Dict[int, List[int]]: bm25 = get_bm25_module() d_ids = load_claim_ids_for_split(split) claims: List[Dict] = get_claims_from_ids(d_ids) cid_to_pids: Dict[int, List[int]] = get_claim_perspective_id_dict2() tokenizer = PCTokenizer() def get_tf_idf(c: Counter): r = Counter() for t, cnt in c.items(): tfidf = bm25.term_idf_factor(t) * cnt r[t] = tfidf return r def get_candidates(c: Dict) -> Tuple[int, List[int]]: cid = c["cId"] assert type(cid) == int claim_text = c["text"] claim_tokens = tokenizer.tokenize_stem(claim_text) top_k = 50 lucene_results = es_helper.get_perspective_from_pool(claim_text, top_k) candidate_list: List[int] = [] for rank, (_text, _pid, _score) in enumerate(lucene_results): candidate_list.append(_pid) gold_pids = cid_to_pids[int(cid)] hard_candidate = [] mismatch_voca = Counter() for pid in gold_pids: if pid not in candidate_list: hard_candidate.append(pid) p_text = perspective_getter(pid) p_tokens = tokenizer.tokenize_stem(p_text) for t in p_tokens: if t not in claim_tokens: mismatch_voca[t] += 1 candidate_list.extend(hard_candidate) mismatch_tf_idf = get_tf_idf(mismatch_voca) new_qterms = left(mismatch_tf_idf.most_common(30)) lucene_results = es_helper.get_perspective_from_pool( " ".join(new_qterms), top_k) for rank, (_text, _pid, _score) in enumerate(lucene_results): if _pid not in candidate_list: candidate_list.append(_pid) return cid, candidate_list candidates: List[Tuple[int, List[int]]] = lmap(get_candidates, claims) return dict(candidates)
def scorer(query_p: Passage, candidate: List[Passage]) -> List[NamedNumber]: q_tf = basic_tf.get_tf(query_p) def do_score(candidate_p: Passage) -> NamedNumber: if candidate_p.text == query_p.text: return NamedNumber(-99, "equal") p_tf = basic_tf.get_tf(candidate_p) return bm25_module.score_inner(q_tf, p_tf) scores = lmap(do_score, candidate) return scores
def doc_score_predictions(): passage_score_path = "output/cppnc/qknc_val" info = load_combine_info_jsons("output/cppnc/qknc_val.info", qk_convert_map) data = join_prediction_with_info(passage_score_path, info) grouped: Dict[str, List[Dict]] = group_by(data, lambda x: x['query'].query_id) def get_score_from_logit(logits): return scipy.special.softmax(logits)[1] for cid, passages in grouped.items(): scores: List[float] = lmap(lambda d: get_score_from_logit(d['logits']), passages) yield cid, scores
def main(): baseline_cid_grouped, cid_grouped, claim_d = load_cppnc_related_data() gold = get_claim_perspective_id_dict() bin_keys = ["< 0.05", "< 0.50", "< 0.95", "< 1"] def bin_fn(item: float): if item > 0.95: return "< 1" elif item > 0.5: return "< 0.95" elif item > 0.05: return "< 0.50" else: return "< 0.05" for cid, pid_entries in cid_grouped.items(): baseline_pid_entries = baseline_cid_grouped[cid] baseline_score_d = {} for cpid, a_thing_array in baseline_pid_entries: _, pid = cpid assert len(a_thing_array) == 1 score = a_thing_array[0]['score'] baseline_score_d[pid] = score gold_pids = gold[cid] def get_score_per_pid_entry(p_entries: Tuple[CPIDPair, List[Dict]]): cpid, entries = p_entries return average(lmap(lambda e: e['score'], entries)) pid_entries.sort(key=get_score_per_pid_entry, reverse=True) s = "{} : {}".format(cid, claim_d[cid]) print(s) head_row = [""] + bin_keys rows = [head_row] for cpid, things in pid_entries: histogram = BinHistogram(bin_fn) _, pid = cpid label = any([pid in pids for pids in gold_pids]) label_str = bool_to_yn(label) base_score = baseline_score_d[pid] base_score_str = "{0:.2f}".format(base_score) scores: List[float] = lmap(lambda x: (x['score']), things) foreach(histogram.add, scores) row = [label_str, base_score_str] + [ str(histogram.counter[bin_key]) for bin_key in bin_keys ] rows.append(row) print_table(rows)