def main(config): word_list_path = config['word_list_path'] claims = get_all_claims() claim_d = claims_to_dict(claims) stopwords = load_stopwords_for_query() word_list_d: Dict = json.load(open(word_list_path, "r")) tokenizer = PCTokenizer() for query_id in word_list_d: claim = claim_d[int(query_id)] word_list = word_list_d[query_id] base_query_terms = tokenizer.tokenize_stem(claim) base_query_terms = list( [t for t in base_query_terms if t not in stopwords]) #print new_term_set = set() for new_term in word_list: t = tokenizer.stemmer.stem(new_term) if t not in base_query_terms: new_term_set.add(t) print() print("Claim {}: {}".format(query_id, claim)) print("base query terms: ", base_query_terms) print("new terms: ", new_term_set)
def pc_new_init_prob(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) claim_d = claims_to_dict(claims) bias_plus_word: Counter = load_from_pickle("bias_plus_words") tokenizer = PCTokenizer() base_p = max(bias_plus_word.values()) init_p_score_d = {} for cid in d_ids: c_text = claim_d[cid] tokens = tokenizer.tokenize_stem(c_text) score_for_cid = Counter() for t, cnt in Counter(tokens).items(): prob = cnt * base_p score_for_cid[t] = prob for t, score in bias_plus_word.items(): score_for_cid[t] += score score_for_cid = normalize_counter_to_sum1(score_for_cid) init_p_score_d[cid] = score_for_cid save_to_pickle(init_p_score_d, "pc_dev_new_init_prob")
def get_valid_terms(): perspective = get_perspective_dict() tokenizer = PCTokenizer() voca = set() for text in perspective.values(): voca.update(tokenizer.tokenize_stem(text)) return voca
def a_relevant(save_name, q_res_path, claims): top_n = 10 ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) stopwords = load_stopwords_for_query() alpha = 0.5 tokenizer = PCTokenizer() all_passages = [] entries = [] num_pos_sum = 0 num_pos_exists = 0 for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] claim_lm = claim_lms_d[c['cId']] log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) claim_text = c['text'] claim_tokens = tokenizer.tokenize_stem(claim_text) scores = [] for t in claim_tokens: if t in log_odd: scores.append(log_odd[t]) def get_passage_score(p): def get_score(t): if t in stopwords: return 0 return log_odd[tokenizer.stemmer.stem(t)] return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0 passages = iterate_passages(q_res, top_n, get_passage_score) num_pos = len(lfilter(lambda x: x[1] > 0, passages)) num_pos_sum += num_pos if num_pos > 0: num_pos_exists += 1 all_passages.extend(passages) entries.append((c, passages)) print("{} claims. {} docs on {} claims".format(len(claims), num_pos_sum, num_pos_exists)) data = entries, all_passages save_to_pickle(data, save_name)
def build_lm(split) -> Iterable[RelevanceModel]: tokenizer = PCTokenizer() problems, candidate_pool_d = prepare_eval_data(split) payload: List[Passage] = get_eval_payload_from_dp(problems) for query, problem in zip(payload, problems): p = problem source_text = p.text1.text tokens = tokenizer.tokenize_stem(source_text) counter = tokens_to_freq(tokens) yield RelevanceModel(query.id.id, query.text, counter)
def count_df(passages: Iterable[Passage]) -> Counter: tokenizer = PCTokenizer() df = Counter() for p in passages: tokens = tokenizer.tokenize_stem(p.text) for term in set(tokens): df[term] += 1 return df
def a_relevant(): d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) claims = claims top_n = 10 q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) stopwords = load_stopwords_for_query() alpha = 0.3 tokenizer = PCTokenizer() all_passages = [] entries = [] for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] claim_lm = claim_lms_d[c['cId']] log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) claim_text = c['text'] claim_tokens = tokenizer.tokenize_stem(claim_text) scores = [] for t in claim_tokens: if t in log_odd: scores.append(log_odd[t]) base = average(scores) def get_passage_score(p): def get_score(t): if t in stopwords: return 0 return log_odd[tokenizer.stemmer.stem(t)] return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0 passages = iterate_passages(q_res, top_n, get_passage_score) all_passages.extend(passages) a_rel_passages = lfilter(lambda x: x[1] > 0, passages) entries.append((c, a_rel_passages)) data = entries, all_passages save_to_pickle(data, "pc_train_a_passages")
def __init__(self, query_lms: Dict[str, Counter], alpha=0.5): self.query_lms = query_lms bg_lm = average_counters(list(query_lms.values())) self.bg_lm = bg_lm self.log_bg_lm: Counter = get_lm_log(bg_lm) self.alpha = alpha self.log_odd_d: Dict[str, Counter] = { k: Counter() for k in query_lms.keys() } self.stopwords = load_stopwords_for_query() self.tokenizer = PCTokenizer()
def __init__(self, out_dir): robust_path = "/mnt/nfs/work3/youngwookim/data/robust04" tprint("Loading doc ids") self.doc_ids = all_doc_ids_of_interest() tprint("Loading robust docs") self.docs: Dict[str, str] = trec.load_robust(robust_path) tprint("Start processing") n_docs = len(self.doc_ids) docs_per_job = int((n_docs+n_jobs) / 5) self.docs_per_job = docs_per_job self.tokenizer = PCTokenizer() self.out_dir = out_dir
class TokenizeForBM25Worker: def __init__(self, split, query_group, candidate_docs_d, out_dir): self.query_group = query_group self.tokenizer = PCTokenizer() self.candidate_docs_d = candidate_docs_d self.out_dir = out_dir self.ms_reader = MSMarcoDataReader(split) def work(self, job_id): qid_list = self.query_group[job_id] ticker = TimeEstimator(len(qid_list)) missing_rel_cnt = 0 missing_nrel_cnt = 0 def empty_doc_fn(query_id, doc_id): rel_docs = self.ms_reader.qrel[query_id] nonlocal missing_rel_cnt nonlocal missing_nrel_cnt if doc_id in rel_docs: missing_rel_cnt += 1 else: missing_nrel_cnt += 1 def get_tf(text): tokens = self.tokenizer.tokenize_stem(text) return Counter(tokens) for qid in qid_list: if qid not in self.candidate_docs_d: continue docs: List[MSMarcoDoc] = load_per_query_docs(qid, empty_doc_fn) ticker.tick() target_docs = self.candidate_docs_d[qid] tokens_d = {} for d in docs: if d.doc_id in target_docs: title_tokens = self.tokenizer.tokenize_stem(d.title) body_sents = sent_tokenize(d.body) body_tf_list = lmap(get_tf, body_sents) tokens_d[d.doc_id] = (title_tokens, body_tf_list) if len(tokens_d) < len(target_docs): log_variables(job_id, qid) print("{} of {} not found".format(len(tokens_d), len(target_docs))) save_path = os.path.join(self.out_dir, str(qid)) pickle.dump(tokens_d, open(save_path, "wb"))
def get_eval_candidates(split, top_k=50) -> List[Tuple[int, List[Dict]]]: # split -> claims d_ids = load_claim_ids_for_split(split) claims: List[Dict] = get_claims_from_ids(d_ids) tokenizer = PCTokenizer() def get_candidates(c: Dict) -> Tuple[int, List[Dict]]: cid = c["cId"] assert type(cid) == int claim_text = c["text"] lucene_results = es_helper.get_perspective_from_pool(claim_text, top_k) candidate_list = [] for rank, (_text, _pid, _score) in enumerate(lucene_results): rationale = "es_rank={} , es_score={}".format(rank, _score) p_entry = { 'cid': cid, 'pid': _pid, 'claim_text': claim_text, 'perspective_text': _text, 'p_tokens': tokenizer.tokenize_stem(_text), 'rationale': rationale, } candidate_list.append(p_entry) return cid, candidate_list candidates: List[Tuple[int, List[Dict]]] = lmap(get_candidates, claims) return candidates
class Worker: def __init__(self, out_dir): robust_path = "/mnt/nfs/work3/youngwookim/data/robust04" tprint("Loading doc ids") self.doc_ids = all_doc_ids_of_interest() tprint("Loading robust docs") self.docs: Dict[str, str] = trec.load_robust(robust_path) tprint("Start processing") n_docs = len(self.doc_ids) docs_per_job = int((n_docs+n_jobs) / 5) self.docs_per_job = docs_per_job self.tokenizer = PCTokenizer() self.out_dir = out_dir def work(self, job_id): doc_id_to_count = dict() st = job_id * self.docs_per_job ed = st + self.docs_per_job todo = self.doc_ids[st:ed] ticker = TimeEstimator(len(todo)) for doc_id in todo: try: text = self.docs[doc_id] tokens = self.tokenizer.tokenize_stem(text) counter = Counter(tokens) doc_id_to_count[doc_id] = counter ticker.tick() except KeyError as e: print(e) print("key error") pass save_path = os.path.join(self.out_dir, str(job_id)) pickle.dump(doc_id_to_count, open(save_path, "wb"))
class BM25: def __init__(self, df, num_doc, avdl, k1=0.01, k2=100, b=0.6): self.core = BM25Bare(df, num_doc, avdl, k1, k2, b) self.tokenizer = PCTokenizer() def score(self, query, text) -> NamedNumber: q_terms = self.tokenizer.tokenize_stem(query) t_terms = self.tokenizer.tokenize_stem(text) q_tf = Counter(q_terms) t_tf = Counter(t_terms) return self.core.score_inner(q_tf, t_tf) def term_idf_factor(self, term): return self.core.term_idf_factor(term) def score_inner(self, q_tf, t_tf) -> NamedNumber: return self.core.score_inner(q_tf, t_tf)
def __init__(self, bm25_module, max_seq_length, include_title=False): self.max_seq_length = max_seq_length self.bm25_module = bm25_module pc_tokenize = PCTokenizer() self.tokenize_stem = pc_tokenize.tokenize_stem self.include_title = include_title bert_tokenizer = get_tokenizer() self.bert_tokenize = bert_tokenizer.tokenize
def a_relevant(): d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) claims = claims[:10] top_n = 100 q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) stopwords = load_stopwords_for_query() alpha = 0.7 tokenizer = PCTokenizer() for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] claim_lm = claim_lms_d[c['cId']] log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) def get_passage_score(p): def get_score(t): if t in stopwords: return 0 return log_odd[tokenizer.stemmer.stem(t)] return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0 docs = [] for i in range(top_n): try: doc = load_doc(q_res[i].doc_id) docs.append(doc) except KeyError: docs.append(None) pass print(c['text']) rows = [] for rank, doc in enumerate(docs): if doc is None: rows.append((rank, "-", "-")) continue scores = get_doc_score(doc, get_passage_score) avg_score = average(scores) max_score = max(scores) rows.append((rank, avg_score, max_score)) print_table(rows)
def build_baseline_lms(claims): tokenizer = PCTokenizer() def get_claim_lm(claim): cid = claim["cId"] counter = tokens_to_freq(tokenizer.tokenize_stem(claim['text'])) return ClaimLM(cid, claim['text'], counter) claim_lms = lmap(get_claim_lm, claims) return claim_lms
class LMScorer: def __init__(self, query_lms: Dict[str, Counter], alpha=0.5): self.query_lms = query_lms bg_lm = average_counters(list(query_lms.values())) self.bg_lm = bg_lm self.log_bg_lm: Counter = get_lm_log(bg_lm) self.alpha = alpha self.log_odd_d: Dict[str, Counter] = { k: Counter() for k in query_lms.keys() } self.stopwords = load_stopwords_for_query() self.tokenizer = PCTokenizer() def score(self, query_id, raw_tokens) -> float: stemmed_tokens = self.filter_and_stem(raw_tokens) return self._get_score_from_stemmed_tokens(query_id, stemmed_tokens) def filter_and_stem(self, tokens): stemmed_tokens = [] for t in tokens: if t in self.stopwords: pass else: try: stemmed_t = self.tokenizer.stemmer.stem(t) stemmed_tokens.append(stemmed_t) except UnicodeDecodeError: pass return stemmed_tokens def score_text(self, query_id, text): tokens = self.tokenizer.tokenize_stem(text) tokens = list([t for t in tokens if t not in self.stopwords]) return self._get_score_from_stemmed_tokens(query_id, tokens) def _get_score_from_stemmed_tokens(self, query_id, tokens) -> float: log_odd_d: Counter = self.log_odd_d[query_id] lm = self.query_lms[query_id] def get_score(token: str) -> float: if token in log_odd_d: return log_odd_d[token] if token in lm or token in self.bg_lm: prob_pos = lm[token] * ( 1 - self.alpha) + self.bg_lm[token] * self.alpha pos_log = math.log(prob_pos) else: pos_log = 0 score = pos_log - self.log_bg_lm[token] log_odd_d[token] = score return score return average(lmap(get_score, tokens))
def get_extended_eval_candidate(split) -> Dict[int, List[int]]: bm25 = get_bm25_module() d_ids = load_claim_ids_for_split(split) claims: List[Dict] = get_claims_from_ids(d_ids) cid_to_pids: Dict[int, List[int]] = get_claim_perspective_id_dict2() tokenizer = PCTokenizer() def get_tf_idf(c: Counter): r = Counter() for t, cnt in c.items(): tfidf = bm25.term_idf_factor(t) * cnt r[t] = tfidf return r def get_candidates(c: Dict) -> Tuple[int, List[int]]: cid = c["cId"] assert type(cid) == int claim_text = c["text"] claim_tokens = tokenizer.tokenize_stem(claim_text) top_k = 50 lucene_results = es_helper.get_perspective_from_pool(claim_text, top_k) candidate_list: List[int] = [] for rank, (_text, _pid, _score) in enumerate(lucene_results): candidate_list.append(_pid) gold_pids = cid_to_pids[int(cid)] hard_candidate = [] mismatch_voca = Counter() for pid in gold_pids: if pid not in candidate_list: hard_candidate.append(pid) p_text = perspective_getter(pid) p_tokens = tokenizer.tokenize_stem(p_text) for t in p_tokens: if t not in claim_tokens: mismatch_voca[t] += 1 candidate_list.extend(hard_candidate) mismatch_tf_idf = get_tf_idf(mismatch_voca) new_qterms = left(mismatch_tf_idf.most_common(30)) lucene_results = es_helper.get_perspective_from_pool( " ".join(new_qterms), top_k) for rank, (_text, _pid, _score) in enumerate(lucene_results): if _pid not in candidate_list: candidate_list.append(_pid) return cid, candidate_list candidates: List[Tuple[int, List[int]]] = lmap(get_candidates, claims) return dict(candidates)
def main(): split = "train" subjectivity_path = sys.argv[1] q_res_path = sys.argv[2] ranked_list: Dict[str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) # load LM claim_lms: List[ClaimLM] = build_gold_lms_for_sub_split(split) bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) alpha = 0.1 stopwords = load_stopwords_for_query() # load subjectivity predictions. subj_d: Dict[str, Tuple[int, int]] = load_subjectivity(subjectivity_path) doc_ids = subj_d.keys() preload_man.preload(TokenizedCluewebDoc, doc_ids) tokenizer = PCTokenizer() lm_scores = [] rates = [] num_subj_list = [] num_sent_list = [] for claim_lm in claim_lms: qid = str(claim_lm.cid) log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) def get_passage_score(p): def get_score(t): if t in stopwords: return 0 return log_odd[tokenizer.stemmer.stem(t)] return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0 for entry in ranked_list[qid]: if entry.doc_id in subj_d: tokens = load_doc(entry.doc_id) assert type(tokens[0]) == str lm_score = get_passage_score(tokens) num_subj, num_sent = subj_d[entry.doc_id] rate = num_subj / num_sent lm_scores.append(lm_score) rates.append(rate) num_subj_list.append(num_subj) num_sent_list.append(num_sent) print("lm scores correlation with ") print("rates: ", pearsonr(lm_scores, rates)) print("num subj: ", pearsonr(lm_scores, num_subj_list)) print("num sent: ", pearsonr(lm_scores, num_sent_list))
def run_lm2(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 5 tokenizer = PCTokenizer() tf_d = { c['cId']: Counter(nltk.tokenize.word_tokenize(c['text'])) for c in claims } bm25 = get_bm25_module() ctf = get_perspective_tf() pred = predict_by_lm(tf_d, ctf, bm25, claims, top_k) print(evaluate(pred))
def predict_by_bm25_rm(bm25_module: BM25, rm_info: Dict[str, List[Tuple[str, str]]], claims, top_k) -> List[Tuple[str, List[Dict]]]: cid_to_text: Dict[int, str] = claims_to_dict(claims) tokenizer = PCTokenizer() def stem_merge(score_list: List[Tuple[str, float]]) -> Counter: c = Counter() for k, v in score_list: try: new_k = tokenizer.stemmer.stem(k) c[new_k] += v except UnicodeDecodeError: pass return c rm_info: Dict[str, List[Tuple[str, float]]] = dict_value_map(parse_float, rm_info) rm_info: Dict[str, List[Tuple[str, float]]] = dict_value_map(normalize_scores, rm_info) rm_info_c: Dict[str, Counter] = dict_value_map(stem_merge, rm_info) print(len(rm_info_c.keys())) print(len(claims)) not_found = set() def scorer(lucene_score, query_id) -> NamedNumber: claim_id, p_id = query_id.split("_") c_text = cid_to_text[int(claim_id)] p_text = perspective_getter(int(p_id)) score: NamedNumber = bm25_module.score(c_text, p_text) nclaim_id = int(claim_id) if nclaim_id in rm_info: ex_qtf = rm_info_c[nclaim_id] p_tokens = tokenizer.tokenize_stem(p_text) ex_score = bm25_module.score_inner(ex_qtf, Counter(p_tokens)) new_info = score.name + "({})".format(ex_score.name) score = NamedNumber(score + ex_score, new_info) else: not_found.add(claim_id) return score r = predict_interface(claims, top_k, scorer) print(not_found) return r
def get_query_lms(split) -> Dict[str, Counter]: evi_dict: Dict[int, str] = load_evidence_dict() tokenzier = PCTokenizer() queries = get_qck_queries(split) evi_gold_dict: Dict[str, List[int]] = evidence_gold_dict_str_qid() def get_evidence_texts(query: QCKQuery) -> List[str]: query_id = query.query_id e_ids: List[int] = evi_gold_dict[query_id] return list([evi_dict[eid] for eid in e_ids]) def get_query_lm(query: QCKQuery) -> Counter: return text_list_to_lm(tokenzier, get_evidence_texts(query)) lms = lmap(get_query_lm, queries) qids = lmap(QCKQuery.get_id, queries) query_lms: Dict[str, Counter] = dict(zip(qids, lms)) return query_lms
def __init__(self, split, query_group, candidate_docs_d, max_sent_length, max_title_length, out_dir): self.query_group = query_group self.candidate_docs_d = candidate_docs_d self.out_dir = out_dir self.bert_tokenizer = get_tokenizer() self.stem_tokenizer = PCTokenizer() self.max_sent_length = max_sent_length self.max_title_length = max_title_length self.ms_reader = MSMarcoDataReader(split) self.text_dir_name = 'text' self.bert_tokens_dir_name = 'bert_tokens' self.stemmed_tokens_dir_name = 'stemmed_tokens' for name in [ self.text_dir_name, self.bert_tokens_dir_name, self.stemmed_tokens_dir_name ]: exist_or_mkdir(os.path.join(self.out_dir, name))
def build_gold_lms(claims) -> List[ClaimLM]: gold = get_claim_perspective_id_dict() tokenizer = PCTokenizer() def get_cluster_lm(cluster: List[int]) -> Counter: p_text_list: List[str] = lmap(perspective_getter, cluster) tokens_list: List[List[str]] = lmap(tokenizer.tokenize_stem, p_text_list) counter_list = lmap(tokens_to_freq, tokens_list) counter = average_counters(counter_list) return counter def get_claim_lm(claim) -> ClaimLM: cid = claim["cId"] counter_list: List[Counter] = lmap(get_cluster_lm, gold[cid]) counter: Counter = average_counters(counter_list) return ClaimLM(cid, claim['text'], counter) claim_lms = lmap(get_claim_lm, claims) return claim_lms
def get_train_passage_a_lms(): data = load_from_pickle("pc_train_a_passages") entries, all_passages = data voca = get_valid_terms() tokenizer = PCTokenizer() bg_tf = tokens_to_freq(flatten(left(all_passages))) bg_tf = simplify_tf(bg_tf, voca) alpha = 0.99 # Smoothing with claim alpha2 = 0.3 # Smoothing with collection documents r = [] ticker = TimeEstimator(len(entries)) for c, passages in entries: r_tf = passage_to_lm(tokenizer, c, passages, alpha) r_tf = simplify_tf(r_tf, voca) c_tf = smooth_ex(r_tf, bg_tf, alpha2) r.append(ClaimLM(c['cId'], c['text'], c_tf)) ticker.tick() return r
def predict_by_lm(claim_lms: List[ClaimLM], claims, top_k) -> List[Tuple[str, List[Dict]]]: alpha = 0.1 bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) tokenizer = PCTokenizer() print("Eval log odds") claim_log_odds_dict = {str(c_lm.cid): get_log_odd(c_lm, bg_lm, alpha) for c_lm in claim_lms} def scorer(lucene_score, query_id) -> NamedNumber: claim_id, p_id = query_id.split("_") p_text = perspective_getter(int(p_id)) tokens = tokenizer.tokenize_stem(p_text) c_lm = claim_log_odds_dict[claim_id] reason = " ".join(["{0} ({1:.2f})".format(t, c_lm[t]) for t in tokens]) score = sum([c_lm[t] for t in tokens]) return NamedNumber(score, reason) r = predict_interface(claims, top_k, scorer) return r
def build_df(): claims, val = train_split() gold = get_claim_perspective_id_dict() tokenizer = PCTokenizer() df = Counter() dl_list = [] for claim in claims: cid = claim["cId"] gold_pids = flatten(gold[cid]) p_text_list: List[str] = lmap(perspective_getter, gold_pids) tokens_list = lmap(tokenizer.tokenize_stem, p_text_list) dl_list.extend(lmap(len, tokens_list)) for t in set(flatten(tokens_list)): df[t] += 1 print(dl_list) print("Avdl", average(dl_list)) print(len(claims)) print(df.most_common(30)) save_to_pickle(df, "pc_df")
def main(): split = "train" resource = ProcessedResource10docMulti(split) query_group: List[List[QueryID]] = load_query_group(split) msmarco_passage_qrel_path = at_data_dir("msmarco", "qrels.train.tsv") passage_qrels: QRelsDict = load_qrels_structured(msmarco_passage_qrel_path) qids = query_group[0] qids = qids[:100] pickle_name = "msmarco_passage_doc_analyze_passage_dict_evidence_loc" try: passage_dict = load_from_pickle(pickle_name) except FileNotFoundError: print("Reading passages...") passage_dict = get_passages(qids, passage_qrels) save_to_pickle(passage_dict, pickle_name) def get_rel_doc_id(qid): if qid not in resource.get_doc_for_query_d(): raise KeyError for doc_id in resource.get_doc_for_query_d()[qid]: label = resource.get_label(qid, doc_id) if label: return doc_id raise KeyError def translate_token_idx_to_sent_idx(stemmed_body_tokens_list, loc_in_body): acc = 0 for idx, tokens in enumerate(stemmed_body_tokens_list): acc += len(tokens) if loc_in_body < acc: return idx return -1 pc_tokenize = PCTokenizer() bert_tokenizer = get_tokenizer() for qid in qids: try: doc_id = get_rel_doc_id(qid) stemmed_tokens_d = resource.get_stemmed_tokens_d(qid) stemmed_title_tokens, stemmed_body_tokens_list = stemmed_tokens_d[doc_id] rel_passages = list([passage_id for passage_id, score in passage_qrels[qid].items() if score]) success = False found_idx = -1 for rel_passage_id in rel_passages: passage_text = passage_dict[rel_passage_id].strip() passage_tokens = pc_tokenize.tokenize_stem(passage_text) stemmed_body_tokens_flat = lflatten(stemmed_body_tokens_list) n, log = lcs(passage_tokens, stemmed_body_tokens_flat, True) if len(passage_tokens) > 4 and n > len(passage_tokens) * 0.7 and n > 0: success = True _, loc_in_body = log[0] sent_idx = translate_token_idx_to_sent_idx(stemmed_body_tokens_list, loc_in_body) prev = stemmed_body_tokens_flat[:loc_in_body] loc_by_bert_tokenize = len(bert_tokenizer.tokenize(" ".join(prev))) print(sent_idx, loc_in_body, loc_by_bert_tokenize, len(stemmed_body_tokens_list)) found_idx = sent_idx if not success: print("Not found. doc_lines={} passage_len={}".format(len(stemmed_body_tokens_list), len(passage_tokens))) except KeyError: pass
def __init__(self, df, num_doc, avdl, k1=0.01, k2=100, b=0.6): self.core = BM25Bare(df, num_doc, avdl, k1, k2, b) self.tokenizer = PCTokenizer()
def doc_lm_scoring(): gold = get_claim_perspective_id_dict() d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) claims = claims top_n = 10 q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) stopwords = load_stopwords_for_query() alpha = 0.5 html_visualizer = HtmlVisualizer("doc_lm_doc_level.html") tokenizer = PCTokenizer() random_passages = [] num_pos_sum = 0 num_pos_exists = 0 for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] html_visualizer.write_headline("{} : {}".format(c['cId'], c['text'])) # for cluster in clusters: # html_visualizer.write_paragraph("---") # p_text_list: List[str] = lmap(perspective_getter, cluster) # for text in p_text_list: # html_visualizer.write_paragraph(text) # html_visualizer.write_paragraph("---") claim_lm = claim_lms_d[c['cId']] topic_lm_prob = smooth(claim_lm.LM, bg_lm, alpha) log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) claim_text = c['text'] claim_tokens = tokenizer.tokenize_stem(claim_text) scores = [] for t in claim_tokens: if t in log_odd: scores.append(log_odd[t]) threshold = average(scores) s = "\t".join(left(log_odd.most_common(30))) html_visualizer.write_paragraph("Log odd top: " + s) not_found = set() def get_log_odd(x): x = tokenizer.stemmer.stem(x) if x not in log_odd: not_found.add(x) return log_odd[x] def get_probs(x): x = tokenizer.stemmer.stem(x) if x not in topic_lm_prob: not_found.add(x) return topic_lm_prob[x] def get_passage_score(p): return sum([log_odd[tokenizer.stemmer.stem(t)] for t in p]) / len(p) if len(p) > 0 else 0 passages = iterate_passages(q_res, top_n, get_passage_score) passages.sort(key=lambda x: x[1], reverse=True) html_visualizer.write_paragraph("Threshold {}".format(threshold)) top5_scores = right(passages[:5]) bot5_scores = right(passages[-5:]) if len(random_passages) > 5: random_sel_pssages = random.choices(random_passages, k=5) else: random_sel_pssages = [] random5_scores = lmap(get_passage_score, random_sel_pssages) def score_line(scores): return " ".join(lmap(two_digit_float, scores)) html_visualizer.write_paragraph("top 5: " + score_line(top5_scores)) html_visualizer.write_paragraph("bot 5: " + score_line(bot5_scores)) html_visualizer.write_paragraph("random 5: " + score_line(random5_scores)) num_pos = len(lfilter(lambda x: x[1] > 0, passages)) num_pos_sum += num_pos if num_pos > 0: num_pos_exists += 1 def print_doc(doc, html_visualizer, score): cells = lmap(lambda x: get_cell_from_token(x, get_log_odd(x)), doc) html_visualizer.write_headline("score={}".format(score)) html_visualizer.multirow_print(cells, width=20) random_passages.extend(left(passages)) if threshold < 0: continue for doc, score in passages: if score < 0: break print_doc(doc, html_visualizer, score) html_visualizer.write_headline("Bottom 5") for doc, score in passages[-5:]: print_doc(doc, html_visualizer, score) print("{} claims. {} docs on {} claims".format(len(claims), num_pos_sum, num_pos_exists))