class TokenizeForBM25Worker: def __init__(self, split, query_group, candidate_docs_d, out_dir): self.query_group = query_group self.tokenizer = PCTokenizer() self.candidate_docs_d = candidate_docs_d self.out_dir = out_dir self.ms_reader = MSMarcoDataReader(split) def work(self, job_id): qid_list = self.query_group[job_id] ticker = TimeEstimator(len(qid_list)) missing_rel_cnt = 0 missing_nrel_cnt = 0 def empty_doc_fn(query_id, doc_id): rel_docs = self.ms_reader.qrel[query_id] nonlocal missing_rel_cnt nonlocal missing_nrel_cnt if doc_id in rel_docs: missing_rel_cnt += 1 else: missing_nrel_cnt += 1 def get_tf(text): tokens = self.tokenizer.tokenize_stem(text) return Counter(tokens) for qid in qid_list: if qid not in self.candidate_docs_d: continue docs: List[MSMarcoDoc] = load_per_query_docs(qid, empty_doc_fn) ticker.tick() target_docs = self.candidate_docs_d[qid] tokens_d = {} for d in docs: if d.doc_id in target_docs: title_tokens = self.tokenizer.tokenize_stem(d.title) body_sents = sent_tokenize(d.body) body_tf_list = lmap(get_tf, body_sents) tokens_d[d.doc_id] = (title_tokens, body_tf_list) if len(tokens_d) < len(target_docs): log_variables(job_id, qid) print("{} of {} not found".format(len(tokens_d), len(target_docs))) save_path = os.path.join(self.out_dir, str(qid)) pickle.dump(tokens_d, open(save_path, "wb"))
def main(config): word_list_path = config['word_list_path'] claims = get_all_claims() claim_d = claims_to_dict(claims) stopwords = load_stopwords_for_query() word_list_d: Dict = json.load(open(word_list_path, "r")) tokenizer = PCTokenizer() for query_id in word_list_d: claim = claim_d[int(query_id)] word_list = word_list_d[query_id] base_query_terms = tokenizer.tokenize_stem(claim) base_query_terms = list( [t for t in base_query_terms if t not in stopwords]) #print new_term_set = set() for new_term in word_list: t = tokenizer.stemmer.stem(new_term) if t not in base_query_terms: new_term_set.add(t) print() print("Claim {}: {}".format(query_id, claim)) print("base query terms: ", base_query_terms) print("new terms: ", new_term_set)
def pc_new_init_prob(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) claim_d = claims_to_dict(claims) bias_plus_word: Counter = load_from_pickle("bias_plus_words") tokenizer = PCTokenizer() base_p = max(bias_plus_word.values()) init_p_score_d = {} for cid in d_ids: c_text = claim_d[cid] tokens = tokenizer.tokenize_stem(c_text) score_for_cid = Counter() for t, cnt in Counter(tokens).items(): prob = cnt * base_p score_for_cid[t] = prob for t, score in bias_plus_word.items(): score_for_cid[t] += score score_for_cid = normalize_counter_to_sum1(score_for_cid) init_p_score_d[cid] = score_for_cid save_to_pickle(init_p_score_d, "pc_dev_new_init_prob")
class Worker: def __init__(self, out_dir): robust_path = "/mnt/nfs/work3/youngwookim/data/robust04" tprint("Loading doc ids") self.doc_ids = all_doc_ids_of_interest() tprint("Loading robust docs") self.docs: Dict[str, str] = trec.load_robust(robust_path) tprint("Start processing") n_docs = len(self.doc_ids) docs_per_job = int((n_docs+n_jobs) / 5) self.docs_per_job = docs_per_job self.tokenizer = PCTokenizer() self.out_dir = out_dir def work(self, job_id): doc_id_to_count = dict() st = job_id * self.docs_per_job ed = st + self.docs_per_job todo = self.doc_ids[st:ed] ticker = TimeEstimator(len(todo)) for doc_id in todo: try: text = self.docs[doc_id] tokens = self.tokenizer.tokenize_stem(text) counter = Counter(tokens) doc_id_to_count[doc_id] = counter ticker.tick() except KeyError as e: print(e) print("key error") pass save_path = os.path.join(self.out_dir, str(job_id)) pickle.dump(doc_id_to_count, open(save_path, "wb"))
def get_valid_terms(): perspective = get_perspective_dict() tokenizer = PCTokenizer() voca = set() for text in perspective.values(): voca.update(tokenizer.tokenize_stem(text)) return voca
class BM25: def __init__(self, df, num_doc, avdl, k1=0.01, k2=100, b=0.6): self.core = BM25Bare(df, num_doc, avdl, k1, k2, b) self.tokenizer = PCTokenizer() def score(self, query, text) -> NamedNumber: q_terms = self.tokenizer.tokenize_stem(query) t_terms = self.tokenizer.tokenize_stem(text) q_tf = Counter(q_terms) t_tf = Counter(t_terms) return self.core.score_inner(q_tf, t_tf) def term_idf_factor(self, term): return self.core.term_idf_factor(term) def score_inner(self, q_tf, t_tf) -> NamedNumber: return self.core.score_inner(q_tf, t_tf)
def a_relevant(save_name, q_res_path, claims): top_n = 10 ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) stopwords = load_stopwords_for_query() alpha = 0.5 tokenizer = PCTokenizer() all_passages = [] entries = [] num_pos_sum = 0 num_pos_exists = 0 for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] claim_lm = claim_lms_d[c['cId']] log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) claim_text = c['text'] claim_tokens = tokenizer.tokenize_stem(claim_text) scores = [] for t in claim_tokens: if t in log_odd: scores.append(log_odd[t]) def get_passage_score(p): def get_score(t): if t in stopwords: return 0 return log_odd[tokenizer.stemmer.stem(t)] return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0 passages = iterate_passages(q_res, top_n, get_passage_score) num_pos = len(lfilter(lambda x: x[1] > 0, passages)) num_pos_sum += num_pos if num_pos > 0: num_pos_exists += 1 all_passages.extend(passages) entries.append((c, passages)) print("{} claims. {} docs on {} claims".format(len(claims), num_pos_sum, num_pos_exists)) data = entries, all_passages save_to_pickle(data, save_name)
def count_df(passages: Iterable[Passage]) -> Counter: tokenizer = PCTokenizer() df = Counter() for p in passages: tokens = tokenizer.tokenize_stem(p.text) for term in set(tokens): df[term] += 1 return df
def build_lm(split) -> Iterable[RelevanceModel]: tokenizer = PCTokenizer() problems, candidate_pool_d = prepare_eval_data(split) payload: List[Passage] = get_eval_payload_from_dp(problems) for query, problem in zip(payload, problems): p = problem source_text = p.text1.text tokens = tokenizer.tokenize_stem(source_text) counter = tokens_to_freq(tokens) yield RelevanceModel(query.id.id, query.text, counter)
class LMScorer: def __init__(self, query_lms: Dict[str, Counter], alpha=0.5): self.query_lms = query_lms bg_lm = average_counters(list(query_lms.values())) self.bg_lm = bg_lm self.log_bg_lm: Counter = get_lm_log(bg_lm) self.alpha = alpha self.log_odd_d: Dict[str, Counter] = { k: Counter() for k in query_lms.keys() } self.stopwords = load_stopwords_for_query() self.tokenizer = PCTokenizer() def score(self, query_id, raw_tokens) -> float: stemmed_tokens = self.filter_and_stem(raw_tokens) return self._get_score_from_stemmed_tokens(query_id, stemmed_tokens) def filter_and_stem(self, tokens): stemmed_tokens = [] for t in tokens: if t in self.stopwords: pass else: try: stemmed_t = self.tokenizer.stemmer.stem(t) stemmed_tokens.append(stemmed_t) except UnicodeDecodeError: pass return stemmed_tokens def score_text(self, query_id, text): tokens = self.tokenizer.tokenize_stem(text) tokens = list([t for t in tokens if t not in self.stopwords]) return self._get_score_from_stemmed_tokens(query_id, tokens) def _get_score_from_stemmed_tokens(self, query_id, tokens) -> float: log_odd_d: Counter = self.log_odd_d[query_id] lm = self.query_lms[query_id] def get_score(token: str) -> float: if token in log_odd_d: return log_odd_d[token] if token in lm or token in self.bg_lm: prob_pos = lm[token] * ( 1 - self.alpha) + self.bg_lm[token] * self.alpha pos_log = math.log(prob_pos) else: pos_log = 0 score = pos_log - self.log_bg_lm[token] log_odd_d[token] = score return score return average(lmap(get_score, tokens))
def a_relevant(): d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) claims = claims top_n = 10 q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) stopwords = load_stopwords_for_query() alpha = 0.3 tokenizer = PCTokenizer() all_passages = [] entries = [] for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] claim_lm = claim_lms_d[c['cId']] log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) claim_text = c['text'] claim_tokens = tokenizer.tokenize_stem(claim_text) scores = [] for t in claim_tokens: if t in log_odd: scores.append(log_odd[t]) base = average(scores) def get_passage_score(p): def get_score(t): if t in stopwords: return 0 return log_odd[tokenizer.stemmer.stem(t)] return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0 passages = iterate_passages(q_res, top_n, get_passage_score) all_passages.extend(passages) a_rel_passages = lfilter(lambda x: x[1] > 0, passages) entries.append((c, a_rel_passages)) data = entries, all_passages save_to_pickle(data, "pc_train_a_passages")
def main(): split = "train" resource = ProcessedResource10docMulti(split) query_group: List[List[QueryID]] = load_query_group(split) msmarco_passage_qrel_path = at_data_dir("msmarco", "qrels.train.tsv") passage_qrels: QRelsDict = load_qrels_structured(msmarco_passage_qrel_path) qids = query_group[0] qids = qids[:100] pickle_name = "msmarco_passage_doc_analyze_passage_dict_evidence_loc" try: passage_dict = load_from_pickle(pickle_name) except FileNotFoundError: print("Reading passages...") passage_dict = get_passages(qids, passage_qrels) save_to_pickle(passage_dict, pickle_name) def get_rel_doc_id(qid): if qid not in resource.get_doc_for_query_d(): raise KeyError for doc_id in resource.get_doc_for_query_d()[qid]: label = resource.get_label(qid, doc_id) if label: return doc_id raise KeyError def translate_token_idx_to_sent_idx(stemmed_body_tokens_list, loc_in_body): acc = 0 for idx, tokens in enumerate(stemmed_body_tokens_list): acc += len(tokens) if loc_in_body < acc: return idx return -1 pc_tokenize = PCTokenizer() bert_tokenizer = get_tokenizer() for qid in qids: try: doc_id = get_rel_doc_id(qid) stemmed_tokens_d = resource.get_stemmed_tokens_d(qid) stemmed_title_tokens, stemmed_body_tokens_list = stemmed_tokens_d[doc_id] rel_passages = list([passage_id for passage_id, score in passage_qrels[qid].items() if score]) success = False found_idx = -1 for rel_passage_id in rel_passages: passage_text = passage_dict[rel_passage_id].strip() passage_tokens = pc_tokenize.tokenize_stem(passage_text) stemmed_body_tokens_flat = lflatten(stemmed_body_tokens_list) n, log = lcs(passage_tokens, stemmed_body_tokens_flat, True) if len(passage_tokens) > 4 and n > len(passage_tokens) * 0.7 and n > 0: success = True _, loc_in_body = log[0] sent_idx = translate_token_idx_to_sent_idx(stemmed_body_tokens_list, loc_in_body) prev = stemmed_body_tokens_flat[:loc_in_body] loc_by_bert_tokenize = len(bert_tokenizer.tokenize(" ".join(prev))) print(sent_idx, loc_in_body, loc_by_bert_tokenize, len(stemmed_body_tokens_list)) found_idx = sent_idx if not success: print("Not found. doc_lines={} passage_len={}".format(len(stemmed_body_tokens_list), len(passage_tokens))) except KeyError: pass
def doc_lm_scoring(): gold = get_claim_perspective_id_dict() d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) claims = claims top_n = 10 q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) stopwords = load_stopwords_for_query() alpha = 0.5 html_visualizer = HtmlVisualizer("doc_lm_doc_level.html") tokenizer = PCTokenizer() random_passages = [] num_pos_sum = 0 num_pos_exists = 0 for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] html_visualizer.write_headline("{} : {}".format(c['cId'], c['text'])) # for cluster in clusters: # html_visualizer.write_paragraph("---") # p_text_list: List[str] = lmap(perspective_getter, cluster) # for text in p_text_list: # html_visualizer.write_paragraph(text) # html_visualizer.write_paragraph("---") claim_lm = claim_lms_d[c['cId']] topic_lm_prob = smooth(claim_lm.LM, bg_lm, alpha) log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) claim_text = c['text'] claim_tokens = tokenizer.tokenize_stem(claim_text) scores = [] for t in claim_tokens: if t in log_odd: scores.append(log_odd[t]) threshold = average(scores) s = "\t".join(left(log_odd.most_common(30))) html_visualizer.write_paragraph("Log odd top: " + s) not_found = set() def get_log_odd(x): x = tokenizer.stemmer.stem(x) if x not in log_odd: not_found.add(x) return log_odd[x] def get_probs(x): x = tokenizer.stemmer.stem(x) if x not in topic_lm_prob: not_found.add(x) return topic_lm_prob[x] def get_passage_score(p): return sum([log_odd[tokenizer.stemmer.stem(t)] for t in p]) / len(p) if len(p) > 0 else 0 passages = iterate_passages(q_res, top_n, get_passage_score) passages.sort(key=lambda x: x[1], reverse=True) html_visualizer.write_paragraph("Threshold {}".format(threshold)) top5_scores = right(passages[:5]) bot5_scores = right(passages[-5:]) if len(random_passages) > 5: random_sel_pssages = random.choices(random_passages, k=5) else: random_sel_pssages = [] random5_scores = lmap(get_passage_score, random_sel_pssages) def score_line(scores): return " ".join(lmap(two_digit_float, scores)) html_visualizer.write_paragraph("top 5: " + score_line(top5_scores)) html_visualizer.write_paragraph("bot 5: " + score_line(bot5_scores)) html_visualizer.write_paragraph("random 5: " + score_line(random5_scores)) num_pos = len(lfilter(lambda x: x[1] > 0, passages)) num_pos_sum += num_pos if num_pos > 0: num_pos_exists += 1 def print_doc(doc, html_visualizer, score): cells = lmap(lambda x: get_cell_from_token(x, get_log_odd(x)), doc) html_visualizer.write_headline("score={}".format(score)) html_visualizer.multirow_print(cells, width=20) random_passages.extend(left(passages)) if threshold < 0: continue for doc, score in passages: if score < 0: break print_doc(doc, html_visualizer, score) html_visualizer.write_headline("Bottom 5") for doc, score in passages[-5:]: print_doc(doc, html_visualizer, score) print("{} claims. {} docs on {} claims".format(len(claims), num_pos_sum, num_pos_exists))