def generate_instances(self, claim: Dict, data_id_manager: DataIDManager) -> List[Instance]: cid = claim['cId'] claim = claim['text'] passages = self.cid_to_passages[cid] good_passages: List[List[str]] = left( lfilter(score_over_zero, passages)) not_good_passages: List[List[str]] = left( lfilter_not(score_over_zero, passages)) n_good = len(good_passages) n_not_good = len(not_good_passages) random_passage = list([self.random_sample(cid) for _ in range(10)]) # len(pair_list_g_ng) = n_not_good ( assuming n_not_good > n_good) def make_instance(passage, label): info = {'cid': cid} return Instance(claim, passage, label, data_id_manager.assign(info)) l1 = lmap(lambda p: make_instance(p, 1), good_passages) l2 = lmap(lambda p: make_instance(p, 0), not_good_passages) l3 = lmap(lambda p: make_instance(p, 0), random_passage) print("g: ng : rand = {} : {} : {}".format(len(l1), len(l2), len(l3))) return l1 + l2 + l3
def generate_instances(self, claim: Dict, data_id_manager) -> List[PairedInstance]: cid = claim['cId'] perspective_clusters: List[List[int]] = self.gold[cid] passages = self.cid_to_passages[cid] gold_candidate_texts: List[str] = flatten_map(perspective_getter, perspective_clusters) good_passages: List[List[str]] = left( lfilter(score_over_zero, passages)) not_good_passages: List[List[str]] = left( lfilter_not(score_over_zero, passages)) # print("good/not_good passages : {}/{}".format(len(good_passages), len(not_good_passages))) # make good vs not_good pairs # about 100 items pair_list_g_ng: List[Tuple[ List[str], List[str]]] = generate_pairwise_combinations( not_good_passages, good_passages, True) # make not_good vs random pairs # about 100 items pair_list_ng_rand: List[Tuple[List[str], List[str]]] = list([ (inst, self.random_sample(cid)) for inst in not_good_passages ]) # generate (candiate_texts) X (two pair_list), while limit maximum to 5 * len(two pair_list) = 1000 max_insts = 100 * 2 * 5 def infinite_passage_iterator(): while True: for pair in pair_list_g_ng: strict_good = 1 strict_bad = 0 yield pair, strict_good, strict_bad for pair in pair_list_ng_rand: strict_good = 0 strict_bad = 1 yield pair, strict_good, strict_bad itr = infinite_passage_iterator() all_passage_pair_len = len(pair_list_g_ng) + len(pair_list_ng_rand) n_passage_per_inst = int(max_insts / len(gold_candidate_texts)) + 1 n_passage_per_inst = min(all_passage_pair_len, n_passage_per_inst) all_insts = [] for candidate in gold_candidate_texts: for _ in range(n_passage_per_inst): passage_pair, strict_good, strict_bad = itr.__next__() passage_good, passage_worse = passage_pair insts = PairedInstance(passage_good, passage_worse, candidate, strict_good, strict_bad) all_insts.append(insts) return all_insts
def get_aawd_binary_train_dev(): global aawd_train_dev_preload if aawd_train_dev_preload is not None: return aawd_train_dev_preload train, dev, test = load_aawd_splits_as_binary() train_x = left(train) train_y = right(train) dev_x = left(dev) dev_y = right(dev) aawd_train_dev_preload = train_x, train_y, dev_x, dev_y return aawd_train_dev_preload
def eval_map(split, score_d: Dict[CPIDPair, float], debug=False): # load pre-computed perspectives candidates: List[Tuple[int, List[Dict]]] = get_eval_candidates_from_pickle( split) # only evalaute what's available valid_cids: Set[int] = set(left(score_d.keys())) sub_candidates: List[Tuple[int, List[Dict]]] = lfilter( lambda x: x[0] in valid_cids, candidates) print("{} claims are evaluated".format(len(sub_candidates))) print(left(sub_candidates)) predictions = predict_from_dict(score_d, sub_candidates, 50) return evaluate_map(predictions, debug)
def get_ap_list_from_score_d(score_d, split): candidates: List[Tuple[int, List[Dict]]] = get_eval_candidates_from_pickle( split) # only evalaute what's available valid_cids: Set[int] = set(left(score_d.keys())) sub_candidates: List[Tuple[int, List[Dict]]] = lfilter( lambda x: x[0] in valid_cids, candidates) print("{} claims are evaluated".format(len(sub_candidates))) predictions = predict_from_dict(score_d, sub_candidates, 50) cids = left(predictions) ap_list = get_average_precision_list(predictions, False) return ap_list, cids
def main(config): qk_candidate: List[QKUnit] = load_from_pickle( "robust_on_clueweb_qk_candidate") qk_out_entries: List[QKOutEntry] = load_qk_score(config) score_type = config['score_type'] k = config['k'] queries = left(qk_candidate) good_doc_list_d = {q.query_id: set() for q in queries} for entry in qk_out_entries: score = get_score_from_logit(score_type, entry.logits) if score > k: good_doc_list_d[entry.query.query_id].add(entry.kdp.doc_id) stat_count = Counter() def filter_map(qk_unit: QKUnit): query, kdp_list = qk_unit good_doc_list = good_doc_list_d[query.query_id] def is_good(kdp): return kdp.doc_id in good_doc_list new_kdp_list = lfilter(is_good, kdp_list) print("{} -> {}".format(len(kdp_list), len(new_kdp_list))) if not new_kdp_list: stat_count["no kdp"] += 1 return query, new_kdp_list new_qk_candidate = lmap(filter_map, qk_candidate) print(stat_count) save_to_pickle(new_qk_candidate, "robust_on_clueweb_qk_candidate_filtered")
def get_stance_check_candidate(text: str, bm25_module: BM25): sents = sent_tokenize_newline(text) term_importance = get_term_importance(bm25_module, sents) def is_heading_num(s): return re.match(r'^\[(\d{1,3}|i{1,5})\]', s) is not None r = [] for sent in sents: if not sent.strip(): continue if is_heading_num(sent.strip()): continue tokens = nltk.tokenize.word_tokenize(sent) tokens = set(tokens) def per_token_score(t): s = bm25_module.tokenizer.stemmer.stem(t) return term_importance[s] scores: List[Tuple[str, float]] = lmap_pairing(per_token_score, tokens) scores.sort(key=lambda x: x[1], reverse=True) terms = left(scores[:5]) candidate = sent, terms r.append(candidate) return r
def generate_instances(self, claim: Dict, data_id_manager) -> List[Payload]: cid = claim['cId'] claim = claim['text'] perspectives = self.candidate_perspective[cid] passages = self.cid_to_passages[cid] if self.filter_good: filter_condition = score_over_zero else: def filter_condition(dummy): return True good_passages: List[List[str]] = left( lfilter(filter_condition, passages)) output = [] for pid in perspectives: is_correct = any([pid in cluster for cluster in self.gold[cid]]) for passage_idx, passage in enumerate(good_passages): perspective = perspective_getter(pid) info = {'cid': cid, 'pid': pid, 'passage_idx': passage_idx} p = Payload(passage, claim, perspective, data_id_manager.assign(info), is_correct) output.append(p) return output
def load_prediction( data: EstimatorPredictionViewer) -> List[Tuple[str, List[float]]]: print("prediction has {} entry".format(data.data_len)) def parse_entry(entry) -> Tuple[str, float]: input_tokens: Segment = entry.get_tokens('input_ids') logits = entry.get_vector("logits") probs = softmax(logits) key = input_tokens_to_key(input_tokens) score = probs[1] return key, score parsed_data: List[Tuple[str, float]] = lmap(parse_entry, data) keys: List[str] = unique_from_sorted(left(parsed_data)) grouped: Dict[str, List[Tuple[str, float]]] = group_by(parsed_data, lambda x: x[0]) def fetch_scores(key): l = [] for k2, score in grouped[key]: assert key == k2 l.append(score) return key, l results: List[Tuple[str, List[float]]] = lmap(fetch_scores, keys) return results
def extract_predictions(score_d, split): candidates: List[Tuple[int, List[Dict]]] = get_eval_candidates_from_pickle( split) # only evalaute what's available valid_cids: Set[int] = set(left(score_d.keys())) sub_candidates: List[Tuple[int, List[Dict]]] = lfilter( lambda x: x[0] in valid_cids, candidates) print("{} claims are evaluated".format(len(sub_candidates))) def make_decisions(e: Tuple[int, List[Dict]]): cid, p_list = e decisions = [] for p in p_list: pid = int(p['pid']) query_id = CPIDPair((cid, pid)) if query_id in score_d: score = score_d[query_id] else: score = 0 binary = 1 if score > 0.5 else 0 decisions.append((cid, pid, binary)) return cid, decisions predictions = lmap(make_decisions, candidates) return predictions
def load_prediction(pred_path) -> List[Tuple[str, List[np.ndarray]]]: data = EstimatorPredictionViewer(pred_path) def parse_entry(entry) -> Tuple[str, np.ndarray]: input_tokens: Segment = entry.get_tokens('input_ids') logits = entry.get_vector("logits") probs = softmax(logits) key = input_tokens_to_key(input_tokens) return key, probs parsed_data: List[Tuple[str, np.ndarray]] = lmap(parse_entry, data) keys: List[str] = unique_from_sorted(left(parsed_data)) grouped: Dict[str, List[Tuple[str, np.ndarray]]] = group_by(parsed_data, lambda x: x[0]) def fetch_scores(key): l = [] for k2, score in grouped[key]: assert key == k2 l.append(score) return key, l results: List[Tuple[str, List[np.ndarray]]] = lmap(fetch_scores, keys) return results
def qk_candidate_gen(q_res_path: str, doc_score_path, split, config) -> List[Tuple[QCKQuery, List[KDP]]]: queries: List[QCKQuery] = get_qck_queries(split) num_jobs = d_n_claims_per_split2[split] score_d = load_doc_scores(doc_score_path, num_jobs) tprint("loading ranked list") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) query_ids = list(ranked_list.keys()) query_ids.sort() print("num queries", len(query_ids)) q_id_to_job_id = {q_id: job_id for job_id, q_id in enumerate(query_ids)} print("Pre loading docs") top_n = config['top_n'] out_qk: List[Tuple[QCKQuery, List[KnowledgeDocumentPart]]] = [] all_doc_parts = 0 ticker = TimeEstimator(len(queries)) for q in queries: job_id: int = q_id_to_job_id[q.query_id] entries: List = score_d[job_id] entries.sort(key=get_second, reverse=True) doc_ids = left(entries) doc_ids = doc_ids[:top_n] preload_man.preload(TokenizedCluewebDoc, doc_ids) docs = iterate_docs(doc_ids) doc_part_list: List[KDP] = iterate_document_parts( docs, config['window_size'], config['step_size'], 20) all_doc_parts += len(doc_part_list) out_qk.append((q, doc_part_list)) ticker.tick() return out_qk
def select_vertices_edges(counter) -> Tuple[Edges, List[Any]]: def is_not_funct(word): if len(word) > 2: return True return word not in ",.)(:'\"`-?''``,%" #print("total pairs", len(counter)) vertice_counter = get_vertices_info(counter) #print("total terms", len(vertice_counter)) common_vertices = list([(k, cnt) for k, cnt in vertice_counter.items() if cnt > 100]) common_vertices.sort(key=lambda x: x[1], reverse=True) # print(left(common_vertices[:20])) # print("Terms with more than 100 appearance : ", len(common_vertices)) valid_vertices: List[Any] = lfilter(is_not_funct, left(common_vertices)) valid_pairs = list([((a, b), cnt) for (a, b), cnt in counter.items() if a in valid_vertices and b in valid_vertices]) # print("valid pairs", len(valid_pairs)) unnormalized_edges: Dict[Any, Dict] = {} for (a, b), cnt in valid_pairs: if a not in unnormalized_edges: unnormalized_edges[a] = Counter() unnormalized_edges[a][b] += cnt edges = {} for vertex_a, raw_edges in unnormalized_edges.items(): total = sum(raw_edges.values()) local_edges = Counter() for vertex_b, cnt in raw_edges.items(): prob = cnt / total local_edges[vertex_b] = prob edges[vertex_a] = local_edges return Edges(edges), valid_vertices
def count_n_gram_grom_docs(docs, n, config, exclude_fn): count = Counter() tick = TimeEstimator(len(docs)) top_k = 10000 after_pruning = False for doc_idx, doc in enumerate(docs): if doc_idx % 10000 == 0: print(doc_idx) tick.tick() for segment in doc: if MERGE_SUBWORD in config: segment = merge_subword(segment) assert type(segment) == list for ngram_item in ngrams(segment, n): if after_pruning and ngram_item in selected_ngram: continue elif exclude_fn(ngram_item): pass else: count[ngram_item] += 1 if len(count) > 1000 * 1000 and not after_pruning: print("Performing pruning") tf_cnt = list(count.items()) tf_cnt.sort(key=lambda x: x[1], reverse=True) selected_ngram = set(left(tf_cnt[:top_k])) after_pruning = True return count
def generate_instances(self, claim: Dict, data_id_manager) -> List[Payload]: cid = claim['cId'] claim = claim['text'] perspectives = self.candidate_perspective[cid] passages = self.cid_to_passages[cid] output = [] for pid in perspectives: info = { 'cid': cid, 'pid': pid, } is_correct = any([pid in cluster for cluster in self.gold[cid]]) perspective = perspective_getter(pid) passage_list = left(passages) payload = Payload( passage_list, claim, perspective, data_id_manager.assign(info), is_correct, ) output.append(payload) return output
def passage_to_lm(tokenizer, claim, passages: List[Tuple[List[str], float]], alpha): claim_text = claim['text'] claim_tokens = tokenizer.tokenize_stem(claim_text) tf = tokens_to_freq(flatten(left(passages))) c_tf = tokens_to_freq(claim_tokens) r_tf = smooth_ex(c_tf, tf, alpha) return r_tf
def predict(doc): tokens = tokenizer(doc) sum_odd = 0 top10 = left(list(self.get_tf10(tokens))) odd_list = lmap(term_odd, tokens) result = sum(odd_list) return result
def __init__(self, max_sequence, vocab_filename, voca_size): self.train_data = None self.dev_data = None self.test_data = None voca_path = os.path.join(data_path, vocab_filename) assert os.path.exists(voca_path) print(voca_path) self.mscore = read_mscore_valid() self.mscore_dict = dict(self.mscore) self.train_topics, self.dev_topics = self.held_out(left(self.mscore)) self.lower_case = True self.sep_char = "#" self.encoder = FullTokenizerWarpper(voca_path) self.voca_size = voca_size self.dev_explain = None self.encoder_unit = EncoderUnit(max_sequence, voca_path) self.client = TextReaderClient() class UniformSampler: def __init__(self, topics): self.sample_space = topics def sample(self): return random.sample(self.sample_space, 2) class BiasSampler: def __init__(self, topics, score_dict): self.sample_space = [] self.sample_group = dict() def score2key(score): return int(math.log(score+1, 1.1)) for topic in topics: key = score2key(score_dict[topic]) if key not in self.sample_group: self.sample_group[key] = [] self.sample_group[key].append(topic) self.sample_space = list(self.sample_group.keys()) # Sample from all group def sample(self): def pick1(l): return l[random.randrange(len(l))] g1, g2 = random.sample(self.sample_space, 2) t1 = pick1(self.sample_group[g1]) t2 = pick1(self.sample_group[g2]) return t1, t2 self.train_sampler = BiasSampler(self.train_topics, self.mscore_dict) self.dev_sampler = BiasSampler(self.dev_topics, self.mscore_dict)
def select_paragraph( docs: Dict[str, List[List[str]]], clue12_13_df, claim_list: List[Dict], strategy="topk", ) -> List[Tuple[str, List[List[str]]]]: claim_id_to_text: Dict[int, str] = {c['cId']: c['text'] for c in claim_list} cdf = 50 * 1000 * 1000 top_k = 100 not_found_set = set() def idf(term: str): if term not in clue12_13_df: if term in string.printable: return 0 not_found_set.add(term) return math.log((cdf + 0.5) / (clue12_13_df[term] + 0.5)) r: List[Tuple[str, List[List[str]]]] = [] ticker = TimeEstimator(len(docs)) for claim_id, docs in docs.items(): claim_text = claim_id_to_text[int(claim_id)] q_terms = set(re_tokenize(nltk.tokenize.word_tokenize(claim_text))) def scorer(para: List[str]) -> float: return paragraph_scorer(idf, q_terms, para) max_score = sum(lmap(idf, q_terms)) def get_best_per_doc(doc: List[str]) -> List[Tuple[List[str], float]]: paragraph_list: Iterable[List[str]] = enum_paragraph([doc]) paragraph_scored_list: List[Tuple[List[str], float]] = lmap_pairing( scorer, paragraph_list) paragraph_scored_list.sort(key=lambda x: x[1], reverse=True) return paragraph_scored_list[:1] selected: List[Tuple[List[str], float]] = list( flatten(lmap(get_best_per_doc, docs))) # if strategy == "topk": # selected: List[Tuple[List[str], float]] = paragraph_scored_list[:top_k] # elif strategy == "cutoff": # cut_off = max_score * 0.6 # selected: List[Tuple[List[str], float]] = lfilter(lambda x: x[1] > cut_off, paragraph_scored_list) # else: # assert False e = claim_id, left(selected) r.append(e) ticker.tick() return r
def sample_kdps(qk_list: List[QKUnit]) -> List[QKUnit]: n = 4 def sample(l: List[KDP]): random.shuffle(l) return l[:n] right_things = lmap(sample, right(qk_list)) return list(zip(left(qk_list), right_things))
def get_scores(r: List[Tuple[int, int]]) -> Dict: tp = sum([1 if a == b == 1 else 0 for a, b in r]) tn = sum([1 if a == b == 0 else 0 for a, b in r]) accuracy = (tp + tn) / len(r) pp = sum(left(r)) precision = tp / pp if pp != 0 else 0 recall = tp / sum(right(r)) return {'accuracy': accuracy, 'precision': precision, 'recall': recall}
def show(r: RelevanceModel): print('----') print(r.text) log_topic_lm = get_lm_log(smooth(r.lm, bg_lm, alpha)) log_bg_lm = get_lm_log(bg_lm) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) for k, v in r.lm.most_common(50): print(k, v) s = "\t".join(left(r.lm.most_common(10))) print("LM freq: ", s) print(s) s = "\t".join(left(log_odd.most_common(30))) print("Log odd top", s) s = "\t".join(left(least_common(log_odd, 10))) print("Log odd bottom", s)
def show(claim_lm: ClaimLM): print('----') print(claim_lm.claim) log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_bg_lm = get_lm_log(bg_lm) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) for k, v in claim_lm.LM.most_common(50): print(k, v) s = "\t".join(left(claim_lm.LM.most_common(10))) print("LM freq: ", s) print(s) s = "\t".join(left(log_odd.most_common(30))) print("Log odd top", s) s = "\t".join(left(least_common(log_odd, 10))) print("Log odd bottom", s)
def guardian_generate(query): articles = load_all_articles(query) article_d = {} for entry in articles: id = entry[0] article_d[id] = entry score_list = load_ranking(query) top_k = 200 ids = get_top_ids(score_list, top_k) sents = [] for id in ids: print(id) id, title, short_id, text = article_d[id] sents += [title] + nltk.sent_tokenize(text) verbs_all = Counter() nouns_all = Counter() entities_all = Counter() print("POS tagging...") shuffle(sents) size_small = int(len(sents)*0.1) ticker = TimeEstimator(size_small) sub_sents = sents[:size_small] for sent in sub_sents: verbs, nouns = get_verb_nouns(sent) nouns_all.update(nouns) verbs_all.update(verbs) entities_all.update(get_entities(sent)) ticker.tick() v_top = left(verbs_all.most_common(100)) n_top = left(nouns_all.most_common(100)) en_top = left(entities_all.most_common(100)) print("Verbs") list_print(v_top, 10) print("Nouns") list_print(n_top, 10) print("Entities") list_print(en_top, 10)
def high_idf_q_terms(self, q_tf, n_limit=10): total_doc = 11503029 + 100 high_qt = Counter() for term, qf in q_tf.items(): qdf = self.df[term] w = BM25_3_q_weight(qf, qdf, total_doc) high_qt[term] = w return set(left(high_qt.most_common(n_limit)))
def load_passage_dict(todo, passage_qrels): passage_ids_to_find = [] qids = left(todo) for qid in qids: for passage_id, score in passage_qrels[qid].items(): if score: passage_ids_to_find.append(passage_id) passage_dict = get_passage_dict(passage_ids_to_find) save_to_pickle(passage_dict, "msmarco_passage_doc_analyze_passage_dict") return passage_dict
def build_match_tree(): selected_words = load_from_pickle("nli_dev_selected_words") seq_set = left(selected_words) match_tree = MatchTree() for seq in seq_set: match_tree.add_seq(seq) save_to_pickle(match_tree, "match_tree_nli_dev")
def high_idf_q_terms(self, q_tf, n_limit=10): total_doc = self.total_doc_n high_qt = Counter() for term, qf in q_tf.items(): postings = self.get_posting(term) qdf = len(postings) w = BM25_3_q_weight(qf, qdf, total_doc) high_qt[term] = w return set(left(high_qt.most_common(n_limit)))
def generate_instances( self, claim: Dict, data_id_manager: DataIDManager) -> List[PairedInstance]: cid = claim['cId'] claim = claim['text'] passages = self.cid_to_passages[cid] good_passages: List[List[str]] = left( lfilter(score_over_zero, passages)) not_good_passages: List[List[str]] = left( lfilter_not(score_over_zero, passages)) n_good = len(good_passages) n_not_good = len(not_good_passages) # len(pair_list_g_ng) = n_not_good ( assuming n_not_good > n_good) pair_list_g_ng: List[Tuple[ List[str], List[str]]] = generate_pairwise_combinations( not_good_passages, good_passages, True) # len(pair_list_g_rand) = n_good pair_list_g_rand: List[Tuple[List[str], List[str]]] = list([ (inst, self.random_sample(cid)) for inst in good_passages ]) # len(pair_list_g_rand) = n_not_good pair_list_ng_rand: List[Tuple[List[str], List[str]]] = list([ (inst, self.random_sample(cid)) for inst in not_good_passages ]) def make_instance(passage_pair, strict_good, strict_bad): passage_good, passage_worse = passage_pair info = {'cid': cid} return PairedInstance(claim, passage_good, passage_worse, strict_good, strict_bad, data_id_manager.assign(info)) l1 = lmap(lambda pair: make_instance(pair, 1, 0), pair_list_g_ng) l2 = lmap(lambda pair: make_instance(pair, 0, 1), pair_list_ng_rand) l3 = lmap(lambda pair: make_instance(pair, 1, 1), pair_list_g_rand) print("g-ng : ng-rank : g-rand = {} : {} : {}".format( len(l1), len(l2), len(l3))) return l1 + l2 + l3
def load_n_1_gram_set(topic, n): if n == 1: return set() else: count = load_n_gram_from_pickle(topic, n - 1) l = list(count.items()) l.sort(key=lambda x: x[1], reverse=True) top_k = 10000 for j in range(n - 1): top_k *= 100 print(l[0]) return set(left(l)[:top_k])