def main(dir_path): output_path = os.path.join(dir_path, "all_balanced") pos_insts = [] neg_insts = [] all_insts = [neg_insts, pos_insts] for i in range(665): p = os.path.join(dir_path, str(i)) if os.path.exists(p): for record in load_record(p): new_features = collections.OrderedDict() for key in record: new_features[key] = create_int_feature(take(record[key])) label = take(record['label_ids'])[0] all_insts[label].append(new_features) random.shuffle(pos_insts) random.shuffle(neg_insts) num_sel = min(len(pos_insts), len(neg_insts)) print("{} insts per label".format(num_sel)) insts_to_write = pos_insts[:num_sel] + neg_insts[:num_sel] writer = RecordWriterWrap(output_path) foreach(writer.write_feature, insts_to_write)
def build_and_show(): claim_lms = build_gold_claim_lm_train() alpha = 0.1 bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) def show(claim_lm: ClaimLM): print('----') print(claim_lm.claim) log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_bg_lm = get_lm_log(bg_lm) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) for k, v in claim_lm.LM.most_common(50): print(k, v) s = "\t".join(left(claim_lm.LM.most_common(10))) print("LM freq: ", s) print(s) s = "\t".join(left(log_odd.most_common(30))) print("Log odd top", s) s = "\t".join(left(least_common(log_odd, 10))) print("Log odd bottom", s) foreach(show, claim_lms[:10])
def debug_failture(predictions): gold = get_claim_perspective_id_dict() ap_list = [] for c_Id, prediction_list in predictions: gold_pids = gold[c_Id] gold_pids_set: Set[int] = set(flatten(gold_pids)) claim_text = prediction_list[0]['claim_text'] print("Claim {}: ".format(c_Id), claim_text) correctness_list = lmap(lambda p: p['pid'] in gold_pids_set, prediction_list) ap = get_ap(prediction_list, gold_pids, False) if not any(correctness_list): # all wrong continue if ap > 0.9: continue def print_line(prediction): pid = prediction['pid'] correct = pid in gold_pids_set if correct: correct_str = "Y" else: correct_str = "N" score = prediction['score'] print(correct_str, score, score.name, prediction['perspective_text']) foreach(print_line, prediction_list) ap_list.append(ap) map = average(ap_list) return {'map': map}
def write_records(records: List[Payload], max_seq_length, output_path): tokenizer = get_tokenizer() def tokenize_from_tokens(tokens: List[str]) -> List[str]: output = [] for t in tokens: ts = tokenizer.tokenize(t) output.extend(ts) return output def encode(inst: Payload) -> OrderedDict: tokens1: List[str] = tokenizer.tokenize(inst.candidate_text) max_seg2_len = max_seq_length - 3 - len(tokens1) tokens2 = tokenize_from_tokens(inst.passage)[:max_seg2_len] tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"] segment_ids = [0] * (len(tokens1) + 2) \ + [1] * (len(tokens2) + 1) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids) features['label_ids'] = create_int_feature([inst.is_correct]) features['data_id'] = create_int_feature([inst.data_id]) return features writer = RecordWriterWrap(output_path) features: List[OrderedDict] = lmap(encode, records) foreach(writer.write_feature, features) writer.close()
def claim_language_model_property(): dev_claim_ids = load_dev_claim_ids() claims = get_claims_from_ids(dev_claim_ids) all_ranked_list = ClaimRankedList() all_voca = set() candidate_k = 50 for claim in claims: claim_text, perspectives = get_perspective(claim, candidate_k) print(claim_text) unigrams = get_relevant_unigrams(perspectives) ranked_list = all_ranked_list.get(str(claim['cId'])) doc_ids = [t[0] for t in ranked_list] print("Loading documents") preload_tf(doc_ids) docs = lmap(load_and_format_doc, doc_ids) foreach(lambda doc: all_voca.update(doc['tokens_set']), docs) # check hypothesis # check_hypothesis(all_voca, cdf_cont, cdf_ncont, clueweb_cdf, clueweb_ctf, clueweb_df, clueweb_tf, ctf_cont, # ctf_ncont, df_cont, df_ncont, tf_cont, tf_ncont, unigrams) print("counting terms stat") lm_classifier = build_lm(docs, unigrams) for p_entry in perspectives: _text, _pid, _score = p_entry tokens = nltk.word_tokenize(_text) score = sum(lmap(lm_classifier.per_token_odd, tokens)) print(_text, score)
def read_doc_list(st, ed): st = int(st) ed = int(ed) q_config_id = Q_CONFIG_ID_BM25_10000 all_data_points = load_train_data_point() print("Running {}~{} of {}".format(st, ed, len(all_data_points))) todo = all_data_points[st:ed] qid_list = lmap(dp_to_qid, todo) doc_list = set() ticker = TimeEstimator(len(qid_list)) def get_doc_list(query_id: str): q_res_id: str = "{}_{}".format(query_id, q_config_id) ticker.tick() if has_key(QueryResult, q_res_id): r: List[SimpleRankedListEntry] = load(QueryResult, q_res_id) for entry in r: doc_id, rank, score = entry doc_list.add(doc_id) print("parsing_doc_list") foreach(get_doc_list, qid_list) return doc_list
def work(st, ed): st = int(st) ed = int(ed) q_config_id = Q_CONFIG_ID_BM25_10000 ci = DynRankedListInterface(make_doc_query, q_config_id) all_data_points = load_train_data_point() print("Running {}~{} of {}".format(st, ed, len(all_data_points))) num_request = 10000 todo = all_data_points[st:ed] not_done = lfilter(partial(db_not_contains, q_config_id), todo) queries: List[DocQuery] = lmap(datapoint_to_doc_query, not_done) print("Executing {} queries".format(len(queries))) ranked_list_dict: Dict[str, List[SimpleRankedListEntry]] = \ send_doc_queries(ci.disk_path, num_request, queries, 600) qid_list = lmap(dp_to_qid, not_done) print("{} of {} succeed".format(len(ranked_list_dict), len(queries))) def add_to_db(query_id: str): if query_id in ranked_list_dict: r = ranked_list_dict[query_id] q_res_id: str = "{}_{}".format(query_id, q_config_id) if not has_key(QueryResult, q_res_id): save(QueryResult, q_res_id, r) foreach(add_to_db, qid_list) flush()
def write_records(records: List[Record], max_seq_length, output_path): tokenizer = get_tokenizer() def encode(record: Record) -> OrderedDict: tokens = ["[CLS]"] + record.claim_tokens + [ "[SEP]" ] + record.doc_tokens + ["[SEP]"] segment_ids = [0] * (len(record.claim_tokens) + 2) \ + [1] * (len(record.doc_tokens) + 1) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids) labels = [0.] * (len(record.claim_tokens) + 2) + record.scores labels += (max_seq_length - len(labels)) * [0.] label_mask = [0] * (len(record.claim_tokens) + 2) + record.valid_mask label_mask += (max_seq_length - len(label_mask)) * [0] features['label_ids'] = create_float_feature(labels) features['label_masks'] = create_int_feature(label_mask) return features writer = RecordWriterWrap(output_path) features: List[OrderedDict] = lmap(encode, records) foreach(writer.write_feature, features) writer.close()
def get_relevant_unigrams(perspectives): unigrams = set() tokens_list = [ lower_all(nltk.word_tokenize(_text)) for _text, _pid, _score in perspectives ] foreach(unigrams.update, tokens_list) return unigrams
def main(): claim_text_d: Dict[int, str] = get_all_claim_d() claim_text_d: Dict[str, str] = dict_key_map(str, claim_text_d) evi_dict: Dict[str, str] = dict_key_map(str, load_evidence_dict()) evi_gold_dict: Dict[str, List[int]] = evidence_gold_dict_str_qid() print("V2") def print_entry(entry): evidence_text = evi_dict[entry.doc_id] print("[{}] {}: {}".format(entry.rank, entry.doc_id, evidence_text)) ranked_list_dict = load_ranked_list_grouped(sys.argv[1]) for query, ranked_list in ranked_list_dict.items(): print() claim_id, perspective_id = query.split("_") gold_ids: List[str] = lmap(str, evi_gold_dict[query]) if not gold_ids: print("query {} has no gold".format(query)) continue assert gold_ids claim_text = claim_text_d[claim_id] perspective_text = perspective_getter(int(perspective_id)) pos_entries = [] neg_entries = [] for entry in ranked_list: label = entry.doc_id in gold_ids if label: pos_entries.append(entry) elif entry.rank < 3: neg_entries.append(entry) if not pos_entries: print("gold not in ranked list") continue num_rel = len(pos_entries) correctness = [] for entry in ranked_list[:num_rel]: label = entry.doc_id in gold_ids correctness.append(int(label)) precision = average(correctness) if precision > 0.99: print("Good") continue print("precision at {}: {}".format(num_rel, precision)) print("Claim: ", claim_text) print("perspective_text: ", perspective_text) print(" < GOLD >") foreach(print_entry, pos_entries) print(" < False Positive >") foreach(print_entry, neg_entries)
def main(): exist_or_mkdir(os.path.join(output_path, "alamri_tfrecord")) data_id_manager = DataIDManager() entries = [] for claim1, claim2 in enum_true_instance(): entries.append((claim1.text, claim2.text)) save_path = at_output_dir("alamri_pilot", "true_pairs_all.csv") csv_writer = csv.writer(open(save_path, "w", newline='', encoding="utf-8")) foreach(csv_writer.writerow, entries)
def work(self, job_id): features: List[ParagraphClaimPersFeature] = pickle.load( open(os.path.join(self.input_dir, str(job_id)), "rb")) writer = RecordWriterWrap(os.path.join(self.out_dir, str(job_id))) for f in features: f2: ParagraphFeature = to_paragraph_feature(f) encoded_list: List[OrderedDict] = format_paragraph_features( self.tokenizer, self.max_seq_length, f2) foreach(writer.write_feature, encoded_list) writer.close()
def main(): save_path = at_output_dir("alamri_pilot", "pilot_pairs.csv") entries = [] for claim1, claim2 in enum_true_instance(3): print("--") print("{}".format(claim1.text)) print("{}".format(claim2.text)) entries.append((claim1.text, claim2.text)) csv_writer = csv.writer(open(save_path, "w", newline='', encoding="utf-8")) foreach(csv_writer.writerow, entries)
def main(): baseline_cid_grouped, cid_grouped, claim_d = load_cppnc_related_data() gold = get_claim_perspective_id_dict() bin_keys = ["< 0.05", "< 0.50", "< 0.95", "< 1"] def bin_fn(item: float): if item > 0.95: return "< 1" elif item > 0.5: return "< 0.95" elif item > 0.05: return "< 0.50" else: return "< 0.05" for cid, pid_entries in cid_grouped.items(): baseline_pid_entries = baseline_cid_grouped[cid] baseline_score_d = {} for cpid, a_thing_array in baseline_pid_entries: _, pid = cpid assert len(a_thing_array) == 1 score = a_thing_array[0]['score'] baseline_score_d[pid] = score gold_pids = gold[cid] def get_score_per_pid_entry(p_entries: Tuple[CPIDPair, List[Dict]]): cpid, entries = p_entries return average(lmap(lambda e: e['score'], entries)) pid_entries.sort(key=get_score_per_pid_entry, reverse=True) s = "{} : {}".format(cid, claim_d[cid]) print(s) head_row = [""] + bin_keys rows = [head_row] for cpid, things in pid_entries: histogram = BinHistogram(bin_fn) _, pid = cpid label = any([pid in pids for pids in gold_pids]) label_str = bool_to_yn(label) base_score = baseline_score_d[pid] base_score_str = "{0:.2f}".format(base_score) scores: List[float] = lmap(lambda x: (x['score']), things) foreach(histogram.add, scores) row = [label_str, base_score_str] + [ str(histogram.counter[bin_key]) for bin_key in bin_keys ] rows.append(row) print_table(rows)
def do_join_and_write(doc_list: Iterable, save_name): doc_id_in_db: Set = get_docs_in_db(save_name) print("doc_list", len(doc_list)) print("Num doc in db", len(doc_id_in_db)) doc_list_to_fetch = set(doc_list) - doc_id_in_db print("doc_list_to_fetch", len(doc_list_to_fetch)) exist_or_mkdir(os.path.join(output_path, "doc_list")) save_path = os.path.join(output_path, "doc_list", save_name) f = open(save_path, "w") write = lambda doc_id: f.write("{}\n".format(doc_id)) foreach(write, doc_list_to_fetch) f.close()
def encode_label_and_token_pair(topic_tokens, label, tokens_labeled, tokens_unlabeled, swap): tokens = [] segment_ids = [] cur_segment_type = 0 label_token = get_label_token(label) sent1 = tokens_labeled if not swap else tokens_unlabeled label_1 = label_token if not swap else get_unk_label_token() sent2 = tokens_unlabeled if not swap else tokens_labeled label_2 = get_unk_label_token() if not swap else label_token def append_token(token): tokens.append(token) segment_ids.append(cur_segment_type) append_token("[CLS]") foreach(append_token, topic_tokens) append_token(label_1) foreach(append_token, sent1) append_token("[SEP]") cur_segment_type = 1 foreach(append_token, topic_tokens) append_token(label_2) foreach(append_token, sent2) append_token("[SEP]") return tokens, segment_ids
def build_co_occurrence(list_tokens: List[List[str]], window_size, stemmer: CacheStemmer) -> Counter: list_tokens: List[List[str]] = lmap(stemmer.stem_list, list_tokens) stopword = load_stopwords() def remove_stopwords(tokens: List[str]) -> List[str]: return list([t for t in tokens if t not in stopword]) list_tokens: List[List[str]] = lmap(remove_stopwords, list_tokens) counter = Counter() def count_co_ocurrence_fn(token_list): count_co_ocurrence(window_size, counter, token_list) foreach(count_co_ocurrence_fn, list_tokens) return counter
def test_generative_model(): train, val = load_feature_and_split() print("Training lm") classifier = learn_lm(train) stopwords = load_stopwords() def fileter_fn(data_point: Dict): remove_stopword_and_punct(stopwords, data_point['feature']) foreach(fileter_fn, train) def is_correct(data_point: Dict): x = data_point['feature'] y = int(data_point['label']) return classifier.predict(x) == int(y) correctness = lmap(is_correct, val) print("val acc: ", average(correctness))
def work(self, job_id): features: List[ParagraphClaimPersFeature] = pickle.load( open(os.path.join(self.input_dir, str(job_id)), "rb")) info_d_all = {} data_id_base = job_id * 100000 data_id_gen = DataIDGen(data_id_base) writer = RecordWriterWrap(os.path.join(self.out_dir, str(job_id))) for f in features: pair = to_retrieval_format(self.tokenizer, self.max_seq_length, data_id_gen, f) info_d: Dict = pair[0] f2: List[OrderedDict] = pair[1] info_d_all.update(info_d) foreach(writer.write_feature, f2) writer.close() pickle.dump(info_d_all, open(os.path.join(self.info_out_dir, str(job_id)), "wb"))
def main(): save_dir = at_output_dir("alamri_annotation1", "grouped_pairs") exist_or_mkdir(save_dir) summary = [] grouped_claim_pairs: List[Tuple[Review, List[Claim, Claim]]] = NotImplemented for review_idx, (review, claim_pairs) in enumerate(grouped_claim_pairs): entries = [] for claim1, claim2 in claim_pairs: entries.append((claim1.text, claim2.text)) review_no = review_idx + 1 save_path = os.path.join(save_dir, "{}.csv".format(review_no)) csv_writer = csv.writer( open(save_path, "w", newline='', encoding="utf-8")) foreach(csv_writer.writerow, entries) summary.append((str(review_no), review.pmid, str(len(claim_pairs)))) save_path = os.path.join(save_dir, 'sumamry.csv') csv_writer = csv.writer(open(save_path, "w", newline='', encoding="utf-8")) foreach(csv_writer.writerow, summary)
def lm_contribution(): train, val = load_feature_and_split() print("Training lm") stopwords = load_stopwords() def fileter_fn(data_point): remove_stopword_and_punct(stopwords, data_point[0][0]) foreach(fileter_fn, train) classifier = learn_lm(train) acc_contrib = Counter() for data_point in train: (tf, num), y = data_point contrib = classifier.counter_contribution(tf) # print("{} {} {}".format(y, classifier.predict(tf), classifier.counter_odd(tf))) # print("--------------") for t, score in contrib.most_common(100): acc_contrib[t] += score for t, score in acc_contrib.most_common(100): print(t, score, classifier.P_w_C_dict[t], classifier.P_w_NC_dict[t])
def write_records(records: List[PairedInstance], max_seq_length, output_path): tokenizer = get_tokenizer() def tokenize_from_tokens(tokens: List[str]) -> List[str]: output = [] for t in tokens: ts = tokenizer.tokenize(t) output.extend(ts) return output def encode(inst: PairedInstance) -> OrderedDict: tokens1: List[str] = tokenizer.tokenize(inst.candidate_text) max_seg2_len = max_seq_length - 3 - len(tokens1) def concat_tokens(raw_tokens: List[str]): tokens2 = tokenize_from_tokens(raw_tokens)[:max_seg2_len] tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"] segment_ids = [0] * (len(tokens1) + 2) \ + [1] * (len(tokens2) + 1) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] return tokens, segment_ids out_tokens1, seg1 = concat_tokens(inst.passage_good) out_tokens2, seg2 = concat_tokens(inst.passage_worse) features = combine_features(out_tokens1, seg1, out_tokens2, seg2, tokenizer, max_seq_length) features['strict_good'] = create_int_feature([inst.strict_good]) features['strict_bad'] = create_int_feature([inst.strict_bad]) return features writer = RecordWriterWrap(output_path) features: List[OrderedDict] = lmap(encode, records) foreach(writer.write_feature, features) writer.close()
def main(): print("Loading scores...") cid_grouped: Dict[str, Dict[str, List[Dict]]] = load_cppnc_score_wrap() baseline_cid_grouped = load_baseline("train_baseline") gold = get_claim_perspective_id_dict() tokenizer = get_tokenizer() claim_d = load_train_claim_d() print("Start analyzing") html = HtmlVisualizer("cppnc_value_per_token_score.html") claim_cnt = 0 for cid, pid_entries_d in cid_grouped.items(): pid_entries_d: Dict[str, List[Dict]] = pid_entries_d pid_entries: List[Tuple[str, List[Dict]]] = list(pid_entries_d.items()) baseline_pid_entries = baseline_cid_grouped[int(cid)] baseline_score_d = fetch_score_per_pid(baseline_pid_entries) gold_pids = gold[int(cid)] ret = collect_score_per_doc(baseline_score_d, get_score_from_entry, gold_pids, pid_entries) passage_tokens_d = collect_passage_tokens(pid_entries) doc_info_d: Dict[int, Tuple[str, int]] = ret[0] doc_value_arr: List[List[float]] = ret[1] kdp_result_grouped = defaultdict(list) for doc_idx, doc_values in enumerate(doc_value_arr): doc_id, passage_idx = doc_info_d[doc_idx] avg_score = average(doc_values) kdp_result = doc_id, passage_idx, avg_score kdp_result_grouped[doc_id].append(kdp_result) s = "{} : {}".format(cid, claim_d[int(cid)]) html.write_headline(s) claim_cnt += 1 if claim_cnt > 10: break scores: List[float] = list([r[2] for r in doc_value_arr]) foreach(html.write_paragraph, lmap(str, scores)) for doc_id, kdp_result_list in kdp_result_grouped.items(): html.write_headline(doc_id) tokens, per_token_score = combine_collect_score(tokenizer, doc_id, passage_tokens_d, kdp_result_list) str_tokens = tokenizer.convert_ids_to_tokens(tokens) row = cells_from_tokens(str_tokens) for idx in range(len(str_tokens)): score = per_token_score[idx][0] norm_score = min(abs(score) * 10000, 100) color = "B" if score > 0 else "R" row[idx].highlight_score = norm_score row[idx].target_color = color rows = [row] nth = 0 any_score_found = True while any_score_found: any_score_found = False score_list = [] for idx in range(len(str_tokens)): if nth < len(per_token_score[idx]): score = per_token_score[idx][nth] any_score_found = True else: score = "-" score_list.append(score) def get_cell(score): if score == "-": return Cell("-") else: # 0.01 -> 100 norm_score = min(abs(score) * 10000, 100) color = "B" if score > 0 else "R" return Cell("", highlight_score=norm_score, target_color=color) nth += 1 if any_score_found: row = lmap(get_cell, score_list) rows.append(row) html.multirow_print_from_cells_list(rows)
def test_logistic_regression(): train_and_val = load_feature_and_split() train: List[Dict] = train_and_val[0] val: List[Dict] = train_and_val[1] valid_datapoint_list: List[Dict] = train + val stopwords = load_stopwords() def fileter_fn(data_point: Dict): remove_stopword_and_punct(stopwords, data_point['feature']) foreach(fileter_fn, train) foreach(fileter_fn, val) tf_list = lmap(lambda dp: dp['feature'], valid_datapoint_list) tf_acc = Counter() for tf in tf_list: tf_acc.update(tf) voca: List[str] = left(tf_acc.most_common(10000)) #voca = set(flatten(lmap(get_voca_from_datapoint, valid_datapoint_list))) voca2idx: Dict[str, int] = dict(zip(list(voca), range(len(voca)))) idx2voca: Dict[int, str] = {v: k for k, v in voca2idx.items()} print("Num voca:", len(voca)) feature_size = len(voca) + 1 def featurize(datapoint: Dict): tf = datapoint['feature'] y = int(datapoint['label']) v = np.zeros([feature_size]) for t, prob in tf.items(): if t in voca2idx: v[voca2idx[t]] = prob v[-1] = datapoint['num_mention'] return v, int(y) x, y = zip(*lmap(featurize, train)) val_x, val_y = zip(*lmap(featurize, val)) model = LogisticRegression() model.fit(x, y) x_a = np.array(x) print(x_a.shape) avg_x = np.sum(x_a, axis=0) def acc(y, pred_y): return np.average(np.equal(y, pred_y)) pred_y = model.predict(x) print("train acc", acc(y, pred_y)) print("val acc", acc(val_y, model.predict(val_x))) t = np.multiply(avg_x, model.coef_) contrib = t[0] ranked_idx = np.argsort(contrib) def print_feature_at(idx): if idx == feature_size - 1: print("[NUM_MENTION]", contrib[idx]) else: print(idx2voca[idx], contrib[idx]) print("Top k features (POS)") for i in range(30): idx = ranked_idx[i] print_feature_at(idx) print("Top k features (NEG)") for i in range(30): j = len(voca) - 1 - i idx = ranked_idx[j] print_feature_at(idx) print("In training data") print("pred\tgold\tterms") for i in range(100): terms = left(train[i]['feature'].most_common(50)) terms = list(terms[25:]) print(pred_y[i], y[i], terms) #
yield RelevanceModel(query.id.id, query.text, counter) if __name__ == "__main__": split = "training" lms: List[Tuple[str, Counter]] = list(build_lm(split)) alpha = 0.1 bg_lm = average_counters(lmap(lambda x: x.lm, lms)) def show(r: RelevanceModel): print('----') print(r.text) log_topic_lm = get_lm_log(smooth(r.lm, bg_lm, alpha)) log_bg_lm = get_lm_log(bg_lm) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) for k, v in r.lm.most_common(50): print(k, v) s = "\t".join(left(r.lm.most_common(10))) print("LM freq: ", s) print(s) s = "\t".join(left(log_odd.most_common(30))) print("Log odd top", s) s = "\t".join(left(least_common(log_odd, 10))) print("Log odd bottom", s) foreach(show, lms[:10])