def enum_all_argument(split) -> Iterable[Passage]: assert split in header.splits all_topic_dir = pjoin(extracted_arguments, split) for topic in header.topics: per_topic_dir = pjoin(all_topic_dir, topic) for maybe_dir_obj in os.scandir(per_topic_dir): if not maybe_dir_obj.is_dir(): continue dir_path = maybe_dir_obj.path con_dir = pjoin(dir_path, "_con") pro_dir = pjoin(dir_path, "pro") def load_files_in_dir(target_dir_path): assert os.path.basename(target_dir_path) in ["_con", "pro"] for file_path in get_dir_files(target_dir_path): content = open(file_path, "r", encoding='utf-8').read() rel_path = get_rel_path(file_path, extracted_arguments) yield Passage(content, ArguDataID.from_rel_path(rel_path)) for item in load_files_in_dir(con_dir): yield item for item in load_files_in_dir(pro_dir): yield item
def show_bert_nli_diff(): model_dir = pjoin(pjoin(output_path, FileName("model")), FileName("runs")) nli = os.path.join(model_dir, FileName("nli"), FileName("model.ckpt-75000")) bert = os.path.join(model_dir, FileName("uncased_L-12_H-768_A-12"), FileName("bert_model.ckpt")) show_embedding_difference(bert, nli)
def alt_from_clueweb12_13A(): model_dir = pjoin(output_path, "models") nli_checkpoint = pjoin(pjoin(model_dir, "nli_bert_300_K"), "model.ckpt-73150") alt_emb_checkpoint = pjoin(pjoin(model_dir, "alt_emb_H"), "model.ckpt-20000") save_path = os.path.join(output_path, "models", "nli_alt_emb_H20K", "model.ckpt-73150") combine(nli_checkpoint, alt_emb_checkpoint, save_path)
def get_nli_and_bert_embeddings(): model_dir = pjoin(pjoin(output_path, FileName("model")), FileName("runs")) nli = os.path.join(model_dir, FileName("nli"), FileName("model.ckpt-75000")) bert = os.path.join(model_dir, FileName("uncased_L-12_H-768_A-12"), FileName("bert_model.ckpt")) nli_emb = get_embedding_table(nli) bert_emb = get_embedding_table(bert) return bert_emb, nli_emb
def run_dir(in_dir_name: FileName, out_dir_name: FileName): in_dir = pjoin(sydney_working_dir, in_dir_name) out_dir = pjoin(sydney_working_dir, out_dir_name) exist_or_mkdir(out_dir) for file_path in get_dir_files(in_dir): name = FileName(os.path.basename(file_path)) out_path = pjoin(out_dir, name) convert_to_2way(file_path, out_path)
def count_tf(): continuation_tokens = get_continuation_token_ids() dir_path = pjoin(output_path, FileName("nli_tfrecord_cls_300")) tf_train = build_word_tf(continuation_tokens, pjoin(dir_path, FileName("train"))) tf_dev = build_word_tf(continuation_tokens, pjoin(dir_path, FileName("dev_mis"))) save_to_pickle(tf_dev, "nli_tf_dev_mis") save_to_pickle(tf_train, "nli_tf_train")
def count_tf(): continuation_tokens = get_continuation_token_ids() out_dir = pjoin(output_path, "eHealth") train_save_path = pjoin(out_dir, "tfrecord_train") test_save_path = pjoin(out_dir, "tfrecord_test") tf_train = build_word_tf(continuation_tokens, train_save_path) tf_dev = build_word_tf(continuation_tokens, test_save_path) save_to_pickle(tf_dev, "eHealth_tf_train") save_to_pickle(tf_train, "eHealth_tf_dev")
def main(): in_dir = pjoin(output_path, "eHealth") exist_or_mkdir(in_dir) input_path_train = pjoin(in_dir, "tfrecord_train") input_path_test = pjoin(in_dir, "tfrecord_test") out_dir = os.path.join(output_path, "ehealth_alt") exist_or_mkdir(out_dir) output_file_path_train = os.path.join(out_dir, "train") output_file_path_test = os.path.join(out_dir, "test") convert(input_path_test, output_file_path_test)
def count_tf(): continuation_tokens = get_continuation_token_ids() dir_path = pjoin(output_path, FileName("eHealth")) tf_train = build_word_tf(continuation_tokens, pjoin(dir_path, FileName("tfrecord_train"))) tf_dev = build_word_tf(continuation_tokens, pjoin(dir_path, FileName("tfrecord_test"))) save_to_pickle(tf_train, "clef1_tf_train") save_to_pickle(tf_dev, "clef1_tf_test")
def count_tf(): continuation_tokens = get_continuation_token_ids() dataset_dir = pjoin(data_path, "ukp_300") for topic in all_topics: train_data_path = pjoin(dataset_dir, "train_{}".format(topic)) test_data_path = pjoin(dataset_dir, "dev_{}".format(topic)) tf_train = build_word_tf(continuation_tokens, train_data_path) tf_dev = build_word_tf(continuation_tokens, test_data_path) save_to_pickle(tf_train, "tf_train_{}".format(topic)) save_to_pickle(tf_dev, "tf_dev_{}".format(topic))
def combine_nli_alt_emb(): model_dir = pjoin(output_path, "models") nli_checkpoint = pjoin(pjoin(model_dir, "nli_bert_300_K"), "model.ckpt-73150") #alt_emb_checkpoint = pjoin(pjoin(model_dir, "alt_emb_F"), "model.ckpt-10000") #alt_emb_checkpoint = pjoin(pjoin(model_dir, "alt_emb_F"), "model.ckpt-10000") #alt_emb_checkpoint = pjoin(pjoin(model_dir, "alt_emb_G"), "model.ckpt-0") alt_emb_checkpoint = pjoin(pjoin(model_dir, "alt_emb_G"), "model.ckpt-100000") save_path = os.path.join(output_path, "models", "nli_alt_emb_100KF", "model.ckpt-73150") combine(nli_checkpoint, alt_emb_checkpoint, save_path)
def load_label(split, topic) -> Iterator: split_dir = pjoin(pair_best_counter, split) topic_dir = pjoin(split_dir, topic) file_list = [ "01-debate-opposing-counters.tsv", "02-debate-counters.tsv", "03-debate-opposing-arguments.tsv", "04-debate-arguments.tsv", "05-theme-counters.tsv", "06-theme-arguments.tsv" ] return load_tsv_or_from_zip(topic_dir, file_list[4])
def main(): dataset_dir = pjoin(data_path, "ukp_300") for topic in all_topics: train_data_path = pjoin(dataset_dir, "train_{}".format(topic)) test_data_path = pjoin(dataset_dir, "dev_{}".format(topic)) out_dir = os.path.join(output_path, "ukp_alt") exist_or_mkdir(out_dir) output_file_path_train = os.path.join(out_dir, "train_{}".format(topic)) output_file_path_test = os.path.join(out_dir, "dev_{}".format(topic)) convert(test_data_path, output_file_path_test) convert(train_data_path, output_file_path_train)
def save_concat_dev(): # prediction_path = pjoin(output_path, "pc_long_seq11") prediction_path = pjoin(output_path, "pc_long_focus_1") scores: Dict[CPID, List[float]] = collect_pipeline2_score( prediction_path, "pc_rel_dev_info_all") reduced_score: Dict[CPID, float] = dict_value_map(sum, scores) save_to_pickle(reduced_score, "pc_concat_dev_score")
def bert_baseline_repeat(): info = load_from_pickle("eHealth_test_info") for i in [3,4,5]: prediction_name = "eHealth_bert_freeze_{}".format(i) pred_data = EstimatorPredictionViewerGosford(prediction_name) out_path = pjoin(subdir_root, "bert_baseline_{}.txt".format(i)) prediction_to_ranked_list(pred_data, info, out_path)
def save_for_train(): info = load_from_pickle("pc_rel_info_all") prediction_path = pjoin(output_path, "pc_rel") rel_info: Dict[DataID, Tuple[CPIDPair, Logits, Logits]] = combine_pc_rel_with_cpid( prediction_path, info) save_to_pickle(rel_info, "pc_rel_with_cpid")
def pc_predict_by_bert_next_sent(bm25_module: BM25, claims, top_k) -> List[Tuple[str, List[Dict]]]: cid_to_text: Dict[int, str] = claims_to_dict(claims) port = 8123 # Example usage : proxy = xmlrpc.client.ServerProxy( 'http://ingham.cs.umass.edu:{}'.format(port)) voca_path = pjoin(data_path, "bert_voca.txt") encoder = EncoderUnitPlain(512, voca_path) def scorer(lucene_score, query_id) -> NamedNumber: claim_id, p_id = query_id.split("_") i_claim_id = int(claim_id) payload = [] p_text = perspective_getter(int(p_id)) c_text = cid_to_text[i_claim_id] payload.append(encoder.encode_pair(c_text, p_text)) r = proxy.predict(payload) ns_score = -float(r[0]) #ns_score = 0 score = bm25_module.score(c_text, p_text) new_score = score + ns_score * 10 score = NamedNumber(new_score, score.name + " {}".format(ns_score)) return score r = predict_interface(claims, top_k, scorer) return r
def compare_before_after(): tokenizer = get_tokenizer() ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize("heavy metal")) dir_name = pjoin(pjoin(output_path, FileName("model")), FileName("alt_emb_heavy_metal_D")) before = pjoin(dir_name, FileName("model.ckpt-0")) after = pjoin(dir_name, FileName("model.ckpt-10000")) v1_d = load_checkpoint_vars(before) v2_d = load_checkpoint_vars(after) for key in v1_d: if key in v2_d: s = np.sum(v1_d[key] - v2_d[key]) if np.abs(s) > 0.01: print(key, s) ori_emb = v2_d['bert/embeddings/word_embeddings'] alt_emb_before = v1_d['bert/embeddings/word_embeddings_alt'] alt_emb_after = v2_d['bert/embeddings/word_embeddings_alt'] def show_diff_from_ori(token_id): diff = np.sum(np.abs(ori_emb[token_id] - alt_emb_after[token_id])) print(token_id, diff) def show_diff_from_step0(token_id): diff = np.sum( np.abs(alt_emb_before[token_id] - alt_emb_after[token_id])) print(token_id, diff) print("Diff against original embedding") print("Target words") for token_id in ids: show_diff_from_ori(token_id) print("Random words") for token_id in [321, 598, 5854]: show_diff_from_ori(token_id) print("Diff against step0 random init embedding") print("Target words") for token_id in range(0, 30000): diff = np.sum( np.abs(alt_emb_before[token_id] - alt_emb_after[token_id])) if diff > 0.001: print(token_id, diff)
def main(): train_queries, test_queries = get_query_split() out_dir = pjoin(output_path, "eHealth") exist_or_mkdir(out_dir) train_save_path = pjoin(out_dir, "tfrecord_train") test_save_path = pjoin(out_dir, "tfrecord_test") ranked_list_path = FilePath( os.path.join(output_path, "eHealth", "bm25_filtered.list")) ranked_list: RankedListDict = load_galago_ranked_list(ranked_list_path) qrels = load_clef_qrels() train_info = write_tfrecord(ranked_list, train_queries, qrels, train_save_path) save_to_pickle(train_info, "eHealth_train_info") test_info = write_tfrecord(ranked_list, test_queries, qrels, test_save_path) save_to_pickle(test_info, "eHealth_test_info")
def print_features(): job_dir = "ukp_paragraph_feature_2" job_id = 0 file_path = os.path.join(sydney_working_dir, job_dir, str(job_id)) features: List[ParagraphFeature] = pickle.load( open(os.path.join(file_path), "rb")) out_path = pjoin(output_path, FileName("ukp_paragraph_feature_2.html")) print_paragraph_feature(features, out_path)
def run_write_claims_as_plain_query(): for claim_ids, out_name in [ (load_train_claim_ids(), "train_claim_query_raw.txt"), (load_dev_claim_ids(), "dev_claim_query_raw.txt") ]: claims = get_claims_from_ids(claim_ids) q_str_list = get_claims_as_plain_query(claims) f = open(pjoin(output_path, out_name), "w") for s in q_str_list: f.write(s + "\n")
def load_tsv_or_from_zip(dir_path, file_name) -> Iterator: file_path = pjoin(dir_path, file_name) if not os.path.exists(file_path): print("extracting from zip...") zip_file_path = file_path + ".zip" extract_zip_file_at(zip_file_path, dir_path) f = open(file_path, "r", encoding="utf-8", errors="ignore") reader = csv.reader(f, delimiter='\t') for idx, row in enumerate(reader): yield row
def __init__(self, word_list: List[str], out_path): self.out_dir = out_path tokenizer = get_tokenizer() self.seq_set: List[List[int]] = [] self.input_dir = pjoin(sydney_working_dir, FileName("alt_emb_heavy_metal")) for word in word_list: subwords = tokenizer.tokenize(word) ids = tokenizer.convert_tokens_to_ids(subwords) print(subwords, ids) self.seq_set.append(ids)
def print_features(): job_dir = "perspective_paragraph_feature" job_id = 0 file_path = os.path.join(sydney_working_dir, job_dir, str(job_id)) features: List[ParagraphClaimPersFeature] = pickle.load( open(os.path.join(file_path), "rb")) features: List[ParagraphFeature] = lmap(to_paragraph_feature, features) out_path = pjoin(output_path, FileName("perspective_paragraph_feature.html")) print_paragraph_feature(features, out_path)
def predict_by_para_scorer(score_pred_file_name: FileName, cpid_resolute_file: FileName, claims, top_k) -> List[Tuple[str, List[Dict]]]: suc_count = SuccessCounter() suc_count.reset() pred_path: FilePath = pjoin(output_path, score_pred_file_name) print("Loading cpid_resolute") cpid_resolute: Dict[str, CPID] = load_cpid_resolute(cpid_resolute_file) print("Loading paragraph triple scores") score_d: Dict[CPID, float] = get_cpid_score_from_cache_or_raw(pred_path, cpid_resolute, "avg") per_claim_suc = {} per_claim_counter = {} def scorer(lucene_score, query_id): claim_id, p_id = query_id.split("_") if claim_id not in per_claim_suc: per_claim_counter[claim_id] = Counter() per_claim_suc[claim_id] = SuccessCounter() if query_id in score_d: cls_score = score_d[query_id] per_claim_suc[claim_id].suc() if cls_score > 0.8: per_claim_counter[claim_id][1] += 1 elif cls_score < 0.3: per_claim_counter[claim_id][0] += 1 suc_count.suc() else: cls_score = 0.5 per_claim_suc[claim_id].fail() suc_count.fail() score = 0.9 * cls_score + 0.1 * lucene_score / 20 return score r = predict_interface(claims, top_k, scorer) for claim in per_claim_suc: suc_counter = per_claim_suc[claim] print("{} suc/total={}/{} True/False={}/{}".format( claim, suc_counter.get_suc(), suc_counter.get_total(), per_claim_counter[claim][1], per_claim_counter[claim][0] )) print("{} found of {}".format(suc_count.get_suc(), suc_count.get_total())) return r
def main(): train_queries, test_queries = get_query_split() out_dir = pjoin(output_path, "eHealth") exist_or_mkdir(out_dir) ranked_list_path = FilePath( "/mnt/nfs/work3/youngwookim/data/CLEF_eHealth_working/ranked_list_filtered" ) ranked_list: RankedListDict = load_galago_ranked_list(ranked_list_path) qrels = load_clef_qrels() new_d = {} for query in test_queries: new_d[query.qid] = ranked_list[query.qid] save_path = os.path.join(out_dir, 'test_baseline.list') write_ranked_list_from_d(new_d, save_path)
def eval( score_pred_file_name: FileName, cpid_resolute_file: FileName, n_way=3, ): topic = "abortion" pred_path: FilePath = pjoin(output_path, score_pred_file_name) dpid_resolute: Dict[str, DPID] = load_dpid_resolute(cpid_resolute_file) score_d: Dict[DPID, np.ndarray] = get_datapoint_score(pred_path, dpid_resolute, "avg") def argmax(arr: np.ndarray) -> int: return arr.argmax() pred_d: Dict[DPID, int] = dict_value_map(argmax, score_d) dev_labels = get_dev_labels(topic) if n_way == 2: def merge_label(e): dpid, label = e return dpid, { 0: 0, 1: 1, 2: 1, }[label] dev_labels = lmap(merge_label, dev_labels) def fetch_pred(e: Tuple[DPID, int]): dpid, label = e pred = pred_d[dpid] return pred gold_list: List[int] = right(dev_labels) pred_list: List[int] = lmap(fetch_pred, dev_labels) if n_way == 3: all_result = eval_3label(gold_list, pred_list) elif n_way == 2: all_result = eval_2label(gold_list, pred_list) else: assert False print(all_result) f1 = sum([result['f1'] for result in all_result]) / n_way print("Avg F1 : ", f1)
def collect_save_relevance_score(): prediction_file = pjoin(output_path, "pc_rel") info_d = load_from_pickle("pc_rel_info_all") print("Building twostepdict") #two_step_d = TwoStepDict(info_d) # info_list = list(info_d.items()) # info_list.sort(key=lambda x: x[0]) # idx = 0 # for a, b in info_list: # print(a) # assert idx == a # idx += 1 print("Collect pc_rel") relevance_scores: Dict[CPIDPair, List[Tuple[Logits, Logits]]] = collect_pc_rel_score(prediction_file, info_d) save_to_pickle(relevance_scores, "pc_relevance_score")
def main(): queries = load_queries() bm25_path = pjoin(cord_working_dir, "youngwoo_bm25_query") ranked_list: Dict[str, List[SimpleRankedListEntry]] = load_galago_ranked_list(bm25_path) out_path = os.path.join(cord_working_dir, "tfrecord_2_4") max_seq_length = 512 meat_data: List[Dict] = read_csv_as_dict(meta_data_path) text_dict = {} for e in meat_data: text_dict[e[str_cord_uid]] = e[str_title] + " " + e[str_abstract] def get_text_from_doc_id(doc_id:str) -> str: return text_dict[doc_id] data_info_save_name = "data_info_save" tf_record_gen(ranked_list, queries, get_text_from_doc_id, out_path, max_seq_length, data_info_save_name)
def main(): info = load_from_pickle("pc_rel_dev_info_all") prediction_path = pjoin(output_path, "pc_rel_dev") rel_info: Dict[DataID, Tuple[CPIDPair, Logits, Logits]] = load_from_pickle("pc_rel_dev_with_cpid") #rel_info: Dict[DataID, Tuple[CPIDPair, Logits, Logits]] = combine_pc_rel_with_cpid(prediction_path, info) doc_index = reverse_index(rel_info) tokenizer = get_tokenizer() while True: s = input() os.system('cls') cid, pid = s.split() cid = int(cid) pid = int(pid) cpid = CPIDPair((cid, pid)) do_print(cpid, doc_index, tokenizer)