def run(): in_dir_name = FileName("ukp_paragraph_tfrecord_dev_abortion") out_dir_name = FileName("ukp_paragraph_tfrecord_dev_abortion_2way") run_dir(in_dir_name, out_dir_name) in_dir_name = FileName("ukp_paragraph_tfrecord_train_abortion") out_dir_name = FileName("ukp_paragraph_tfrecord_train_abortion_2way") run_dir(in_dir_name, out_dir_name)
def count_tf(): continuation_tokens = get_continuation_token_ids() dir_path = pjoin(output_path, FileName("nli_tfrecord_cls_300")) tf_train = build_word_tf(continuation_tokens, pjoin(dir_path, FileName("train"))) tf_dev = build_word_tf(continuation_tokens, pjoin(dir_path, FileName("dev_mis"))) save_to_pickle(tf_dev, "nli_tf_dev_mis") save_to_pickle(tf_train, "nli_tf_train")
def run_para_scorer(): claims, val = train_split() top_k = 6 target = filter_avail(val) print("targets", len(target)) score_pred_file: FileName = FileName("pc_para_D_pred") cpid_resolute_file: FileName = FileName("resolute_dict_580_606") pred = predict_by_para_scorer(score_pred_file, cpid_resolute_file, target, top_k) print(evaluate(pred))
def count_tf(): continuation_tokens = get_continuation_token_ids() dir_path = pjoin(output_path, FileName("eHealth")) tf_train = build_word_tf(continuation_tokens, pjoin(dir_path, FileName("tfrecord_train"))) tf_dev = build_word_tf(continuation_tokens, pjoin(dir_path, FileName("tfrecord_test"))) save_to_pickle(tf_train, "clef1_tf_train") save_to_pickle(tf_dev, "clef1_tf_test")
def run_baseline(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) print("targets", len(claims)) top_k = 5 score_pred_file: FileName = FileName("pc_para_D_pred_dev_11") cpid_resolute_file: FileName = FileName("resolute_dict_dev_11") # score_pred_file: FileName = FileName("pc_para_D_pred_dev") # cpid_resolute_file: FileName = FileName("resolute_dict_dev") pred = predict_by_para_scorer(score_pred_file, cpid_resolute_file, claims, top_k) print(evaluate(pred))
def filter_avail(claims): cpid_resolute: Dict[str, CPID] = load_cpid_resolute( FileName("resolute_dict_580_606")) cid_list: List[int] = lmap(lambda x: int(x.split("_")[0]), cpid_resolute.values()) cid_list: Set[int] = set(cid_list) return lfilter(lambda x: x['cId'] in cid_list, claims)
def compare_before_after(): tokenizer = get_tokenizer() ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize("heavy metal")) dir_name = pjoin(pjoin(output_path, FileName("model")), FileName("alt_emb_heavy_metal_D")) before = pjoin(dir_name, FileName("model.ckpt-0")) after = pjoin(dir_name, FileName("model.ckpt-10000")) v1_d = load_checkpoint_vars(before) v2_d = load_checkpoint_vars(after) for key in v1_d: if key in v2_d: s = np.sum(v1_d[key] - v2_d[key]) if np.abs(s) > 0.01: print(key, s) ori_emb = v2_d['bert/embeddings/word_embeddings'] alt_emb_before = v1_d['bert/embeddings/word_embeddings_alt'] alt_emb_after = v2_d['bert/embeddings/word_embeddings_alt'] def show_diff_from_ori(token_id): diff = np.sum(np.abs(ori_emb[token_id] - alt_emb_after[token_id])) print(token_id, diff) def show_diff_from_step0(token_id): diff = np.sum( np.abs(alt_emb_before[token_id] - alt_emb_after[token_id])) print(token_id, diff) print("Diff against original embedding") print("Target words") for token_id in ids: show_diff_from_ori(token_id) print("Random words") for token_id in [321, 598, 5854]: show_diff_from_ori(token_id) print("Diff against step0 random init embedding") print("Target words") for token_id in range(0, 30000): diff = np.sum( np.abs(alt_emb_before[token_id] - alt_emb_after[token_id])) if diff > 0.001: print(token_id, diff)
def print_features(): job_dir = "ukp_paragraph_feature_2" job_id = 0 file_path = os.path.join(sydney_working_dir, job_dir, str(job_id)) features: List[ParagraphFeature] = pickle.load( open(os.path.join(file_path), "rb")) out_path = pjoin(output_path, FileName("ukp_paragraph_feature_2.html")) print_paragraph_feature(features, out_path)
def run_dir(in_dir_name: FileName, out_dir_name: FileName): in_dir = pjoin(sydney_working_dir, in_dir_name) out_dir = pjoin(sydney_working_dir, out_dir_name) exist_or_mkdir(out_dir) for file_path in get_dir_files(in_dir): name = FileName(os.path.basename(file_path)) out_path = pjoin(out_dir, name) convert_to_2way(file_path, out_path)
def eval_from_prediction(prediction_path): cpid_resolute_file: FileName = FileName("resolute_dict_dev_11") top_k = 5 cpid_resolute: Dict[str, CPID] = load_cpid_resolute(cpid_resolute_file) print("cpid_resolute has {}".format(len(cpid_resolute))) strategy = "avg" score_d: Dict[CPID, float] = get_cpid_score(prediction_path, cpid_resolute, strategy) return eval_from_score_d(score_d, top_k)
def __init__(self, word_list: List[str], out_path): self.out_dir = out_path tokenizer = get_tokenizer() self.seq_set: List[List[int]] = [] self.input_dir = pjoin(sydney_working_dir, FileName("alt_emb_heavy_metal")) for word in word_list: subwords = tokenizer.tokenize(word) ids = tokenizer.convert_tokens_to_ids(subwords) print(subwords, ids) self.seq_set.append(ids)
def print_features(): job_dir = "perspective_paragraph_feature" job_id = 0 file_path = os.path.join(sydney_working_dir, job_dir, str(job_id)) features: List[ParagraphClaimPersFeature] = pickle.load( open(os.path.join(file_path), "rb")) features: List[ParagraphFeature] = lmap(to_paragraph_feature, features) out_path = pjoin(output_path, FileName("perspective_paragraph_feature.html")) print_paragraph_feature(features, out_path)
def show_bert_nli_diff(): model_dir = pjoin(pjoin(output_path, FileName("model")), FileName("runs")) nli = os.path.join(model_dir, FileName("nli"), FileName("model.ckpt-75000")) bert = os.path.join(model_dir, FileName("uncased_L-12_H-768_A-12"), FileName("bert_model.ckpt")) show_embedding_difference(bert, nli)
def get_nli_and_bert_embeddings(): model_dir = pjoin(pjoin(output_path, FileName("model")), FileName("runs")) nli = os.path.join(model_dir, FileName("nli"), FileName("model.ckpt-75000")) bert = os.path.join(model_dir, FileName("uncased_L-12_H-768_A-12"), FileName("bert_model.ckpt")) nli_emb = get_embedding_table(nli) bert_emb = get_embedding_table(bert) return bert_emb, nli_emb
def work(): q_config_id = Q_CONFIG_ID_BM25_UKP ranked_list_save_root = get_ranked_list_save_dir(q_config_id) doc_ids = set() ticker = TimeEstimator(num_query_file) for i in range(num_query_file): file_name = FileName("{}_{}.txt".format(index_name_list[0], str(i))) ranked_list_path = pjoin(ranked_list_save_root, file_name) rl: Dict[str, List[SimpleRankedListEntry]] = load_galago_ranked_list( ranked_list_path) for key, value in rl.items(): for entry in value[:100]: doc_ids.add(entry.doc_id) ticker.tick() f = open("{}_uniq_100".format(q_config_id), "w") for doc_id in doc_ids: f.write("{}\n".format(doc_id)) f.close()
def get_query_file(query_collection_id, i) -> FilePath: return pjoin(get_query_dir(query_collection_id), FileName("{}.json".format(i)))
from arg.ukp.eval import eval from base_type import FileName if __name__ == "__main__": pred_file = FileName("ukp_para_pred") resolute_file = FileName("ukp_resolute_dict") eval(pred_file, resolute_file)
from arg.ukp.eval import eval from base_type import FileName if __name__ == "__main__": pred_file = FileName("ukp_pred_para_E_2way") resolute_file = FileName("ukp_resolute_dict") eval(pred_file, resolute_file, n_way=2)
def get_query_dir(query_collection_id) -> FilePath: out_dir = pjoin(output_path, FileName("ukp_query_{}".format(query_collection_id))) return out_dir
html = html_pos if is_true else html_neg claim, perspective, paragraph = split_3segments(tokens) highlight_terms = set(claim + perspective) if is_true: html.write_paragraph("{} of {}".format(idx_true, cnt_true)) idx_true += 1 else: html.write_paragraph("{} of {}".format(idx_false, cnt_false)) idx_false += 1 html.write_paragraph("claim : " + pretty_tokens(claim)) html.write_paragraph("perspective : " + pretty_tokens(perspective)) def make_cell(subword: Subword): if subword in highlight_terms: return Cell(subword, highlight_score=100) else: return Cell(subword) cells = lmap(make_cell, paragraph) html.multirow_print(cells) if item_cnt > 100: break if __name__ == "__main__": pred_file = FileName("pc_para_D_pred_dev") pred_path: FilePath = pjoin(output_path, pred_file) print_file(pred_path)
pc_tokens: List[str] = nltk.word_tokenize( f.claim_pers.claim_text) + nltk.word_tokenize(f.claim_pers.p_text) pc_tokens_set = set([t.lower() for t in pc_tokens]) print(pc_tokens_set) def get_cell(token) -> Cell: if token.lower() in pc_tokens_set: score = 100 else: score = 0 return Cell(token, score) html_visualizer.write_paragraph("Label : {}".format( f.claim_pers.label)) for score_paragraph in f.feature: paragraph = score_paragraph.paragraph cells = [get_cell(t) for t in paragraph.tokens] html_visualizer.write_paragraph("---") html_visualizer.multirow_print(cells, width=20) if __name__ == "__main__": input_job_name: FileName = FileName("perspective_paragraph_feature_dev") input_dir = pjoin(output_path, input_job_name) job_id = 0 features: List[ParagraphClaimPersFeature] = pickle.load( open(pjoin(input_dir, FileName(str(job_id))), "rb")) html = HtmlVisualizer("pers_dev_para_features.html") show(html, features)
def get_ranked_list_save_dir(q_config_id): return pjoin(subproject_hub, FileName("{}_q_res".format(q_config_id)))