def save_to_csv(): gold = get_claim_perspective_id_dict() def routine(claims, out_path): payloads = predict_by_elastic_search(claims, 50) head = ['sentence1', 'sentence2', 'gold_label', 'cid', 'pid'] rows = [] for cid, data_list in payloads: gold_pids = gold[cid] all_pid_set = set(flatten(gold_pids)) for p_entry in data_list: c_text = p_entry['claim_text'] p_text = p_entry['perspective_text'] y = 1 if p_entry['pid'] in all_pid_set else 0 row = [c_text, p_text, y, cid, p_entry['pid']] rows.append(row) f_out = csv.writer(open(out_path, "w", encoding="utf-8"), dialect='excel-tab') f_out.writerows([head] + rows) claims, val = train_split() routine(claims, get_file_path('train')) d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) routine(claims, get_file_path('dev')) d_ids: List[int] = list(load_test_claim_ids()) claims = get_claims_from_ids(d_ids) routine(claims, get_file_path('test'))
def run_reweight(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 7 param = {'k1': 0.5} pred = predict_by_reweighter(get_bm25_module(), claims, top_k, param) print(evaluate(pred))
def run_bm25_rm(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) rm_info = load_from_pickle("perspective_dev_claim_rm") top_k = 7 pred = predict_by_bm25_rm(get_bm25_module(), rm_info, claims, top_k) print(evaluate(pred))
def pc_new_init_prob(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) claim_d = claims_to_dict(claims) bias_plus_word: Counter = load_from_pickle("bias_plus_words") tokenizer = PCTokenizer() base_p = max(bias_plus_word.values()) init_p_score_d = {} for cid in d_ids: c_text = claim_d[cid] tokens = tokenizer.tokenize_stem(c_text) score_for_cid = Counter() for t, cnt in Counter(tokens).items(): prob = cnt * base_p score_for_cid[t] = prob for t, score in bias_plus_word.items(): score_for_cid[t] += score score_for_cid = normalize_counter_to_sum1(score_for_cid) init_p_score_d[cid] = score_for_cid save_to_pickle(init_p_score_d, "pc_dev_new_init_prob")
def claim_language_model_property(): dev_claim_ids = load_dev_claim_ids() claims = get_claims_from_ids(dev_claim_ids) all_ranked_list = ClaimRankedList() all_voca = set() candidate_k = 50 for claim in claims: claim_text, perspectives = get_perspective(claim, candidate_k) print(claim_text) unigrams = get_relevant_unigrams(perspectives) ranked_list = all_ranked_list.get(str(claim['cId'])) doc_ids = [t[0] for t in ranked_list] print("Loading documents") preload_tf(doc_ids) docs = lmap(load_and_format_doc, doc_ids) foreach(lambda doc: all_voca.update(doc['tokens_set']), docs) # check hypothesis # check_hypothesis(all_voca, cdf_cont, cdf_ncont, clueweb_cdf, clueweb_ctf, clueweb_df, clueweb_tf, ctf_cont, # ctf_ncont, df_cont, df_ncont, tf_cont, tf_ncont, unigrams) print("counting terms stat") lm_classifier = build_lm(docs, unigrams) for p_entry in perspectives: _text, _pid, _score = p_entry tokens = nltk.word_tokenize(_text) score = sum(lmap(lm_classifier.per_token_odd, tokens)) print(_text, score)
def save_dev(): save_name = "pc_dev_a_passages" q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/dev_claim/q_res_100") d_ids = list(load_dev_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) a_relevant_candidate(save_name, q_res_path, claims)
def write_claim_perspective_pair_as_query(): split = "dev" assert split in ["train", "dev", "test"] d_ids = list({ "train": load_train_claim_ids(), "dev": load_dev_claim_ids(), "test": load_test_claim_ids() }[split]) claims = get_claims_from_ids(d_ids) print(len(claims), " claims") is_train = split == "train" all_data_points = get_candidates(claims, is_train) k = 0 def get_query_entry_from_data_point(x: PerspectiveCandidate) -> DocQuery: tokens = clean_tokenize_str_to_tokens(x.claim_text + " " + x.p_text) qid = "{}_{}".format(x.cid, x.pid) return format_query_bm25(qid, tokens, k) queries = lmap(get_query_entry_from_data_point, all_data_points) out_dir = query_dir_format.format(split) exist_or_mkdir(out_dir) n_query_per_file = 50 write_queries_to_files(n_query_per_file, out_dir, queries)
def main(): docs: Dict[str, List[List[str]]] = load_from_pickle("dev_claim_docs") _, clue12_13_df = load_clueweb12_B13_termstat() d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) r = select_paragraph(docs, clue12_13_df, claims, "topk") save_to_pickle(r, "dev_claim_paras")
def run_eval_with_dict(pickle_name): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) print("targets", len(claims)) top_k = 8 pc_score_d = load_from_pickle(pickle_name) pred = predict_from_dict(pc_score_d, claims, top_k) print(evaluate(pred))
def run_eval_with_two_dict(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) print("targets", len(claims)) top_k = 7 pc_score_d = load_from_pickle("pc_bert_baseline_score_d") pc_score_d2 = load_from_pickle("pc_random_walk_based_score_d") pred = predict_from_two_dict(pc_score_d, pc_score_d2, claims, top_k) print(evaluate(pred))
def save_random_walk_pred(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 50 q_tf_replace = dict(load_from_pickle("random_walk_score_100")) bm25 = get_bm25_module() pred = pc_predict_from_vector_query(bm25, q_tf_replace, claims, top_k) score_d = prediction_to_dict(pred) save_to_pickle(score_d, "pc_random_walk_based_score_d")
def run_lm(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 5 q_tf_replace = dict(load_from_pickle("pc_dev_par_tf")) q_tf_replace = dict(load_from_pickle("random_walk_score_100")) bm25 = get_bm25_module() ctf = load_collection_tf() pred = predict_by_lm(q_tf_replace, ctf, bm25, claims, top_k) print(evaluate(pred))
def run_random_walk_score_with_weight(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 7 q_tf_replace = dict(load_from_pickle("random_walk_score_100")) q_tf_replace = dict_key_map(lambda x: int(x), q_tf_replace) bm25 = get_bm25_module() pred = pc_predict_vector_query_and_reweight(bm25, q_tf_replace, claims, top_k, {'k1': 0.5}) print(evaluate(pred))
def generate_classification_payload(): claims, val = train_split() top_k = 50 pred = predict_by_elastic_search(claims, top_k) save_to_pickle(pred, "perspective_cls_train_X") d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 50 pred = predict_by_elastic_search(claims, top_k) save_to_pickle(pred, "perspective_cls_dev_X")
def run_write_claims_as_plain_query(): for claim_ids, out_name in [ (load_train_claim_ids(), "train_claim_query_raw.txt"), (load_dev_claim_ids(), "dev_claim_query_raw.txt") ]: claims = get_claims_from_ids(claim_ids) q_str_list = get_claims_as_plain_query(claims) f = open(pjoin(output_path, out_name), "w") for s in q_str_list: f.write(s + "\n")
def main(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 50 q_tf_replace = dict(load_from_pickle("random_walk_score_100")) bm25 = get_bm25_module() pred2 = pc_predict_from_vector_query(bm25, q_tf_replace, claims, top_k) pc_score_d = load_from_pickle("pc_bert_baseline_score_d") pred1 = predict_from_dict(pc_score_d, claims, top_k) compare_two_runs(pred1, pred2)
def run_random_walk_score(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 7 q_tf_replace = dict(load_from_pickle("random_walk_score_100")) q_tf_replace = dict_key_map(lambda x: int(x), q_tf_replace) #q_tf_replace = dict(load_from_pickle("pc_dev_par_tf")) #q_tf_replace = dict(load_from_pickle("bias_random_walk_dev_plus_all")) bm25 = get_bm25_module() pred = pc_predict_from_vector_query(bm25, q_tf_replace, claims, top_k) print(evaluate(pred))
def run_random_walk_score(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 20 bm25 = get_bm25_module() #df, N = get_idf() #bm25.df = df #bm25.N = N q_tf_replace_0 = dict(load_from_pickle("random_walk_score_100")) q_tf_replace = dict(load_from_pickle("dev_claim_random_walk_debug2")) q_tf_replace = dict_key_map(lambda x: int(x), q_tf_replace) pc_predict_to_inspect(bm25, q_tf_replace, q_tf_replace_0, claims, top_k)
def show_random_walk_score(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) claim_d = claims_to_dict(claims) top_k = 7 q_tf_replace = dict(load_from_pickle("bias_random_walk_dev_plus_all")) for claim_id, qtf in q_tf_replace.items(): print(claim_d[claim_id]) print(qtf.most_common(100)) print("")
def run_baseline(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) print("targets", len(claims)) top_k = 5 score_pred_file: FileName = FileName("pc_para_D_pred_dev_11") cpid_resolute_file: FileName = FileName("resolute_dict_dev_11") # score_pred_file: FileName = FileName("pc_para_D_pred_dev") # cpid_resolute_file: FileName = FileName("resolute_dict_dev") pred = predict_by_para_scorer(score_pred_file, cpid_resolute_file, claims, top_k) print(evaluate(pred))
def run_lm2(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 5 tokenizer = PCTokenizer() tf_d = { c['cId']: Counter(nltk.tokenize.word_tokenize(c['text'])) for c in claims } bm25 = get_bm25_module() ctf = get_perspective_tf() pred = predict_by_lm(tf_d, ctf, bm25, claims, top_k) print(evaluate(pred))
def write_claim_queries_k0(): def write(claim_ids, split_name): claims = get_claims_from_ids(claim_ids) queries = get_claims_query(claims, True) out_path = os.path.join( output_path, "perspective_{}_claim_query_k0.json".format(split_name)) save_queries_to_file(queries, out_path) claim_ids, split_name = (load_train_claim_ids(), "train") write(claim_ids, split_name) claim_ids, split_name = (load_dev_claim_ids(), "dev") write(claim_ids, split_name)
def load_cppnc_score_and_baseline_and_group(save_name): out_dir = os.path.join(output_path, "cppnc") exist_or_mkdir(out_dir) info_file_path = os.path.join(out_dir, save_name + ".info") pred_file_path = os.path.join(out_dir, save_name + ".score") d_ids = list(load_dev_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) claim_d = {c['cId']: c['text'] for c in claims} cid_grouped = load_and_group_predictions(info_file_path, pred_file_path) save_name = "baseline_cppnc" baseline_info_file_path = os.path.join(out_dir, save_name + ".info") baseline_pred_file_path = os.path.join(out_dir, save_name + ".score") baseline_cid_grouped = load_and_group_predictions(baseline_info_file_path, baseline_pred_file_path) return baseline_cid_grouped, cid_grouped, claim_d
def main(): args = parser.parse_args(sys.argv[1:]) prediction_path = args.prediction_path data_id_info: Dict = load_from_pickle("pc_dev_passage_payload_info") save_name = args.save_name d_ids = list(load_dev_claim_ids()) dev_claims: List[Dict] = get_claims_from_ids(d_ids) candidate_perspectives: Dict[int, List[Dict]] = dict( get_eval_candidates_from_pickle("dev")) config = json.load(open(args.config_path, "r")) print(config) make_cppnc_problem(prediction_path, data_id_info, dev_claims, candidate_perspectives, config, save_name, encode_inner)
def work(): claim_ids, split_name = (load_train_claim_ids(), "train") print("Num claims in train : ", len(list(claim_ids))) exit() def submit_jobs_inner(claim_ids, split_name): claims = get_claims_from_ids(claim_ids) queries = get_claims_query(claims) out_root = "/mnt/nfs/work3/youngwookim/data/perspective/{}_claim_rm3".format( split_name) exist_or_mkdir(out_root) submit_rm_jobs(queries, out_root) claim_ids, split_name = (load_dev_claim_ids(), "dev") submit_jobs_inner(claim_ids, split_name)
def start_generate_jobs_for_dev(generator: InstanceGenerator, name_prefix): # claim ids split to train/val print("Loading data ....") d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) cids = {str(t['cId']) for t in claims} qk_candidate: List[QKUnit] = load_qk_candidate_dev() qk_candidate_val = list( [qk for qk in qk_candidate if qk[0].query_id in cids]) print("Generate instances : dev") def worker_factory(out_dir): return QCKWorker(qk_candidate_val, generator, out_dir) runner = JobRunnerS(job_man_dir, 138, name_prefix + "_dev", worker_factory) runner.start()
def perspective_lm_correlation(): d_ids = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 20 gold = get_claim_perspective_id_dict() predictions = predict_with_lm(claims, top_k) avg_pos_list = [] avg_neg_list = [] for c_Id, prediction_list in predictions: gold_pids = gold[c_Id] claim_text = prediction_list[0]['claim_text'] pos_list = [] neg_list = [] print("Claim {}: ".format(c_Id), claim_text) for prediction in prediction_list: pid = prediction['pid'] valid = False for cluster in gold_pids: if pid in cluster: valid = True break print("{0} {1:.2f} {2}".format(valid, prediction['lm_score'], prediction['perspective_text'])) if not valid: neg_list.append(prediction['lm_score']) else: pos_list.append(prediction['lm_score']) if pos_list and neg_list: pos_score = average(pos_list) neg_score = average(neg_list) avg_pos_list.append(pos_score) avg_neg_list.append(neg_score) diff, p = ttest_ind(avg_pos_list, avg_neg_list) print("pos", average(avg_pos_list), "neg", average(avg_neg_list)) print("pos", avg_pos_list) print("neg", avg_neg_list) print(diff, p)
def start_generate_jobs_for_dev( generator_functor: Callable[[Dict[int, List[Tuple[List[str], float]]]], CPPNCGeneratorInterface], writer, name_prefix): # claim ids split to train/val d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) data = load_from_pickle("pc_dev_a_passages") entries, all_passages = data cid_to_passages: Dict[int, List[Tuple[List[str], float]]] = { claim['cId']: p for claim, p in entries } generator = generator_functor(cid_to_passages) print("Generate instances : dev") def worker_factory(out_dir): return CPPNCWorker(claims, generator, writer, out_dir) runner = JobRunner(job_man_dir, 138, name_prefix + "_dev", worker_factory) runner.start()
def work(): split = "train" assert split in ["train", "dev", "test"] tokenizer = PCTokenizer() d_ids = list({ "train": load_train_claim_ids(), "dev": load_dev_claim_ids(), "test": load_test_claim_ids() }[split]) claims = get_claims_from_ids(d_ids) claim_d = claims_to_dict(claims) print(len(claims), " claims") do_balance = False all_data_points: List[PerspectiveCandidate] = get_candidates( claims, do_balance) grouped: Dict[str, List] = group_by(all_data_points, lambda x: x.cid) def get_frequency_per_class(datapoints: List[PerspectiveCandidate]): pos_text = [] neg_text = [] for dp in datapoints: tokens = tokenizer.tokenize_stem(dp.p_text) tf = Counter(tokens) dl = sum(tf.values()) tf_rel = {k: v / dl for k, v in tf.items()} if dp.label == "1": pos_text.append(tf_rel) elif dp.label == "0": neg_text.append(tf_rel) else: assert False def accumulate(tf_list: List[Dict]): out_c = Counter() n = len(tf_list) for tf in tf_list: for k, v in tf.items(): out_c[k] += v / n return out_c pos_avg_tf = accumulate(pos_text) neg_avg_tf = accumulate(neg_text) return pos_avg_tf, neg_avg_tf class_freq: Dict[str, Tuple[Counter, Counter]] = dict_value_map(get_frequency_per_class, grouped) save_to_pickle(class_freq, "per_claim_class_word_tf_{}".format(split)) def normalize(s_list: List[float]) -> List[float]: m = sum(s_list) return list([s / m for s in s_list]) pos_prob_dict = {} neg_prob_dict = {} for cid, info in class_freq.items(): pos, neg = info all_words = set(pos.keys()) all_words.update(neg.keys()) info = [] for word in all_words: score = pos[word] - neg[word] info.append((word, score)) pos_scores = list([(w, s) for w, s in info if s > 0]) neg_scores = list([(w, s) for w, s in info if s < 0]) def normalize_right(pair_list): right_scores = normalize(right(pair_list)) return list(zip(left(pair_list), right_scores)) pos_prob_dict[cid] = normalize_right(pos_scores) neg_prob_dict[cid] = normalize_right(neg_scores) save_to_pickle(pos_prob_dict, "pc_pos_word_prob_{}".format(split)) save_to_pickle(neg_prob_dict, "pc_neg_word_prob_{}".format(split))
def save_dev_candidate(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) candidates: List[Tuple[Dict, List[Dict]]] = get_all_candidate(claims) save_to_pickle(candidates, "pc_dev_candidate")