def save_to_csv(): gold = get_claim_perspective_id_dict() def routine(claims, out_path): payloads = predict_by_elastic_search(claims, 50) head = ['sentence1', 'sentence2', 'gold_label', 'cid', 'pid'] rows = [] for cid, data_list in payloads: gold_pids = gold[cid] all_pid_set = set(flatten(gold_pids)) for p_entry in data_list: c_text = p_entry['claim_text'] p_text = p_entry['perspective_text'] y = 1 if p_entry['pid'] in all_pid_set else 0 row = [c_text, p_text, y, cid, p_entry['pid']] rows.append(row) f_out = csv.writer(open(out_path, "w", encoding="utf-8"), dialect='excel-tab') f_out.writerows([head] + rows) claims, val = train_split() routine(claims, get_file_path('train')) d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) routine(claims, get_file_path('dev')) d_ids: List[int] = list(load_test_claim_ids()) claims = get_claims_from_ids(d_ids) routine(claims, get_file_path('test'))
def run_bm25_rm(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) rm_info = load_from_pickle("perspective_dev_claim_rm") top_k = 7 pred = predict_by_bm25_rm(get_bm25_module(), rm_info, claims, top_k) print(evaluate(pred))
def start_generate_jobs_for_train_val( generator_functor: Callable[[Dict[int, List[Tuple[List[str], float]]]], CPPNCGeneratorInterface], writer, name_prefix): # claim ids split to train/val d_ids: List[int] = list(load_train_claim_ids()) claims = get_claims_from_ids(d_ids) train, val = split_7_3(claims) data = load_from_pickle("pc_train_a_passages") entries, all_passages = data cid_to_passages: Dict[int, List[Tuple[List[str], float]]] = { claim['cId']: p for claim, p in entries } generator = generator_functor(cid_to_passages) print("Generate instances : train") def worker_factory(out_dir): return CPPNCWorker(train, generator, writer, out_dir) runner = JobRunner(job_man_dir, 378, name_prefix + "_train", worker_factory) runner.start() print("Generate instances : val") def worker_factory(out_dir): return CPPNCWorker(val, generator, writer, out_dir) runner = JobRunner(job_man_dir, 162, name_prefix + "_val", worker_factory) runner.start()
def get_perspective_candidates(claim_id): from arg.perspectives import es_helper claims = get_claims_from_ids([claim_id]) claim_text = claims[0]['text'] lucene_results = es_helper.get_perspective_from_pool(claim_text, 50) for _text, _pid, _score in lucene_results: yield _text, _pid
def claim_language_model_property(): dev_claim_ids = load_dev_claim_ids() claims = get_claims_from_ids(dev_claim_ids) all_ranked_list = ClaimRankedList() all_voca = set() candidate_k = 50 for claim in claims: claim_text, perspectives = get_perspective(claim, candidate_k) print(claim_text) unigrams = get_relevant_unigrams(perspectives) ranked_list = all_ranked_list.get(str(claim['cId'])) doc_ids = [t[0] for t in ranked_list] print("Loading documents") preload_tf(doc_ids) docs = lmap(load_and_format_doc, doc_ids) foreach(lambda doc: all_voca.update(doc['tokens_set']), docs) # check hypothesis # check_hypothesis(all_voca, cdf_cont, cdf_ncont, clueweb_cdf, clueweb_ctf, clueweb_df, clueweb_tf, ctf_cont, # ctf_ncont, df_cont, df_ncont, tf_cont, tf_ncont, unigrams) print("counting terms stat") lm_classifier = build_lm(docs, unigrams) for p_entry in perspectives: _text, _pid, _score = p_entry tokens = nltk.word_tokenize(_text) score = sum(lmap(lm_classifier.per_token_odd, tokens)) print(_text, score)
def main(): docs: Dict[str, List[List[str]]] = load_from_pickle("dev_claim_docs") _, clue12_13_df = load_clueweb12_B13_termstat() d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) r = select_paragraph(docs, clue12_13_df, claims, "topk") save_to_pickle(r, "dev_claim_paras")
def write(claim_ids, split_name): claims = get_claims_from_ids(claim_ids) queries = get_claims_query(claims, True) out_path = os.path.join( output_path, "perspective_{}_claim_query_k0.json".format(split_name)) save_queries_to_file(queries, out_path)
def start_generate_jobs_for_train_val(generator: InstanceGenerator, name_prefix): # claim ids split to train/val print("Loading data ....") d_ids: List[int] = list(load_train_claim_ids()) claims = get_claims_from_ids(d_ids) train, val = split_7_3(claims) train_cids = {str(t['cId']) for t in train} val_cids = {str(t['cId']) for t in val} qk_candidate: List[QKUnit] = load_qk_candidate_train() print("Generate instances : train") qk_candidate_train: List[QKUnit] = list( [qk for qk in qk_candidate if qk[0].query_id in train_cids]) qk_candidate_val = list( [qk for qk in qk_candidate if qk[0].query_id in val_cids]) def worker_factory(out_dir): return QCKWorker(qk_candidate_train, generator, out_dir) runner = JobRunner(job_man_dir, 378, name_prefix + "_train", worker_factory) runner.start() print("Generate instances : val") def worker_factory(out_dir): return QCKWorker(qk_candidate_val, generator, out_dir) runner = JobRunner(job_man_dir, 162, name_prefix + "_val", worker_factory) runner.start()
def submit_jobs_inner(claim_ids, split_name): claims = get_claims_from_ids(claim_ids) queries = get_claims_query(claims) out_root = "/mnt/nfs/work3/youngwookim/data/perspective/{}_claim_rm3".format( split_name) exist_or_mkdir(out_root) submit_rm_jobs(queries, out_root)
def get_eval_candidates(split, top_k=50) -> List[Tuple[int, List[Dict]]]: # split -> claims d_ids = load_claim_ids_for_split(split) claims: List[Dict] = get_claims_from_ids(d_ids) tokenizer = PCTokenizer() def get_candidates(c: Dict) -> Tuple[int, List[Dict]]: cid = c["cId"] assert type(cid) == int claim_text = c["text"] lucene_results = es_helper.get_perspective_from_pool(claim_text, top_k) candidate_list = [] for rank, (_text, _pid, _score) in enumerate(lucene_results): rationale = "es_rank={} , es_score={}".format(rank, _score) p_entry = { 'cid': cid, 'pid': _pid, 'claim_text': claim_text, 'perspective_text': _text, 'p_tokens': tokenizer.tokenize_stem(_text), 'rationale': rationale, } candidate_list.append(p_entry) return cid, candidate_list candidates: List[Tuple[int, List[Dict]]] = lmap(get_candidates, claims) return candidates
def sum_random_walk_score(name_class): d_ids: List[int] = list(load_train_claim_ids()) claims = get_claims_from_ids(d_ids) claim_d = claims_to_dict(claims) prob_score_d = load_from_pickle("pc_{}_word_prob_train".format(name_class)) stopwords = load_stopwords() acc_counter_prob_init = Counter() for claim_id, prob_scores in prob_score_d.items(): for k, v in prob_scores: if k not in stopwords: acc_counter_prob_init[k] += v rw_score = dict(load_from_pickle("bias_random_walk_train_{}".format(name_class))) acc_counter = Counter() for claim_id, qtf in rw_score.items(): for k, v in qtf.items(): acc_counter[k] += v acc_counter_prob_init = normalize_counter_to_sum1(acc_counter_prob_init) acc_counter = normalize_counter_to_sum1(acc_counter) new_counter = Counter() for k, v in acc_counter.items(): if len(k) > 2: new_v = v - acc_counter_prob_init[k] new_counter[k] = new_v return new_counter
def pc_new_init_prob(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) claim_d = claims_to_dict(claims) bias_plus_word: Counter = load_from_pickle("bias_plus_words") tokenizer = PCTokenizer() base_p = max(bias_plus_word.values()) init_p_score_d = {} for cid in d_ids: c_text = claim_d[cid] tokens = tokenizer.tokenize_stem(c_text) score_for_cid = Counter() for t, cnt in Counter(tokens).items(): prob = cnt * base_p score_for_cid[t] = prob for t, score in bias_plus_word.items(): score_for_cid[t] += score score_for_cid = normalize_counter_to_sum1(score_for_cid) init_p_score_d[cid] = score_for_cid save_to_pickle(init_p_score_d, "pc_dev_new_init_prob")
def run_reweight(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 7 param = {'k1': 0.5} pred = predict_by_reweighter(get_bm25_module(), claims, top_k, param) print(evaluate(pred))
def save_dev(): save_name = "pc_dev_a_passages" q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/dev_claim/q_res_100") d_ids = list(load_dev_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) a_relevant_candidate(save_name, q_res_path, claims)
def work(): d_ids = list(load_train_claim_ids()) claims = get_claims_from_ids(d_ids) is_train = True all_data_points = get_candidates(claims, is_train) all_data_points = all_data_points[:10] binary_feature_demo(all_data_points)
def write_claim_perspective_pair_as_query(): split = "dev" assert split in ["train", "dev", "test"] d_ids = list({ "train": load_train_claim_ids(), "dev": load_dev_claim_ids(), "test": load_test_claim_ids() }[split]) claims = get_claims_from_ids(d_ids) print(len(claims), " claims") is_train = split == "train" all_data_points = get_candidates(claims, is_train) k = 0 def get_query_entry_from_data_point(x: PerspectiveCandidate) -> DocQuery: tokens = clean_tokenize_str_to_tokens(x.claim_text + " " + x.p_text) qid = "{}_{}".format(x.cid, x.pid) return format_query_bm25(qid, tokens, k) queries = lmap(get_query_entry_from_data_point, all_data_points) out_dir = query_dir_format.format(split) exist_or_mkdir(out_dir) n_query_per_file = 50 write_queries_to_files(n_query_per_file, out_dir, queries)
def run_eval_with_dict(pickle_name): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) print("targets", len(claims)) top_k = 8 pc_score_d = load_from_pickle(pickle_name) pred = predict_from_dict(pc_score_d, claims, top_k) print(evaluate(pred))
def get_qck_queries_from_cids(d_ids: List[int]): claims: List[Dict] = get_claims_from_ids(d_ids) def claim_to_query(claim: Dict): return QCKQuery(str(claim['cId']), claim['text']) queries: List[QCKQuery] = lmap(claim_to_query, claims) return queries
def get_claim_lms() -> Dict[str, Counter]: split = "train" qids = list(get_qids_for_split(split_name2, split)) cids = lmap(int, qids) claims = get_claims_from_ids(cids) claim_lms = build_gold_lms(claims) claim_lms_dict: Dict[str, Counter] = {str(claim_lm.cid): claim_lm.LM for claim_lm in claim_lms} return claim_lms_dict
def main(): split = sys.argv[1] ids = load_claim_ids_for_split(split) claims = get_claims_from_ids(ids) for c in claims: print("Claim {} :\t{}".format(c['cId'], c['text']))
def write_simple_claim_queries(): for split in splits: claim_ids = load_claim_ids_for_split(split) claims = get_claims_from_ids(claim_ids) queries = get_simple_claim_query(claims, True) out_path = os.path.join(output_path, "perspective_query", "simple_query_{}.json".format(split)) save_queries_to_file(queries, out_path)
def main(): args = parser.parse_args(sys.argv[1:]) save_name = args.save_name d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) candidate_perspectives: Dict[int, List[Dict]] = dict( get_eval_candidates_from_pickle("train")) make_cppnc_dummy_problem(claims, candidate_perspectives, save_name, encode_two_inputs)
def run_eval_with_two_dict(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) print("targets", len(claims)) top_k = 7 pc_score_d = load_from_pickle("pc_bert_baseline_score_d") pc_score_d2 = load_from_pickle("pc_random_walk_based_score_d") pred = predict_from_two_dict(pc_score_d, pc_score_d2, claims, top_k) print(evaluate(pred))
def save_random_walk_pred(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 50 q_tf_replace = dict(load_from_pickle("random_walk_score_100")) bm25 = get_bm25_module() pred = pc_predict_from_vector_query(bm25, q_tf_replace, claims, top_k) score_d = prediction_to_dict(pred) save_to_pickle(score_d, "pc_random_walk_based_score_d")
def a_relevant(): d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) claims = claims[:10] top_n = 100 q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) stopwords = load_stopwords_for_query() alpha = 0.7 tokenizer = PCTokenizer() for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] claim_lm = claim_lms_d[c['cId']] log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) def get_passage_score(p): def get_score(t): if t in stopwords: return 0 return log_odd[tokenizer.stemmer.stem(t)] return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0 docs = [] for i in range(top_n): try: doc = load_doc(q_res[i].doc_id) docs.append(doc) except KeyError: docs.append(None) pass print(c['text']) rows = [] for rank, doc in enumerate(docs): if doc is None: rows.append((rank, "-", "-")) continue scores = get_doc_score(doc, get_passage_score) avg_score = average(scores) max_score = max(scores) rows.append((rank, avg_score, max_score)) print_table(rows)
def run_random_walk_score_with_weight(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 7 q_tf_replace = dict(load_from_pickle("random_walk_score_100")) q_tf_replace = dict_key_map(lambda x: int(x), q_tf_replace) bm25 = get_bm25_module() pred = pc_predict_vector_query_and_reweight(bm25, q_tf_replace, claims, top_k, {'k1': 0.5}) print(evaluate(pred))
def run_lm(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 5 q_tf_replace = dict(load_from_pickle("pc_dev_par_tf")) q_tf_replace = dict(load_from_pickle("random_walk_score_100")) bm25 = get_bm25_module() ctf = load_collection_tf() pred = predict_by_lm(q_tf_replace, ctf, bm25, claims, top_k) print(evaluate(pred))
def run_write_claims_as_plain_query(): for claim_ids, out_name in [ (load_train_claim_ids(), "train_claim_query_raw.txt"), (load_dev_claim_ids(), "dev_claim_query_raw.txt") ]: claims = get_claims_from_ids(claim_ids) q_str_list = get_claims_as_plain_query(claims) f = open(pjoin(output_path, out_name), "w") for s in q_str_list: f.write(s + "\n")
def generate_classification_payload(): claims, val = train_split() top_k = 50 pred = predict_by_elastic_search(claims, top_k) save_to_pickle(pred, "perspective_cls_train_X") d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 50 pred = predict_by_elastic_search(claims, top_k) save_to_pickle(pred, "perspective_cls_dev_X")
def a_relevant(): d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) claims = claims top_n = 10 q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) stopwords = load_stopwords_for_query() alpha = 0.3 tokenizer = PCTokenizer() all_passages = [] entries = [] for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] claim_lm = claim_lms_d[c['cId']] log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) claim_text = c['text'] claim_tokens = tokenizer.tokenize_stem(claim_text) scores = [] for t in claim_tokens: if t in log_odd: scores.append(log_odd[t]) base = average(scores) def get_passage_score(p): def get_score(t): if t in stopwords: return 0 return log_odd[tokenizer.stemmer.stem(t)] return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0 passages = iterate_passages(q_res, top_n, get_passage_score) all_passages.extend(passages) a_rel_passages = lfilter(lambda x: x[1] > 0, passages) entries.append((c, a_rel_passages)) data = entries, all_passages save_to_pickle(data, "pc_train_a_passages")