def main(): q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/qck/evidence/q_res_10.txt") qck_queries = get_qck_queries_all() candidate = get_qk_candidate(config1(), q_res_path, qck_queries) print("Num candidate : {}", len(candidate)) save_to_pickle(candidate, "pc_evidence_qk")
def nli_attribution_predict(hparam, nli_setting, data_loader, explain_tag, method_name, data_id, sub_range, model_path): enc_payload, plain_payload = data_loader.load_plain_text(data_id) if sub_range is not None: raise Exception("Sub_range is not supported") from attribution.gradient import explain_by_gradient from attribution.deepexplain.tensorflow import DeepExplain sess = init_session() with DeepExplain(session=sess, graph=sess.graph) as de: task = transformer_nli_pooled_embedding_in(hparam, nli_setting.vocab_size, False) softmax_out = tf.nn.softmax(task.logits, axis=-1) sess.run(tf.global_variables_initializer()) load_model(sess, model_path) emb_outputs = task.encoded_embedding_out, task.attention_mask_out emb_input = task.encoded_embedding_in, task.attention_mask_in def feed_end_input(batch): x0, x1, x2 = batch return {task.x_list[0]:x0, task.x_list[1]:x1, task.x_list[2]:x2, } explains = explain_by_gradient(enc_payload, method_name, explain_tag, sess, de, feed_end_input, emb_outputs, emb_input, softmax_out) pred_list = predict_translate(explains, data_loader, enc_payload, plain_payload) save_to_pickle(pred_list, "pred_{}_{}".format(method_name, data_id))
def main(): dir_path = sys.argv[1] tokenizer = get_tokenizer() averager = Averager() sbc = SubwordConvertor() df = Counter() collection_size = 0 tikcer = TimeEstimator(485393) for file_path in get_dir_files(dir_path): for idx, record in enumerate( tf.compat.v1.python_io.tf_record_iterator(file_path)): example = tf.train.Example() example.ParseFromString(record) feature = example.features.feature input_ids = feature["input_ids"].int64_list.value tokens = tokenizer.convert_ids_to_tokens(input_ids) sep_idx1 = tokens.index("[SEP]") sep_idx2 = tokens.index("[SEP]", sep_idx1 + 1) doc_tokens = tokens[sep_idx1:sep_idx2] words = lmap(tuple, sbc.get_word_as_subtoken_tuple(doc_tokens)) dl = len(words) collection_size += dl averager.append(dl) for word in set(words): df[word] += 1 tikcer.tick() print("collection length", collection_size) print("average dl", averager.get_average()) save_to_pickle(df, "subword_df_robust_train")
def main(config): qk_candidate: List[QKUnit] = load_from_pickle( "robust_on_clueweb_qk_candidate") qk_out_entries: List[QKOutEntry] = load_qk_score(config) score_type = config['score_type'] k = config['k'] queries = left(qk_candidate) good_doc_list_d = {q.query_id: set() for q in queries} for entry in qk_out_entries: score = get_score_from_logit(score_type, entry.logits) if score > k: good_doc_list_d[entry.query.query_id].add(entry.kdp.doc_id) stat_count = Counter() def filter_map(qk_unit: QKUnit): query, kdp_list = qk_unit good_doc_list = good_doc_list_d[query.query_id] def is_good(kdp): return kdp.doc_id in good_doc_list new_kdp_list = lfilter(is_good, kdp_list) print("{} -> {}".format(len(kdp_list), len(new_kdp_list))) if not new_kdp_list: stat_count["no kdp"] += 1 return query, new_kdp_list new_qk_candidate = lmap(filter_map, qk_candidate) print(stat_count) save_to_pickle(new_qk_candidate, "robust_on_clueweb_qk_candidate_filtered")
def work(): acc_counter = Counter() for i in range(0, 122): save_name = "acc_count_{}".format(i) counter = load_from_pickle(save_name) acc_counter.update(counter) save_to_pickle(acc_counter, "acc_count")
def save_concat_dev(): # prediction_path = pjoin(output_path, "pc_long_seq11") prediction_path = pjoin(output_path, "pc_long_focus_1") scores: Dict[CPID, List[float]] = collect_pipeline2_score( prediction_path, "pc_rel_dev_info_all") reduced_score: Dict[CPID, float] = dict_value_map(sum, scores) save_to_pickle(reduced_score, "pc_concat_dev_score")
def pc_new_init_prob(): d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) claim_d = claims_to_dict(claims) bias_plus_word: Counter = load_from_pickle("bias_plus_words") tokenizer = PCTokenizer() base_p = max(bias_plus_word.values()) init_p_score_d = {} for cid in d_ids: c_text = claim_d[cid] tokens = tokenizer.tokenize_stem(c_text) score_for_cid = Counter() for t, cnt in Counter(tokens).items(): prob = cnt * base_p score_for_cid[t] = prob for t, score in bias_plus_word.items(): score_for_cid[t] += score score_for_cid = normalize_counter_to_sum1(score_for_cid) init_p_score_d[cid] = score_for_cid save_to_pickle(init_p_score_d, "pc_dev_new_init_prob")
def a_relevant_candidate(save_name, q_res_path, claims): top_n = 10 ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) all_passages = [] entries = [] all_docs = 0 for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] claim_text = c['text'] def get_passage_score(dummy): return 0 passages: List[Tuple[List[str], float]] = iterate_passages( q_res, top_n, get_passage_score) all_docs += len(passages) all_passages.extend(passages) entries.append((c, passages)) print("{} claims. {} docs ".format(len(claims), all_docs)) data = entries, all_passages save_to_pickle(data, save_name)
def run(args): hp = hyperparams.HPGenEx() save_name = "{}_labels".format(args.data_name) data = load_as_simple_format(args.data_name) labels = label_predict(hp, data, args.model_path) save_to_pickle(labels, save_name)
def tf_record_gen(ranked_list: Dict[str, List[SimpleRankedListEntry]], queries: Dict, text_reader: Callable[[str], str], output_path, max_seq_length: int, data_info_save_name, ): writer = RecordWriterWrap(output_path) tokenizer = get_tokenizer() dummy_label = 0 data_id_idx = 0 data_id_info = {} for query_id_str in ranked_list: query_rep = queries[query_id_str] query_str = query_rep['query'] for ranked_entry in ranked_list[query_id_str]: data_id = data_id_idx data_id_idx += 1 data_id_info[data_id] = (query_id_str, ranked_entry.doc_id) text = text_reader(ranked_entry.doc_id) tokens, segment_ids = encode_query_and_text(tokenizer, query_str, text, max_seq_length) features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids) features['label_ids'] = create_int_feature([dummy_label]) features['data_id'] = create_int_feature([data_id]) writer.write_feature(features) save_to_pickle(data_id_info, data_info_save_name) writer.close()
def write_topic_sentence_as_query(): query_collection_id = Q_CONFIG_ID_BM25_UKP dp_id_to_q_res_id = {} def dp_to_query(dp: UkpDataPoint) -> DocQuery: topic_tokens = clean_tokenize_str_to_tokens(dp.topic) sent_tokens = clean_tokenize_str_to_tokens(dp.sentence) qid = str(dp.id) dp_id_to_q_res_id[str(dp.id)] = "{}_{}".format(qid, query_collection_id) return format_query(topic_tokens, sent_tokens, qid, 3) train_data, val_data = load_all_data() def all_data_iterator() -> Iterator[UkpDataPoint]: for data_list in chain(train_data.values(), val_data.values()): for dp in data_list: yield dp all_queries: List[DocQuery] = lmap(dp_to_query, all_data_iterator()) out_dir = get_query_dir(query_collection_id) exist_or_mkdir(out_dir) n_query_per_file = 50 save_to_pickle(dp_id_to_q_res_id, "ukp_10_dp_id_to_q_res_id")
def majority(build_lm_from_tokens_list, save_name): tf0 = Counter() tf1 = Counter() tf2 = Counter() for doc, preds in enum_docs_and_stance(): assert len(preds) == len(doc) cnt_stance1 = 0 cnt_stance2 = 0 for sent, pred in zip(doc, preds): probs = softmax(pred) if probs[1] > 0.5: cnt_stance1 += 1 elif probs[2] > 0.5: cnt_stance2 += 1 if cnt_stance1 > cnt_stance2: stance = 1 elif cnt_stance2 > cnt_stance1: stance = 2 else: stance = 0 if stance > 0: tf = build_lm_from_tokens_list(doc) [tf0, tf1, tf2][stance].update(tf) result = tf0, tf1, tf2 save_to_pickle(result, save_name) display(tf1, tf2, "favor", "against")
def main(): qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt" judgement = load_qrels_structured(qrel_path) def is_correct(query: QCKQuery, candidate: QCKCandidate): qid = query.query_id doc_id = candidate.id if qid not in judgement: return 0 d = judgement[qid] label = 1 if doc_id in d and d[doc_id] > 0 else 0 return label qk_candidate: List[QKUnit] = load_from_pickle( "robust_on_clueweb_qk_candidate_filtered") candidate_dict = load_cache("candidate_for_robust_qck_7") if candidate_dict is None: candidate_dict: \ Dict[str, List[QCKCandidateI]] = get_candidate_all_passage_w_samping() save_to_pickle(candidate_dict, "candidate_for_robust_qck_7") generator = QCKInstanceGenerator(candidate_dict, is_correct) num_jobs = 250 def worker_factory(out_dir): worker = QCKWorker(qk_candidate, generator, out_dir) return worker ## job_name = "robust_qck_10" runner = JobRunner(job_man_dir, num_jobs, job_name, worker_factory) runner.start()
def predict_for_view(hparam, nli_setting, data_loader, data_id, model_path, run_name, modeling_option, tags): print("predict_nli_ex") print("Modeling option: ", modeling_option) enc_payload, plain_payload = data_loader.load_plain_text(data_id) batches = get_batches_ex(enc_payload, hparam.batch_size, 3) task = transformer_nli_pooled(hparam, nli_setting.vocab_size) explain_predictor = ExplainPredictor(len(tags), task.model.get_sequence_output(), modeling_option) sess = init_session() sess.run(tf.global_variables_initializer()) load_model(sess, model_path) out_entries = [] for batch in batches: x0, x1, x2 = batch logits, ex_logits, = sess.run( [task.logits, explain_predictor.get_score()], feed_dict={ task.x_list[0]: x0, task.x_list[1]: x1, task.x_list[2]: x2, }) for i in range(len(x0)): e = x0[i], logits[i], tuple_list_select(ex_logits, i) out_entries.append(e) save_to_pickle(out_entries, "save_view_{}_{}".format(run_name, data_id))
def main(): docs: Dict[str, List[List[str]]] = load_from_pickle("dev_claim_docs") _, clue12_13_df = load_clueweb12_B13_termstat() d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) r = select_paragraph(docs, clue12_13_df, claims, "topk") save_to_pickle(r, "dev_claim_paras")
def do_predict( bert_hp, train_config, data, lms_config, modeling_option, init_fn, ): num_gpu = train_config.num_gpu train_batches, dev_batches = data lms_model = LMSModel(modeling_option, bert_hp, lms_config, num_gpu) sess = init_session() sess.run(tf.global_variables_initializer()) init_fn(sess) step_size = 100 for i in range(100): st = i * step_size ed = st + step_size # make explain train_op does not increase global step tprint(st, ed) output_d = predict_fn(sess, train_batches[st:ed], lms_model.logits, lms_model.loss_tensor, lms_model.ex_score_tensor, lms_model.per_layer_logit_tensor, lms_model.batch2feed_dict) save_path = at_output_dir("lms_scores", str(i)) save_to_pickle(output_d, save_path)
def main(config): new_qks = filter_with_ranked_list_path(config['qk_name'], config['ranked_list_path'], config['threshold'], config['top_k']) save_to_pickle(new_qks, config['save_name'])
def main(): split = "train" qk_candidate = get_qk_candidate(split) query_lms: Dict[str, Counter] = get_query_lms(split) print(len(qk_candidate), len(query_lms)) filtered_qk_candidate = filter_qk_rel(qk_candidate, query_lms, 50) save_to_pickle(filtered_qk_candidate, "pc_qk2_filtered_rel_{}".format(split))
def save_for_train(): info = load_from_pickle("pc_rel_info_all") prediction_path = pjoin(output_path, "pc_rel") rel_info: Dict[DataID, Tuple[CPIDPair, Logits, Logits]] = combine_pc_rel_with_cpid( prediction_path, info) save_to_pickle(rel_info, "pc_rel_with_cpid")
def main_hp09(): split = "train" qk_candidate = get_qk_candidate(split) query_lms: Dict[str, Counter] = get_query_lms(split) print(len(qk_candidate), len(query_lms)) alpha = 0.9 filtered_qk_candidate = filter_qk(qk_candidate, query_lms, alpha) save_to_pickle(filtered_qk_candidate, "pc_qk2_09_filtered_{}".format(split))
def gen_overlap(config): split = config['split'] q_res_path = config['q_res_path'] save_name = config['save_name'] doc_score_path = config['doc_score_path'] candidate: List[QKUnit] = qk_candidate_gen(q_res_path, doc_score_path, split, config2()) save_to_pickle(candidate, save_name)
def main(): raw_payload: List[ClaimPassages] = load_dev_payload() save_path = os.path.join(output_path, "pc_dev_passage_payload") encode = get_encode_fn(512) data_id_manage = DataIDManager() insts = list(generate_instances(raw_payload, data_id_manage)) write_records_w_encode_fn(save_path, encode, insts, len(insts)) save_to_pickle(data_id_manage.id_to_info, "pc_dev_passage_payload_info")
def save_to_cache(): for split in splits: job_name = "argu_qck_datagen_{}".format(split) candidate_dict, correct_d = load_base_resource( EvalCondition.EntirePortalCounters, split) obj = candidate_dict, correct_d save_to_pickle(obj, job_name + "_base_resource")
def main(): for split in splits: q_res_path = os.path.join(output_path, "perspective_experiments", "clueweb_qres", "{}.txt".format(split)) qck_queries = get_qck_queries(split) candidate = get_qk_candidate(config1(), q_res_path, qck_queries) print("Num candidate : {}", len(candidate)) save_to_pickle(candidate, "pc_qk2_{}".format(split))
def main(): split = "train" all_qk = load_all_qk() qids = list(get_qids_for_split(split_name2, split)) qks_for_split = list([qk for qk in all_qk if qk[0].query_id in qids]) query_lms: Dict[str, Counter] = get_claim_lms() print(len(qks_for_split), len(query_lms)) filtered_qk_candidate = filter_qk_rel(qks_for_split, query_lms, 50) save_to_pickle(filtered_qk_candidate, "pc_qk3_filtered_rel_{}".format(split))
def select_word_from_dev(): tokenizer = get_tokenizer() tf_dev = load_from_pickle("nli_tf_dev_mis") selected_words = select_common(tf_dev, tokenizer) print(list(tf_dev.most_common(100))[-1]) save_to_pickle(selected_words, "nli_dev_selected_words")
def work(self, job_id): jsonl_path = self.jsonl_path_format.format(job_id) f = open(jsonl_path, "r") line_itr = f buffered_saver = datastore.tool.PayloadSaver() payload_saver = process_jsonl(line_itr, self.tokenize_fn, buffered_saver) save_name = os.path.basename(jsonl_path) save_to_pickle(payload_saver, save_name)
def get_cpid_score_from_cache_or_raw(pred_path, cpid_resolute, strategy): cache_name = os.path.basename(pred_path) + "_" + strategy r = load_cache(cache_name) if r is None: r = get_cpid_score(pred_path, cpid_resolute, strategy) save_to_pickle(r, cache_name) return r
def main(): save_name = sys.argv[1] out_dir = os.path.join(output_path, "cppnc") exist_or_mkdir(out_dir) info_file_path = os.path.join(out_dir, sys.argv[2]) pred_file_path = os.path.join(out_dir, save_name + ".score") score_d = summarize_score(info_file_path, pred_file_path) save_to_pickle(score_d, "score_d") print("Saved as 'score_d'")
def a_relevant(save_name, q_res_path, claims): top_n = 10 ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) stopwords = load_stopwords_for_query() alpha = 0.5 tokenizer = PCTokenizer() all_passages = [] entries = [] num_pos_sum = 0 num_pos_exists = 0 for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] claim_lm = claim_lms_d[c['cId']] log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) claim_text = c['text'] claim_tokens = tokenizer.tokenize_stem(claim_text) scores = [] for t in claim_tokens: if t in log_odd: scores.append(log_odd[t]) def get_passage_score(p): def get_score(t): if t in stopwords: return 0 return log_odd[tokenizer.stemmer.stem(t)] return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0 passages = iterate_passages(q_res, top_n, get_passage_score) num_pos = len(lfilter(lambda x: x[1] > 0, passages)) num_pos_sum += num_pos if num_pos > 0: num_pos_exists += 1 all_passages.extend(passages) entries.append((c, passages)) print("{} claims. {} docs on {} claims".format(len(claims), num_pos_sum, num_pos_exists)) data = entries, all_passages save_to_pickle(data, save_name)