def augment(short_records, long_records, target_len, save_dir, start_record_idx=0): exist_or_mkdir(save_dir) record_idx = start_record_idx print("record_idx", record_idx) def get_next_writer(): return RecordWriterWrap(os.path.join(save_dir, str(record_idx))) writer = get_next_writer() cnt = 0 while cnt < target_len: first_inst = short_records.__next__() second_inst = long_records.__next__() first_inst = feature_to_ordered_dict(first_inst) first_inst["next_sentence_labels"] = create_int_feature([1]) second_inst = feature_to_ordered_dict(second_inst) second_inst["next_sentence_labels"] = create_int_feature([1]) writer.write_feature(first_inst) writer.write_feature(second_inst) # cnt += 2 if writer.total_written >= 100000: record_idx += 1 print("Wrote {} data".format(cnt)) writer.close() writer = get_next_writer() return
def simple(): tf_logging.setLevel(logging.INFO) out_path = os.path.join(working_path, "dict_reader3") exist_or_mkdir(out_path) worker = DGenWorker(out_path) worker.gen.f_hide_word = False worker.work(1)
def split_train_to_tdev(): sequence_length = 300 data_loader = get_modified_nli_data_loader(sequence_length) file = data_loader.train_file dir_path = os.path.join(output_path, "nli_tfrecord_t_{}".format(sequence_length)) exist_or_mkdir(dir_path) itr = data_loader.example_generator(file) all_inst = [] for e in itr: f = entry_to_feature_dict(e) all_inst.append(f) random.shuffle(all_inst) tdev_size = 9000 train_t = all_inst[:-tdev_size] dev_t = all_inst[-tdev_size:] assert len(train_t) + len(dev_t) == len(all_inst) def save(name, data): output_file = os.path.join(dir_path, name) writer = write_features_to_file(data, output_file) print("%s: Wrote %d total instances" % (name, writer.total_written)) save("train_t", train_t) save("dev_t", dev_t)
def add_jobs(sh_format_path, model_sub_path, save_dir, job_group_name, job_list): save_path_list = [] exist_or_mkdir(save_dir) job_id_list = [] job_info_list: List[Dict] = [] for i in job_list: save_path = os.path.join(save_dir, str(i)) run_name = "{}-{}".format(job_group_name, i) d = { "$model_subpath": model_sub_path, "$run_name": run_name, "$i": str(i), "$save_path": save_path } job_id = run_job(sh_format_path, d) job_id_list.append(job_id) save_path_list.append(save_path) job_info = { 'job_id': job_id, 'save_path': save_path, 'data_no': i, } job_info_list.append(job_info) return job_info_list
def work(): ranked_list_save_root = get_ranked_list_save_dir(Q_CONFIG_ID_BM25_UKP) exist_or_mkdir(ranked_list_save_root) query_files = get_all_query_file_names(Q_CONFIG_ID_BM25_UKP) query_to_all_clueweb_disk.send(query_files, index_name_list[:1], "ukp_{}".format(Q_CONFIG_ID_BM25_UKP), ranked_list_save_root)
def make_cppnc_dummy_problem(claims: List[Dict], candidate_perspectives, save_name: str, encode_inner_fn) -> None: empty_passage = {'passage': []} def get_payload() -> Iterable[Tuple[int, int, List[Dict]]]: for cid, candidates in candidate_perspectives.items(): for candi in candidates: yield cid, candi['pid'], [empty_passage] tokenizer = get_tokenizer() data_id_man = DataIDManager() payloads: Iterable[PayloadAsTokens] = put_texts(get_payload(), claims, tokenizer, data_id_man) max_seq_length = 512 def encode_fn(r: PayloadAsTokens): return encode_inner_fn(max_seq_length, tokenizer, r) out_dir = os.path.join(output_path, "cppnc") exist_or_mkdir(out_dir) save_path = os.path.join(out_dir, save_name + ".tfrecord") write_records_w_encode_fn(save_path, encode_fn, payloads) info_save_path = os.path.join(out_dir, save_name + ".info") print("Payload size : ", len(data_id_man.id_to_info)) json.dump(data_id_man.id_to_info, open(info_save_path, "w")) print("tfrecord saved at :", save_path) print("info saved at :", info_save_path)
def __init__(self): print("AAA") logger.debug("Executer init") # save/log current jobs, so that it can restart. self.task_info_dir = os.path.join(root_dir, "task_info") self.root_info_dir = os.path.join(root_dir, "root_info") exist_or_mkdir(self.task_info_dir) exist_or_mkdir(self.root_info_dir) # load task info for all active / queued task self.active_task_list = TaskList( os.path.join(self.root_info_dir, "active_task.json"), self.task_info_dir) self.queued_task_list = TaskList( os.path.join(self.root_info_dir, "queued_task.json"), self.task_info_dir) self.info_dict = JsonTiedDict( os.path.join(self.root_info_dir, "info.json")) tpu_info_path = os.path.join(self.root_info_dir, "tpu_info.json") self.tpu_resource = ResourceList(tpu_info_path, ["v2-tf2", "v2-tf2-2"]) self.current_task_handles = {} # task_id -> process object # task_id being in current_task_handle does NOT imply the task is active, we don't delete handles self.task_cache = {} # task_id -> TaskObj self._init_info()
def gen_tfrecord_w_tdev(): max_sequence = 300 dir_path = os.path.join(data_path, "ukp_tdev_{}".format(max_sequence)) exist_or_mkdir(dir_path) for topic in all_topics: data_loader = modify_data_loader( BertDataLoader(topic, True, max_sequence, "bert_voca.txt", "only_topic_word")) todo = [("dev", data_loader.get_dev_data())] train_data = list(data_loader.get_train_data()) random.shuffle(train_data) validation_size = int(len(train_data) * 0.1) train_train_data = train_data[:-validation_size] train_dev_data = train_data[validation_size:] todo.append(("ttrain", train_train_data)) todo.append(("tdev", train_dev_data)) for name, data in todo[::-1]: features = lmap(entry_to_feature_dict, data) out_name = "{}_{}".format(name, topic) out_path = os.path.join(dir_path, out_name) write_features_to_file(features, out_path)
def gen_tf_record(): sequence_length = 300 data_loader = get_biobert_nli_data_loader(sequence_length) todo = [("train", [data_loader.train_file]), ("dev", [data_loader.dev_file])] batch_size = 32 dir_path = os.path.join(output_path, "biobert_mnli_{}".format(sequence_length)) exist_or_mkdir(dir_path) for name, files in todo[::-1]: output_file = os.path.join(dir_path, name) writer = RecordWriterWrap(output_file) for file in files: for e in data_loader.example_generator(file): f = entry_to_feature_dict(e) f["is_real_example"] = create_int_feature([1]) writer.write_feature(f) if name == "dev": while writer.total_written % batch_size != 0: f["is_real_example"] = create_int_feature([0]) writer.write_feature(f) writer.close() print("Wrote %d total instances" % writer.total_written)
def write_topic_sentence_as_query(): query_collection_id = Q_CONFIG_ID_BM25_UKP dp_id_to_q_res_id = {} def dp_to_query(dp: UkpDataPoint) -> DocQuery: topic_tokens = clean_tokenize_str_to_tokens(dp.topic) sent_tokens = clean_tokenize_str_to_tokens(dp.sentence) qid = str(dp.id) dp_id_to_q_res_id[str(dp.id)] = "{}_{}".format(qid, query_collection_id) return format_query(topic_tokens, sent_tokens, qid, 3) train_data, val_data = load_all_data() def all_data_iterator() -> Iterator[UkpDataPoint]: for data_list in chain(train_data.values(), val_data.values()): for dp in data_list: yield dp all_queries: List[DocQuery] = lmap(dp_to_query, all_data_iterator()) out_dir = get_query_dir(query_collection_id) exist_or_mkdir(out_dir) n_query_per_file = 50 save_to_pickle(dp_id_to_q_res_id, "ukp_10_dp_id_to_q_res_id")
def gen_pairwise(): max_sequence = 300 dir_path = os.path.join(data_path, "ukp_pairwise_{}".format(max_sequence)) exist_or_mkdir(dir_path) for topic in all_topics: data_loader = modify_data_loader( BertDataLoader(topic, True, max_sequence, "bert_voca.txt", "only_topic_word")) todo = [("train", data_loader.get_train_data()), ("dev", data_loader.get_dev_data())] for name, data in todo[::-1]: out_name = "{}_{}".format(name, topic) out_path = os.path.join(dir_path, out_name) grouped = [[], [], []] for e in data: input_ids, input_mask, segment_ids, label = e grouped[label].append(e) combs = [] combs.extend(generate_pairwise_combinations( grouped[0], grouped[1])) combs.extend(generate_pairwise_combinations( grouped[1], grouped[2])) combs.extend(generate_pairwise_combinations( grouped[2], grouped[0])) features = lmap(pairwise_entry_to_feature_dict, combs) write_features_to_file(features, out_path)
def write_claim_perspective_pair_as_query(): split = "dev" assert split in ["train", "dev", "test"] d_ids = list({ "train": load_train_claim_ids(), "dev": load_dev_claim_ids(), "test": load_test_claim_ids() }[split]) claims = get_claims_from_ids(d_ids) print(len(claims), " claims") is_train = split == "train" all_data_points = get_candidates(claims, is_train) k = 0 def get_query_entry_from_data_point(x: PerspectiveCandidate) -> DocQuery: tokens = clean_tokenize_str_to_tokens(x.claim_text + " " + x.p_text) qid = "{}_{}".format(x.cid, x.pid) return format_query_bm25(qid, tokens, k) queries = lmap(get_query_entry_from_data_point, all_data_points) out_dir = query_dir_format.format(split) exist_or_mkdir(out_dir) n_query_per_file = 50 write_queries_to_files(n_query_per_file, out_dir, queries)
def run(in_dir_path, out_dir_path, keyword): exist_or_mkdir(out_dir_path) tokenizer = get_tokenizer() ids = tokenizer.convert_tokens_to_ids([keyword]) assert len(ids) == 1 id_keyword = ids[0] def condition_fn(features): return id_keyword in take(features['input_ids']) inst_cnt = 0 def debug_call_back(features): nonlocal inst_cnt if inst_cnt < 4: input_tokens = tokenizer.convert_ids_to_tokens( take(features['input_ids'])) print(pretty_tokens(input_tokens)) inst_cnt += 1 for file_path in get_dir_files(in_dir_path): inst_cnt = 0 name = os.path.basename(file_path) out_path = os.path.join(out_dir_path, name) do_filtering(file_path, out_path, condition_fn)
def run(args): data_name = args.data_name method_name = args.method_name score_name = "{}_{}".format(data_name, method_name) config = DropStop try: save_name = "{}_{}.txt".format(score_name, config.name) save_dir = os.path.join(output_path, "genex", "runs") exist_or_mkdir(os.path.join(output_path, "genex")) exist_or_mkdir(save_dir) save_path = os.path.join(save_dir, save_name) data: List[PackedInstance] = load_packed(data_name) if method_name == "random": config = RandomConfig scores: List[np.array] = [np.random.random([512])] * len(data) else: scores: List[np.array] = load_from_pickle(score_name) if "term_" in method_name: save_score_to_file_term_level(data, config, save_path, scores) else: save_score_to_file(data, config, save_path, scores) except: raise
def submit_jobs_inner(claim_ids, split_name): claims = get_claims_from_ids(claim_ids) queries = get_claims_query(claims) out_root = "/mnt/nfs/work3/youngwookim/data/perspective/{}_claim_rm3".format( split_name) exist_or_mkdir(out_root) submit_rm_jobs(queries, out_root)
def make_cppnc_problem(passage_score_path: FilePath, data_id_to_info: Dict, claims: List[Dict], candidate_perspectives, config, save_name: str, encode_inner_fn) -> None: output: List[Tuple[int, List[Dict]]] = collect_good_passages( data_id_to_info, passage_score_path, config) joined_payloads: List = list( join_perspective(output, candidate_perspectives)) tokenizer = get_tokenizer() data_id_man = DataIDManager() payloads: Iterable[PayloadAsTokens] = put_texts(joined_payloads, claims, tokenizer, data_id_man) max_seq_length = 512 def encode_fn(r: PayloadAsTokens): return encode_inner_fn(max_seq_length, tokenizer, r) out_dir = os.path.join(output_path, "cppnc") exist_or_mkdir(out_dir) save_path = os.path.join(out_dir, save_name + ".tfrecord") write_records_w_encode_fn(save_path, encode_fn, payloads) info_save_path = os.path.join(out_dir, save_name + ".info") print("Payload size : ", len(data_id_man.id_to_info)) json.dump(data_id_man.id_to_info, open(info_save_path, "w")) print("tfrecord saved at :", save_path) print("info saved at :", info_save_path)
def generate_qc3(): is_correct_fn = get_is_correct_fn() save_dir = os.path.join(output_path, "pc_evidence_qc3") exist_or_mkdir(save_dir) for split in splits: candidate_dict: Dict[str, List[QCKCandidateI]] = get_candidate(split) do_generate_jobs(candidate_dict, is_correct_fn, save_dir, split)
def __init__(self, max_seq_length, out_dir): self.query_group: List[List[QueryID]] = load_query_group("train") self.seg_resource_loader = SegmentResourceLoader(job_man_dir, "train") self.max_seq_length = max_seq_length self.out_dir = out_dir self.info_dir = self.out_dir + "_info" exist_or_mkdir(self.info_dir)
def init_worker(): out_path1 = os.path.join(working_path, "entry_prediction_tf") out_path2 = os.path.join(working_path, "entry_prediction_n") exist_or_mkdir(out_path1) exist_or_mkdir(out_path2) worker = Worker(out_path1, out_path2) return worker
def main(): save_dir = os.path.join(output_path, "pc_qc") exist_or_mkdir(save_dir) for split in splits: queries = get_qck_queries(split) eval_candidate = get_eval_candidates_as_qck(split) save_path = os.path.join(save_dir, split) make_pc_qc(queries, eval_candidate, is_correct_factory(), save_path)
def main(): exist_or_mkdir(os.path.join(output_path, "aawd_tfrecord")) train, dev, test = load_aawd_splits() todo = [(train, "train"), (dev, "dev"), (test, "test")] encode_fn = get_encode_fn(256) for data, split in todo: save_path = at_output_dir("aawd_tfrecord", split) write_records_w_encode_fn(save_path, encode_fn, data)
def main(): job_name = "MMD_train_single_seg" out_dir = os.path.join(job_man_dir, job_name) exist_or_mkdir(out_dir) worker = SingleSegTrainGen(512, out_dir) for job_id in range(178, train_query_group_len): print("job_id:", job_id) worker.work(job_id)
def main(): save_name = sys.argv[1] out_dir = os.path.join(output_path, "cppnc") exist_or_mkdir(out_dir) info_file_path = os.path.join(out_dir, sys.argv[2]) pred_file_path = os.path.join(out_dir, save_name + ".score") score_d = summarize_score(info_file_path, pred_file_path) save_to_pickle(score_d, "score_d") print("Saved as 'score_d'")
def do_nli(): model_1_path = "gs://clovertpu/training/model/nli_bert_freeze_D/model.ckpt-73615" model_2_path = 'gs://clover_eu4/model/alt_emb_L/model.ckpt-20000' save_dir = os.path.join(output_path, "nli_from_alt_emb_L") exist_or_mkdir(save_dir) save_path = os.path.join(save_dir, "model.ckpt-0") download_and_combine(model_1_path, model_2_path, save_path) upload_gs_dir = "gs://clover_eu4/model/nli_from_alt_emb_L" upload_to_gs(save_path, upload_gs_dir)
def save_per_cid(): print("Loading scores...") cid_grouped: Dict[str, Dict[str, List[Dict]]] = load_cppnc_score_wrap() save_root = os.path.join(output_path, "cppnc", "cid_grouped") exist_or_mkdir(save_root) for cid, entries in cid_grouped.items(): save_path = os.path.join(save_root, cid) pickle.dump(entries, open(save_path, "wb"))
def ukp(): model_1_path = "gs://clovertpu/training/model/ukp_runs/ukp_8_bert_/model.ckpt-1939" model_2_path = 'gs://clover_eu4/model/alt_emb_O_ukp/model.ckpt-20000' save_dir = os.path.join(output_path, "ukp_from_alt_emb_O") exist_or_mkdir(save_dir) save_path = os.path.join(save_dir, "model.ckpt-0") download_and_combine(model_1_path, model_2_path, save_path) upload_gs_dir = "gs://clovertpu/training/model/ukp_from_alt_emb_O" upload_to_gs(save_path, upload_gs_dir)
def ehealth_K(): model_1_path = "gs://clovertpu/training/model/ehealth_bert_freeze/model.ckpt-10000" model_2_path = 'gs://clover_eu4/model/alt_emb_K/model.ckpt-20000' save_dir = os.path.join(output_path, "ehealth_K") exist_or_mkdir(save_dir) save_path = os.path.join(save_dir, "model.ckpt-0") download_and_combine(model_1_path, model_2_path, save_path) upload_gs_dir = "gs://clover_eu4/model/ehealth_combined_K" upload_to_gs(save_path, upload_gs_dir)
def main(): save_name = sys.argv[1] out_dir = os.path.join(output_path, "cppnc") exist_or_mkdir(out_dir) info_file_path = os.path.join(out_dir, save_name + ".info") pred_file_path = os.path.join(out_dir, save_name + ".score") score_d = summarize_score(info_file_path, pred_file_path) map_score = eval_map("dev", score_d, False) print(map_score)
def run_dir(in_dir_name: FileName, out_dir_name: FileName): in_dir = pjoin(sydney_working_dir, in_dir_name) out_dir = pjoin(sydney_working_dir, out_dir_name) exist_or_mkdir(out_dir) for file_path in get_dir_files(in_dir): name = FileName(os.path.basename(file_path)) out_path = pjoin(out_dir, name) convert_to_2way(file_path, out_path)
def generate_qc_bert4(): is_correct_fn = get_is_correct_fn() save_dir = os.path.join(output_path, "pc_evidence_qc4") exist_or_mkdir(save_dir) for split in splits: candidate_dict: Dict[ str, List[QCKCandidateI]] = get_ex_candidate_for_training(split, False) do_generate_jobs(candidate_dict, is_correct_fn, save_dir, split)