def qck_gen(job_name, qk_candidate_name, candidate_ranked_list_path, kdp_ranked_list_path, split): claim_ids = load_claim_ids_for_split(split) cids: List[str] = lmap(str, claim_ids) qk_candidate: List[QKUnit] = load_from_pickle(qk_candidate_name) kdp_ranked_list: Dict[ str, List[TrecRankedListEntry]] = load_ranked_list_grouped( kdp_ranked_list_path) print("cids", len(cids)) print("len(qk_candidate)", len(qk_candidate)) print("Generate instances : ", split) generator = QCKInstGenWScore( get_qck_candidate_from_ranked_list_path(candidate_ranked_list_path), is_correct_factory(), kdp_ranked_list) qk_candidate_train: List[QKUnit] = list( [qk for qk in qk_candidate if qk[0].query_id in cids]) def worker_factory(out_dir): return QCKWorker(qk_candidate_train, generator, out_dir) num_jobs = d_n_claims_per_split2[split] runner = JobRunnerS(job_man_dir, num_jobs, job_name + "_" + split, worker_factory) runner.start()
def main(): working_path = "d:\\job_dir" max_job = 3 job_name = "test_job" job_runner = JobRunnerS(working_path, max_job, job_name, DummyWorker) job_id = job_runner.pool_job() print(job_id)
def run_tokenize_jobs_for_prediction_split(split): query_group = load_query_group(split) candidate_docs = top100_doc_ids(split) def factory(out_dir): return TokenizeDocTitleBodyWorker(split, query_group, candidate_docs, out_dir) runner = JobRunnerS(job_man_dir, len(query_group), "MSMARCO_{}_title_body_tokens".format(split), factory) runner.start()
def run_tokenize_jobs_for_train_split(split): query_group = load_query_group(split) candidate_docs = load_candidate_doc_list_10(split) def factory(out_dir): return SentLevelTokenizeWorker(split, query_group, candidate_docs, out_dir) runner = JobRunnerS(job_man_dir, len(query_group), "MSMARCO_{}_sent_tokens".format(split), factory) runner.start()
def run_tokenize_jobs_for_pred_split(split): query_group = load_query_group(split) candidate_docs = top100_doc_ids(split) max_sent_length = 64 * 5 max_title_length = 64 * 5 def factory(out_dir): return MultipleTokenizeWorker(split, query_group, candidate_docs, max_sent_length, max_title_length, out_dir) runner = JobRunnerS(job_man_dir, len(query_group), "MSMARCO_{}_multiple_tokenize".format(split), factory) runner.start()
def qck_gen_w_ranked_list(job_name, qk_candidates: List[QKUnit], qck_candidates_dict: Dict[str, List[QCKCandidate]], split): qids = list(get_qids_for_split(split_name2, split)) print("Generate instances : ", split) generator = QCKInstanceGenerator(qck_candidates_dict, is_correct_factory()) qk_candidates_for_split: List[QKUnit] = list([qk for qk in qk_candidates if qk[0].query_id in qids]) print("{} of {} qk are used".format(len(qk_candidates_for_split), len(qk_candidates))) def worker_factory(out_dir): return QCKWorker(qk_candidates_for_split, generator, out_dir) num_jobs = len(qids) runner = JobRunnerS(job_man_dir, num_jobs, job_name + "_" + split, worker_factory) runner.start()
def start_generate_jobs_for_dev(generator: InstanceGenerator, name_prefix): # claim ids split to train/val print("Loading data ....") d_ids: List[int] = list(load_dev_claim_ids()) claims = get_claims_from_ids(d_ids) cids = {str(t['cId']) for t in claims} qk_candidate: List[QKUnit] = load_qk_candidate_dev() qk_candidate_val = list( [qk for qk in qk_candidate if qk[0].query_id in cids]) print("Generate instances : dev") def worker_factory(out_dir): return QCKWorker(qk_candidate_val, generator, out_dir) runner = JobRunnerS(job_man_dir, 138, name_prefix + "_dev", worker_factory) runner.start()
def main(): is_correct_fn = get_is_correct_fn() split = "train" qk_candidate = load_from_pickle("pc_evi_filtered_qk_{}".format(split)) tprint("Loading candidates..") candidate_dict = load_bal_candidate(split) tprint("{} dict keys".format(len(candidate_dict))) tprint("Initializing generator..") generator = QCKInstanceGenerator(candidate_dict, is_correct_fn) num_jobs = d_n_pc_per_split[split] def worker_factory(out_dir): worker = QCKWorker(qk_candidate, generator, out_dir) return worker job_name = "pc_evi_qck2_s_{}".format(split) runner = JobRunnerS(job_man_dir, num_jobs, job_name, worker_factory) runner.start()
def multi_qck_gen(job_name, qk_candidate_name, ranked_list_path, split, k_group_size): claim_ids = load_claim_ids_for_split(split) cids: List[str] = lmap(str, claim_ids) qk_candidate: List[QKUnit] = load_from_pickle(qk_candidate_name) print("cids", len(cids)) print("len(qk_candidate)", len(qk_candidate)) print("Generate instances : ", split) generator = QCKGeneratorGrouped( get_qck_candidate_from_ranked_list_path(ranked_list_path), is_correct_factory(), False, k_group_size) qk_candidate_train: List[QKUnit] = list( [qk for qk in qk_candidate if qk[0].query_id in cids]) def worker_factory(out_dir): return QCKWorker(qk_candidate_train, generator, out_dir) num_jobs = d_n_claims_per_split2[split] runner = JobRunnerS(job_man_dir, num_jobs, job_name + "_" + split, worker_factory) runner.start()
def main(): is_correct_fn = get_is_correct_fn() for split in splits[:2]: qk_candidate = load_from_pickle("pc_evi_filtered_qk_{}".format(split)) qk_candidate = sample_kdps(qk_candidate) tprint("Loading candidates..") candidate_dict = load_bal_candidate(split) tprint("{} dict keys".format(len(candidate_dict))) tprint("Initializing generator..") generator = QCKInstanceGenerator(candidate_dict, is_correct_fn) n_qk_per_job = 10 num_jobs = ceil_divide(d_n_pc_per_split[split], n_qk_per_job) def worker_factory(out_dir): worker = QCKWorkerMultiple(qk_candidate, generator, n_qk_per_job, out_dir) return worker job_name = "pc_evi_qck3_{}".format(split) runner = JobRunnerS(job_man_dir, num_jobs, job_name, worker_factory) runner.start()
def start_generate_jobs(generator: InstanceGenerator, subsplit, qk_candidate_name, name_prefix): # claim ids split to train/val print("Loading data ....") claims = load_claims_for_sub_split(subsplit) valid_cids = {str(t['cId']) for t in claims} qk_candidate: List[QKUnit] = load_from_pickle(qk_candidate_name) qk_candidate_val = list( [qk for qk in qk_candidate if qk[0].query_id in valid_cids]) print("Generate instances :") print("split: ", subsplit) print("qk_candidate_name: ", qk_candidate_name) def worker_factory(out_dir): return QCKWorker(qk_candidate_val, generator, out_dir) num_job = d_n_claims_per_subsplit[subsplit] runner = JobRunnerS(job_man_dir, num_job, name_prefix + "_" + subsplit, worker_factory) runner.auto_runner()
def qck_gen_w_ranked_list_multiple(job_name, qk_candidate_name, ranked_list_path, split, n_qk_per_job): claim_ids = load_claim_ids_for_split(split) cids: List[str] = lmap(str, claim_ids) qk_candidate: List[QKUnit] = load_from_pickle(qk_candidate_name) print("cids", len(cids)) print("len(qk_candidate)", len(qk_candidate)) print("Generate instances : ", split) generator = QCKInstanceGenerator( get_qck_candidate_from_ranked_list_path(ranked_list_path), is_correct_factory()) qk_candidate_train: List[QKUnit] = list( [qk for qk in qk_candidate if qk[0].query_id in cids]) def worker_factory(out_dir): return QCKWorkerMultiple(qk_candidate_train, generator, n_qk_per_job, out_dir) num_qks = d_n_claims_per_split2[split] num_jobs = ceil_divide(num_qks, n_qk_per_job) runner = JobRunnerS(job_man_dir, num_jobs, job_name + "_" + split, worker_factory) runner.start()
line_end = line_start + job_size self.doc_f.seek(self.line_offset_d[line_start]) lines = [] for i in range(job_size): line = self.doc_f.readline() lines.append(line) cur_offset = self.doc_f.tell() try: expected_offset = self.line_offset_d[line_end] if cur_offset != expected_offset: print("cur_offset != expected_offset : {} != {}".format( cur_offset, expected_offset)) except IndexError as exception_e: print(exception_e) tokenized_lines = tokenize_lines(self.tokenizer, lines) out_f = open(os.path.join(self.out_dir, str(job_id)), "w") for row in tokenized_lines: out_f.write("\t".join(row) + "\n") if __name__ == "__main__": num_job = 3213 + 1 runner = JobRunnerS(job_man_dir, num_job, "MSMARCO_tokens", CorpusTokenizeWorker) runner.start()
from dataset_specific.msmarco.common import load_query_group, load_candidate_doc_list_10, load_candidate_doc_top50 from dataset_specific.msmarco.tokenize_worker import TokenizeWorker from epath import job_man_dir from job_manager.job_runner_with_server import JobRunnerS if __name__ == "__main__": split = "train" query_group = load_query_group(split) candidate_docs = load_candidate_doc_top50(split) def factory(out_dir): return TokenizeWorker(split, query_group, candidate_docs, out_dir) runner = JobRunnerS(job_man_dir, len(query_group)-1, "MSMARCO_{}_top50_tokens".format(split), factory) runner.start()
from data_generator.job_runner import JobRunner from epath import job_man_dir from job_manager.job_runner_with_server import JobRunnerS from tlm.data_gen.adhoc_datagen import MultiWindow from tlm.data_gen.adhoc_sent_tokenize import SeroFromTextEncoder from tlm.data_gen.msmarco_doc_gen.gen_worker import MMDWorker from tlm.data_gen.msmarco_doc_gen.gen_worker_sent_level import PointwiseGenFromText from tlm.data_gen.msmarco_doc_gen.processed_resource import ProcessedResource10doc if __name__ == "__main__": split = "train" resource = ProcessedResource10doc(split) max_segments = 4 total_sequence_length = 512 * 4 src_window_size = 512 basic_encoder = SeroFromTextEncoder(src_window_size, total_sequence_length, random_short=True, max_seg_per_doc=max_segments) generator = PointwiseGenFromText(resource, basic_encoder, total_sequence_length) def factory(out_dir): return MMDWorker(resource.query_group, generator, out_dir) runner = JobRunnerS(job_man_dir, len(resource.query_group) - 1, "MMD_sero_train_B", factory) runner.start()
from typing import List, Dict from data_generator.job_runner import JobRunner from epath import job_man_dir from job_manager.job_runner_with_server import JobRunnerS from tlm.data_gen.adhoc_datagen import LeadingN, FirstAndRandom, GeoSampler from tlm.data_gen.adhoc_sent_tokenize import FromTextEncoder from tlm.data_gen.doc_encode_common import seg_selection_by_geo_sampling from tlm.data_gen.msmarco_doc_gen.gen_worker import MMDWorker from tlm.data_gen.msmarco_doc_gen.gen_worker_sent_level import PairwiseGenFromText from tlm.data_gen.msmarco_doc_gen.processed_resource import ProcessedResource, ProcessedResource10doc if __name__ == "__main__": split = "train" resource = ProcessedResource10doc(split) max_seq_length = 512 document_encoder = FromTextEncoder(max_seq_length, True, seg_selection_by_geo_sampling()) generator = PairwiseGenFromText(resource, document_encoder, max_seq_length) def factory(out_dir): return MMDWorker(resource.query_group, generator, out_dir) runner = JobRunnerS(job_man_dir, len(resource.query_group) - 1, "MMD_{}_ss".format(split), factory) runner.start()