def main(): max_passage_length = 256 encoder = PassageSampling(max_passage_length) max_seq_length = 512 worker_factory = partial(RobustWorker, RobustPointwiseTrainGenEx(encoder, max_seq_length)) runner = JobRunner(job_man_dir, 4, "robust_train_sampling_256", worker_factory) runner.start()
def generate_robust_all_seg_for_predict(): max_seq_length = 128 step_size = 64 encoder = OverlappingSegments(max_seq_length, step_size) worker_factory = partial(RobustWorker, RobustPredictGen(encoder, max_seq_length, 100, "desc")) runner = JobRunner(job_man_dir, 4, "robust_predict_desc_128_overlap", worker_factory) runner.start()
def generate_robust_sero_for_train(): total_sequence_length = 128 * 32 src_window_size = 128 encoder = MultiWindowOverlap(src_window_size, total_sequence_length) worker_factory = partial(RobustWorker, RobustPointwiseTrainGenEx(encoder, total_sequence_length, "desc")) runner = JobRunner(job_man_dir, 4, "RobustSero_128_32_overlap", worker_factory) runner.start()
def main(): qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt" judgement = load_qrels_structured(qrel_path) def is_correct(query: QCKQuery, candidate: QCKCandidate): qid = query.query_id doc_part_id = candidate.id doc_id = "_".join(doc_part_id.split("_")[:-1]) if qid not in judgement: return 0 d = judgement[qid] if doc_id in d: return d[doc_id] else: return 0 qk_candidate: List[QKUnit] = load_from_pickle( "robust_on_clueweb_qk_candidate") candidate_dict: \ Dict[str, List[QCKCandidateI]] = load_candidate_all_passage_from_qrel(256) generator = QCKInstanceGenerator(candidate_dict, is_correct) num_jobs = 250 def worker_factory(out_dir): worker = QCKWorker(qk_candidate, generator, out_dir) return worker ## job_name = "robust_qck_6" runner = JobRunner(job_man_dir, num_jobs, job_name, worker_factory) runner.start()
def main(): max_seq_length = 512 encoder = AllSegmentAsDoc(max_seq_length) worker_factory = partial(RobustWorkerWDataID, RobustTrainGenWDataID(encoder, max_seq_length)) runner = JobRunner(job_man_dir, 4, "robust_w_data_id", worker_factory) runner.start()
def generate_robust_sero_for_train(): total_sequence_length = 512 * 4 src_window_size = 512 encoder = MultiWindow(src_window_size, total_sequence_length) worker_factory = partial(RobustWorker, RobustPointwiseTrainGenEx(encoder, total_sequence_length, "desc")) runner = JobRunner(sydney_working_dir, 4, "RobustSero5", worker_factory) runner.start()
def main(): qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt" judgement = load_qrels_structured(qrel_path) def is_correct(query: QCKQuery, candidate: QCKCandidate): qid = query.query_id doc_id = candidate.id if qid not in judgement: return 0 d = judgement[qid] label = 1 if doc_id in d and d[doc_id] > 0 else 0 return label qk_candidate: List[QKUnit] = load_from_pickle( "robust_on_clueweb_qk_candidate_filtered") candidate_dict = load_cache("candidate_for_robust_qck_7") if candidate_dict is None: candidate_dict: \ Dict[str, List[QCKCandidateI]] = get_candidate_all_passage_w_samping() save_to_pickle(candidate_dict, "candidate_for_robust_qck_7") generator = QCKInstanceGenerator(candidate_dict, is_correct) num_jobs = 250 def worker_factory(out_dir): worker = QCKWorker(qk_candidate, generator, out_dir) return worker ## job_name = "robust_qck_10" runner = JobRunner(job_man_dir, num_jobs, job_name, worker_factory) runner.start()
def start_generate_jobs_for_train_val( generator_functor: Callable[[Dict[int, List[Tuple[List[str], float]]]], CPPNCGeneratorInterface], writer, name_prefix): # claim ids split to train/val d_ids: List[int] = list(load_train_claim_ids()) claims = get_claims_from_ids(d_ids) train, val = split_7_3(claims) data = load_from_pickle("pc_train_a_passages") entries, all_passages = data cid_to_passages: Dict[int, List[Tuple[List[str], float]]] = { claim['cId']: p for claim, p in entries } generator = generator_functor(cid_to_passages) print("Generate instances : train") def worker_factory(out_dir): return CPPNCWorker(train, generator, writer, out_dir) runner = JobRunner(job_man_dir, 378, name_prefix + "_train", worker_factory) runner.start() print("Generate instances : val") def worker_factory(out_dir): return CPPNCWorker(val, generator, writer, out_dir) runner = JobRunner(job_man_dir, 162, name_prefix + "_val", worker_factory) runner.start()
def main(): max_seq_length = 128 encoder = TwoPieceSegmentComposer(max_seq_length, True) worker_factory = partial(RobustPerQueryWorker, RobustTrainGenLight(encoder, max_seq_length)) num_jobs = 250 runner = JobRunner(job_man_dir, num_jobs-1, "robust_two_piece2", worker_factory) runner.start()
def start_generate_jobs_for_train_val(generator: InstanceGenerator, name_prefix): # claim ids split to train/val print("Loading data ....") d_ids: List[int] = list(load_train_claim_ids()) claims = get_claims_from_ids(d_ids) train, val = split_7_3(claims) train_cids = {str(t['cId']) for t in train} val_cids = {str(t['cId']) for t in val} qk_candidate: List[QKUnit] = load_qk_candidate_train() print("Generate instances : train") qk_candidate_train: List[QKUnit] = list( [qk for qk in qk_candidate if qk[0].query_id in train_cids]) qk_candidate_val = list( [qk for qk in qk_candidate if qk[0].query_id in val_cids]) def worker_factory(out_dir): return QCKWorker(qk_candidate_train, generator, out_dir) runner = JobRunner(job_man_dir, 378, name_prefix + "_train", worker_factory) runner.start() print("Generate instances : val") def worker_factory(out_dir): return QCKWorker(qk_candidate_val, generator, out_dir) runner = JobRunner(job_man_dir, 162, name_prefix + "_val", worker_factory) runner.start()
def main(): max_passage_length = 128 num_segment = 1 encoder = LeadingN(max_passage_length, num_segment) max_seq_length = max_passage_length worker_factory = partial(RobustWorker, RobustPointwiseTrainGenEx(encoder, max_seq_length, "desc")) runner = JobRunner(job_man_dir, 4, "first_128_desc", worker_factory) runner.start()
def main(): max_passage_length = 128 g = 0.5 encoder = GeoSampler(max_passage_length, g) max_seq_length = max_passage_length worker_factory = partial(RobustWorker, RobustPointwiseTrainGenEx(encoder, max_seq_length, "desc")) runner = JobRunner(job_man_dir, 4, "robust_geo05", worker_factory) runner.start()
def generate_robust_first_for_prediction(): max_seq_length = 512 encoder = FirstSegmentAsDoc(max_seq_length) worker_factory = partial(RobustWorker, RobustPredictGenOld(encoder, max_seq_length)) runner = JobRunner(sydney_working_dir, 4, "RobustFirstPred3", worker_factory) runner.start()
def generate_robust_all_seg_for_train(): max_seq_length = 512 encoder = AllSegmentAsDoc(max_seq_length) worker_factory = partial( RobustWorker, RobustPredictGen(encoder, max_seq_length, 100, "desc")) runner = JobRunner(job_man_dir, 4, "robust_predict_desc_query", worker_factory) runner.start()
def generate_robust_all_seg_for_train(): max_seq_length = 256 encoder = AllSegmentAsDoc(max_seq_length) worker_factory = partial(RobustWorker, RobustPredictGen(encoder, max_seq_length)) runner = JobRunner(job_man_dir, 4, "robust_all_passage_predict_256", worker_factory) runner.start()
def generate_robust_first_for_pred(): doc_len = 256 + 3 max_seq_length = 512 encoder = FirstSegmentAsDoc(doc_len) worker_factory = partial(RobustWorker, RobustPredictGenOld(encoder, max_seq_length)) runner = JobRunner(job_man_dir, 4, "robust_first_256_pred", worker_factory) runner.start()
def generate_robust_first_for_train(): doc_len = 256 + 3 max_seq_length = 512 encoder = FirstSegmentAsDoc(doc_len) worker_factory = partial(RobustWorker, RobustPairwiseTrainGen(encoder, max_seq_length)) runner = JobRunner(job_man_dir, 4, "RobustFirst256", worker_factory) runner.start()
def main(): max_seq_length = 512 encoder = AllSegmentAsDoc(max_seq_length) worker_factory = partial( RobustWorker, RobustPointwiseTrainGenEx(encoder, max_seq_length)) runner = JobRunner(job_man_dir, 4, "robust_all_passage_pointwise_ex", worker_factory) runner.start()
def generate_robust_first_for_train(): max_seq_length = 512 encoder = FirstSegmentAsDoc(max_seq_length) worker_factory = partial(RobustWorker, RobustPairwiseTrainGen(encoder, max_seq_length)) runner = JobRunner(sydney_working_dir, 4, "RobustFirstClean", worker_factory) runner.start()
def main(): input_path_format = "/mnt/nfs/work3/youngwookim/data/msmarco/triple_pieces/x{0:04}" def factory(out_dir): return TripletWorker(input_path_format, out_dir) num_jobs = 360 runner = JobRunner(job_man_dir, num_jobs - 1, "MMD_pair_triplet", factory) runner.start()
def main(): max_seq_length = 512 score_d = load_from_pickle("robust_score_d2") encoder = AllSegmentAsDoc(max_seq_length) worker_factory = partial( RobustWorkerWDataID, RobustTrainGenSelected(encoder, max_seq_length, score_d)) runner = JobRunner(job_man_dir, 4, "robust_selected2", worker_factory) runner.start()
def main(): max_passage_length = 512 encoder = FirstEquiSero(max_passage_length, 128, 4) max_seq_length = max_passage_length worker_factory = partial( RobustWorker, RobustPointwiseTrainGenEx(encoder, max_seq_length, "desc")) runner = JobRunner(job_man_dir, 4, "first_512_equi_sero", worker_factory) runner.start()
def main(): split = "train" def factory(out_dir): return BestSegmentPredictionGen(512, split, True, False, out_dir) runner = JobRunner(job_man_dir, train_query_group_len - 1, "MMD_best_seg_prediction_{}".format(split), factory) runner.start()
def generate_robust_all_seg_for_train(): limited_length = 256 encoder = AllSegmentAsDoc(limited_length) max_seq_length = 512 worker_factory = partial(RobustWorker, RobustPairwiseTrainGen(encoder, max_seq_length)) runner = JobRunner(job_man_dir, 4, "robust_all_passage_256", worker_factory) runner.start()
def main(): max_seq_length = 512 score_d = load_score_set1() encoder = AllSegmentAsDoc(max_seq_length) for target_selection in ["random_over_09", "best", "first_and_best", "best_or_over_09"]: worker_factory = partial(RobustWorkerWDataID, RobustTrainGenSelected(encoder, max_seq_length, score_d, "desc", target_selection)) runner = JobRunner(job_man_dir, 3, "robust_selected2_{}".format(target_selection), worker_factory) runner.start()
def generate_robust_all_seg_for_predict(): doc_max_length = 512 worker_factory = partial( RobustPerQueryWorker, RobustSeparateEncoder(doc_max_length, "desc", 1000, False)) num_jobs = 250 runner = JobRunner(job_man_dir, num_jobs - 1, "robust_query_doc", worker_factory) runner.start()
def generate_robust_sero_for_prediction(): total_sequence_length = 512 * 4 src_window_size = 512 - 2 encoder = MultiWindow(src_window_size, total_sequence_length) worker_factory = partial( RobustWorker, RobustPredictGenOld(encoder, total_sequence_length)) runner = JobRunner(sydney_working_dir, 4, "RobustSeroPred4", worker_factory) runner.start()
def generate_robust_sero_for_train(): total_sequence_length = 128 * 4 src_window_size = 128 encoder = MultiWindow(src_window_size, total_sequence_length) worker_factory = partial( RobustWorker, RobustPredictGen(encoder, total_sequence_length, 100, "desc")) runner = JobRunner(job_man_dir, 4, "RobustSero5_128_pred", worker_factory) runner.start()
def generate_robust_all_seg_for_predict(): max_seq_length = 128 encoder = ManyTwoPieceSegmentComposer(max_seq_length) worker_factory = partial(RobustPerQueryWorker, RobustPredictGenLight(encoder, max_seq_length)) num_jobs = 250 runner = JobRunner(job_man_dir, num_jobs - 1, "robust_two_piece_pred_m", worker_factory) runner.start()
def main(): max_passage_length = 128 num_segment = 4 encoder = FirstAndRandom(max_passage_length, num_segment) max_seq_length = max_passage_length worker_factory = partial( RobustWorker, RobustPointwiseTrainGenEx(encoder, max_seq_length)) runner = JobRunner(job_man_dir, 4, "leading_segments", worker_factory) runner.start()