def generate_robust_sero_for_train(): total_sequence_length = 128 * 32 src_window_size = 128 encoder = MultiWindowOverlap(src_window_size, total_sequence_length) worker_factory = partial(RobustWorker, RobustPointwiseTrainGenEx(encoder, total_sequence_length, "desc")) runner = JobRunner(job_man_dir, 4, "RobustSero_128_32_overlap", worker_factory) runner.start()
def generate_robust_all_seg_for_predict(): max_seq_length = 128 step_size = 64 encoder = OverlappingSegments(max_seq_length, step_size) worker_factory = partial(RobustWorker, RobustPredictGen(encoder, max_seq_length, 100, "desc")) runner = JobRunner(job_man_dir, 4, "robust_predict_desc_128_overlap", worker_factory) runner.start()
def main(): qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt" judgement = load_qrels_structured(qrel_path) def is_correct(query: QCKQuery, candidate: QCKCandidate): qid = query.query_id doc_id = candidate.id if qid not in judgement: return 0 d = judgement[qid] label = 1 if doc_id in d and d[doc_id] > 0 else 0 return label qk_candidate: List[QKUnit] = load_from_pickle( "robust_on_clueweb_qk_candidate_filtered") candidate_dict = load_cache("candidate_for_robust_qck_7") if candidate_dict is None: candidate_dict: \ Dict[str, List[QCKCandidateI]] = get_candidate_all_passage_w_samping() save_to_pickle(candidate_dict, "candidate_for_robust_qck_7") generator = QCKInstanceGenerator(candidate_dict, is_correct) num_jobs = 250 def worker_factory(out_dir): worker = QCKWorker(qk_candidate, generator, out_dir) return worker ## job_name = "robust_qck_10" runner = JobRunner(job_man_dir, num_jobs, job_name, worker_factory) runner.start()
def main(): max_passage_length = 256 encoder = PassageSampling(max_passage_length) max_seq_length = 512 worker_factory = partial(RobustWorker, RobustPointwiseTrainGenEx(encoder, max_seq_length)) runner = JobRunner(job_man_dir, 4, "robust_train_sampling_256", worker_factory) runner.start()
def main(): max_seq_length = 128 encoder = TwoPieceSegmentComposer(max_seq_length, True) worker_factory = partial(RobustPerQueryWorker, RobustTrainGenLight(encoder, max_seq_length)) num_jobs = 250 runner = JobRunner(job_man_dir, num_jobs-1, "robust_two_piece2", worker_factory) runner.start()
def main(): max_seq_length = 512 encoder = AllSegmentAsDoc(max_seq_length) worker_factory = partial(RobustWorkerWDataID, RobustTrainGenWDataID(encoder, max_seq_length)) runner = JobRunner(job_man_dir, 4, "robust_w_data_id", worker_factory) runner.start()
def main(): qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt" judgement = load_qrels_structured(qrel_path) def is_correct(query: QCKQuery, candidate: QCKCandidate): qid = query.query_id doc_part_id = candidate.id doc_id = "_".join(doc_part_id.split("_")[:-1]) if qid not in judgement: return 0 d = judgement[qid] if doc_id in d: return d[doc_id] else: return 0 qk_candidate: List[QKUnit] = load_from_pickle( "robust_on_clueweb_qk_candidate") candidate_dict: \ Dict[str, List[QCKCandidateI]] = load_candidate_all_passage_from_qrel(256) generator = QCKInstanceGenerator(candidate_dict, is_correct) num_jobs = 250 def worker_factory(out_dir): worker = QCKWorker(qk_candidate, generator, out_dir) return worker ## job_name = "robust_qck_6" runner = JobRunner(job_man_dir, num_jobs, job_name, worker_factory) runner.start()
def generate_robust_sero_for_train(): total_sequence_length = 512 * 4 src_window_size = 512 encoder = MultiWindow(src_window_size, total_sequence_length) worker_factory = partial(RobustWorker, RobustPointwiseTrainGenEx(encoder, total_sequence_length, "desc")) runner = JobRunner(sydney_working_dir, 4, "RobustSero5", worker_factory) runner.start()
def main(): max_passage_length = 128 g = 0.5 encoder = GeoSampler(max_passage_length, g) max_seq_length = max_passage_length worker_factory = partial(RobustWorker, RobustPointwiseTrainGenEx(encoder, max_seq_length, "desc")) runner = JobRunner(job_man_dir, 4, "robust_geo05", worker_factory) runner.start()
def generate_robust_first_for_prediction(): max_seq_length = 512 encoder = FirstSegmentAsDoc(max_seq_length) worker_factory = partial(RobustWorker, RobustPredictGenOld(encoder, max_seq_length)) runner = JobRunner(sydney_working_dir, 4, "RobustFirstPred3", worker_factory) runner.start()
def main(): max_seq_length = 512 encoder = AllSegmentAsDoc(max_seq_length) worker_factory = partial( RobustWorker, RobustPointwiseTrainGenEx(encoder, max_seq_length)) runner = JobRunner(job_man_dir, 4, "robust_all_passage_pointwise_ex", worker_factory) runner.start()
def generate_robust_all_seg_for_train(): max_seq_length = 256 encoder = AllSegmentAsDoc(max_seq_length) worker_factory = partial(RobustWorker, RobustPredictGen(encoder, max_seq_length)) runner = JobRunner(job_man_dir, 4, "robust_all_passage_predict_256", worker_factory) runner.start()
def generate_robust_first_for_pred(): doc_len = 256 + 3 max_seq_length = 512 encoder = FirstSegmentAsDoc(doc_len) worker_factory = partial(RobustWorker, RobustPredictGenOld(encoder, max_seq_length)) runner = JobRunner(job_man_dir, 4, "robust_first_256_pred", worker_factory) runner.start()
def generate_robust_first_for_train(): doc_len = 256 + 3 max_seq_length = 512 encoder = FirstSegmentAsDoc(doc_len) worker_factory = partial(RobustWorker, RobustPairwiseTrainGen(encoder, max_seq_length)) runner = JobRunner(job_man_dir, 4, "RobustFirst256", worker_factory) runner.start()
def generate_robust_all_seg_for_train(): max_seq_length = 512 encoder = AllSegmentAsDoc(max_seq_length) worker_factory = partial( RobustWorker, RobustPredictGen(encoder, max_seq_length, 100, "desc")) runner = JobRunner(job_man_dir, 4, "robust_predict_desc_query", worker_factory) runner.start()
def generate_robust_first_for_train(): max_seq_length = 512 encoder = FirstSegmentAsDoc(max_seq_length) worker_factory = partial(RobustWorker, RobustPairwiseTrainGen(encoder, max_seq_length)) runner = JobRunner(sydney_working_dir, 4, "RobustFirstClean", worker_factory) runner.start()
def main(): max_passage_length = 128 num_segment = 1 encoder = LeadingN(max_passage_length, num_segment) max_seq_length = max_passage_length worker_factory = partial(RobustWorker, RobustPointwiseTrainGenEx(encoder, max_seq_length, "desc")) runner = JobRunner(job_man_dir, 4, "first_128_desc", worker_factory) runner.start()
def generate_robust_sero_for_prediction(): total_sequence_length = 512 * 4 src_window_size = 512 - 2 encoder = MultiWindow(src_window_size, total_sequence_length) worker_factory = partial( RobustWorker, RobustPredictGenOld(encoder, total_sequence_length)) runner = JobRunner(sydney_working_dir, 4, "RobustSeroPred4", worker_factory) runner.start()
def main(): max_passage_length = 512 encoder = FirstEquiSero(max_passage_length, 128, 4) max_seq_length = max_passage_length worker_factory = partial( RobustWorker, RobustPointwiseTrainGenEx(encoder, max_seq_length, "desc")) runner = JobRunner(job_man_dir, 4, "first_512_equi_sero", worker_factory) runner.start()
def main(): split = "train" def factory(out_dir): return BestSegmentPredictionGen(512, split, True, False, out_dir) runner = JobRunner(job_man_dir, train_query_group_len - 1, "MMD_best_seg_prediction_{}".format(split), factory) runner.start()
def main(): max_seq_length = 512 score_d = load_from_pickle("robust_score_d2") encoder = AllSegmentAsDoc(max_seq_length) worker_factory = partial( RobustWorkerWDataID, RobustTrainGenSelected(encoder, max_seq_length, score_d)) runner = JobRunner(job_man_dir, 4, "robust_selected2", worker_factory) runner.start()
def generate_robust_sero_for_train(): total_sequence_length = 128 * 4 src_window_size = 128 encoder = MultiWindow(src_window_size, total_sequence_length) worker_factory = partial( RobustWorker, RobustPredictGen(encoder, total_sequence_length, 100, "desc")) runner = JobRunner(job_man_dir, 4, "RobustSero5_128_pred", worker_factory) runner.start()
def main(): input_path_format = "/mnt/nfs/work3/youngwookim/data/msmarco/triple_pieces/x{0:04}" def factory(out_dir): return TripletWorker(input_path_format, out_dir) num_jobs = 360 runner = JobRunner(job_man_dir, num_jobs - 1, "MMD_pair_triplet", factory) runner.start()
def main(): max_seq_length = 512 score_d = load_score_set1() encoder = AllSegmentAsDoc(max_seq_length) for target_selection in ["random_over_09", "best", "first_and_best", "best_or_over_09"]: worker_factory = partial(RobustWorkerWDataID, RobustTrainGenSelected(encoder, max_seq_length, score_d, "desc", target_selection)) runner = JobRunner(job_man_dir, 3, "robust_selected2_{}".format(target_selection), worker_factory) runner.start()
def main(): max_passage_length = 128 num_segment = 4 encoder = FirstAndRandom(max_passage_length, num_segment) max_seq_length = max_passage_length worker_factory = partial( RobustWorker, RobustPointwiseTrainGenEx(encoder, max_seq_length)) runner = JobRunner(job_man_dir, 4, "leading_segments", worker_factory) runner.start()
def generate_robust_all_seg_for_predict(): doc_max_length = 512 worker_factory = partial( RobustPerQueryWorker, RobustSeparateEncoder(doc_max_length, "desc", 1000, False)) num_jobs = 250 runner = JobRunner(job_man_dir, num_jobs - 1, "robust_query_doc", worker_factory) runner.start()
def generate_robust_all_seg_for_train(): limited_length = 256 encoder = AllSegmentAsDoc(limited_length) max_seq_length = 512 worker_factory = partial(RobustWorker, RobustPairwiseTrainGen(encoder, max_seq_length)) runner = JobRunner(job_man_dir, 4, "robust_all_passage_256", worker_factory) runner.start()
def generate_robust_all_seg_for_predict(): max_seq_length = int(sys.argv[1]) encoder = AllSegmentAsDoc(max_seq_length) worker_factory = partial( RobustWorker, RobustPredictGen(encoder, max_seq_length, 100, "desc")) runner = JobRunner(job_man_dir, 4, "robust_predict_desc_{}".format(max_seq_length), worker_factory) runner.auto_runner()
def generate_robust_all_seg_for_predict(): max_seq_length = 128 encoder = ManyTwoPieceSegmentComposer(max_seq_length) worker_factory = partial(RobustPerQueryWorker, RobustPredictGenLight(encoder, max_seq_length)) num_jobs = 250 runner = JobRunner(job_man_dir, num_jobs - 1, "robust_two_piece_pred_m", worker_factory) runner.start()
def generate_robust_all_seg_for_predict(): max_seq_length = 128 encoder = AllSegmentAsDoc(max_seq_length) worker_factory = partial(RobustWorker, RobustPosOnlyGen(encoder, max_seq_length, "desc")) num_jobs = 5 runner = JobRunner(job_man_dir, num_jobs - 1, "robust_pos_only_128", worker_factory) runner.start()