Esempio n. 1
0
def generate_robust_sero_for_train():
    total_sequence_length = 128 * 32
    src_window_size = 128
    encoder = MultiWindowOverlap(src_window_size, total_sequence_length)
    worker_factory = partial(RobustWorker, RobustPointwiseTrainGenEx(encoder, total_sequence_length, "desc"))
    runner = JobRunner(job_man_dir, 4, "RobustSero_128_32_overlap", worker_factory)
    runner.start()
Esempio n. 2
0
def generate_robust_all_seg_for_predict():
    max_seq_length = 128
    step_size = 64
    encoder = OverlappingSegments(max_seq_length, step_size)
    worker_factory = partial(RobustWorker, RobustPredictGen(encoder, max_seq_length, 100, "desc"))
    runner = JobRunner(job_man_dir, 4, "robust_predict_desc_128_overlap", worker_factory)
    runner.start()
Esempio n. 3
0
def main():
    qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt"
    judgement = load_qrels_structured(qrel_path)

    def is_correct(query: QCKQuery, candidate: QCKCandidate):
        qid = query.query_id
        doc_id = candidate.id
        if qid not in judgement:
            return 0
        d = judgement[qid]
        label = 1 if doc_id in d and d[doc_id] > 0 else 0
        return label

    qk_candidate: List[QKUnit] = load_from_pickle(
        "robust_on_clueweb_qk_candidate_filtered")

    candidate_dict = load_cache("candidate_for_robust_qck_7")
    if candidate_dict is None:
        candidate_dict: \
            Dict[str, List[QCKCandidateI]] = get_candidate_all_passage_w_samping()
        save_to_pickle(candidate_dict, "candidate_for_robust_qck_7")

    generator = QCKInstanceGenerator(candidate_dict, is_correct)
    num_jobs = 250

    def worker_factory(out_dir):
        worker = QCKWorker(qk_candidate, generator, out_dir)
        return worker

    ##
    job_name = "robust_qck_10"
    runner = JobRunner(job_man_dir, num_jobs, job_name, worker_factory)
    runner.start()
Esempio n. 4
0
def main():
    max_passage_length = 256
    encoder = PassageSampling(max_passage_length)
    max_seq_length = 512
    worker_factory = partial(RobustWorker, RobustPointwiseTrainGenEx(encoder, max_seq_length))
    runner = JobRunner(job_man_dir, 4, "robust_train_sampling_256", worker_factory)
    runner.start()
Esempio n. 5
0
def main():
    max_seq_length = 128
    encoder = TwoPieceSegmentComposer(max_seq_length, True)
    worker_factory = partial(RobustPerQueryWorker, RobustTrainGenLight(encoder, max_seq_length))
    num_jobs = 250
    runner = JobRunner(job_man_dir, num_jobs-1, "robust_two_piece2", worker_factory)
    runner.start()
Esempio n. 6
0
def main():
    max_seq_length = 512
    encoder = AllSegmentAsDoc(max_seq_length)
    worker_factory = partial(RobustWorkerWDataID,
                             RobustTrainGenWDataID(encoder, max_seq_length))
    runner = JobRunner(job_man_dir, 4, "robust_w_data_id", worker_factory)
    runner.start()
Esempio n. 7
0
def main():
    qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt"
    judgement = load_qrels_structured(qrel_path)

    def is_correct(query: QCKQuery, candidate: QCKCandidate):
        qid = query.query_id
        doc_part_id = candidate.id
        doc_id = "_".join(doc_part_id.split("_")[:-1])
        if qid not in judgement:
            return 0
        d = judgement[qid]
        if doc_id in d:
            return d[doc_id]
        else:
            return 0

    qk_candidate: List[QKUnit] = load_from_pickle(
        "robust_on_clueweb_qk_candidate")
    candidate_dict: \
        Dict[str, List[QCKCandidateI]] = load_candidate_all_passage_from_qrel(256)
    generator = QCKInstanceGenerator(candidate_dict, is_correct)
    num_jobs = 250

    def worker_factory(out_dir):
        worker = QCKWorker(qk_candidate, generator, out_dir)
        return worker

    ##
    job_name = "robust_qck_6"
    runner = JobRunner(job_man_dir, num_jobs, job_name, worker_factory)
    runner.start()
Esempio n. 8
0
def generate_robust_sero_for_train():
    total_sequence_length = 512 * 4
    src_window_size = 512
    encoder = MultiWindow(src_window_size, total_sequence_length)
    worker_factory = partial(RobustWorker, RobustPointwiseTrainGenEx(encoder, total_sequence_length, "desc"))
    runner = JobRunner(sydney_working_dir, 4, "RobustSero5", worker_factory)
    runner.start()
Esempio n. 9
0
def main():
    max_passage_length = 128
    g = 0.5
    encoder = GeoSampler(max_passage_length, g)
    max_seq_length = max_passage_length
    worker_factory = partial(RobustWorker, RobustPointwiseTrainGenEx(encoder, max_seq_length, "desc"))
    runner = JobRunner(job_man_dir, 4, "robust_geo05", worker_factory)
    runner.start()
Esempio n. 10
0
def generate_robust_first_for_prediction():
    max_seq_length = 512
    encoder = FirstSegmentAsDoc(max_seq_length)
    worker_factory = partial(RobustWorker,
                             RobustPredictGenOld(encoder, max_seq_length))
    runner = JobRunner(sydney_working_dir, 4, "RobustFirstPred3",
                       worker_factory)
    runner.start()
Esempio n. 11
0
def main():
    max_seq_length = 512
    encoder = AllSegmentAsDoc(max_seq_length)
    worker_factory = partial(
        RobustWorker, RobustPointwiseTrainGenEx(encoder, max_seq_length))
    runner = JobRunner(job_man_dir, 4, "robust_all_passage_pointwise_ex",
                       worker_factory)
    runner.start()
Esempio n. 12
0
def generate_robust_all_seg_for_train():
    max_seq_length = 256
    encoder = AllSegmentAsDoc(max_seq_length)
    worker_factory = partial(RobustWorker,
                             RobustPredictGen(encoder, max_seq_length))
    runner = JobRunner(job_man_dir, 4, "robust_all_passage_predict_256",
                       worker_factory)
    runner.start()
Esempio n. 13
0
def generate_robust_first_for_pred():
    doc_len = 256 + 3
    max_seq_length = 512
    encoder = FirstSegmentAsDoc(doc_len)
    worker_factory = partial(RobustWorker,
                             RobustPredictGenOld(encoder, max_seq_length))
    runner = JobRunner(job_man_dir, 4, "robust_first_256_pred", worker_factory)
    runner.start()
Esempio n. 14
0
def generate_robust_first_for_train():
    doc_len = 256 + 3
    max_seq_length = 512
    encoder = FirstSegmentAsDoc(doc_len)
    worker_factory = partial(RobustWorker,
                             RobustPairwiseTrainGen(encoder, max_seq_length))
    runner = JobRunner(job_man_dir, 4, "RobustFirst256", worker_factory)
    runner.start()
Esempio n. 15
0
def generate_robust_all_seg_for_train():
    max_seq_length = 512
    encoder = AllSegmentAsDoc(max_seq_length)
    worker_factory = partial(
        RobustWorker, RobustPredictGen(encoder, max_seq_length, 100, "desc"))
    runner = JobRunner(job_man_dir, 4, "robust_predict_desc_query",
                       worker_factory)
    runner.start()
Esempio n. 16
0
def generate_robust_first_for_train():
    max_seq_length = 512
    encoder = FirstSegmentAsDoc(max_seq_length)
    worker_factory = partial(RobustWorker,
                             RobustPairwiseTrainGen(encoder, max_seq_length))
    runner = JobRunner(sydney_working_dir, 4, "RobustFirstClean",
                       worker_factory)
    runner.start()
Esempio n. 17
0
def main():
    max_passage_length = 128
    num_segment = 1
    encoder = LeadingN(max_passage_length, num_segment)
    max_seq_length = max_passage_length
    worker_factory = partial(RobustWorker, RobustPointwiseTrainGenEx(encoder, max_seq_length, "desc"))
    runner = JobRunner(job_man_dir, 4, "first_128_desc", worker_factory)
    runner.start()
Esempio n. 18
0
def generate_robust_sero_for_prediction():
    total_sequence_length = 512 * 4
    src_window_size = 512 - 2
    encoder = MultiWindow(src_window_size, total_sequence_length)
    worker_factory = partial(
        RobustWorker, RobustPredictGenOld(encoder, total_sequence_length))
    runner = JobRunner(sydney_working_dir, 4, "RobustSeroPred4",
                       worker_factory)
    runner.start()
Esempio n. 19
0
def main():
    max_passage_length = 512
    encoder = FirstEquiSero(max_passage_length, 128, 4)
    max_seq_length = max_passage_length
    worker_factory = partial(
        RobustWorker, RobustPointwiseTrainGenEx(encoder, max_seq_length,
                                                "desc"))
    runner = JobRunner(job_man_dir, 4, "first_512_equi_sero", worker_factory)
    runner.start()
def main():
    split = "train"

    def factory(out_dir):
        return BestSegmentPredictionGen(512, split, True, False, out_dir)

    runner = JobRunner(job_man_dir, train_query_group_len - 1,
                       "MMD_best_seg_prediction_{}".format(split), factory)
    runner.start()
Esempio n. 21
0
def main():
    max_seq_length = 512
    score_d = load_from_pickle("robust_score_d2")
    encoder = AllSegmentAsDoc(max_seq_length)
    worker_factory = partial(
        RobustWorkerWDataID,
        RobustTrainGenSelected(encoder, max_seq_length, score_d))
    runner = JobRunner(job_man_dir, 4, "robust_selected2", worker_factory)
    runner.start()
Esempio n. 22
0
def generate_robust_sero_for_train():
    total_sequence_length = 128 * 4
    src_window_size = 128
    encoder = MultiWindow(src_window_size, total_sequence_length)
    worker_factory = partial(
        RobustWorker,
        RobustPredictGen(encoder, total_sequence_length, 100, "desc"))
    runner = JobRunner(job_man_dir, 4, "RobustSero5_128_pred", worker_factory)
    runner.start()
Esempio n. 23
0
def main():
    input_path_format = "/mnt/nfs/work3/youngwookim/data/msmarco/triple_pieces/x{0:04}"

    def factory(out_dir):
        return TripletWorker(input_path_format, out_dir)

    num_jobs = 360
    runner = JobRunner(job_man_dir, num_jobs - 1, "MMD_pair_triplet", factory)
    runner.start()
Esempio n. 24
0
def main():
    max_seq_length = 512
    score_d = load_score_set1()
    encoder = AllSegmentAsDoc(max_seq_length)
    for target_selection in ["random_over_09", "best", "first_and_best", "best_or_over_09"]:
        worker_factory = partial(RobustWorkerWDataID,
                                 RobustTrainGenSelected(encoder, max_seq_length, score_d, "desc", target_selection))
        runner = JobRunner(job_man_dir, 3, "robust_selected2_{}".format(target_selection), worker_factory)
        runner.start()
Esempio n. 25
0
def main():
    max_passage_length = 128
    num_segment = 4
    encoder = FirstAndRandom(max_passage_length, num_segment)
    max_seq_length = max_passage_length
    worker_factory = partial(
        RobustWorker, RobustPointwiseTrainGenEx(encoder, max_seq_length))
    runner = JobRunner(job_man_dir, 4, "leading_segments", worker_factory)
    runner.start()
Esempio n. 26
0
def generate_robust_all_seg_for_predict():
    doc_max_length = 512
    worker_factory = partial(
        RobustPerQueryWorker,
        RobustSeparateEncoder(doc_max_length, "desc", 1000, False))
    num_jobs = 250
    runner = JobRunner(job_man_dir, num_jobs - 1, "robust_query_doc",
                       worker_factory)
    runner.start()
Esempio n. 27
0
def generate_robust_all_seg_for_train():
    limited_length = 256
    encoder = AllSegmentAsDoc(limited_length)
    max_seq_length = 512
    worker_factory = partial(RobustWorker,
                             RobustPairwiseTrainGen(encoder, max_seq_length))
    runner = JobRunner(job_man_dir, 4, "robust_all_passage_256",
                       worker_factory)
    runner.start()
Esempio n. 28
0
def generate_robust_all_seg_for_predict():
    max_seq_length = int(sys.argv[1])
    encoder = AllSegmentAsDoc(max_seq_length)
    worker_factory = partial(
        RobustWorker, RobustPredictGen(encoder, max_seq_length, 100, "desc"))
    runner = JobRunner(job_man_dir, 4,
                       "robust_predict_desc_{}".format(max_seq_length),
                       worker_factory)
    runner.auto_runner()
Esempio n. 29
0
def generate_robust_all_seg_for_predict():
    max_seq_length = 128
    encoder = ManyTwoPieceSegmentComposer(max_seq_length)
    worker_factory = partial(RobustPerQueryWorker,
                             RobustPredictGenLight(encoder, max_seq_length))
    num_jobs = 250
    runner = JobRunner(job_man_dir, num_jobs - 1, "robust_two_piece_pred_m",
                       worker_factory)
    runner.start()
Esempio n. 30
0
def generate_robust_all_seg_for_predict():
    max_seq_length = 128
    encoder = AllSegmentAsDoc(max_seq_length)
    worker_factory = partial(RobustWorker,
                             RobustPosOnlyGen(encoder, max_seq_length, "desc"))
    num_jobs = 5
    runner = JobRunner(job_man_dir, num_jobs - 1, "robust_pos_only_128",
                       worker_factory)
    runner.start()