Example #1
0
def qck_gen(job_name, qk_candidate_name, candidate_ranked_list_path,
            kdp_ranked_list_path, split):
    claim_ids = load_claim_ids_for_split(split)
    cids: List[str] = lmap(str, claim_ids)
    qk_candidate: List[QKUnit] = load_from_pickle(qk_candidate_name)
    kdp_ranked_list: Dict[
        str, List[TrecRankedListEntry]] = load_ranked_list_grouped(
            kdp_ranked_list_path)

    print("cids", len(cids))
    print("len(qk_candidate)", len(qk_candidate))
    print("Generate instances : ", split)
    generator = QCKInstGenWScore(
        get_qck_candidate_from_ranked_list_path(candidate_ranked_list_path),
        is_correct_factory(), kdp_ranked_list)
    qk_candidate_train: List[QKUnit] = list(
        [qk for qk in qk_candidate if qk[0].query_id in cids])

    def worker_factory(out_dir):
        return QCKWorker(qk_candidate_train, generator, out_dir)

    num_jobs = d_n_claims_per_split2[split]
    runner = JobRunnerS(job_man_dir, num_jobs, job_name + "_" + split,
                        worker_factory)
    runner.start()
Example #2
0
def main():
    working_path = "d:\\job_dir"
    max_job = 3
    job_name = "test_job"
    job_runner = JobRunnerS(working_path, max_job, job_name, DummyWorker)
    job_id = job_runner.pool_job()
    print(job_id)
def run_tokenize_jobs_for_prediction_split(split):
    query_group = load_query_group(split)
    candidate_docs = top100_doc_ids(split)

    def factory(out_dir):
        return TokenizeDocTitleBodyWorker(split, query_group, candidate_docs, out_dir)

    runner = JobRunnerS(job_man_dir, len(query_group), "MSMARCO_{}_title_body_tokens".format(split), factory)
    runner.start()
Example #4
0
def run_tokenize_jobs_for_train_split(split):
    query_group = load_query_group(split)
    candidate_docs = load_candidate_doc_list_10(split)

    def factory(out_dir):
        return SentLevelTokenizeWorker(split, query_group, candidate_docs,
                                       out_dir)

    runner = JobRunnerS(job_man_dir, len(query_group),
                        "MSMARCO_{}_sent_tokens".format(split), factory)
    runner.start()
def run_tokenize_jobs_for_pred_split(split):
    query_group = load_query_group(split)
    candidate_docs = top100_doc_ids(split)
    max_sent_length = 64 * 5
    max_title_length = 64 * 5

    def factory(out_dir):
        return MultipleTokenizeWorker(split, query_group, candidate_docs, max_sent_length, max_title_length, out_dir)

    runner = JobRunnerS(job_man_dir, len(query_group),
                        "MSMARCO_{}_multiple_tokenize".format(split),
                        factory)
    runner.start()
Example #6
0
def qck_gen_w_ranked_list(job_name,
                          qk_candidates: List[QKUnit],
                          qck_candidates_dict: Dict[str, List[QCKCandidate]],
                          split):
    qids = list(get_qids_for_split(split_name2, split))
    print("Generate instances : ", split)
    generator = QCKInstanceGenerator(qck_candidates_dict, is_correct_factory())
    qk_candidates_for_split: List[QKUnit] = list([qk for qk in qk_candidates if qk[0].query_id in qids])
    print("{} of {} qk are used".format(len(qk_candidates_for_split), len(qk_candidates)))

    def worker_factory(out_dir):
        return QCKWorker(qk_candidates_for_split,
                         generator,
                         out_dir)

    num_jobs = len(qids)
    runner = JobRunnerS(job_man_dir, num_jobs, job_name + "_" + split, worker_factory)
    runner.start()
Example #7
0
def start_generate_jobs_for_dev(generator: InstanceGenerator, name_prefix):
    # claim ids split to train/val
    print("Loading data ....")
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)

    cids = {str(t['cId']) for t in claims}
    qk_candidate: List[QKUnit] = load_qk_candidate_dev()
    qk_candidate_val = list(
        [qk for qk in qk_candidate if qk[0].query_id in cids])

    print("Generate instances : dev")

    def worker_factory(out_dir):
        return QCKWorker(qk_candidate_val, generator, out_dir)

    runner = JobRunnerS(job_man_dir, 138, name_prefix + "_dev", worker_factory)
    runner.start()
Example #8
0
def main():
    is_correct_fn = get_is_correct_fn()
    split = "train"
    qk_candidate = load_from_pickle("pc_evi_filtered_qk_{}".format(split))
    tprint("Loading candidates..")
    candidate_dict = load_bal_candidate(split)
    tprint("{} dict keys".format(len(candidate_dict)))

    tprint("Initializing generator..")
    generator = QCKInstanceGenerator(candidate_dict, is_correct_fn)
    num_jobs = d_n_pc_per_split[split]

    def worker_factory(out_dir):
        worker = QCKWorker(qk_candidate, generator, out_dir)
        return worker

    job_name = "pc_evi_qck2_s_{}".format(split)
    runner = JobRunnerS(job_man_dir, num_jobs, job_name, worker_factory)
    runner.start()
Example #9
0
def multi_qck_gen(job_name, qk_candidate_name, ranked_list_path, split,
                  k_group_size):
    claim_ids = load_claim_ids_for_split(split)
    cids: List[str] = lmap(str, claim_ids)
    qk_candidate: List[QKUnit] = load_from_pickle(qk_candidate_name)
    print("cids", len(cids))
    print("len(qk_candidate)", len(qk_candidate))
    print("Generate instances : ", split)
    generator = QCKGeneratorGrouped(
        get_qck_candidate_from_ranked_list_path(ranked_list_path),
        is_correct_factory(), False, k_group_size)
    qk_candidate_train: List[QKUnit] = list(
        [qk for qk in qk_candidate if qk[0].query_id in cids])

    def worker_factory(out_dir):
        return QCKWorker(qk_candidate_train, generator, out_dir)

    num_jobs = d_n_claims_per_split2[split]
    runner = JobRunnerS(job_man_dir, num_jobs, job_name + "_" + split,
                        worker_factory)
    runner.start()
Example #10
0
def main():
    is_correct_fn = get_is_correct_fn()
    for split in splits[:2]:
        qk_candidate = load_from_pickle("pc_evi_filtered_qk_{}".format(split))
        qk_candidate = sample_kdps(qk_candidate)
        tprint("Loading candidates..")
        candidate_dict = load_bal_candidate(split)
        tprint("{} dict keys".format(len(candidate_dict)))

        tprint("Initializing generator..")
        generator = QCKInstanceGenerator(candidate_dict, is_correct_fn)
        n_qk_per_job = 10
        num_jobs = ceil_divide(d_n_pc_per_split[split], n_qk_per_job)

        def worker_factory(out_dir):
            worker = QCKWorkerMultiple(qk_candidate, generator, n_qk_per_job, out_dir)
            return worker

        job_name = "pc_evi_qck3_{}".format(split)
        runner = JobRunnerS(job_man_dir, num_jobs, job_name, worker_factory)
        runner.start()
Example #11
0
def start_generate_jobs(generator: InstanceGenerator, subsplit,
                        qk_candidate_name, name_prefix):
    # claim ids split to train/val
    print("Loading data ....")
    claims = load_claims_for_sub_split(subsplit)

    valid_cids = {str(t['cId']) for t in claims}
    qk_candidate: List[QKUnit] = load_from_pickle(qk_candidate_name)
    qk_candidate_val = list(
        [qk for qk in qk_candidate if qk[0].query_id in valid_cids])

    print("Generate instances :")
    print("split: ", subsplit)
    print("qk_candidate_name: ", qk_candidate_name)

    def worker_factory(out_dir):
        return QCKWorker(qk_candidate_val, generator, out_dir)

    num_job = d_n_claims_per_subsplit[subsplit]
    runner = JobRunnerS(job_man_dir, num_job, name_prefix + "_" + subsplit,
                        worker_factory)
    runner.auto_runner()
Example #12
0
def qck_gen_w_ranked_list_multiple(job_name, qk_candidate_name,
                                   ranked_list_path, split, n_qk_per_job):
    claim_ids = load_claim_ids_for_split(split)
    cids: List[str] = lmap(str, claim_ids)
    qk_candidate: List[QKUnit] = load_from_pickle(qk_candidate_name)
    print("cids", len(cids))
    print("len(qk_candidate)", len(qk_candidate))
    print("Generate instances : ", split)
    generator = QCKInstanceGenerator(
        get_qck_candidate_from_ranked_list_path(ranked_list_path),
        is_correct_factory())
    qk_candidate_train: List[QKUnit] = list(
        [qk for qk in qk_candidate if qk[0].query_id in cids])

    def worker_factory(out_dir):
        return QCKWorkerMultiple(qk_candidate_train, generator, n_qk_per_job,
                                 out_dir)

    num_qks = d_n_claims_per_split2[split]
    num_jobs = ceil_divide(num_qks, n_qk_per_job)
    runner = JobRunnerS(job_man_dir, num_jobs, job_name + "_" + split,
                        worker_factory)
    runner.start()
Example #13
0
        line_end = line_start + job_size

        self.doc_f.seek(self.line_offset_d[line_start])

        lines = []
        for i in range(job_size):
            line = self.doc_f.readline()
            lines.append(line)

        cur_offset = self.doc_f.tell()
        try:
            expected_offset = self.line_offset_d[line_end]
            if cur_offset != expected_offset:
                print("cur_offset != expected_offset : {} != {}".format(
                    cur_offset, expected_offset))
        except IndexError as exception_e:
            print(exception_e)

        tokenized_lines = tokenize_lines(self.tokenizer, lines)
        out_f = open(os.path.join(self.out_dir, str(job_id)), "w")

        for row in tokenized_lines:
            out_f.write("\t".join(row) + "\n")


if __name__ == "__main__":
    num_job = 3213 + 1
    runner = JobRunnerS(job_man_dir, num_job, "MSMARCO_tokens",
                        CorpusTokenizeWorker)
    runner.start()
Example #14
0
from dataset_specific.msmarco.common import load_query_group, load_candidate_doc_list_10, load_candidate_doc_top50
from dataset_specific.msmarco.tokenize_worker import TokenizeWorker
from epath import job_man_dir
from job_manager.job_runner_with_server import JobRunnerS

if __name__ == "__main__":
    split = "train"
    query_group = load_query_group(split)
    candidate_docs = load_candidate_doc_top50(split)

    def factory(out_dir):
        return TokenizeWorker(split, query_group, candidate_docs, out_dir)

    runner = JobRunnerS(job_man_dir, len(query_group)-1, "MSMARCO_{}_top50_tokens".format(split), factory)
    runner.start()
Example #15
0
from data_generator.job_runner import JobRunner
from epath import job_man_dir
from job_manager.job_runner_with_server import JobRunnerS
from tlm.data_gen.adhoc_datagen import MultiWindow
from tlm.data_gen.adhoc_sent_tokenize import SeroFromTextEncoder
from tlm.data_gen.msmarco_doc_gen.gen_worker import MMDWorker
from tlm.data_gen.msmarco_doc_gen.gen_worker_sent_level import PointwiseGenFromText
from tlm.data_gen.msmarco_doc_gen.processed_resource import ProcessedResource10doc

if __name__ == "__main__":
    split = "train"
    resource = ProcessedResource10doc(split)
    max_segments = 4
    total_sequence_length = 512 * 4
    src_window_size = 512
    basic_encoder = SeroFromTextEncoder(src_window_size,
                                        total_sequence_length,
                                        random_short=True,
                                        max_seg_per_doc=max_segments)

    generator = PointwiseGenFromText(resource, basic_encoder,
                                     total_sequence_length)

    def factory(out_dir):
        return MMDWorker(resource.query_group, generator, out_dir)

    runner = JobRunnerS(job_man_dir,
                        len(resource.query_group) - 1, "MMD_sero_train_B",
                        factory)
    runner.start()
Example #16
0
from typing import List, Dict

from data_generator.job_runner import JobRunner
from epath import job_man_dir
from job_manager.job_runner_with_server import JobRunnerS
from tlm.data_gen.adhoc_datagen import LeadingN, FirstAndRandom, GeoSampler
from tlm.data_gen.adhoc_sent_tokenize import FromTextEncoder
from tlm.data_gen.doc_encode_common import seg_selection_by_geo_sampling
from tlm.data_gen.msmarco_doc_gen.gen_worker import MMDWorker
from tlm.data_gen.msmarco_doc_gen.gen_worker_sent_level import PairwiseGenFromText
from tlm.data_gen.msmarco_doc_gen.processed_resource import ProcessedResource, ProcessedResource10doc

if __name__ == "__main__":
    split = "train"
    resource = ProcessedResource10doc(split)
    max_seq_length = 512
    document_encoder = FromTextEncoder(max_seq_length, True,
                                       seg_selection_by_geo_sampling())
    generator = PairwiseGenFromText(resource, document_encoder, max_seq_length)

    def factory(out_dir):
        return MMDWorker(resource.query_group, generator, out_dir)

    runner = JobRunnerS(job_man_dir,
                        len(resource.query_group) - 1,
                        "MMD_{}_ss".format(split), factory)
    runner.start()