Example #1
0
def start_generate_jobs_for_train_val(generator: InstanceGenerator,
                                      name_prefix):
    # claim ids split to train/val
    print("Loading data ....")
    d_ids: List[int] = list(load_train_claim_ids())
    claims = get_claims_from_ids(d_ids)
    train, val = split_7_3(claims)

    train_cids = {str(t['cId']) for t in train}
    val_cids = {str(t['cId']) for t in val}
    qk_candidate: List[QKUnit] = load_qk_candidate_train()
    print("Generate instances : train")
    qk_candidate_train: List[QKUnit] = list(
        [qk for qk in qk_candidate if qk[0].query_id in train_cids])
    qk_candidate_val = list(
        [qk for qk in qk_candidate if qk[0].query_id in val_cids])

    def worker_factory(out_dir):
        return QCKWorker(qk_candidate_train, generator, out_dir)

    runner = JobRunner(job_man_dir, 378, name_prefix + "_train",
                       worker_factory)
    runner.start()

    print("Generate instances : val")

    def worker_factory(out_dir):
        return QCKWorker(qk_candidate_val, generator, out_dir)

    runner = JobRunner(job_man_dir, 162, name_prefix + "_val", worker_factory)
    runner.start()
Example #2
0
def start_generate_jobs_for_train_val(
        generator_functor: Callable[[Dict[int, List[Tuple[List[str], float]]]],
                                    CPPNCGeneratorInterface], writer,
        name_prefix):
    # claim ids split to train/val
    d_ids: List[int] = list(load_train_claim_ids())
    claims = get_claims_from_ids(d_ids)
    train, val = split_7_3(claims)
    data = load_from_pickle("pc_train_a_passages")
    entries, all_passages = data
    cid_to_passages: Dict[int, List[Tuple[List[str], float]]] = {
        claim['cId']: p
        for claim, p in entries
    }
    generator = generator_functor(cid_to_passages)

    print("Generate instances : train")

    def worker_factory(out_dir):
        return CPPNCWorker(train, generator, writer, out_dir)

    runner = JobRunner(job_man_dir, 378, name_prefix + "_train",
                       worker_factory)
    runner.start()

    print("Generate instances : val")

    def worker_factory(out_dir):
        return CPPNCWorker(val, generator, writer, out_dir)

    runner = JobRunner(job_man_dir, 162, name_prefix + "_val", worker_factory)
    runner.start()
Example #3
0
def load_data_point_50_train_val(split):
    d_ids = list(load_claim_ids_for_split("train"))
    train_ids, val_ids = split_7_3(d_ids)
    print(len(d_ids), len(train_ids), len(val_ids))
    if split == "train":
        sel_ids = train_ids
    elif split == "val":
        sel_ids = val_ids
    else:
        sel_ids = d_ids
        print("Taking all claims in {}".format(split))

    claims = get_claims_from_ids(sel_ids)
    all_data_points = get_candidates(claims, False)
    return all_data_points
Example #4
0
def load_claims_for_sub_split(sub_split) -> List[Dict]:
    if sub_split in ["train", "val"]:
        split = "train"
        d_ids: List[int] = list(load_train_claim_ids())
        claims = get_claims_from_ids(d_ids)
        train, val = split_7_3(claims)
        if sub_split == "train":
            return train
        elif sub_split == "val":
            return val
        else:
            assert False
    else:
        d_ids: Iterable[int] = load_claim_ids_for_split(sub_split)
        claims = get_claims_from_ids(d_ids)
        return claims
Example #5
0
def main():
    print("Loading data ....")
    d_ids: List[int] = list(load_train_claim_ids())
    claims = get_claims_from_ids(d_ids)
    train, val = split_7_3(claims)

    val_cids = {str(t['cId']) for t in val}

    qk_candidate: List[QKUnit] = load_qk_candidate_train()
    qk_candidate_val = list(
        [qk for qk in qk_candidate if qk[0].query_id in val_cids])

    print(qk_candidate_val[0][0])

    for q, kdp_list in qk_candidate_val[1:9]:
        job_id = request_kdp_eval(kdp_list)
        print('qid:', q.query_id)
        print('job_id', job_id)
Example #6
0
def train_split():
    d_ids: List[int] = list(load_train_claim_ids())
    claims = get_claims_from_ids(d_ids)
    train, val = split_7_3(claims)
    return claims, val