def start_generate_jobs_for_train_val(generator: InstanceGenerator, name_prefix): # claim ids split to train/val print("Loading data ....") d_ids: List[int] = list(load_train_claim_ids()) claims = get_claims_from_ids(d_ids) train, val = split_7_3(claims) train_cids = {str(t['cId']) for t in train} val_cids = {str(t['cId']) for t in val} qk_candidate: List[QKUnit] = load_qk_candidate_train() print("Generate instances : train") qk_candidate_train: List[QKUnit] = list( [qk for qk in qk_candidate if qk[0].query_id in train_cids]) qk_candidate_val = list( [qk for qk in qk_candidate if qk[0].query_id in val_cids]) def worker_factory(out_dir): return QCKWorker(qk_candidate_train, generator, out_dir) runner = JobRunner(job_man_dir, 378, name_prefix + "_train", worker_factory) runner.start() print("Generate instances : val") def worker_factory(out_dir): return QCKWorker(qk_candidate_val, generator, out_dir) runner = JobRunner(job_man_dir, 162, name_prefix + "_val", worker_factory) runner.start()
def start_generate_jobs_for_train_val( generator_functor: Callable[[Dict[int, List[Tuple[List[str], float]]]], CPPNCGeneratorInterface], writer, name_prefix): # claim ids split to train/val d_ids: List[int] = list(load_train_claim_ids()) claims = get_claims_from_ids(d_ids) train, val = split_7_3(claims) data = load_from_pickle("pc_train_a_passages") entries, all_passages = data cid_to_passages: Dict[int, List[Tuple[List[str], float]]] = { claim['cId']: p for claim, p in entries } generator = generator_functor(cid_to_passages) print("Generate instances : train") def worker_factory(out_dir): return CPPNCWorker(train, generator, writer, out_dir) runner = JobRunner(job_man_dir, 378, name_prefix + "_train", worker_factory) runner.start() print("Generate instances : val") def worker_factory(out_dir): return CPPNCWorker(val, generator, writer, out_dir) runner = JobRunner(job_man_dir, 162, name_prefix + "_val", worker_factory) runner.start()
def load_data_point_50_train_val(split): d_ids = list(load_claim_ids_for_split("train")) train_ids, val_ids = split_7_3(d_ids) print(len(d_ids), len(train_ids), len(val_ids)) if split == "train": sel_ids = train_ids elif split == "val": sel_ids = val_ids else: sel_ids = d_ids print("Taking all claims in {}".format(split)) claims = get_claims_from_ids(sel_ids) all_data_points = get_candidates(claims, False) return all_data_points
def load_claims_for_sub_split(sub_split) -> List[Dict]: if sub_split in ["train", "val"]: split = "train" d_ids: List[int] = list(load_train_claim_ids()) claims = get_claims_from_ids(d_ids) train, val = split_7_3(claims) if sub_split == "train": return train elif sub_split == "val": return val else: assert False else: d_ids: Iterable[int] = load_claim_ids_for_split(sub_split) claims = get_claims_from_ids(d_ids) return claims
def main(): print("Loading data ....") d_ids: List[int] = list(load_train_claim_ids()) claims = get_claims_from_ids(d_ids) train, val = split_7_3(claims) val_cids = {str(t['cId']) for t in val} qk_candidate: List[QKUnit] = load_qk_candidate_train() qk_candidate_val = list( [qk for qk in qk_candidate if qk[0].query_id in val_cids]) print(qk_candidate_val[0][0]) for q, kdp_list in qk_candidate_val[1:9]: job_id = request_kdp_eval(kdp_list) print('qid:', q.query_id) print('job_id', job_id)
def train_split(): d_ids: List[int] = list(load_train_claim_ids()) claims = get_claims_from_ids(d_ids) train, val = split_7_3(claims) return claims, val