save_path = config['save_path'] if top_k == 50: candidate_d: Dict[str, List[QCKCandidate]] = get_eval_candidates_as_qck(split) elif top_k == 1000: candidate_d: Dict[str, List[QCKCandidate]] = get_eval_candidates_1k_as_qck(split) else: assert False per_query_infos: Dict[str, Dict[WordAsID, np.array]] = load_pickle_from(word_prob_path) all_ranked_list_entries = [] for query_id, d in per_query_infos.items(): scorer = Scorer(d, True) candidates: List[QCKCandidate] = candidate_d[query_id] entries = [] for c in candidates: e = c.id, scorer.score(c.text) entries.append(e) entries.sort(key=get_second, reverse=True) ranked_list_entries = scores_to_ranked_list_entries(entries, run_name, query_id) all_ranked_list_entries.extend(ranked_list_entries) write_trec_ranked_list_entry(all_ranked_list_entries, save_path) if __name__ == "__main__": run_func_with_config(main)
ticker = TimeEstimator(len(queries)) for q in queries: job_id: int = q_id_to_job_id[q.query_id] entries: List = score_d[job_id] entries.sort(key=get_second, reverse=True) doc_ids = left(entries) doc_ids = doc_ids[:top_n] preload_man.preload(TokenizedCluewebDoc, doc_ids) docs = iterate_docs(doc_ids) doc_part_list: List[KDP] = iterate_document_parts( docs, config['window_size'], config['step_size'], 20) all_doc_parts += len(doc_part_list) out_qk.append((q, doc_part_list)) ticker.tick() return out_qk def gen_overlap(config): split = config['split'] q_res_path = config['q_res_path'] save_name = config['save_name'] doc_score_path = config['doc_score_path'] candidate: List[QKUnit] = qk_candidate_gen(q_res_path, doc_score_path, split, config2()) save_to_pickle(candidate, save_name) if __name__ == "__main__": run_func_with_config(gen_overlap)
url_prefix = "http://gosford.cs.umass.edu:36559/document?identifier=" rows = [] header = ["claim" ] + ["url{}".format(i) for i in range(1, num_doc_per_query + 1)] rows.append(header) for query_id in keys[:10]: entries: List[SimpleRankedListEntry] = ranked_list[query_id] entries = entries[:num_doc_per_query * 3] doc_ids: List[str] = remove_duplicate(list([e.doc_id for e in entries])) claim = claim_d[int(query_id)] urls = [] for doc_id in doc_ids[:num_doc_per_query]: url = url_prefix + doc_id urls.append(url) assert len(urls) == num_doc_per_query row = [claim] + urls rows.append(row) save_path = os.path.join(output_path, "claim10_train.csv") f = open(save_path, "w") csv_writer = csv.writer(f) csv_writer.writerows(rows) if __name__ == "__main__": run_func_with_config(write_csv)
return out_d print("Collecting token level scores") per_query_infos: Dict[str, Dict[WordAsID, np.array]] = {} ticker = TimeEstimator(len(grouped)) for key, value in grouped.items(): per_query_infos[key] = average_scores(value) ticker.tick() return per_query_infos def collect_and_save_score(config): info_path = config['info_path'] pred_path = config['pred_path'] save_path = config['save_path'] info = load_combine_info_jsons(info_path, qk_convert_map, False) predictions: List[Dict] = join_prediction_with_info(pred_path, info, ['data_id', 'logits', 'input_ids', 'label_ids'], ) outputs: Iterable[QKTokenLevelOutEntry] = map(QKTokenLevelOutEntry.from_dict, predictions) per_query_infos: Dict[str, Dict[WordAsID, np.array]] = group_average_per_query(outputs) pickle.dump(per_query_infos, open(save_path, "wb")) if __name__ == "__main__": run_func_with_config(collect_and_save_score)
max_seq_length = 512 pos_doc_ids = set( [l.strip() for l in open(pos_doc_list_path, "r").readlines()]) doc_ids_unique = get_doc_ids_from_ranked_list_path(q_res_path) insts = generate(list(pos_doc_ids), list(doc_ids_unique), max_seq_length) train_size = int(0.9 * len(insts)) train_insts = insts[:train_size] val_insts = insts[train_size:] val_pos_insts = list([i for i in val_insts if i.label == 1]) val_neg_insts = list([i for i in val_insts if not i.label]) print("num pos inst in val", len(val_pos_insts)) if balance_test: val_neg_insts = val_neg_insts[:len(val_pos_insts)] val_insts = val_pos_insts + val_neg_insts tokenizer = get_tokenizer() def encode_fn(inst: Instance) -> OrderedDict: return encode_w_data_id(tokenizer, max_seq_length, inst) write_records_w_encode_fn(save_path + "train", encode_fn, train_insts) write_records_w_encode_fn(save_path + "val", encode_fn, val_insts) if __name__ == "__main__": run_func_with_config(make_training_data)