Example #1
0
    save_path = config['save_path']
    if top_k == 50:
        candidate_d: Dict[str, List[QCKCandidate]] = get_eval_candidates_as_qck(split)
    elif top_k == 1000:
        candidate_d: Dict[str, List[QCKCandidate]] = get_eval_candidates_1k_as_qck(split)
    else:
        assert False

    per_query_infos: Dict[str, Dict[WordAsID, np.array]] = load_pickle_from(word_prob_path)

    all_ranked_list_entries = []

    for query_id, d in per_query_infos.items():
        scorer = Scorer(d, True)
        candidates: List[QCKCandidate] = candidate_d[query_id]

        entries = []
        for c in candidates:
            e = c.id, scorer.score(c.text)
            entries.append(e)
        entries.sort(key=get_second, reverse=True)

        ranked_list_entries = scores_to_ranked_list_entries(entries, run_name, query_id)
        all_ranked_list_entries.extend(ranked_list_entries)

    write_trec_ranked_list_entry(all_ranked_list_entries, save_path)


if __name__ == "__main__":
    run_func_with_config(main)
    ticker = TimeEstimator(len(queries))
    for q in queries:
        job_id: int = q_id_to_job_id[q.query_id]
        entries: List = score_d[job_id]
        entries.sort(key=get_second, reverse=True)
        doc_ids = left(entries)
        doc_ids = doc_ids[:top_n]
        preload_man.preload(TokenizedCluewebDoc, doc_ids)
        docs = iterate_docs(doc_ids)
        doc_part_list: List[KDP] = iterate_document_parts(
            docs, config['window_size'], config['step_size'], 20)

        all_doc_parts += len(doc_part_list)
        out_qk.append((q, doc_part_list))
        ticker.tick()
    return out_qk


def gen_overlap(config):
    split = config['split']
    q_res_path = config['q_res_path']
    save_name = config['save_name']
    doc_score_path = config['doc_score_path']
    candidate: List[QKUnit] = qk_candidate_gen(q_res_path, doc_score_path,
                                               split, config2())
    save_to_pickle(candidate, save_name)


if __name__ == "__main__":
    run_func_with_config(gen_overlap)
Example #3
0
    url_prefix = "http://gosford.cs.umass.edu:36559/document?identifier="
    rows = []

    header = ["claim"
              ] + ["url{}".format(i) for i in range(1, num_doc_per_query + 1)]
    rows.append(header)
    for query_id in keys[:10]:
        entries: List[SimpleRankedListEntry] = ranked_list[query_id]
        entries = entries[:num_doc_per_query * 3]
        doc_ids: List[str] = remove_duplicate(list([e.doc_id
                                                    for e in entries]))
        claim = claim_d[int(query_id)]
        urls = []
        for doc_id in doc_ids[:num_doc_per_query]:
            url = url_prefix + doc_id
            urls.append(url)

        assert len(urls) == num_doc_per_query
        row = [claim] + urls
        rows.append(row)

    save_path = os.path.join(output_path, "claim10_train.csv")
    f = open(save_path, "w")

    csv_writer = csv.writer(f)
    csv_writer.writerows(rows)


if __name__ == "__main__":
    run_func_with_config(write_csv)
Example #4
0
        return out_d

    print("Collecting token level scores")
    per_query_infos: Dict[str, Dict[WordAsID, np.array]] = {}
    ticker = TimeEstimator(len(grouped))
    for key, value in grouped.items():
        per_query_infos[key] = average_scores(value)
        ticker.tick()

    return per_query_infos


def collect_and_save_score(config):
    info_path = config['info_path']
    pred_path = config['pred_path']
    save_path = config['save_path']

    info = load_combine_info_jsons(info_path, qk_convert_map, False)
    predictions: List[Dict] = join_prediction_with_info(pred_path,
                                                        info,
                                                        ['data_id', 'logits', 'input_ids', 'label_ids'],
                                                        )
    outputs: Iterable[QKTokenLevelOutEntry] = map(QKTokenLevelOutEntry.from_dict, predictions)

    per_query_infos: Dict[str, Dict[WordAsID, np.array]] = group_average_per_query(outputs)
    pickle.dump(per_query_infos, open(save_path, "wb"))


if __name__ == "__main__":
    run_func_with_config(collect_and_save_score)
Example #5
0
    max_seq_length = 512

    pos_doc_ids = set(
        [l.strip() for l in open(pos_doc_list_path, "r").readlines()])
    doc_ids_unique = get_doc_ids_from_ranked_list_path(q_res_path)

    insts = generate(list(pos_doc_ids), list(doc_ids_unique), max_seq_length)

    train_size = int(0.9 * len(insts))
    train_insts = insts[:train_size]
    val_insts = insts[train_size:]

    val_pos_insts = list([i for i in val_insts if i.label == 1])
    val_neg_insts = list([i for i in val_insts if not i.label])
    print("num pos inst in val", len(val_pos_insts))
    if balance_test:
        val_neg_insts = val_neg_insts[:len(val_pos_insts)]
    val_insts = val_pos_insts + val_neg_insts

    tokenizer = get_tokenizer()

    def encode_fn(inst: Instance) -> OrderedDict:
        return encode_w_data_id(tokenizer, max_seq_length, inst)

    write_records_w_encode_fn(save_path + "train", encode_fn, train_insts)
    write_records_w_encode_fn(save_path + "val", encode_fn, val_insts)


if __name__ == "__main__":
    run_func_with_config(make_training_data)