Ejemplo n.º 1
0
def qk_candidate_gen(q_res_path: str, doc_score_path, split,
                     config) -> List[Tuple[QCKQuery, List[KDP]]]:
    queries: List[QCKQuery] = get_qck_queries(split)
    num_jobs = d_n_claims_per_split2[split]
    score_d = load_doc_scores(doc_score_path, num_jobs)

    tprint("loading ranked list")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    query_ids = list(ranked_list.keys())
    query_ids.sort()
    print("num queries", len(query_ids))
    q_id_to_job_id = {q_id: job_id for job_id, q_id in enumerate(query_ids)}
    print("Pre loading docs")
    top_n = config['top_n']
    out_qk: List[Tuple[QCKQuery, List[KnowledgeDocumentPart]]] = []

    all_doc_parts = 0
    ticker = TimeEstimator(len(queries))
    for q in queries:
        job_id: int = q_id_to_job_id[q.query_id]
        entries: List = score_d[job_id]
        entries.sort(key=get_second, reverse=True)
        doc_ids = left(entries)
        doc_ids = doc_ids[:top_n]
        preload_man.preload(TokenizedCluewebDoc, doc_ids)
        docs = iterate_docs(doc_ids)
        doc_part_list: List[KDP] = iterate_document_parts(
            docs, config['window_size'], config['step_size'], 20)

        all_doc_parts += len(doc_part_list)
        out_qk.append((q, doc_part_list))
        ticker.tick()
    return out_qk
Ejemplo n.º 2
0
def sentence_payload_gen(q_res_path: str, top_n, data_id_man: DataIDManager):
    print("loading ranked list")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    qid_list = list(ranked_list.keys())
    qid_list = qid_list[:10]
    ranked_list = {k: ranked_list[k] for k in qid_list}
    print("Pre loading docs")
    preload_docs(ranked_list, top_n)
    entries: List[Tuple[str, bool, int]] = []

    def enum_sentence(tokens) -> Iterator[str]:
        text = " ".join(tokens)
        sents = sent_tokenize(text)
        yield from sents

    ticker = TimeEstimator(len(ranked_list))
    for qid in ranked_list:
        q_res: List[SimpleRankedListEntry] = ranked_list[qid]
        docs = iterate_docs(q_res, top_n)

        for doc in docs:
            for sent_idx, sent in enumerate(enum_sentence(doc.tokens)):
                info = {
                    'doc_id': doc.doc_id,
                    'sent_idx': sent_idx,
                    'sentence': sent
                }
                data_id = data_id_man.assign(info)
                e = sent, True, data_id
                entries.append(e)

        ticker.tick()
    return entries
Ejemplo n.º 3
0
def a_relevant_candidate(save_name, q_res_path, claims):
    top_n = 10
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)
    all_passages = []
    entries = []

    all_docs = 0
    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        claim_text = c['text']

        def get_passage_score(dummy):
            return 0

        passages: List[Tuple[List[str], float]] = iterate_passages(
            q_res, top_n, get_passage_score)
        all_docs += len(passages)
        all_passages.extend(passages)
        entries.append((c, passages))

    print("{} claims. {} docs ".format(len(claims), all_docs))
    data = entries, all_passages
    save_to_pickle(data, save_name)
Ejemplo n.º 4
0
def main(config):
    # select claims
    # load relevant documents
    # remove duplicate
    q_res_path = config['q_res_path']
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    claims = get_all_claims()
    claim_d = claims_to_dict(claims)

    keys = list(ranked_list.keys())
    keys.sort()
    num_doc_per_query = 10
    url_prefix = "http://localhost:36559/document?identifier="
    rows = []
    for query_id in keys[:10]:
        entries: List[SimpleRankedListEntry] = ranked_list[query_id]
        entries = entries[:num_doc_per_query * 3]
        doc_ids: List[str] = remove_duplicate(list([e.doc_id
                                                    for e in entries]))
        claim = claim_d[int(query_id)]
        s = "{} : {}".format(query_id, claim)
        rows.append([Cell(s)])
        for doc_id in doc_ids[:num_doc_per_query]:
            url = url_prefix + doc_id
            s = "<a href=\"{}\">{}</a>".format(url, doc_id)
            rows.append([Cell(s)])

    html = HtmlVisualizer("claim_docs_urls.html")
    html.write_table(rows)
Ejemplo n.º 5
0
def main(config):
    # select claims
    # load relevant documents
    # remove duplicate
    q_res_path = config['q_res_path']
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    query_text_d = json.load(open(config['query_text_d']))
    save_name = config['save_path']

    keys = list(ranked_list.keys())
    keys.sort()
    num_doc_per_query = 10
    url_prefix = "http://localhost:36559/document?identifier="
    rows = []
    for query_id in keys[:100]:
        entries: List[SimpleRankedListEntry] = ranked_list[query_id]
        entries = entries[:num_doc_per_query * 3]
        doc_ids: List[str] = list([e.doc_id for e in entries])
        query_text = query_text_d[query_id]
        s = "{} : {}".format(query_id, query_text)
        rows.append([Cell(s)])
        for doc_id in doc_ids[:num_doc_per_query]:
            url = url_prefix + doc_id
            s = "<a href=\"{}\">{}</a>".format(url, doc_id)
            rows.append([Cell(s)])

    html = HtmlVisualizer(save_name)
    html.write_table(rows)
Ejemplo n.º 6
0
def insert_ranked_list_from_path(file_path: FilePath, q_config_id: str):
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(file_path)

    for query_id in ranked_list:
        q_res_id: QueryResultID = QueryResultID("{}_{}".format(
            query_id, q_config_id))
        insert_ranked_list(q_res_id, ranked_list[query_id])
Ejemplo n.º 7
0
def load_all_ranked_list(ranked_list_save_root, disk_name):
    d = {}
    for idx in train_query_indices:
        file_name = "{}_{}.txt".format(disk_name, idx)
        file_path = os.path.join(ranked_list_save_root, file_name)
        d.update(load_galago_ranked_list(file_path))

    return d
Ejemplo n.º 8
0
 def __init__(self, q_res_path, config, top_n):
     self.config = config
     self.top_n = top_n
     print("loading ranked list")
     self.ranked_list: Dict[str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
     print("Ranked list loaded for {} queries".format(len(self.ranked_list)))
     print("Pre loading docs")
     preload_docs(self.ranked_list, top_n)
Ejemplo n.º 9
0
 def __init__(self, q_res_path, top_n, window_size):
     self.robust_tokens: Dict[str,
                              List[str]] = load_robust_tokens_for_predict()
     self.ranked_list: Dict[
         str,
         List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
     self.top_n = top_n
     self.window_size = window_size
Ejemplo n.º 10
0
def a_relevant(save_name, q_res_path, claims):
    top_n = 10

    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    stopwords = load_stopwords_for_query()
    alpha = 0.5

    tokenizer = PCTokenizer()
    all_passages = []
    entries = []
    num_pos_sum = 0
    num_pos_exists = 0

    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        claim_lm = claim_lms_d[c['cId']]
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        claim_text = c['text']
        claim_tokens = tokenizer.tokenize_stem(claim_text)

        scores = []
        for t in claim_tokens:
            if t in log_odd:
                scores.append(log_odd[t])

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        passages = iterate_passages(q_res, top_n, get_passage_score)
        num_pos = len(lfilter(lambda x: x[1] > 0, passages))
        num_pos_sum += num_pos
        if num_pos > 0:
            num_pos_exists += 1

        all_passages.extend(passages)
        entries.append((c, passages))

    print("{} claims. {} docs on {} claims".format(len(claims), num_pos_sum,
                                                   num_pos_exists))

    data = entries, all_passages

    save_to_pickle(data, save_name)
Ejemplo n.º 11
0
def a_relevant():
    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    claims = claims[:10]
    top_n = 100
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)

    stopwords = load_stopwords_for_query()
    alpha = 0.7

    tokenizer = PCTokenizer()
    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        claim_lm = claim_lms_d[c['cId']]
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        docs = []
        for i in range(top_n):
            try:
                doc = load_doc(q_res[i].doc_id)
                docs.append(doc)
            except KeyError:
                docs.append(None)
                pass

        print(c['text'])
        rows = []
        for rank, doc in enumerate(docs):
            if doc is None:
                rows.append((rank, "-", "-"))
                continue

            scores = get_doc_score(doc, get_passage_score)
            avg_score = average(scores)
            max_score = max(scores)
            rows.append((rank, avg_score, max_score))

        print_table(rows)
Ejemplo n.º 12
0
    def __init__(self, doc_ids, judgement_path, query):
        print("DataSample init")
        # load doc_lisself.doc_ids = load_doc_list(doc_id_path)t
        self.q_group = load_galago_ranked_list(judgement_path)

        # load query-judgement
        self.doc_ids = list(doc_ids)
        self.query = query
        self.n_sample_ranked = 5
        self.n_sample_not_ranked = 3
Ejemplo n.º 13
0
 def __init__(self, q_res_path, query_d: Dict[int, str], out_dir):
     self.ranked_list: Dict[str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
     query_ids = list(self.ranked_list.keys())
     query_ids.sort()
     self.job_id_to_q_id = {job_id: q_id for job_id, q_id in enumerate(query_ids)}
     self.query_d: Dict[int, str] = query_d
     self.tokenizer = get_tokenizer()
     self.max_seq_length = 512
     self.out_dir = out_dir
     self.info_out_dir = out_dir + "_info"
     exist_or_mkdir(self.info_out_dir)
Ejemplo n.º 14
0
def main():
    file_path = sys.argv[1]
    top_n = int(sys.argv[2])
    save_path = sys.argv[3]
    ranked_list_d: Dict[str, List[SimpleRankedListEntry]] = load_galago_ranked_list(file_path)

    def get_head(l: List):
        return l[:top_n]

    new_ranked_list = dict_value_map(get_head, ranked_list_d)
    write_ranked_list_from_s(new_ranked_list, save_path)
Ejemplo n.º 15
0
def a_relevant():
    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    claims = claims
    top_n = 10
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    stopwords = load_stopwords_for_query()
    alpha = 0.3

    tokenizer = PCTokenizer()
    all_passages = []
    entries = []
    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        claim_lm = claim_lms_d[c['cId']]
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        claim_text = c['text']
        claim_tokens = tokenizer.tokenize_stem(claim_text)

        scores = []
        for t in claim_tokens:
            if t in log_odd:
                scores.append(log_odd[t])
        base = average(scores)

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        passages = iterate_passages(q_res, top_n, get_passage_score)

        all_passages.extend(passages)
        a_rel_passages = lfilter(lambda x: x[1] > 0, passages)

        entries.append((c, a_rel_passages))

    data = entries, all_passages

    save_to_pickle(data, "pc_train_a_passages")
Ejemplo n.º 16
0
def load_ranked_list(relevance_list_path):
    all_ranked_list = {}
    for file_path in get_dir_files(relevance_list_path):
        file_name = os.path.basename(file_path)
        ranked_list_d = load_galago_ranked_list(file_path)

        queries = ranked_list_d.keys()
        any_query = list(queries)[0]
        ranked_list = ranked_list_d[any_query]
        all_ranked_list[file_name] = ranked_list
    return all_ranked_list
Ejemplo n.º 17
0
def show_missing():
    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    claims = claims[:10]
    top_n = 100
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)

    preload_docs(ranked_list, claims, top_n)
    report_missing(claims, ranked_list, top_n)
Ejemplo n.º 18
0
def main():
    split = "train"
    subjectivity_path = sys.argv[1]
    q_res_path = sys.argv[2]
    ranked_list: Dict[str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)

    # load LM
    claim_lms: List[ClaimLM] = build_gold_lms_for_sub_split(split)
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)
    alpha = 0.1
    stopwords = load_stopwords_for_query()
    # load subjectivity predictions.
    subj_d: Dict[str, Tuple[int, int]] = load_subjectivity(subjectivity_path)
    doc_ids = subj_d.keys()
    preload_man.preload(TokenizedCluewebDoc, doc_ids)
    tokenizer = PCTokenizer()

    lm_scores = []
    rates = []
    num_subj_list = []
    num_sent_list = []
    for claim_lm in claim_lms:
        qid = str(claim_lm.cid)
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        for entry in ranked_list[qid]:
            if entry.doc_id in subj_d:
                tokens = load_doc(entry.doc_id)
                assert type(tokens[0]) == str
                lm_score = get_passage_score(tokens)
                num_subj, num_sent = subj_d[entry.doc_id]
                rate = num_subj / num_sent
                lm_scores.append(lm_score)
                rates.append(rate)
                num_subj_list.append(num_subj)
                num_sent_list.append(num_sent)



    print("lm scores correlation with ")
    print("rates: ", pearsonr(lm_scores, rates))
    print("num subj: ", pearsonr(lm_scores, num_subj_list))
    print("num sent: ", pearsonr(lm_scores, num_sent_list))
Ejemplo n.º 19
0
def main(config):
    def get_worker(out_dir):
        writer = Writer(max_seq_length=config['max_seq_length'],
                        reverse=config['reverse'])
        return KDPParaWorker(config, writer, out_dir)

    q_res_path = config['q_res_path']
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    num_job = len(ranked_list) - 1

    runner = JobRunner(job_man_dir, num_job, config['job_name'], get_worker)
    runner.auto_runner()
Ejemplo n.º 20
0
def verify_ranked_list(out_path, queries):
    n_query = len(queries)
    file_name = os.path.basename(out_path)
    ranked_list_d = load_galago_ranked_list(out_path)
    if len(ranked_list_d) < n_query:
        print("{} has only {} queries, expected {}".format(
            file_name, len(ranked_list_d), n_query))
        found_query_ids = set(ranked_list_d.keys())
        queries_d = dict(lmap(lambda x: (x["number"], x["text"]), queries))
        expected_query_ids = lmap(lambda x: x["number"], queries)
        not_found_query_ids = list(
            [t for t in expected_query_ids if t not in found_query_ids])
        for query_id in not_found_query_ids:
            print("Not found: ", queries_d[query_id])
Ejemplo n.º 21
0
def main():
    train_queries, test_queries = get_query_split()
    out_dir = pjoin(output_path, "eHealth")
    exist_or_mkdir(out_dir)
    ranked_list_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/CLEF_eHealth_working/ranked_list_filtered"
    )
    ranked_list: RankedListDict = load_galago_ranked_list(ranked_list_path)
    qrels = load_clef_qrels()

    new_d = {}
    for query in test_queries:
        new_d[query.qid] = ranked_list[query.qid]

    save_path = os.path.join(out_dir, 'test_baseline.list')
    write_ranked_list_from_d(new_d, save_path)
Ejemplo n.º 22
0
def qk_candidate_gen(q_res_path: str, queries: List[QCKQuery], top_n, config) -> List[Tuple[QCKQuery, List[KDP]]]:
    print("loading ranked list")
    ranked_list: Dict[str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    print("Pre loading docs")
    preload_docs(ranked_list, top_n)
    entries: List[Tuple[QCKQuery, List[KnowledgeDocumentPart]]] = []

    all_doc_parts = 0
    ticker = TimeEstimator(len(queries))
    for q in queries:
        q_res: List[SimpleRankedListEntry] = ranked_list[q.query_id]
        doc_part_list = enum_doc_parts_from_ranked_list(config, q_res, top_n)
        all_doc_parts += len(doc_part_list)
        entries.append((q, doc_part_list))
        ticker.tick()
    return entries
Ejemplo n.º 23
0
 def __init__(
     self,
     config,
     writer,
     out_dir,
 ):
     q_res_path = config['q_res_path']
     self.top_n = config['top_n']
     self.num_sent = config['num_sent']
     self.max_seq_length = config['max_seq_length']
     self.ranked_list: Dict[
         str,
         List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
     self.cids = lmap(int, self.ranked_list.keys())
     self.pid_dict = first_pid_as_rep()
     self.out_dir = out_dir
     self.writer = writer
Ejemplo n.º 24
0
def work(q_res_path, save_name):
    ranked_list_d = load_galago_ranked_list(q_res_path)
    window_size = 10
    stemmer = CacheStemmer()
    print(q_res_path)

    ticker = TimeEstimator(len(ranked_list_d))
    r = []
    for claim_id, ranked_list in ranked_list_d.items():
        ticker.tick()
        doc_ids = list([e.doc_id for e in ranked_list])
        print("1")
        counter = build_co_occurrence(get_tokens_form_doc_ids(doc_ids), window_size, stemmer)
        print("2")
        r.append((claim_id, counter))

    save_to_pickle(r, save_name)
Ejemplo n.º 25
0
def load_multiple_ranked_list(dir_path, get_key_from_name):
    files = get_dir_files(dir_path)

    data = []
    for file_path in files:
        name = os.path.basename(file_path)
        ranked_list_d = load_galago_ranked_list(file_path)
        for query, ranked_list in ranked_list_d.items():
            data.append((name, ranked_list))

    new_d = {}
    key_fn = lambda x: get_key_from_name(x[0])
    for key, sub_data in group_by(data, key_fn).items():
        ranked_list = right(sub_data)
        new_d[key] = merge_ranked_list_list(ranked_list)

    return new_d
Ejemplo n.º 26
0
def main():
    train_queries, test_queries = get_query_split()
    out_dir = pjoin(output_path, "eHealth")
    exist_or_mkdir(out_dir)
    train_save_path = pjoin(out_dir, "tfrecord_train")
    test_save_path = pjoin(out_dir, "tfrecord_test")
    ranked_list_path = FilePath(
        os.path.join(output_path, "eHealth", "bm25_filtered.list"))
    ranked_list: RankedListDict = load_galago_ranked_list(ranked_list_path)
    qrels = load_clef_qrels()

    train_info = write_tfrecord(ranked_list, train_queries, qrels,
                                train_save_path)
    save_to_pickle(train_info, "eHealth_train_info")
    test_info = write_tfrecord(ranked_list, test_queries, qrels,
                               test_save_path)
    save_to_pickle(test_info, "eHealth_test_info")
Ejemplo n.º 27
0
def do_datagen(d_ids, q_res_path, save_name):
    claims: List[Dict] = get_claims_from_ids(d_ids)
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    claim_lms = build_gold_lms(claims)
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    alpha = 0.1
    max_seq_length = 512
    generator = get_generator(max_seq_length, bg_lm, alpha)
    out_dir = os.path.join(env_data_dir, save_name)
    exist_or_mkdir(out_dir)
    for claim_lm in claim_lms:
        print(claim_lm.cid)
        records: List[Record] = generator(claim_lm,
                                          ranked_list[str(claim_lm.cid)])
        output_path = os.path.join(out_dir, str(claim_lm.cid))
        write_records(records, max_seq_length, output_path)
Ejemplo n.º 28
0
def main():
    queries = load_queries()
    bm25_path = pjoin(cord_working_dir, "youngwoo_bm25_query")
    ranked_list:  Dict[str, List[SimpleRankedListEntry]] = load_galago_ranked_list(bm25_path)
    out_path = os.path.join(cord_working_dir, "tfrecord_2_4")
    max_seq_length = 512

    meat_data: List[Dict] = read_csv_as_dict(meta_data_path)

    text_dict = {}
    for e in meat_data:
        text_dict[e[str_cord_uid]] = e[str_title] + " " + e[str_abstract]

    def get_text_from_doc_id(doc_id:str) -> str:
        return text_dict[doc_id]

    data_info_save_name = "data_info_save"
    tf_record_gen(ranked_list, queries, get_text_from_doc_id, out_path, max_seq_length, data_info_save_name)
Ejemplo n.º 29
0
def work():
    q_config_id = Q_CONFIG_ID_BM25_UKP
    ranked_list_save_root = get_ranked_list_save_dir(q_config_id)
    doc_ids = set()
    ticker = TimeEstimator(num_query_file)
    for i in range(num_query_file):
        file_name = FileName("{}_{}.txt".format(index_name_list[0], str(i)))
        ranked_list_path = pjoin(ranked_list_save_root, file_name)
        rl: Dict[str, List[SimpleRankedListEntry]] = load_galago_ranked_list(
            ranked_list_path)

        for key, value in rl.items():
            for entry in value[:100]:
                doc_ids.add(entry.doc_id)
        ticker.tick()

    f = open("{}_uniq_100".format(q_config_id), "w")
    for doc_id in doc_ids:
        f.write("{}\n".format(doc_id))
    f.close()
Ejemplo n.º 30
0
def write_csv(config):
    # select claims
    # load relevant documents
    # remove duplicate
    q_res_path = config['q_res_path']
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    claims = get_all_claims()
    claim_d = claims_to_dict(claims)

    keys = list(ranked_list.keys())
    keys.sort()
    num_doc_per_query = 10
    url_prefix = "http://gosford.cs.umass.edu:36559/document?identifier="
    rows = []

    header = ["claim"
              ] + ["url{}".format(i) for i in range(1, num_doc_per_query + 1)]
    rows.append(header)
    for query_id in keys[:10]:
        entries: List[SimpleRankedListEntry] = ranked_list[query_id]
        entries = entries[:num_doc_per_query * 3]
        doc_ids: List[str] = remove_duplicate(list([e.doc_id
                                                    for e in entries]))
        claim = claim_d[int(query_id)]
        urls = []
        for doc_id in doc_ids[:num_doc_per_query]:
            url = url_prefix + doc_id
            urls.append(url)

        assert len(urls) == num_doc_per_query
        row = [claim] + urls
        rows.append(row)

    save_path = os.path.join(output_path, "claim10_train.csv")
    f = open(save_path, "w")

    csv_writer = csv.writer(f)
    csv_writer.writerows(rows)