Beispiel #1
0
    def generate(self, qids) -> Iterator[SRPerQuery]:
        for qid in qids:
            query_tokens = self.resource.get_q_tokens(qid)
            content_len = self.max_seq_length - 3 - len(query_tokens)
            try:
                docs: List[MSMarcoDoc] = load_per_query_docs(qid, None)
                docs_d = {d.doc_id: d for d in docs}

                sr_per_query_doc_list = []
                for doc_id in self.resource.get_doc_for_query_d()[qid]:
                    label = self.resource.get_label(qid, doc_id)
                    try:
                        doc = docs_d[doc_id]
                        segs: List[SegmentRepresentation] = self.get_segs(
                            query_tokens, doc, content_len)
                        sr_per_query_doc = SRPerQueryDoc(doc_id, segs, label)
                        sr_per_query_doc_list.append(sr_per_query_doc)
                    except KeyError:
                        pass

                sr_per_query = SRPerQuery(qid, sr_per_query_doc_list)
                yield sr_per_query
            except KeyError as e:
                print(e)
                print(doc_id)
            except FileNotFoundError as e:
                print(e)
                print(qid)
Beispiel #2
0
    def work(self, job_id):
        qid_list = self.query_group[job_id]
        missing_rel_cnt = 0
        missing_nrel_cnt = 0

        def empty_doc_fn(query_id, doc_id):
            rel_docs = self.ms_reader.qrel[query_id]
            nonlocal missing_rel_cnt
            nonlocal missing_nrel_cnt
            if doc_id in rel_docs:
                missing_rel_cnt += 1
            else:
                missing_nrel_cnt += 1

        for qid in qid_list:
            docs: List[MSMarcoDoc] = load_per_query_docs(qid, empty_doc_fn)
            if qid not in self.candidate_docs_d:
                continue

            target_docs = self.candidate_docs_d[qid]
            tokens_d = {}
            for d in docs:
                if d.doc_id in target_docs:
                    tokens_d[d.doc_id] = []

            if len(tokens_d) < len(target_docs):
                log_variables(job_id, qid, tokens_d, target_docs)
                not_found_docs = list([
                    doc_id for doc_id in target_docs if doc_id not in tokens_d
                ])
                print("{} of {} not found: {}".format(len(not_found_docs),
                                                      len(target_docs),
                                                      not_found_docs))
    def work(self, job_id):
        qid_list = self.query_group[job_id]
        ticker = TimeEstimator(len(qid_list))
        missing_rel_cnt = 0
        missing_nrel_cnt = 0

        def empty_doc_fn(query_id, doc_id):
            rel_docs = self.ms_reader.qrel[query_id]
            nonlocal missing_rel_cnt
            nonlocal missing_nrel_cnt
            if doc_id in rel_docs:
                missing_rel_cnt += 1
            else:
                missing_nrel_cnt += 1

        for qid in qid_list:
            if qid not in self.candidate_docs_d:
                continue

            docs: List[MSMarcoDoc] = load_per_query_docs(qid, empty_doc_fn)
            ticker.tick()

            target_docs = self.candidate_docs_d[qid]
            text_d = {}
            bert_tokens_d = {}
            stemmed_tokens_d = {}

            for d in docs:
                if d.doc_id in target_docs:
                    title = d.title
                    title = crop_to_space(title, self.max_title_length)

                    body_sents = sent_tokenize(d.body)
                    new_body_sents = self.resplit_body_sents(body_sents)
                    text_d[d.doc_id] = title, new_body_sents

                    for tokenize_fn, save_dict in [
                        (self.bert_tokenizer.tokenize, bert_tokens_d),
                        (self.stem_tokenizer.tokenize_stem, stemmed_tokens_d)
                    ]:
                        title_tokens = tokenize_fn(title)
                        body_tokens_list = lmap(tokenize_fn, new_body_sents)
                        save_dict[d.doc_id] = (title_tokens, body_tokens_list)

            todo = [
                (text_d, self.text_dir_name),
                (bert_tokens_d, self.bert_tokens_dir_name),
                (stemmed_tokens_d, self.stemmed_tokens_dir_name),
            ]

            for tokens_d, dir_name in todo:
                save_path = os.path.join(self.out_dir, dir_name, str(qid))
                pickle.dump(tokens_d, open(save_path, "wb"))
Beispiel #4
0
def get_todo() -> List[Tuple[QueryID, MSMarcoDoc]]:
    print("get_todo()")
    doc_queries = load_train_queries()
    doc_qrels: Dict[QueryID, List[str]] = load_msmarco_raw_qrels("train")

    todo: List[Tuple[QueryID, MSMarcoDoc]] = []
    doc_id_to_find = []
    n_item = 1000
    for qid, q_text in doc_queries[:n_item]:
        docs = load_per_query_docs(qid, None)
        for doc in docs:
            if doc.doc_id in doc_qrels[qid]:
                todo.append((qid, doc))
                doc_id_to_find.append(doc.doc_id)
    return todo
Beispiel #5
0
    def generate(self, data_id_manager, qids):
        missing_cnt = 0
        success_docs = 0
        missing_doc_qid = []
        ticker = TimeEstimator(len(qids))
        for qid in qids:
            if qid not in self.resource.get_doc_for_query_d():
                continue
            ticker.tick()
            docs: List[MSMarcoDoc] = load_per_query_docs(qid, None)
            docs_d = {d.doc_id: d for d in docs}

            q_tokens = self.resource.get_q_tokens(qid)
            pos_doc_id_list, neg_doc_id_list \
                = get_pos_neg_doc_ids_for_qid(self.resource, qid)

            def iter_passages(doc_id):
                doc = docs_d[doc_id]
                insts: List[Tuple[List, List]] = self.encoder.encode(q_tokens, doc.title, doc.body)

                for passage_idx, passage in enumerate(insts):
                    yield passage

            for pos_doc_id in pos_doc_id_list:
                sampled_neg_doc_id = pick1(neg_doc_id_list)
                try:
                    for passage_idx1, passage1 in enumerate(iter_passages(pos_doc_id)):
                        for passage_idx2, passage2 in enumerate(iter_passages(sampled_neg_doc_id)):
                            tokens_seg1, seg_ids1 = passage1
                            tokens_seg2, seg_ids2 = passage2

                            data_id = data_id_manager.assign({
                                'doc_id1': pos_doc_id,
                                'passage_idx1': passage_idx1,
                                'doc_id2': sampled_neg_doc_id,
                                'passage_idx2': passage_idx2,
                            })
                            inst = PairedInstance(tokens_seg1, seg_ids1, tokens_seg2, seg_ids2, data_id)
                            yield inst
                    success_docs += 1
                except KeyError:
                    missing_cnt += 1
                    missing_doc_qid.append(qid)
                    if missing_cnt > 10:
                        print(missing_doc_qid)
                        print("success: ", success_docs)
                        raise KeyError
Beispiel #6
0
def main():
    split = "dev"
    query_d = dict(load_queries(split))
    bm25_module = get_bm25_module()
    ranked_list_path = at_working_dir("msmarco-doc{}-top100".format(split))
    run_name = "BM25_df100"
    rlg = load_ranked_list_grouped(ranked_list_path)
    save_path = at_output_dir("ranked_list", "mmd_dev_{}.txt".format(run_name))
    te = TimeEstimator(100)
    out_entries = []
    for query_id, entries in rlg.items():
        doc_ids = list([e.doc_id for e in entries])
        docs = load_per_query_docs(query_id, None)

        found_doc_ids = list([d.doc_id for d in docs])
        not_found_doc_ids = list(
            [doc_id for doc_id in doc_ids if doc_id not in found_doc_ids])
        doc_id_len = len(not_found_doc_ids)
        if doc_id_len:
            print("{} docs not found".format(doc_id_len))

        query_text = query_d[QueryID(query_id)]

        def score(doc: MSMarcoDoc):
            content = doc.title + " " + doc.body
            return bm25_module.score(query_text, content)

        scored_docs = list([(d, score(d)) for d in docs])
        scored_docs.sort(key=get_second, reverse=True)

        reranked_entries = []
        for rank, (doc, score) in enumerate(scored_docs):
            e = TrecRankedListEntry(query_id, doc.doc_id, rank, score,
                                    run_name)
            reranked_entries.append(e)
        out_entries.extend(reranked_entries)
        te.tick()

        if len(out_entries) > 100 * 100:
            break

    write_trec_ranked_list_entry(out_entries, save_path)
Beispiel #7
0
    def work(self, job_id):
        qid_list = self.query_group[job_id]
        ticker = TimeEstimator(len(qid_list))
        missing_rel_cnt = 0
        missing_nrel_cnt = 0

        def empty_doc_fn(query_id, doc_id):
            rel_docs = self.ms_reader.qrel[query_id]
            nonlocal missing_rel_cnt
            nonlocal missing_nrel_cnt
            if doc_id in rel_docs:
                missing_rel_cnt += 1
            else:
                missing_nrel_cnt += 1

        def get_tf(text):
            tokens = self.tokenizer.tokenize_stem(text)
            return Counter(tokens)

        for qid in qid_list:
            if qid not in self.candidate_docs_d:
                continue

            docs: List[MSMarcoDoc] = load_per_query_docs(qid, empty_doc_fn)
            ticker.tick()

            target_docs = self.candidate_docs_d[qid]
            tokens_d = {}
            for d in docs:
                if d.doc_id in target_docs:
                    title_tokens = self.tokenizer.tokenize_stem(d.title)
                    body_sents = sent_tokenize(d.body)
                    body_tf_list = lmap(get_tf, body_sents)
                    tokens_d[d.doc_id] = (title_tokens, body_tf_list)

            if len(tokens_d) < len(target_docs):
                log_variables(job_id, qid)
                print("{} of {} not found".format(len(tokens_d),
                                                  len(target_docs)))

            save_path = os.path.join(self.out_dir, str(qid))
            pickle.dump(tokens_d, open(save_path, "wb"))
Beispiel #8
0
    def generate(self, data_id_manager, qids):
        missing_cnt = 0
        success_docs = 0
        missing_doc_qid = []
        ticker = TimeEstimator(len(qids))
        for qid in qids:
            if qid not in self.resource.get_doc_for_query_d():
                continue
            ticker.tick()
            docs: List[MSMarcoDoc] = load_per_query_docs(qid, None)
            docs_d = {d.doc_id: d for d in docs}

            q_tokens = self.resource.get_q_tokens(qid)
            for doc_id in self.resource.get_doc_for_query_d()[qid]:
                label = self.resource.get_label(qid, doc_id)
                try:
                    doc = docs_d[doc_id]
                    insts: Iterable[Tuple[List, List]] = self.encoder.encode(q_tokens, doc.title, doc.body)
                    for passage_idx, passage in enumerate(insts):
                        tokens_seg, seg_ids = passage
                        assert type(tokens_seg[0]) == str
                        assert type(seg_ids[0]) == int
                        data_id = data_id_manager.assign({
                            'doc_id': doc_id,
                            'passage_idx': passage_idx,
                            'label': label,
                        })
                        inst = ClassificationInstanceWDataID(tokens_seg, seg_ids, label, data_id)
                        yield inst
                    success_docs += 1
                except KeyError:
                    missing_cnt += 1
                    missing_doc_qid.append(qid)
                    if missing_cnt > 10:
                        print(missing_doc_qid)
                        print("success: ", success_docs)
                        raise KeyError