Exemple #1
0
    def generate(self, query_list, data_id_manager) -> Iterator[ClassificationInstanceWDataID]:
        neg_k = self.neg_k
        for query_id in query_list:
            if query_id not in self.judgement:
                continue

            qck_query = QCKQuery(query_id, "")
            judgement = self.judgement[query_id]
            query = self.queries[query_id]
            query_tokens = self.tokenizer.tokenize(query)

            ranked_list = self.galago_rank[query_id]
            ranked_list = ranked_list[:neg_k]

            target_docs = set(judgement.keys())
            target_docs.update([e.doc_id for e in ranked_list])
            print("Total of {} docs".format(len(target_docs)))

            for doc_id in target_docs:
                tokens = self.data[doc_id]
                passage_list = self.encoder.encode(query_tokens, tokens)
                label = 1 if doc_id in judgement and judgement[doc_id] > 0 else 0
                if not label:
                    continue
                candidate = QCKCandidate(doc_id, "")
                for idx, (tokens, seg_ids) in enumerate(passage_list):
                    info = {
                        'query': get_light_qckquery(qck_query),
                        'candidate': get_light_qckcandidate(candidate),
                        'idx': idx,
                    }
                    data_id = data_id_manager.assign(info)
                    inst = ClassificationInstanceWDataID(tokens, seg_ids, label, data_id)
                    yield inst
Exemple #2
0
def convert(info):
    new_info = {
        'query': QCKQuery(str(info['qid']), ""),
        'candidate': QCKCandidate(str(info['cid']), ""),
        'kdp': KnowledgeDocumentPart("", 0, 0, []),
    }
    return new_info
Exemple #3
0
 def get_candidate_for_query(query: QCKQuery):
     res = get_evidence_from_pool(query.text, 60)
     output = []
     for text, e_id, score in res:
         c = QCKCandidate(str(e_id), text)
         output.append(c)
     return output
Exemple #4
0
def transform(
    t: Tuple[ArguDataPoint, Passage,
             bool]) -> Tuple[QCKQuery, QCKCandidate, bool]:
    problem, candidate, is_correct = t
    return QCKQuery(problem.text1.id.id, problem.text1.text), \
           QCKCandidate(candidate.id.id, candidate.text), \
           is_correct
Exemple #5
0
    def generate(self, query_list, data_id_manager) -> Iterator[QueryDocInstance]:
        neg_k = self.neg_k
        for query_id in query_list:
            if query_id not in self.judgement:
                continue

            qck_query = QCKQuery(query_id, "")
            judgement = self.judgement[query_id]
            query = self.queries[query_id]
            query_tokens = self.tokenizer.tokenize(query)

            ranked_list = self.galago_rank[query_id]
            ranked_list = ranked_list[:neg_k]

            target_docs = set(judgement.keys())
            target_docs.update([e.doc_id for e in ranked_list])
            print("Total of {} docs".format(len(target_docs)))

            for doc_id in target_docs:
                tokens = self.data[doc_id][:self.doc_max_length]
                label = 1 if doc_id in judgement and judgement[doc_id] > 0 else 0
                if self.pos_only and not label:
                    continue
                candidate = QCKCandidate(doc_id, "")
                info = {
                    'query': get_light_qckquery(qck_query),
                    'candidate': get_light_qckcandidate(candidate),
                    'q_term_len': len(query_tokens),
                }
                data_id = data_id_manager.assign(info)
                inst = QueryDocInstance(query_tokens, tokens, label, data_id)
                yield inst
Exemple #6
0
    def generate(self, data_id_manager, qids):
        missing_cnt = 0
        success_docs = 0
        missing_doc_qid = []
        for qid in qids:
            if qid not in self.resource.get_doc_for_query_d():
                # assert not self.resource.query_in_qrel(qid)
                continue

            query_text = self.resource.get_query_text(qid)
            bert_tokens_d = self.resource.get_bert_tokens_d(qid)
            stemmed_tokens_d = self.resource.get_stemmed_tokens_d(qid)
            for doc_id in self.resource.get_doc_for_query_d()[qid]:
                label = self.resource.get_label(qid, doc_id)
                try:
                    bert_title_tokens, bert_body_tokens_list = bert_tokens_d[
                        doc_id]
                    stemmed_title_tokens, stemmed_body_tokens_list = stemmed_tokens_d[
                        doc_id]
                    insts: List[Tuple[List, List]]\
                        = self.encoder.encode(
                            query_text,
                            stemmed_title_tokens,
                            stemmed_body_tokens_list,
                            bert_title_tokens,
                            bert_body_tokens_list
                    )

                    for passage_idx, passage in enumerate(insts):
                        tokens_seg, seg_ids = passage
                        assert type(tokens_seg[0]) == str
                        assert type(seg_ids[0]) == int
                        data_id = data_id_manager.assign({
                            'query':
                            QCKQuery(qid, ""),
                            'candidate':
                            QCKCandidate(doc_id, ""),
                            'passage_idx':
                            passage_idx,
                        })
                        inst = ClassificationInstanceWDataID(
                            tokens_seg, seg_ids, label, data_id)
                        yield inst
                    success_docs += 1
                except KeyError:
                    missing_cnt += 1
                    missing_doc_qid.append(qid)
                    if missing_cnt > 10:
                        print(missing_doc_qid)
                        print("success: ", success_docs)
                        raise KeyError
Exemple #7
0
    def generate(self, data_id_manager, qids):
        missing_cnt = 0
        success_docs = 0
        n_passage = 0
        for qid in qids:
            if qid not in self.resource.candidate_doc_d:
                assert qid not in self.resource.qrel.qrel_d
                continue

            tokens_d: Dict[str, Tuple[List, List]] = self.resource.get_doc_tokens_d(qid)
            q_tokens = self.resource.get_q_tokens(qid)

            data_size_maybe = 0
            for title_tokens, body_tokens in tokens_d.values():
                data_size_maybe += len(title_tokens)
                data_size_maybe += len(body_tokens)
            for doc_id in self.resource.candidate_doc_d[qid]:
                label = self.resource.get_label(qid, doc_id)
                try:
                    title_tokens, body_tokens = tokens_d[doc_id]
                    insts: List[Tuple[List, List]] = self.doc_encoder.encode(q_tokens, title_tokens, body_tokens)

                    for passage_idx, passage in enumerate(insts):
                        tokens_seg, seg_ids = passage
                        assert type(tokens_seg[0]) == str
                        assert type(seg_ids[0]) == int
                        data_id = data_id_manager.assign({
                            'query': QCKQuery(qid, ""),
                            'candidate': QCKCandidate(doc_id, ""),
                            'passage_idx': passage_idx,
                        })
                        inst = ClassificationInstanceWDataID(tokens_seg, seg_ids, label, data_id)
                        n_passage += 1
                        yield inst
                        # if n_passage % 1000 == 0:
                        #     tprint("n_passage : {}".format(n_passage))
                        #     tprint('gc.get_count()', gc.get_count())
                        #     tprint('gc.get_stats', gc.get_stats())
                    success_docs += 1
                except KeyError:
                    missing_cnt += 1
                    if missing_cnt > 10:
                        print("success: ", success_docs)
                        raise KeyError
        print(" {} of {} has long title".format(self.doc_encoder.long_title_cnt, self.doc_encoder.total_doc_cnt))
Exemple #8
0
 def generate(self, query_list, data_id_manager):
     all_insts = []
     for query_id in query_list:
         if query_id not in self.galago_rank:
             continue
         query = self.queries[query_id]
         qck_query = QCKQuery(query_id, "")
         query_tokens = self.tokenizer.tokenize(query)
         for doc_id, _, _ in self.galago_rank[query_id][:self.top_k]:
             tokens = self.data[doc_id]
             passage_list = self.encoder.encode(query_tokens, tokens)
             candidate = QCKCandidate(doc_id, "")
             for idx, (tokens, seg_ids) in enumerate(passage_list):
                 info = {
                     'query': get_light_qckquery(qck_query),
                     'candidate': get_light_qckcandidate(candidate),
                     'idx': idx
                 }
                 data_id = data_id_manager.assign(info)
                 inst = Instance(tokens, seg_ids, data_id, 0)
                 all_insts.append(inst)
     return all_insts
Exemple #9
0
    def generate(self, query_list,
                 data_id_manager) -> Iterator[QueryDocInstance]:
        neg_k = self.neg_k
        for query_id in query_list:
            if query_id not in self.judgement:
                continue

            qck_query = QCKQuery(query_id, "")
            judgement = self.judgement[query_id]
            ranked_list = self.galago_rank[query_id]
            ranked_list = ranked_list[:neg_k]

            target_docs = set()
            docs_in_ranked_list = [e.doc_id for e in ranked_list]
            target_docs.update(docs_in_ranked_list)

            if self.include_all_judged:
                docs_in_judgements = judgement.keys()
                target_docs.update(docs_in_judgements)

            print("Total of {} docs".format(len(target_docs)))
            for doc_id in target_docs:
                for tas in self.encoder.encode(query_id, doc_id):
                    label = 1 if doc_id in judgement and judgement[
                        doc_id] > 0 else 0
                    # if label:
                    #     bprint(" -> Label={}".format(label))
                    #     bflush()
                    # else:
                    #     bempty()
                    candidate = QCKCandidate(doc_id, "")
                    info = {
                        'query': get_light_qckquery(qck_query),
                        'candidate': get_light_qckcandidate(candidate),
                    }
                    data_id = data_id_manager.assign(info)
                    inst = ClassificationInstanceWDataID.make_from_tas(
                        tas, label, data_id)
                    yield inst
Exemple #10
0
    def generate(self, data_id_manager, qids):
        missing_cnt = 0
        success_docs = 0
        missing_doc_qid = []
        for qid in qids:
            if qid not in self.resource.candidate_doc_d:
                assert qid not in self.resource.qrel.qrel_d
                continue

            tokens_d = self.resource.get_doc_tokens_d(qid)
            q_tokens = self.resource.get_q_tokens(qid)
            for doc_id in self.resource.get_candidate_doc_d(qid):
                label = self.resource.get_label(qid, doc_id)
                try:
                    doc_tokens = tokens_d[doc_id]
                    insts: List[Tuple[List, List]] = self.encoder.encode(q_tokens, doc_tokens)

                    for passage_idx, passage in enumerate(insts):
                        tokens_seg, seg_ids = passage
                        assert type(tokens_seg[0]) == str
                        assert type(seg_ids[0]) == int
                        data_id = data_id_manager.assign({
                            'query': QCKQuery(qid, ""),
                            'candidate': QCKCandidate(doc_id, ""),
                            'passage_idx': passage_idx,
                        })
                        inst = ClassificationInstanceWDataID(tokens_seg, seg_ids, label, data_id)
                        yield inst
                    success_docs += 1
                except KeyError:
                    missing_cnt += 1
                    missing_doc_qid.append(qid)
                    if missing_cnt > 10:
                        print(missing_doc_qid)
                        print("success: ", success_docs)
                        raise KeyError
Exemple #11
0
def doc_id_to_candidate(doc_id: str) -> QCKCandidate:
    return QCKCandidate(doc_id, perspective_getter(int(doc_id)))
Exemple #12
0
def get_qck_candidate_from_candidate_id(candidate_id: str):
    text = perspective_getter(int(candidate_id))
    return QCKCandidate(candidate_id, text)
Exemple #13
0
def cid_pid_format_to_qck(candidate_pers):
    candidate_dict: Dict[str, List[QCKCandidate]] = dict()
    for cid, candidate_pids in candidate_pers:
        candidate_dict[str(cid)] = \
            lmap(lambda pid: QCKCandidate(str(pid), perspective_getter(pid)), candidate_pids)
    return candidate_dict
Exemple #14
0
def light_candidate(obj: QCKCandidateWToken):
    return QCKCandidate(obj.id, "")
Exemple #15
0
 def convert_candidates(candidates: List[int]) -> List[QCKCandidate]:
     p_texts = lmap(perspective_getter, candidates)
     l: List[QCKCandidate] = []
     for pid, text in zip(candidates, p_texts):
         l.append(QCKCandidate(str(pid), text))
     return l