Esempio n. 1
0
    def generate(self, query_list, data_id_manager) -> Iterator[ClassificationInstanceWDataID]:
        neg_k = self.neg_k
        for query_id in query_list:
            if query_id not in self.judgement:
                continue

            qck_query = QCKQuery(query_id, "")
            judgement = self.judgement[query_id]
            query = self.queries[query_id]
            query_tokens = self.tokenizer.tokenize(query)

            ranked_list = self.galago_rank[query_id]
            ranked_list = ranked_list[:neg_k]

            target_docs = set(judgement.keys())
            target_docs.update([e.doc_id for e in ranked_list])
            print("Total of {} docs".format(len(target_docs)))

            for doc_id in target_docs:
                tokens = self.data[doc_id]
                passage_list = self.encoder.encode(query_tokens, tokens)
                label = 1 if doc_id in judgement and judgement[doc_id] > 0 else 0
                if not label:
                    continue
                candidate = QCKCandidate(doc_id, "")
                for idx, (tokens, seg_ids) in enumerate(passage_list):
                    info = {
                        'query': get_light_qckquery(qck_query),
                        'candidate': get_light_qckcandidate(candidate),
                        'idx': idx,
                    }
                    data_id = data_id_manager.assign(info)
                    inst = ClassificationInstanceWDataID(tokens, seg_ids, label, data_id)
                    yield inst
Esempio n. 2
0
    def generate(self, query_list, data_id_manager) -> Iterator[QueryDocInstance]:
        neg_k = self.neg_k
        for query_id in query_list:
            if query_id not in self.judgement:
                continue

            qck_query = QCKQuery(query_id, "")
            judgement = self.judgement[query_id]
            query = self.queries[query_id]
            query_tokens = self.tokenizer.tokenize(query)

            ranked_list = self.galago_rank[query_id]
            ranked_list = ranked_list[:neg_k]

            target_docs = set(judgement.keys())
            target_docs.update([e.doc_id for e in ranked_list])
            print("Total of {} docs".format(len(target_docs)))

            for doc_id in target_docs:
                tokens = self.data[doc_id][:self.doc_max_length]
                label = 1 if doc_id in judgement and judgement[doc_id] > 0 else 0
                if self.pos_only and not label:
                    continue
                candidate = QCKCandidate(doc_id, "")
                info = {
                    'query': get_light_qckquery(qck_query),
                    'candidate': get_light_qckcandidate(candidate),
                    'q_term_len': len(query_tokens),
                }
                data_id = data_id_manager.assign(info)
                inst = QueryDocInstance(query_tokens, tokens, label, data_id)
                yield inst
Esempio n. 3
0
def collect_info_transform(data: Iterable[Tuple[QCKQuery, QCKCandidate, bool]], data_id_man: DataIDManager) \
        -> Iterable[QCInstance]:
    for query, candidate, is_correct in data:
        info = {
            'query': get_light_qckquery(query),
            'candidate': get_light_qckcandidate(candidate)
        }
        yield QCInstance(query.text, candidate.text, data_id_man.assign(info),
                         int(is_correct))
Esempio n. 4
0
 def get_insts_per_candidate(candidate: QCKCandidateWToken,
                             query: QCKQueryWToken,
                             kdp_list: List[KDPWToken]):
     inst_per_candidate = []
     for p_idx, kdp in enumerate(kdp_list):
         info = {
             'query': get_light_qckquery(query),
             'candidate': get_light_qckcandidate(candidate),
             'kdp': get_light_kdp(kdp)
         }
         inst = PayloadAsTokens(
             passage=kdp.sub_tokens,
             text1=query.tokens,
             text2=candidate.tokens,
             data_id=data_id_manager.assign(info),
             is_correct=self._is_correct(query, candidate))
         inst_per_candidate.append(inst)
     return inst_per_candidate
Esempio n. 5
0
            def get_insts_per_candidate(candidate: QCKCandidateWToken,
                                        query: QCKQueryWToken,
                                        kdp_list: List[KDPWToken]) -> Payload:
                kdp_list = kdp_list[:self.k_group_size]

                kdp_token_list = []
                for p_idx, kdp in enumerate(kdp_list):
                    kdp_token_list.append(kdp.sub_tokens)

                info = {
                    'query': get_light_qckquery(query),
                    'candidate': get_light_qckcandidate(candidate),
                    'kdpl': lmap(get_light_kdp, kdp_list)
                }
                inst = Payload(kdp_list=kdp_token_list,
                               text1=query.tokens,
                               text2=candidate.tokens,
                               data_id=data_id_manager.assign(info),
                               is_correct=self._is_correct(query, candidate))
                return inst
Esempio n. 6
0
 def generate(self, query_list, data_id_manager):
     all_insts = []
     for query_id in query_list:
         if query_id not in self.galago_rank:
             continue
         query = self.queries[query_id]
         qck_query = QCKQuery(query_id, "")
         query_tokens = self.tokenizer.tokenize(query)
         for doc_id, _, _ in self.galago_rank[query_id][:self.top_k]:
             tokens = self.data[doc_id]
             passage_list = self.encoder.encode(query_tokens, tokens)
             candidate = QCKCandidate(doc_id, "")
             for idx, (tokens, seg_ids) in enumerate(passage_list):
                 info = {
                     'query': get_light_qckquery(qck_query),
                     'candidate': get_light_qckcandidate(candidate),
                     'idx': idx
                 }
                 data_id = data_id_manager.assign(info)
                 inst = Instance(tokens, seg_ids, data_id, 0)
                 all_insts.append(inst)
     return all_insts
Esempio n. 7
0
    def generate(self, query_list,
                 data_id_manager) -> Iterator[QueryDocInstance]:
        neg_k = self.neg_k
        for query_id in query_list:
            if query_id not in self.judgement:
                continue

            qck_query = QCKQuery(query_id, "")
            judgement = self.judgement[query_id]
            ranked_list = self.galago_rank[query_id]
            ranked_list = ranked_list[:neg_k]

            target_docs = set()
            docs_in_ranked_list = [e.doc_id for e in ranked_list]
            target_docs.update(docs_in_ranked_list)

            if self.include_all_judged:
                docs_in_judgements = judgement.keys()
                target_docs.update(docs_in_judgements)

            print("Total of {} docs".format(len(target_docs)))
            for doc_id in target_docs:
                for tas in self.encoder.encode(query_id, doc_id):
                    label = 1 if doc_id in judgement and judgement[
                        doc_id] > 0 else 0
                    # if label:
                    #     bprint(" -> Label={}".format(label))
                    #     bflush()
                    # else:
                    #     bempty()
                    candidate = QCKCandidate(doc_id, "")
                    info = {
                        'query': get_light_qckquery(qck_query),
                        'candidate': get_light_qckcandidate(candidate),
                    }
                    data_id = data_id_manager.assign(info)
                    inst = ClassificationInstanceWDataID.make_from_tas(
                        tas, label, data_id)
                    yield inst
Esempio n. 8
0
        def convert(
                pair: Tuple[QCKQuery, List[KDP]]) -> Iterable[PayloadAsTokens]:
            query, passages = pair
            tokenizer = self.tokenizer
            q_tokens: List[int] = tokenizer.convert_tokens_to_ids(
                tokenizer.tokenize(query.text))
            candidates = self.candidates_dict[query.query_id]
            num_inst_expectation = len(passages) * len(candidates)
            if num_inst_expectation > 1000 * 1000:
                print(query)
                print(len(passages))
                print(len(candidates))

            passage_input_ids_list = []
            for p_idx, passage in enumerate(passages):
                if self.kdp_as_sub_token:
                    passage_subtokens = passage.tokens
                else:
                    passage_subtokens = tokenize_from_tokens(
                        tokenizer, passage.tokens)
                passage_input_ids_list.append(
                    tokenizer.convert_tokens_to_ids(passage_subtokens))

            for c in candidates:
                c_tokens: List[int] = tokenizer.convert_tokens_to_ids(
                    c.get_tokens(tokenizer))
                for p_idx, passage in enumerate(passages):
                    info = {
                        'query': get_light_qckquery(query),
                        'candidate': get_light_qckcandidate(c),
                        'kdp': get_light_kdp(passage)
                    }
                    passage_subtokens = passage_input_ids_list[p_idx]
                    inst = PayloadAsIds(passage=passage_subtokens,
                                        text1=q_tokens,
                                        text2=c_tokens,
                                        data_id=data_id_manager.assign(info),
                                        is_correct=self._is_correct(query, c))
                    yield inst
Esempio n. 9
0
        def convert(pair: Tuple[QCKQuery, List[KDP]]) -> Iterable[Payload]:
            query, kdp_list = pair
            tokenizer = self.tokenizer
            q_tokens: List[str] = tokenizer.tokenize(query.text)
            candidates = self.candidates_dict[query.query_id]
            num_inst_expectation = len(kdp_list) * len(candidates)
            if num_inst_expectation > 1000 * 1000:
                print(query)
                print(len(kdp_list))
                print(len(candidates))
            p_sub_tokens = []
            for p_idx, kdp in enumerate(kdp_list):
                if self.kdp_as_sub_token:
                    passage_subtokens = kdp.tokens
                else:
                    passage_subtokens = tokenize_from_tokens(
                        tokenizer, kdp.tokens)
                p_sub_tokens.append(passage_subtokens)

            for c in candidates:
                c_tokens: List[str] = c.get_tokens(tokenizer)
                for p_idx, kdp in enumerate(kdp_list):
                    info = {
                        'query': get_light_qckquery(query),
                        'candidate': get_light_qckcandidate(c),
                        'kdp': get_light_kdp(kdp)
                    }
                    passage_subtokens = p_sub_tokens[p_idx]
                    inst = Payload(
                        passage=passage_subtokens,
                        text1=q_tokens,
                        text2=c_tokens,
                        data_id=data_id_manager.assign(info),
                        is_correct=self._is_correct(query, c),
                        kdp_score=self.get_rel_score(query, kdp),
                    )
                    yield inst