コード例 #1
0
ファイル: cppnc_datagen.py プロジェクト: clover3/Chair
def convert_sub_token(tokenizer, r: Payload) -> PayloadAsTokens:
    passage_subtokens = tokenize_from_tokens(tokenizer, r.passage)
    tokens1: List[str] = tokenizer.tokenize(r.text1)
    tokens2: List[str] = tokenizer.tokenize(r.text2)

    return PayloadAsTokens(passage=passage_subtokens,
                           text1=tokens1,
                           text2=tokens2,
                           data_id=r.data_id,
                           is_correct=r.is_correct)
コード例 #2
0
ファイル: qcknc_datagen.py プロジェクト: clover3/Chair
    def _convert_sub_token(self, r: QCKInstance) -> PayloadAsTokens:
        tokenizer = self.tokenizer
        passage_subtokens = tokenize_from_tokens(tokenizer, r.doc_tokens)
        tokens1: List[str] = tokenizer.tokenize(r.query_text)
        tokens2: List[str] = tokenizer.tokenize(r.candidate_text)

        return PayloadAsTokens(
            passage=passage_subtokens,
            text1=tokens1,
            text2=tokens2,
            data_id=r.data_id,
            is_correct=r.is_correct,
        )
コード例 #3
0
        def convert(k: KDP) -> Iterable[PayloadAsTokens]:
            k_tokens = tokenize_from_tokens(self.tokenizer, k.tokens)
            for query in self.queries:
                for c in self.candidates_dict[query.query_id]:
                    info = {
                        'query': light_query(query),
                        'candidate': light_candidate(c),
                        'kdp': light_kdp(k)
                    }

                    yield PayloadAsTokens(query.tokens, c.tokens, k_tokens,
                                          data_id_manager.assign(info),
                                          self._is_correct(query, c))
コード例 #4
0
    def encode_fn(e: QCKCompactEntry) -> OrderedDict:
        query, candidate, qk_out_entry = e
        candidate: QCKCandidate = candidate
        info = {
            'query': query,
            'candidate': candidate,
            'kdp': qk_out_entry.kdp
        }

        p = PayloadAsTokens(passage=qk_out_entry.passage_tokens,
                            text1=cache_tokenizer.tokenize(query.text),
                            text2=cache_tokenizer.tokenize(candidate.text),
                            data_id=data_id_man.assign(info),
                            is_correct=0
                            )
        return encode_two_inputs(max_seq_length, tokenizer, p)
コード例 #5
0
ファイル: qcknc_mix.py プロジェクト: clover3/Chair
 def get_insts_per_candidate(candidate: QCKCandidateWToken,
                             query: QCKQueryWToken,
                             kdp_list: List[KDPWToken]):
     inst_per_candidate = []
     for p_idx, kdp in enumerate(kdp_list):
         info = {
             'query': get_light_qckquery(query),
             'candidate': get_light_qckcandidate(candidate),
             'kdp': get_light_kdp(kdp)
         }
         inst = PayloadAsTokens(
             passage=kdp.sub_tokens,
             text1=query.tokens,
             text2=candidate.tokens,
             data_id=data_id_manager.assign(info),
             is_correct=self._is_correct(query, candidate))
         inst_per_candidate.append(inst)
     return inst_per_candidate
コード例 #6
0
ファイル: parse_cpnr_results.py プロジェクト: clover3/Chair
    def encode(e: Tuple[int, int, List[Dict]]):
        cid, pid, passages = e
        text1 = tokenize(cid_to_text[cid])
        text2 = tokenize(perspective_getter(pid))

        for passage_idx, passage in enumerate(passages):
            info = {
                'cid': cid,
                'pid': pid,
                'passage_idx': passage_idx,
                'passage': passage['passage'],
                'c_text': cid_to_text[cid],
                'p_text': perspective_getter(pid)
            }
            yield PayloadAsTokens(passage=passage['passage'],
                                   text1=text1,
                                   text2=text2,
                                   data_id=data_id_man.assign(info),
                                   is_correct=0
                                   )
コード例 #7
0
ファイル: pdcd_datagen.py プロジェクト: clover3/Chair
    def generate_instances(self, claim: Dict,
                           data_id_manager) -> List[PayloadAsTokens]:
        cid = claim['cId']
        claim_tokens = self.tokenizer.tokenize(claim['text'])
        perspectives = self.candidate_perspective[cid]
        passages = self.cid_to_passages[cid]
        output = []
        for pid in perspectives:
            is_correct = any([pid in cluster for cluster in self.gold[cid]])
            perspective = perspective_getter(pid)
            perspective_tokens = self.tokenizer.tokenize(perspective)
            for passage_idx, passage in enumerate(left(passages)):
                passage_subtokens = tokenize_from_tokens(
                    self.tokenizer, passage)
                info = {'cid': cid, 'pid': pid, 'passage_idx': passage_idx}
                p = PayloadAsTokens(passage_subtokens,
                                    perspective_tokens, claim_tokens,
                                    data_id_manager.assign(info), is_correct)
                output.append(p)

        return output
コード例 #8
0
ファイル: qcknc_datagen.py プロジェクト: clover3/Chair
        def convert(
                pair: Tuple[QCKQuery, List[KDP]]) -> Iterable[PayloadAsTokens]:
            query, passages = pair
            tokenizer = self.tokenizer
            q_tokens: List[str] = tokenizer.tokenize(query.text)
            candidates = self.candidates_dict[query.query_id]
            num_inst_expectation = len(passages) * len(candidates)
            if num_inst_expectation > 1000 * 1000:
                print(query)
                print(len(passages))
                print(len(candidates))
            p_sub_tokens = []
            for p_idx, passage in enumerate(passages):
                if self.kdp_as_sub_token:
                    passage_subtokens = passage.tokens
                else:
                    passage_subtokens = tokenize_from_tokens(
                        tokenizer, passage.tokens)
                p_sub_tokens.append(passage_subtokens)

            for c in candidates:
                c_tokens: List[str] = c.get_tokens(tokenizer)
                for p_idx, passage in enumerate(passages):
                    info = {
                        'query': get_light_qckquery(query),
                        'candidate': get_light_qckcandidate(c),
                        'kdp': get_light_kdp(passage)
                    }
                    passage_subtokens = p_sub_tokens[p_idx]
                    inst = PayloadAsTokens(
                        passage=passage_subtokens,
                        text1=q_tokens,
                        text2=c_tokens,
                        data_id=data_id_manager.assign(info),
                        is_correct=self._is_correct(query, c))
                    yield inst
コード例 #9
0
ファイル: qcknc_w_rel_score.py プロジェクト: clover3/Chair
def get_payload_as_token(payload: Payload):
    return PayloadAsTokens(payload.passage, payload.text1, payload.text2,
                           payload.data_id, payload.is_correct)