def convert_sub_token(tokenizer, r: Payload) -> PayloadAsTokens: passage_subtokens = tokenize_from_tokens(tokenizer, r.passage) tokens1: List[str] = tokenizer.tokenize(r.text1) tokens2: List[str] = tokenizer.tokenize(r.text2) return PayloadAsTokens(passage=passage_subtokens, text1=tokens1, text2=tokens2, data_id=r.data_id, is_correct=r.is_correct)
def _convert_sub_token(self, r: QCKInstance) -> PayloadAsTokens: tokenizer = self.tokenizer passage_subtokens = tokenize_from_tokens(tokenizer, r.doc_tokens) tokens1: List[str] = tokenizer.tokenize(r.query_text) tokens2: List[str] = tokenizer.tokenize(r.candidate_text) return PayloadAsTokens( passage=passage_subtokens, text1=tokens1, text2=tokens2, data_id=r.data_id, is_correct=r.is_correct, )
def convert(k: KDP) -> Iterable[PayloadAsTokens]: k_tokens = tokenize_from_tokens(self.tokenizer, k.tokens) for query in self.queries: for c in self.candidates_dict[query.query_id]: info = { 'query': light_query(query), 'candidate': light_candidate(c), 'kdp': light_kdp(k) } yield PayloadAsTokens(query.tokens, c.tokens, k_tokens, data_id_manager.assign(info), self._is_correct(query, c))
def encode_fn(e: QCKCompactEntry) -> OrderedDict: query, candidate, qk_out_entry = e candidate: QCKCandidate = candidate info = { 'query': query, 'candidate': candidate, 'kdp': qk_out_entry.kdp } p = PayloadAsTokens(passage=qk_out_entry.passage_tokens, text1=cache_tokenizer.tokenize(query.text), text2=cache_tokenizer.tokenize(candidate.text), data_id=data_id_man.assign(info), is_correct=0 ) return encode_two_inputs(max_seq_length, tokenizer, p)
def get_insts_per_candidate(candidate: QCKCandidateWToken, query: QCKQueryWToken, kdp_list: List[KDPWToken]): inst_per_candidate = [] for p_idx, kdp in enumerate(kdp_list): info = { 'query': get_light_qckquery(query), 'candidate': get_light_qckcandidate(candidate), 'kdp': get_light_kdp(kdp) } inst = PayloadAsTokens( passage=kdp.sub_tokens, text1=query.tokens, text2=candidate.tokens, data_id=data_id_manager.assign(info), is_correct=self._is_correct(query, candidate)) inst_per_candidate.append(inst) return inst_per_candidate
def encode(e: Tuple[int, int, List[Dict]]): cid, pid, passages = e text1 = tokenize(cid_to_text[cid]) text2 = tokenize(perspective_getter(pid)) for passage_idx, passage in enumerate(passages): info = { 'cid': cid, 'pid': pid, 'passage_idx': passage_idx, 'passage': passage['passage'], 'c_text': cid_to_text[cid], 'p_text': perspective_getter(pid) } yield PayloadAsTokens(passage=passage['passage'], text1=text1, text2=text2, data_id=data_id_man.assign(info), is_correct=0 )
def generate_instances(self, claim: Dict, data_id_manager) -> List[PayloadAsTokens]: cid = claim['cId'] claim_tokens = self.tokenizer.tokenize(claim['text']) perspectives = self.candidate_perspective[cid] passages = self.cid_to_passages[cid] output = [] for pid in perspectives: is_correct = any([pid in cluster for cluster in self.gold[cid]]) perspective = perspective_getter(pid) perspective_tokens = self.tokenizer.tokenize(perspective) for passage_idx, passage in enumerate(left(passages)): passage_subtokens = tokenize_from_tokens( self.tokenizer, passage) info = {'cid': cid, 'pid': pid, 'passage_idx': passage_idx} p = PayloadAsTokens(passage_subtokens, perspective_tokens, claim_tokens, data_id_manager.assign(info), is_correct) output.append(p) return output
def convert( pair: Tuple[QCKQuery, List[KDP]]) -> Iterable[PayloadAsTokens]: query, passages = pair tokenizer = self.tokenizer q_tokens: List[str] = tokenizer.tokenize(query.text) candidates = self.candidates_dict[query.query_id] num_inst_expectation = len(passages) * len(candidates) if num_inst_expectation > 1000 * 1000: print(query) print(len(passages)) print(len(candidates)) p_sub_tokens = [] for p_idx, passage in enumerate(passages): if self.kdp_as_sub_token: passage_subtokens = passage.tokens else: passage_subtokens = tokenize_from_tokens( tokenizer, passage.tokens) p_sub_tokens.append(passage_subtokens) for c in candidates: c_tokens: List[str] = c.get_tokens(tokenizer) for p_idx, passage in enumerate(passages): info = { 'query': get_light_qckquery(query), 'candidate': get_light_qckcandidate(c), 'kdp': get_light_kdp(passage) } passage_subtokens = p_sub_tokens[p_idx] inst = PayloadAsTokens( passage=passage_subtokens, text1=q_tokens, text2=c_tokens, data_id=data_id_manager.assign(info), is_correct=self._is_correct(query, c)) yield inst
def get_payload_as_token(payload: Payload): return PayloadAsTokens(payload.passage, payload.text1, payload.text2, payload.data_id, payload.is_correct)