def convert_sub_token(tokenizer, r: Payload) -> PayloadAsTokens: passage_subtokens = tokenize_from_tokens(tokenizer, r.passage) tokens1: List[str] = tokenizer.tokenize(r.text1) tokens2: List[str] = tokenizer.tokenize(r.text2) return PayloadAsTokens(passage=passage_subtokens, text1=tokens1, text2=tokens2, data_id=r.data_id, is_correct=r.is_correct)
def add_tokens_to_qk_unit(qk_unit: QKUnit, tokenizer) -> QKUnitWToken: query, kdp_list = qk_unit q = QCKQueryWToken(query.query_id, query.text, tokenizer.tokenize(query.text)) new_kdp_list = [] for kdp in kdp_list: sub_tokens = tokenize_from_tokens(tokenizer, kdp.tokens) kdp_w_tokens = KDPWToken(kdp.doc_id, kdp.passage_idx, kdp.start_location, kdp.tokens, sub_tokens) new_kdp_list.append(kdp_w_tokens) return q, new_kdp_list
def convert(pair: Tuple[QCKQuery, List[KDP]]) -> Iterable[QKInstance]: query, passages = pair for passage in passages: info = { 'query': query, 'kdp': passage } yield QKInstance(query.text, tokenize_from_tokens(self.tokenizer, passage.tokens), data_id_manager.assign(info), self.get_label(query, passage) )
def _convert_sub_token(self, r: QCKInstance) -> PayloadAsTokens: tokenizer = self.tokenizer passage_subtokens = tokenize_from_tokens(tokenizer, r.doc_tokens) tokens1: List[str] = tokenizer.tokenize(r.query_text) tokens2: List[str] = tokenizer.tokenize(r.candidate_text) return PayloadAsTokens( passage=passage_subtokens, text1=tokens1, text2=tokens2, data_id=r.data_id, is_correct=r.is_correct, )
def convert(k: KDP) -> Iterable[PayloadAsTokens]: k_tokens = tokenize_from_tokens(self.tokenizer, k.tokens) for query in self.queries: for c in self.candidates_dict[query.query_id]: info = { 'query': light_query(query), 'candidate': light_candidate(c), 'kdp': light_kdp(k) } yield PayloadAsTokens(query.tokens, c.tokens, k_tokens, data_id_manager.assign(info), self._is_correct(query, c))
def _convert_sub_token(self, r: Instance) -> InstanceTokenized: tokenizer = self.tokenizer passage_subtokens_list = list( [tokenize_from_tokens(tokenizer, p) for p in r.doc_tokens_list]) tokens1: List[str] = tokenizer.tokenize(r.query_text) tokens2: List[str] = tokenizer.tokenize(r.candidate_text) return InstanceTokenized( passage_subtokens_list=passage_subtokens_list, q_tokens=tokens1, c_tokens=tokens2, data_id=r.data_id, is_correct=r.is_correct, )
def generate_instances(self, claim: Dict, data_id_manager) -> List[PayloadAsTokens]: cid = claim['cId'] claim_tokens = self.tokenizer.tokenize(claim['text']) perspectives = self.candidate_perspective[cid] passages = self.cid_to_passages[cid] output = [] for pid in perspectives: is_correct = any([pid in cluster for cluster in self.gold[cid]]) perspective = perspective_getter(pid) perspective_tokens = self.tokenizer.tokenize(perspective) for passage_idx, passage in enumerate(left(passages)): passage_subtokens = tokenize_from_tokens( self.tokenizer, passage) info = {'cid': cid, 'pid': pid, 'passage_idx': passage_idx} p = PayloadAsTokens(passage_subtokens, perspective_tokens, claim_tokens, data_id_manager.assign(info), is_correct) output.append(p) return output
def convert( pair: Tuple[QCKQuery, List[KDP]]) -> Iterable[PayloadAsTokens]: query, passages = pair tokenizer = self.tokenizer q_tokens: List[int] = tokenizer.convert_tokens_to_ids( tokenizer.tokenize(query.text)) candidates = self.candidates_dict[query.query_id] num_inst_expectation = len(passages) * len(candidates) if num_inst_expectation > 1000 * 1000: print(query) print(len(passages)) print(len(candidates)) passage_input_ids_list = [] for p_idx, passage in enumerate(passages): if self.kdp_as_sub_token: passage_subtokens = passage.tokens else: passage_subtokens = tokenize_from_tokens( tokenizer, passage.tokens) passage_input_ids_list.append( tokenizer.convert_tokens_to_ids(passage_subtokens)) for c in candidates: c_tokens: List[int] = tokenizer.convert_tokens_to_ids( c.get_tokens(tokenizer)) for p_idx, passage in enumerate(passages): info = { 'query': get_light_qckquery(query), 'candidate': get_light_qckcandidate(c), 'kdp': get_light_kdp(passage) } passage_subtokens = passage_input_ids_list[p_idx] inst = PayloadAsIds(passage=passage_subtokens, text1=q_tokens, text2=c_tokens, data_id=data_id_manager.assign(info), is_correct=self._is_correct(query, c)) yield inst
def convert(pair: Tuple[QCKQuery, List[KDP]]) -> Iterable[Payload]: query, kdp_list = pair tokenizer = self.tokenizer q_tokens: List[str] = tokenizer.tokenize(query.text) candidates = self.candidates_dict[query.query_id] num_inst_expectation = len(kdp_list) * len(candidates) if num_inst_expectation > 1000 * 1000: print(query) print(len(kdp_list)) print(len(candidates)) p_sub_tokens = [] for p_idx, kdp in enumerate(kdp_list): if self.kdp_as_sub_token: passage_subtokens = kdp.tokens else: passage_subtokens = tokenize_from_tokens( tokenizer, kdp.tokens) p_sub_tokens.append(passage_subtokens) for c in candidates: c_tokens: List[str] = c.get_tokens(tokenizer) for p_idx, kdp in enumerate(kdp_list): info = { 'query': get_light_qckquery(query), 'candidate': get_light_qckcandidate(c), 'kdp': get_light_kdp(kdp) } passage_subtokens = p_sub_tokens[p_idx] inst = Payload( passage=passage_subtokens, text1=q_tokens, text2=c_tokens, data_id=data_id_manager.assign(info), is_correct=self._is_correct(query, c), kdp_score=self.get_rel_score(query, kdp), ) yield inst
def tokenize_from_tokens_fn(tokens): return tokenize_from_tokens(tokenizer, tokens)