def generate(self, query_list, data_id_manager) -> Iterator[ClassificationInstanceWDataID]: neg_k = self.neg_k for query_id in query_list: if query_id not in self.judgement: continue qck_query = QCKQuery(query_id, "") judgement = self.judgement[query_id] query = self.queries[query_id] query_tokens = self.tokenizer.tokenize(query) ranked_list = self.galago_rank[query_id] ranked_list = ranked_list[:neg_k] target_docs = set(judgement.keys()) target_docs.update([e.doc_id for e in ranked_list]) print("Total of {} docs".format(len(target_docs))) for doc_id in target_docs: tokens = self.data[doc_id] passage_list = self.encoder.encode(query_tokens, tokens) label = 1 if doc_id in judgement and judgement[doc_id] > 0 else 0 if not label: continue candidate = QCKCandidate(doc_id, "") for idx, (tokens, seg_ids) in enumerate(passage_list): info = { 'query': get_light_qckquery(qck_query), 'candidate': get_light_qckcandidate(candidate), 'idx': idx, } data_id = data_id_manager.assign(info) inst = ClassificationInstanceWDataID(tokens, seg_ids, label, data_id) yield inst
def generate(self, query_list, data_id_manager) -> Iterator[QueryDocInstance]: neg_k = self.neg_k for query_id in query_list: if query_id not in self.judgement: continue qck_query = QCKQuery(query_id, "") judgement = self.judgement[query_id] query = self.queries[query_id] query_tokens = self.tokenizer.tokenize(query) ranked_list = self.galago_rank[query_id] ranked_list = ranked_list[:neg_k] target_docs = set(judgement.keys()) target_docs.update([e.doc_id for e in ranked_list]) print("Total of {} docs".format(len(target_docs))) for doc_id in target_docs: tokens = self.data[doc_id][:self.doc_max_length] label = 1 if doc_id in judgement and judgement[doc_id] > 0 else 0 if self.pos_only and not label: continue candidate = QCKCandidate(doc_id, "") info = { 'query': get_light_qckquery(qck_query), 'candidate': get_light_qckcandidate(candidate), 'q_term_len': len(query_tokens), } data_id = data_id_manager.assign(info) inst = QueryDocInstance(query_tokens, tokens, label, data_id) yield inst
def collect_info_transform(data: Iterable[Tuple[QCKQuery, QCKCandidate, bool]], data_id_man: DataIDManager) \ -> Iterable[QCInstance]: for query, candidate, is_correct in data: info = { 'query': get_light_qckquery(query), 'candidate': get_light_qckcandidate(candidate) } yield QCInstance(query.text, candidate.text, data_id_man.assign(info), int(is_correct))
def get_insts_per_candidate(candidate: QCKCandidateWToken, query: QCKQueryWToken, kdp_list: List[KDPWToken]): inst_per_candidate = [] for p_idx, kdp in enumerate(kdp_list): info = { 'query': get_light_qckquery(query), 'candidate': get_light_qckcandidate(candidate), 'kdp': get_light_kdp(kdp) } inst = PayloadAsTokens( passage=kdp.sub_tokens, text1=query.tokens, text2=candidate.tokens, data_id=data_id_manager.assign(info), is_correct=self._is_correct(query, candidate)) inst_per_candidate.append(inst) return inst_per_candidate
def get_insts_per_candidate(candidate: QCKCandidateWToken, query: QCKQueryWToken, kdp_list: List[KDPWToken]) -> Payload: kdp_list = kdp_list[:self.k_group_size] kdp_token_list = [] for p_idx, kdp in enumerate(kdp_list): kdp_token_list.append(kdp.sub_tokens) info = { 'query': get_light_qckquery(query), 'candidate': get_light_qckcandidate(candidate), 'kdpl': lmap(get_light_kdp, kdp_list) } inst = Payload(kdp_list=kdp_token_list, text1=query.tokens, text2=candidate.tokens, data_id=data_id_manager.assign(info), is_correct=self._is_correct(query, candidate)) return inst
def generate(self, query_list, data_id_manager): all_insts = [] for query_id in query_list: if query_id not in self.galago_rank: continue query = self.queries[query_id] qck_query = QCKQuery(query_id, "") query_tokens = self.tokenizer.tokenize(query) for doc_id, _, _ in self.galago_rank[query_id][:self.top_k]: tokens = self.data[doc_id] passage_list = self.encoder.encode(query_tokens, tokens) candidate = QCKCandidate(doc_id, "") for idx, (tokens, seg_ids) in enumerate(passage_list): info = { 'query': get_light_qckquery(qck_query), 'candidate': get_light_qckcandidate(candidate), 'idx': idx } data_id = data_id_manager.assign(info) inst = Instance(tokens, seg_ids, data_id, 0) all_insts.append(inst) return all_insts
def generate(self, query_list, data_id_manager) -> Iterator[QueryDocInstance]: neg_k = self.neg_k for query_id in query_list: if query_id not in self.judgement: continue qck_query = QCKQuery(query_id, "") judgement = self.judgement[query_id] ranked_list = self.galago_rank[query_id] ranked_list = ranked_list[:neg_k] target_docs = set() docs_in_ranked_list = [e.doc_id for e in ranked_list] target_docs.update(docs_in_ranked_list) if self.include_all_judged: docs_in_judgements = judgement.keys() target_docs.update(docs_in_judgements) print("Total of {} docs".format(len(target_docs))) for doc_id in target_docs: for tas in self.encoder.encode(query_id, doc_id): label = 1 if doc_id in judgement and judgement[ doc_id] > 0 else 0 # if label: # bprint(" -> Label={}".format(label)) # bflush() # else: # bempty() candidate = QCKCandidate(doc_id, "") info = { 'query': get_light_qckquery(qck_query), 'candidate': get_light_qckcandidate(candidate), } data_id = data_id_manager.assign(info) inst = ClassificationInstanceWDataID.make_from_tas( tas, label, data_id) yield inst
def convert( pair: Tuple[QCKQuery, List[KDP]]) -> Iterable[PayloadAsTokens]: query, passages = pair tokenizer = self.tokenizer q_tokens: List[int] = tokenizer.convert_tokens_to_ids( tokenizer.tokenize(query.text)) candidates = self.candidates_dict[query.query_id] num_inst_expectation = len(passages) * len(candidates) if num_inst_expectation > 1000 * 1000: print(query) print(len(passages)) print(len(candidates)) passage_input_ids_list = [] for p_idx, passage in enumerate(passages): if self.kdp_as_sub_token: passage_subtokens = passage.tokens else: passage_subtokens = tokenize_from_tokens( tokenizer, passage.tokens) passage_input_ids_list.append( tokenizer.convert_tokens_to_ids(passage_subtokens)) for c in candidates: c_tokens: List[int] = tokenizer.convert_tokens_to_ids( c.get_tokens(tokenizer)) for p_idx, passage in enumerate(passages): info = { 'query': get_light_qckquery(query), 'candidate': get_light_qckcandidate(c), 'kdp': get_light_kdp(passage) } passage_subtokens = passage_input_ids_list[p_idx] inst = PayloadAsIds(passage=passage_subtokens, text1=q_tokens, text2=c_tokens, data_id=data_id_manager.assign(info), is_correct=self._is_correct(query, c)) yield inst
def convert(pair: Tuple[QCKQuery, List[KDP]]) -> Iterable[Payload]: query, kdp_list = pair tokenizer = self.tokenizer q_tokens: List[str] = tokenizer.tokenize(query.text) candidates = self.candidates_dict[query.query_id] num_inst_expectation = len(kdp_list) * len(candidates) if num_inst_expectation > 1000 * 1000: print(query) print(len(kdp_list)) print(len(candidates)) p_sub_tokens = [] for p_idx, kdp in enumerate(kdp_list): if self.kdp_as_sub_token: passage_subtokens = kdp.tokens else: passage_subtokens = tokenize_from_tokens( tokenizer, kdp.tokens) p_sub_tokens.append(passage_subtokens) for c in candidates: c_tokens: List[str] = c.get_tokens(tokenizer) for p_idx, kdp in enumerate(kdp_list): info = { 'query': get_light_qckquery(query), 'candidate': get_light_qckcandidate(c), 'kdp': get_light_kdp(kdp) } passage_subtokens = p_sub_tokens[p_idx] inst = Payload( passage=passage_subtokens, text1=q_tokens, text2=c_tokens, data_id=data_id_manager.assign(info), is_correct=self._is_correct(query, c), kdp_score=self.get_rel_score(query, kdp), ) yield inst