def generate(self, query_list, data_id_manager) -> Iterator[ClassificationInstanceWDataID]: neg_k = self.neg_k for query_id in query_list: if query_id not in self.judgement: continue qck_query = QCKQuery(query_id, "") judgement = self.judgement[query_id] query = self.queries[query_id] query_tokens = self.tokenizer.tokenize(query) ranked_list = self.galago_rank[query_id] ranked_list = ranked_list[:neg_k] target_docs = set(judgement.keys()) target_docs.update([e.doc_id for e in ranked_list]) print("Total of {} docs".format(len(target_docs))) for doc_id in target_docs: tokens = self.data[doc_id] passage_list = self.encoder.encode(query_tokens, tokens) label = 1 if doc_id in judgement and judgement[doc_id] > 0 else 0 if not label: continue candidate = QCKCandidate(doc_id, "") for idx, (tokens, seg_ids) in enumerate(passage_list): info = { 'query': get_light_qckquery(qck_query), 'candidate': get_light_qckcandidate(candidate), 'idx': idx, } data_id = data_id_manager.assign(info) inst = ClassificationInstanceWDataID(tokens, seg_ids, label, data_id) yield inst
def convert(info): new_info = { 'query': QCKQuery(str(info['qid']), ""), 'candidate': QCKCandidate(str(info['cid']), ""), 'kdp': KnowledgeDocumentPart("", 0, 0, []), } return new_info
def get_candidate_for_query(query: QCKQuery): res = get_evidence_from_pool(query.text, 60) output = [] for text, e_id, score in res: c = QCKCandidate(str(e_id), text) output.append(c) return output
def transform( t: Tuple[ArguDataPoint, Passage, bool]) -> Tuple[QCKQuery, QCKCandidate, bool]: problem, candidate, is_correct = t return QCKQuery(problem.text1.id.id, problem.text1.text), \ QCKCandidate(candidate.id.id, candidate.text), \ is_correct
def generate(self, query_list, data_id_manager) -> Iterator[QueryDocInstance]: neg_k = self.neg_k for query_id in query_list: if query_id not in self.judgement: continue qck_query = QCKQuery(query_id, "") judgement = self.judgement[query_id] query = self.queries[query_id] query_tokens = self.tokenizer.tokenize(query) ranked_list = self.galago_rank[query_id] ranked_list = ranked_list[:neg_k] target_docs = set(judgement.keys()) target_docs.update([e.doc_id for e in ranked_list]) print("Total of {} docs".format(len(target_docs))) for doc_id in target_docs: tokens = self.data[doc_id][:self.doc_max_length] label = 1 if doc_id in judgement and judgement[doc_id] > 0 else 0 if self.pos_only and not label: continue candidate = QCKCandidate(doc_id, "") info = { 'query': get_light_qckquery(qck_query), 'candidate': get_light_qckcandidate(candidate), 'q_term_len': len(query_tokens), } data_id = data_id_manager.assign(info) inst = QueryDocInstance(query_tokens, tokens, label, data_id) yield inst
def generate(self, data_id_manager, qids): missing_cnt = 0 success_docs = 0 missing_doc_qid = [] for qid in qids: if qid not in self.resource.get_doc_for_query_d(): # assert not self.resource.query_in_qrel(qid) continue query_text = self.resource.get_query_text(qid) bert_tokens_d = self.resource.get_bert_tokens_d(qid) stemmed_tokens_d = self.resource.get_stemmed_tokens_d(qid) for doc_id in self.resource.get_doc_for_query_d()[qid]: label = self.resource.get_label(qid, doc_id) try: bert_title_tokens, bert_body_tokens_list = bert_tokens_d[ doc_id] stemmed_title_tokens, stemmed_body_tokens_list = stemmed_tokens_d[ doc_id] insts: List[Tuple[List, List]]\ = self.encoder.encode( query_text, stemmed_title_tokens, stemmed_body_tokens_list, bert_title_tokens, bert_body_tokens_list ) for passage_idx, passage in enumerate(insts): tokens_seg, seg_ids = passage assert type(tokens_seg[0]) == str assert type(seg_ids[0]) == int data_id = data_id_manager.assign({ 'query': QCKQuery(qid, ""), 'candidate': QCKCandidate(doc_id, ""), 'passage_idx': passage_idx, }) inst = ClassificationInstanceWDataID( tokens_seg, seg_ids, label, data_id) yield inst success_docs += 1 except KeyError: missing_cnt += 1 missing_doc_qid.append(qid) if missing_cnt > 10: print(missing_doc_qid) print("success: ", success_docs) raise KeyError
def generate(self, data_id_manager, qids): missing_cnt = 0 success_docs = 0 n_passage = 0 for qid in qids: if qid not in self.resource.candidate_doc_d: assert qid not in self.resource.qrel.qrel_d continue tokens_d: Dict[str, Tuple[List, List]] = self.resource.get_doc_tokens_d(qid) q_tokens = self.resource.get_q_tokens(qid) data_size_maybe = 0 for title_tokens, body_tokens in tokens_d.values(): data_size_maybe += len(title_tokens) data_size_maybe += len(body_tokens) for doc_id in self.resource.candidate_doc_d[qid]: label = self.resource.get_label(qid, doc_id) try: title_tokens, body_tokens = tokens_d[doc_id] insts: List[Tuple[List, List]] = self.doc_encoder.encode(q_tokens, title_tokens, body_tokens) for passage_idx, passage in enumerate(insts): tokens_seg, seg_ids = passage assert type(tokens_seg[0]) == str assert type(seg_ids[0]) == int data_id = data_id_manager.assign({ 'query': QCKQuery(qid, ""), 'candidate': QCKCandidate(doc_id, ""), 'passage_idx': passage_idx, }) inst = ClassificationInstanceWDataID(tokens_seg, seg_ids, label, data_id) n_passage += 1 yield inst # if n_passage % 1000 == 0: # tprint("n_passage : {}".format(n_passage)) # tprint('gc.get_count()', gc.get_count()) # tprint('gc.get_stats', gc.get_stats()) success_docs += 1 except KeyError: missing_cnt += 1 if missing_cnt > 10: print("success: ", success_docs) raise KeyError print(" {} of {} has long title".format(self.doc_encoder.long_title_cnt, self.doc_encoder.total_doc_cnt))
def generate(self, query_list, data_id_manager): all_insts = [] for query_id in query_list: if query_id not in self.galago_rank: continue query = self.queries[query_id] qck_query = QCKQuery(query_id, "") query_tokens = self.tokenizer.tokenize(query) for doc_id, _, _ in self.galago_rank[query_id][:self.top_k]: tokens = self.data[doc_id] passage_list = self.encoder.encode(query_tokens, tokens) candidate = QCKCandidate(doc_id, "") for idx, (tokens, seg_ids) in enumerate(passage_list): info = { 'query': get_light_qckquery(qck_query), 'candidate': get_light_qckcandidate(candidate), 'idx': idx } data_id = data_id_manager.assign(info) inst = Instance(tokens, seg_ids, data_id, 0) all_insts.append(inst) return all_insts
def generate(self, query_list, data_id_manager) -> Iterator[QueryDocInstance]: neg_k = self.neg_k for query_id in query_list: if query_id not in self.judgement: continue qck_query = QCKQuery(query_id, "") judgement = self.judgement[query_id] ranked_list = self.galago_rank[query_id] ranked_list = ranked_list[:neg_k] target_docs = set() docs_in_ranked_list = [e.doc_id for e in ranked_list] target_docs.update(docs_in_ranked_list) if self.include_all_judged: docs_in_judgements = judgement.keys() target_docs.update(docs_in_judgements) print("Total of {} docs".format(len(target_docs))) for doc_id in target_docs: for tas in self.encoder.encode(query_id, doc_id): label = 1 if doc_id in judgement and judgement[ doc_id] > 0 else 0 # if label: # bprint(" -> Label={}".format(label)) # bflush() # else: # bempty() candidate = QCKCandidate(doc_id, "") info = { 'query': get_light_qckquery(qck_query), 'candidate': get_light_qckcandidate(candidate), } data_id = data_id_manager.assign(info) inst = ClassificationInstanceWDataID.make_from_tas( tas, label, data_id) yield inst
def generate(self, data_id_manager, qids): missing_cnt = 0 success_docs = 0 missing_doc_qid = [] for qid in qids: if qid not in self.resource.candidate_doc_d: assert qid not in self.resource.qrel.qrel_d continue tokens_d = self.resource.get_doc_tokens_d(qid) q_tokens = self.resource.get_q_tokens(qid) for doc_id in self.resource.get_candidate_doc_d(qid): label = self.resource.get_label(qid, doc_id) try: doc_tokens = tokens_d[doc_id] insts: List[Tuple[List, List]] = self.encoder.encode(q_tokens, doc_tokens) for passage_idx, passage in enumerate(insts): tokens_seg, seg_ids = passage assert type(tokens_seg[0]) == str assert type(seg_ids[0]) == int data_id = data_id_manager.assign({ 'query': QCKQuery(qid, ""), 'candidate': QCKCandidate(doc_id, ""), 'passage_idx': passage_idx, }) inst = ClassificationInstanceWDataID(tokens_seg, seg_ids, label, data_id) yield inst success_docs += 1 except KeyError: missing_cnt += 1 missing_doc_qid.append(qid) if missing_cnt > 10: print(missing_doc_qid) print("success: ", success_docs) raise KeyError
def doc_id_to_candidate(doc_id: str) -> QCKCandidate: return QCKCandidate(doc_id, perspective_getter(int(doc_id)))
def get_qck_candidate_from_candidate_id(candidate_id: str): text = perspective_getter(int(candidate_id)) return QCKCandidate(candidate_id, text)
def cid_pid_format_to_qck(candidate_pers): candidate_dict: Dict[str, List[QCKCandidate]] = dict() for cid, candidate_pids in candidate_pers: candidate_dict[str(cid)] = \ lmap(lambda pid: QCKCandidate(str(pid), perspective_getter(pid)), candidate_pids) return candidate_dict
def light_candidate(obj: QCKCandidateWToken): return QCKCandidate(obj.id, "")
def convert_candidates(candidates: List[int]) -> List[QCKCandidate]: p_texts = lmap(perspective_getter, candidates) l: List[QCKCandidate] = [] for pid, text in zip(candidates, p_texts): l.append(QCKCandidate(str(pid), text)) return l