def get_candidate_all_passage_w_samping_predict( max_seq_length=256) -> Dict[str, List[QCKCandidateWToken]]: qrel_path = os.path.join(data_path, "robust", "qrels.rob04.txt") galago_rank = load_bm25_best() tokens_d = load_robust_tokens_for_predict(4) queries = load_robust04_title_query() tokenizer = get_tokenizer() out_d: Dict[str, List[QCKCandidateWToken]] = {} for query_id in queries: query = queries[query_id] query_tokens = tokenizer.tokenize(query) ranked_list = galago_rank[query_id] ranked_list = ranked_list[:100] doc_ids = list([e.doc_id for e in ranked_list]) candidate = [] for doc_id in doc_ids: tokens = tokens_d[doc_id] for idx, passage in enumerate(enum_passage(tokens, max_seq_length)): if idx == 0: include = True else: include = random.random() < 0.1 if include: c = QCKCandidateWToken(doc_id, "", passage) candidate.append(c) out_d[query_id] = candidate return out_d
def make_candidate(doc_id: str) -> Iterable[QCKCandidateWToken]: tokens = token_data[doc_id] for idx, passage_tokens in enumerate( enum_passage(tokens, content_len)): if idx >= max_passage_per_doc: break doc_part_id = "{}_{}".format(doc_id, idx) yield QCKCandidateWToken(doc_part_id, "", passage_tokens)
def get_candidate_for_query(query: QCKQuery): res = get_evidence_from_pool(query.text, 60) query_len = len(tokenizer.tokenize(query.text)) candidate_max_len = max_seq_length - 3 - query_len output = [] for text, e_id, score in res: tokens = tokenizer.tokenize(text) for passage in enum_passage(tokens, candidate_max_len): c = QCKCandidateWToken(str(e_id), "", passage) output.append(c) return output
def convert( target_pair: Tuple[QCKQueryWToken, List[KDPWToken]], other_pairs: List[Tuple[QCKQueryWToken, List[KDPWToken]]] ) -> Iterable[Payload]: target_query, target_kdp_list = target_pair candidates = self.candidates_dict[target_query.query_id] candidates_w_tokens = [ QCKCandidateWToken.from_qck_candidate(self.tokenizer, c) for c in candidates ] num_inst_expectation = len(target_kdp_list) * len(candidates) if num_inst_expectation > 1000 * 1000: print(target_query) print(len(target_kdp_list)) print(len(candidates)) def get_insts_per_candidate(candidate: QCKCandidateWToken, query: QCKQueryWToken, kdp_list: List[KDPWToken]) -> Payload: kdp_list = kdp_list[:self.k_group_size] kdp_token_list = [] for p_idx, kdp in enumerate(kdp_list): kdp_token_list.append(kdp.sub_tokens) info = { 'query': get_light_qckquery(query), 'candidate': get_light_qckcandidate(candidate), 'kdpl': lmap(get_light_kdp, kdp_list) } inst = Payload(kdp_list=kdp_token_list, text1=query.tokens, text2=candidate.tokens, data_id=data_id_manager.assign(info), is_correct=self._is_correct(query, candidate)) return inst for c_w_token in candidates_w_tokens: yield get_insts_per_candidate(c_w_token, target_query, target_kdp_list) other_query, other_kdp_list = pick1(other_pairs) yield get_insts_per_candidate(c_w_token, other_query, other_kdp_list)
def get_candidate_all_passage_w_samping( max_seq_length=256, neg_k=1000) -> Dict[str, List[QCKCandidateWToken]]: qrel_path = os.path.join(data_path, "robust", "qrels.rob04.txt") galago_rank = load_bm25_best() tokens_d = load_robust_tokens_for_train() tokens_d.update(load_robust_tokens_for_predict(4)) queries = load_robust04_title_query() tokenizer = get_tokenizer() judgement: Dict[str, Dict] = load_qrels_structured(qrel_path) out_d: Dict[str, List[QCKCandidateWToken]] = {} for query_id in judgement.keys(): if query_id not in judgement: continue query = queries[query_id] query_tokens = tokenizer.tokenize(query) judge_entries = judgement[query_id] doc_ids = set(judge_entries.keys()) ranked_list = galago_rank[query_id] ranked_list = ranked_list[:neg_k] doc_ids.update([e.doc_id for e in ranked_list]) candidate = [] for doc_id in doc_ids: tokens = tokens_d[doc_id] for idx, passage in enumerate(enum_passage(tokens, max_seq_length)): if idx == 0: include = True else: include = random.random() < 0.1 if include: c = QCKCandidateWToken(doc_id, "", passage) candidate.append(c) out_d[query_id] = candidate return out_d
def make_candidate(e_id: int) -> Iterable[QCKCandidate]: text = evi_dict[e_id] tokens = tokenizer.tokenize(text) for passage in enum_passage(tokens, candidate_max_len): yield QCKCandidateWToken(str(e_id), "", passage)
def make_candidate(doc_id: str): tokens = data[doc_id] return QCKCandidateWToken(doc_id, "", tokens[:doc_len])
def get_qck_candidate_w_token(self, c: QCKCandidate) -> QCKCandidateWToken: tokens = self.tokenizer.tokenize(c.text) return QCKCandidateWToken(c.id, c.text, tokens)