def select_doc_per_query_top50(split): ms_reader = MSMarcoDataReader(split) save_path = os.path.join(root_dir, "train_docs_top50_{}.tsv".format(split)) out_f = open(save_path, "w") def pop(query_id, cur_doc_ids: List[Tuple[str, int]]): pos_docs = ms_reader.qrel[query_id] neg_docs = [] for doc_id, rank in cur_doc_ids: if doc_id not in pos_docs and rank < 50: neg_docs.append(doc_id) doc_needed = pos_docs + neg_docs row = [query_id] + doc_needed out_f.write("\t".join(row) + "\n") total_line = 36701116 ticker = TimeEstimator(total_line, "reading", 1000) with open_top100(split) as top100f: last_topic_id = None cur_doc_ids = [] for line_no, line in enumerate(top100f): [topic_id, _, doc_id, rank, _, _] = line.split() if last_topic_id is None: last_topic_id = topic_id elif last_topic_id != topic_id: pop(last_topic_id, cur_doc_ids) last_topic_id = topic_id cur_doc_ids = [] ticker.tick() cur_doc_ids.append((doc_id, int(rank)))
def select_doc_per_query(split): ms_reader = MSMarcoDataReader(split) save_path = os.path.join(root_dir, "train_docs_10times_{}.tsv".format(split)) out_f = open(save_path, "w") def pop(query_id, cur_doc_ids: Set): pos_docs = ms_reader.qrel[query_id] neg_docs = list( [doc_id for doc_id in cur_doc_ids if doc_id not in pos_docs]) if pos_docs: num_neg_docs = 10 * len(pos_docs) sel_docs = random.sample(neg_docs, num_neg_docs) doc_needed = pos_docs + sel_docs row = [query_id] + doc_needed out_f.write("\t".join(row) + "\n") total_line = 36701116 ticker = TimeEstimator(total_line, "reading", 1000) with open_top100(split) as top100f: last_topic_id = None cur_doc_ids = set() for line_no, line in enumerate(top100f): [topic_id, _, doc_id, rank, _, _] = line.split() if last_topic_id is None: last_topic_id = topic_id elif last_topic_id != topic_id: pop(last_topic_id, cur_doc_ids) last_topic_id = topic_id cur_doc_ids = set() ticker.tick() cur_doc_ids.add(doc_id)
def __init__(self, split, query_group, candidate_docs_d, out_dir): self.query_group = query_group self.tokenizer = get_tokenizer() self.candidate_docs_d = candidate_docs_d self.out_dir = out_dir self.ms_reader = MSMarcoDataReader(split)
def collect_doc_per_query(split, target_qid): ms_reader = MSMarcoDataReader(split) def pop(query_id, cur_doc_ids: Set): num_candidate_doc = len(cur_doc_ids) cur_doc_ids.update(ms_reader.qrel[query_id]) todo = [] for doc_id in cur_doc_ids: offset = ms_reader.doc_offset[doc_id] todo.append((doc_id, offset)) todo.sort(key=get_second) num_all_docs = len(cur_doc_ids) print("{} docs".format(num_all_docs)) exist_or_mkdir(per_query_root) save_path = get_per_query_doc_path(query_id) out_f = open(save_path, "w") for doc_id, offset in todo: content: str = ms_reader.get_content(doc_id) out_f.write(content + "\n") out_f.close() ### total_line = 36701116 skip = True with open_top100(split) as top100f: last_topic_id = None cur_doc_ids = set() for line_no, line in enumerate(top100f): if skip: if not line.startswith(target_qid): continue else: tprint("skip done") remain_lines = total_line - line_no ticker = TimeEstimator(remain_lines, "reading", 1000) skip = False [topic_id, _, doc_id, rank, _, _] = line.split() if last_topic_id is None: last_topic_id = topic_id elif last_topic_id != topic_id: pop(last_topic_id, cur_doc_ids) break last_topic_id = topic_id cur_doc_ids = set() ticker.tick() cur_doc_ids.add(doc_id) pop(last_topic_id, cur_doc_ids)
def __init__(self, split, query_group, candidate_docs_d, max_sent_length, max_title_length, out_dir): self.query_group = query_group self.candidate_docs_d = candidate_docs_d self.out_dir = out_dir self.bert_tokenizer = get_tokenizer() self.stem_tokenizer = PCTokenizer() self.max_sent_length = max_sent_length self.max_title_length = max_title_length self.ms_reader = MSMarcoDataReader(split) self.text_dir_name = 'text' self.bert_tokens_dir_name = 'bert_tokens' self.stemmed_tokens_dir_name = 'stemmed_tokens' for name in [ self.text_dir_name, self.bert_tokens_dir_name, self.stemmed_tokens_dir_name ]: exist_or_mkdir(os.path.join(self.out_dir, name))