def work(self, job_id): qid_list = self.query_group[job_id] missing_rel_cnt = 0 missing_nrel_cnt = 0 def empty_doc_fn(query_id, doc_id): rel_docs = self.ms_reader.qrel[query_id] nonlocal missing_rel_cnt nonlocal missing_nrel_cnt if doc_id in rel_docs: missing_rel_cnt += 1 else: missing_nrel_cnt += 1 for qid in qid_list: docs: List[MSMarcoDoc] = load_per_query_docs(qid, empty_doc_fn) if qid not in self.candidate_docs_d: continue target_docs = self.candidate_docs_d[qid] tokens_d = {} for d in docs: if d.doc_id in target_docs: tokens_d[d.doc_id] = [] if len(tokens_d) < len(target_docs): log_variables(job_id, qid, tokens_d, target_docs) not_found_docs = list([ doc_id for doc_id in target_docs if doc_id not in tokens_d ]) print("{} of {} not found: {}".format(len(not_found_docs), len(target_docs), not_found_docs))
def main(): my_variable = 10 # print(varname(g_variable)) # print(varname(my_variable)) # avkai = 0 log_variables(g_variable) log_variables(g_variable, my_variable)
def work(self, job_id): qid_list = self.resource.query_group[job_id] for qid in qid_list: if qid not in self.resource.candidate_doc_d: continue target_docs = self.resource.candidate_doc_d[qid] tokens_d = self.resource.get_doc_tokens_d(qid) for doc_id in target_docs: if doc_id not in tokens_d: log_variables(qid, target_docs) print("Not foudn: ", doc_id)
def work(self, job_id): qid_list = self.query_group[job_id] ticker = TimeEstimator(len(qid_list)) missing_rel_cnt = 0 missing_nrel_cnt = 0 def empty_doc_fn(query_id, doc_id): rel_docs = self.ms_reader.qrel[query_id] nonlocal missing_rel_cnt nonlocal missing_nrel_cnt if doc_id in rel_docs: missing_rel_cnt += 1 else: missing_nrel_cnt += 1 def get_tf(text): tokens = self.tokenizer.tokenize_stem(text) return Counter(tokens) for qid in qid_list: if qid not in self.candidate_docs_d: continue docs: List[MSMarcoDoc] = load_per_query_docs(qid, empty_doc_fn) ticker.tick() target_docs = self.candidate_docs_d[qid] tokens_d = {} for d in docs: if d.doc_id in target_docs: title_tokens = self.tokenizer.tokenize_stem(d.title) body_sents = sent_tokenize(d.body) body_tf_list = lmap(get_tf, body_sents) tokens_d[d.doc_id] = (title_tokens, body_tf_list) if len(tokens_d) < len(target_docs): log_variables(job_id, qid) print("{} of {} not found".format(len(tokens_d), len(target_docs))) save_path = os.path.join(self.out_dir, str(qid)) pickle.dump(tokens_d, open(save_path, "wb"))
def main(): q1 = read_queries_at(sys.argv[1]) q2 = read_queries_at(sys.argv[2]) print("len(q1)", len(q1)) print("len(q2)", len(q2)) q2_d = dict(q2) perfect_match = 0 qid_match = 0 for query_id, query_text in q1: if query_id in q2_d: qid_match += 1 query_text_from2 = q2_d[query_id] if query_text.lower() == query_text_from2.lower(): perfect_match += 1 else: print(query_id) print(query_text) print(query_text_from2) log_variables(perfect_match, qid_match)