def gen_ranking_list(self, method, _callback, paras): """ We get the statistics from /collection_path/detailed_doc_stats/ so that we can get everything for the top 10,000 documents for each query generated by Dirichlet language model method. """ single_queries = Query(self.collection_path).get_queries_of_length(1) queries = {ele['num']:ele['title'] for ele in single_queries} doc_details = GenSqaDocDetails(self.collection_path) cs = CollectionStats(self.collection_path) avdl = cs.get_avdl() total_terms = cs.get_total_terms() res = {} for qid in queries: print queries[qid] res[qid] = [] idx = 0 ctf = cs.get_term_collection_occur(queries[qid]) idf = cs.get_term_logidf1(queries[qid]) #for row in cs.get_qid_details(qid): for row in doc_details.get_qid_details(qid): docid = row['docid'] total_tf = float(row['total_tf']) doc_len = float(row['doc_len']) localpara = copy.deepcopy(paras) localpara.extend([total_tf, doc_len, avdl, ctf, total_terms, idf]) score = _callback(localpara) res[qid].append((docid, score)) idx += 1 if idx >= 1000: break self.output_results(res, method) self.eval(method)
def print_statistics(self, methods): single_queries = Query(self.collection_path).get_queries_of_length(1) queries = {ele['num']:ele['title'] for ele in single_queries} cs = CollectionStats(self.collection_path) performance = Performances(self.collection_path) res = performance.gen_optimal_performances_queries(methods, queries.keys()) avdl = cs.get_avdl() total_terms = cs.get_total_terms() collection_freq = [] for qid in queries: idx = 0 ctf = cs.get_term_collection_occur(queries[qid]) idf = cs.get_term_logidf1(queries[qid]) collection_freq.append( ctf*1.0/total_terms ) print avdl print np.mean(collection_freq) for ele in res: label = ele[0] p = ele[1] para = float(ele[2].split(':')[1]) print label if 'okapi' in label: print 'b:', para, 'beta:', 1.2*para/avdl, 'c2:', 1.2*(1-para) if 'pivoted' in label: print 's:', para, 'beta:', para/avdl, 'c2:', 1-para