def process(self, qid, method_name, method_paras, output_fn): cs = CollectionStats(self.collection_path) single_queries = Query(self.collection_path).get_queries_of_length(1) queries = {ele['num']:ele['title'] for ele in single_queries} #print qids self.rel_docs = Judgment(self.collection_path).get_relevant_docs_of_some_queries([qid], 1, 'dict') # idfs = [(qid, math.log(cs.get_term_IDF1(queries[qid]))) for qid in self.rel_docs] # idfs.sort(key=itemgetter(1)) avdl = cs.get_avdl() total_terms = cs.get_total_terms() data = {True: [], False: []} # False: non-relevant True: relevant ctf = cs.get_term_collection_occur(queries[qid]) collection_para = { 'avdl': avdl, 'total_terms': total_terms, 'ctf': ctf } for row in cs.get_qid_details(qid): docid = row['docid'] total_tf = float(row['total_tf']) doc_len = float(row['doc_len']) rel_score = int(row['rel_score']) rel = (rel_score>=1) data[rel].append( { 'docid': docid, 'tf': total_tf, 'ln': doc_len } ) method_para_dict = {ele.split(':')[0]:ele.split(':')[1] for ele in method_paras.split(',')} max_map, max_para = self.learn(qid, data, collection_para, method_name, method_para_dict) with open(output_fn, 'wb') as f: json.dump({'map':max_map, 'para':max_para, 'eta':method_para_dict['eta']}, f, indent=2)
def output_data_file(self): cs = CollectionStats(self.collection_path) single_queries = Query(self.collection_path).get_queries_of_length(1) queries = {ele['num']:ele['title'] for ele in single_queries} #print qids with open(os.path.join(self.collection_path, 'svm_data_index_file'), 'wb') as indexf: for qid in queries: data_fn = os.path.join(self.svm_data_root, qid) indexf.write('%s\n' % (data_fn)) with open(data_fn, 'wb') as f: for row in cs.get_qid_details(qid): docid = row['docid'] total_tf = float(row['total_tf']) doc_len = float(row['doc_len']) rel_score = int(row['rel_score']) #rel = (rel_score>=1) f.write('%d qid:%s 1:%f 2:%f\n' % (rel_score, qid, total_tf, doc_len))
def gen_perfect_ranking_list(self, plotbins=True, numbins=60): cs = CollectionStats(self.collection_path) single_queries = Query(self.collection_path).get_queries_of_length(1) queries = {ele['num']:ele['title'] for ele in single_queries} #print qids rel_docs = Judgment(self.collection_path).get_relevant_docs_of_some_queries(queries.keys(), 1, 'dict') idfs = [(qid, math.log(cs.get_term_IDF1(queries[qid]))) for qid in rel_docs] idfs.sort(key=itemgetter(1)) res = {} for qid,idf in idfs: x_dict = {} res[qid] = [] score_mapping = {} maxScore = -99999999 for row in cs.get_qid_details(qid): docid = row['docid'] total_tf = float(row['total_tf']) doc_len = float(row['doc_len']) rel_score = int(row['rel_score']) score = math.log(total_tf+1.0)/(math.log(total_tf+1.0)+math.log(doc_len)) #score = total_tf/(total_tf + doc_len) score_mapping[docid] = score if score > maxScore: maxScore = score rel = (rel_score>=1) if score not in x_dict: x_dict[score] = [0, 0, [docid, score, rel, len(rel_docs[qid])]] # [rel_docs, total_docs] if rel: x_dict[score][0] += 1 x_dict[score][1] += 1 # xaxis = x_dict.keys() # xaxis.sort() # yaxis = [(x_dict[x][0]*1./x_dict[x][1], x_dict[x][2]) for x in xaxis] # if plotbins: interval = maxScore*1.0/numbins newxaxis = [i for i in np.arange(0, maxScore+1e-10, interval)] newyaxis = [[0.0, 0.0, []] for x in newxaxis] for x in x_dict: newx = int(x / interval) # print x_dict[x] newyaxis[newx][0] += x_dict[x][0] newyaxis[newx][1] += x_dict[x][1] newyaxis[newx][2].append( x_dict[x][2] ) # print x, newx # print newxaxis # print newyaxis # raw_input() xaxis = newxaxis yaxis = [(ele[0]*1.0/ele[1], ele[2]) if ele[1] != 0 else (0, []) for ele in newyaxis] yaxis.sort(key=itemgetter(0), reverse=True) #yaxis.sort(key=self._sort_by_map, reverse=True) sbase = 1e9 for ele in yaxis: for doc in ele[1]: docid = doc[0] if len(res[qid]) < 1000: res[qid].append((docid, sbase+score_mapping[docid])) sbase -= 100 #print len(res[qid]) method = 'hypothesis_stq_tf_ln_upperbound' self.output_results(res, method) self.eval(method)