def gen_ranking_list(self, method, _callback, paras): """ We get the statistics from /collection_path/detailed_doc_stats/ so that we can get everything for the top 10,000 documents for each query generated by Dirichlet language model method. """ single_queries = Query(self.collection_path).get_queries_of_length(1) queries = {ele['num']:ele['title'] for ele in single_queries} doc_details = GenSqaDocDetails(self.collection_path) cs = CollectionStats(self.collection_path) avdl = cs.get_avdl() total_terms = cs.get_total_terms() res = {} for qid in queries: print queries[qid] res[qid] = [] idx = 0 ctf = cs.get_term_collection_occur(queries[qid]) idf = cs.get_term_logidf1(queries[qid]) #for row in cs.get_qid_details(qid): for row in doc_details.get_qid_details(qid): docid = row['docid'] total_tf = float(row['total_tf']) doc_len = float(row['doc_len']) localpara = copy.deepcopy(paras) localpara.extend([total_tf, doc_len, avdl, ctf, total_terms, idf]) score = _callback(localpara) res[qid].append((docid, score)) idx += 1 if idx >= 1000: break self.output_results(res, method) self.eval(method)
def print_statistics(self, methods): single_queries = Query(self.collection_path).get_queries_of_length(1) queries = {ele['num']:ele['title'] for ele in single_queries} cs = CollectionStats(self.collection_path) performance = Performances(self.collection_path) res = performance.gen_optimal_performances_queries(methods, queries.keys()) avdl = cs.get_avdl() total_terms = cs.get_total_terms() collection_freq = [] for qid in queries: idx = 0 ctf = cs.get_term_collection_occur(queries[qid]) idf = cs.get_term_logidf1(queries[qid]) collection_freq.append( ctf*1.0/total_terms ) print avdl print np.mean(collection_freq) for ele in res: label = ele[0] p = ele[1] para = float(ele[2].split(':')[1]) print label if 'okapi' in label: print 'b:', para, 'beta:', 1.2*para/avdl, 'c2:', 1.2*(1-para) if 'pivoted' in label: print 's:', para, 'beta:', para/avdl, 'c2:', 1-para
def process(self, qid, method_name, method_paras, output_fn): cs = CollectionStats(self.collection_path) single_queries = Query(self.collection_path).get_queries_of_length(1) queries = {ele['num']:ele['title'] for ele in single_queries} #print qids self.rel_docs = Judgment(self.collection_path).get_relevant_docs_of_some_queries([qid], 1, 'dict') # idfs = [(qid, math.log(cs.get_term_IDF1(queries[qid]))) for qid in self.rel_docs] # idfs.sort(key=itemgetter(1)) avdl = cs.get_avdl() total_terms = cs.get_total_terms() data = {True: [], False: []} # False: non-relevant True: relevant ctf = cs.get_term_collection_occur(queries[qid]) collection_para = { 'avdl': avdl, 'total_terms': total_terms, 'ctf': ctf } for row in cs.get_qid_details(qid): docid = row['docid'] total_tf = float(row['total_tf']) doc_len = float(row['doc_len']) rel_score = int(row['rel_score']) rel = (rel_score>=1) data[rel].append( { 'docid': docid, 'tf': total_tf, 'ln': doc_len } ) method_para_dict = {ele.split(':')[0]:ele.split(':')[1] for ele in method_paras.split(',')} max_map, max_para = self.learn(qid, data, collection_para, method_name, method_para_dict) with open(output_fn, 'wb') as f: json.dump({'map':max_map, 'para':max_para, 'eta':method_para_dict['eta']}, f, indent=2)
def output_data_file(self): cs = CollectionStats(self.collection_path) single_queries = Query(self.collection_path).get_queries_of_length(1) queries = {ele['num']:ele['title'] for ele in single_queries} #print qids with open(os.path.join(self.collection_path, 'svm_data_index_file'), 'wb') as indexf: for qid in queries: data_fn = os.path.join(self.svm_data_root, qid) indexf.write('%s\n' % (data_fn)) with open(data_fn, 'wb') as f: for row in cs.get_qid_details(qid): docid = row['docid'] total_tf = float(row['total_tf']) doc_len = float(row['doc_len']) rel_score = int(row['rel_score']) #rel = (rel_score>=1) f.write('%d qid:%s 1:%f 2:%f\n' % (rel_score, qid, total_tf, doc_len))
def gen_perfect_ranking_list(self, plotbins=True, numbins=60): cs = CollectionStats(self.collection_path) single_queries = Query(self.collection_path).get_queries_of_length(1) queries = {ele['num']:ele['title'] for ele in single_queries} #print qids rel_docs = Judgment(self.collection_path).get_relevant_docs_of_some_queries(queries.keys(), 1, 'dict') idfs = [(qid, math.log(cs.get_term_IDF1(queries[qid]))) for qid in rel_docs] idfs.sort(key=itemgetter(1)) res = {} for qid,idf in idfs: x_dict = {} res[qid] = [] score_mapping = {} maxScore = -99999999 for row in cs.get_qid_details(qid): docid = row['docid'] total_tf = float(row['total_tf']) doc_len = float(row['doc_len']) rel_score = int(row['rel_score']) score = math.log(total_tf+1.0)/(math.log(total_tf+1.0)+math.log(doc_len)) #score = total_tf/(total_tf + doc_len) score_mapping[docid] = score if score > maxScore: maxScore = score rel = (rel_score>=1) if score not in x_dict: x_dict[score] = [0, 0, [docid, score, rel, len(rel_docs[qid])]] # [rel_docs, total_docs] if rel: x_dict[score][0] += 1 x_dict[score][1] += 1 # xaxis = x_dict.keys() # xaxis.sort() # yaxis = [(x_dict[x][0]*1./x_dict[x][1], x_dict[x][2]) for x in xaxis] # if plotbins: interval = maxScore*1.0/numbins newxaxis = [i for i in np.arange(0, maxScore+1e-10, interval)] newyaxis = [[0.0, 0.0, []] for x in newxaxis] for x in x_dict: newx = int(x / interval) # print x_dict[x] newyaxis[newx][0] += x_dict[x][0] newyaxis[newx][1] += x_dict[x][1] newyaxis[newx][2].append( x_dict[x][2] ) # print x, newx # print newxaxis # print newyaxis # raw_input() xaxis = newxaxis yaxis = [(ele[0]*1.0/ele[1], ele[2]) if ele[1] != 0 else (0, []) for ele in newyaxis] yaxis.sort(key=itemgetter(0), reverse=True) #yaxis.sort(key=self._sort_by_map, reverse=True) sbase = 1e9 for ele in yaxis: for doc in ele[1]: docid = doc[0] if len(res[qid]) < 1000: res[qid].append((docid, sbase+score_mapping[docid])) sbase -= 100 #print len(res[qid]) method = 'hypothesis_stq_tf_ln_upperbound' self.output_results(res, method) self.eval(method)
def plot_rel_prob(self, query_length, x_func, _method, plot_ratio=True, plot_total_or_avg=True, plot_rel_or_all=True, performance_as_legend=True, drawline=True, numbins=60, xlimit=0, ylimit=0, zoom_x=0, compact_x=False, curve_fitting=False, draw_individual=False, draw_all=True, oformat='eps'): """ plot the P(D=1|TF=x) Input: @query_length: only plot the queries of length, 0 for all queries. @x_func: how to get the x-axis of the figure. By default, this should be TF values. But we are flexible with other options, e.g. tf/dl @_method: Which method is going to be plot. The parameters should also be attached, e.g. dir,mu:2500 @plot_ratio: When this is false, plot the y-axis as the number of relevant documents; When this is true, plot the y-axis as the #rel_docs/#docs @plot_total_or_avg: When this is true, plot the y-axis as the collection total ; When this is false, plot the collection average. Only available when plot_ratio is false is only available for collection-wise @plot_rel_or_all: When this is true, plot the y-axis as the number of relevant docs ; When this is false, plot the number of all docs. Only available when plot_ratio is false is only available for collection-wise @performance_as_legend: whether to add performance(e.g. MAP) as part of the legend @drawline: draw the data points as line(true) or dots(false) @numbins: the number of bins if we choose to plot x points as bins, 0 for no bins @xlimit: the limit of xaxis, any value larger than this value would not be plotted. default 0, meaning plot all data. @ylimit: the limit of yaxis, any value larger than this value would not be plotted. default 0, meaning plot all data. @zoom: whether zoom part of the plot @zoom_x: the zoom start x point, 0 for no zoom. @compact_x: map the x to continuous integers, e.g. 1,2,3,4,.... @oformat: output format, eps or png """ collection_name = self.collection_name cs = CollectionStats(self.collection_path) doc_details = GenDocDetails(self.collection_path) output_root = os.path.join('collection_figures', str(query_length)) if not os.path.exists(os.path.join(self.all_results_root, output_root)): os.makedirs(os.path.join(self.all_results_root, output_root)) if query_length == 0: queries = Query(self.collection_path).get_queries() else: queries = Query(self.collection_path).get_queries_of_length(query_length) queries = {ele['num']:ele['title'] for ele in queries} #print qids rel_docs = Judgment(self.collection_path).get_relevant_docs_of_some_queries(queries.keys(), 1, 'dict') #print np.mean([len(rel_docs[qid]) for qid in rel_docs]) eval_class = Evaluation(self.collection_path) print _method p = eval_class.get_all_performance_of_some_queries( method=_method, qids=queries.keys(), return_all_metrics=False, metrics=['map'] ) collection_x_dict = {} collection_level_maxX = 0.0 num_cols = min(4, len(queries)) num_rows = int(math.ceil(len(rel_docs)*1.0/num_cols)) fig, axs = plt.subplots(nrows=num_rows, ncols=num_cols, sharex=False, sharey=False, figsize=(2*num_cols, 2*num_rows)) font = {'size' : 5} plt.rc('font', **font) row_idx = 0 col_idx = 0 #idfs = [(qid, math.log(cs.get_term_IDF1(queries[qid]))) for qid in rel_docs] #idfs.sort(key=itemgetter(1)) all_expected_maps = [] if curve_fitting: all_fitting_results = [{'sr': [], 'ap':[], 'ap_diff':[]} for i in range(FittingModels().size())] all_fitting_performances = {} for qid in sorted(queries): if num_rows > 1: ax = axs[row_idx][col_idx] else: if num_cols > 1: ax = axs[col_idx] else: ax = axs col_idx += 1 if col_idx >= num_cols: row_idx += 1 col_idx = 0 query_term = queries[qid] maxTF = cs.get_term_maxTF(query_term) #idf = math.log(cs.get_term_IDF1(query_term)) #legend = 'idf:%.2f'%idf if performance_as_legend: legend = '\nAP:%.4f' % (p[qid]['map'] if p[qid] else 0) x_dict = {} qid_docs_len = 0 #for row in cs.get_qid_details(qid): for row in doc_details.get_qid_details(qid): qid_docs_len += 1 x = x_func(cs, row) if x > collection_level_maxX: collection_level_maxX = x rel = (int(row['rel_score'])>=1) if x not in x_dict: x_dict[x] = [0, 0] # [rel_docs, total_docs] if rel: x_dict[x][0] += 1 x_dict[x][1] += 1 if x not in collection_x_dict: collection_x_dict[x] = [0, 0] # [rel_docs, total_docs] if rel: collection_x_dict[x][0] += 1 collection_x_dict[x][1] += 1 xaxis = x_dict.keys() xaxis.sort() if plot_ratio: yaxis = [x_dict[x][0]*1./x_dict[x][1] for x in xaxis] else: yaxis = [(x_dict[x][0]) if plot_rel_or_all else (x_dict[x][1]) for x in xaxis] ranking_list = [(x_dict[x][0], x_dict[x][1]) for x in xaxis] all_expected_maps.append(EMAP().cal_expected_map(ranking_list)) if draw_individual: if np.sum(xaxis) == 0 or np.sum(yaxis) == 0: continue raw_xaxis = copy.deepcopy(xaxis) xaxis = np.array(xaxis, dtype=np.float32) yaxis = np.array(yaxis, dtype=np.float32) if compact_x: xaxis = range(1, len(xaxis)+1) if curve_fitting and not plot_ratio: sum_yaxis = np.sum(yaxis) yaxis /= sum_yaxis query_stat = cs.get_term_stats(query_term) zoom_xaxis = xaxis[zoom_x:] zoom_yaxis = yaxis[zoom_x:] ax, zoom_ax = self.plot_figure(ax, xaxis, yaxis, qid+'-'+query_term, legend, drawline=drawline, xlimit=xlimit, ylimit=ylimit, zoom=zoom_x > 0, zoom_xaxis=zoom_xaxis, zoom_yaxis=zoom_yaxis, legend_markscale=0.5) if curve_fitting: all_fittings = [] fitting_xaxis = [] fitting_yaxis = [] for i, ele in enumerate(yaxis): #if ele != 0: fitting_xaxis.append(xaxis[i]) fitting_yaxis.append(ele) for j in range(1, FittingModels().size()+1): fitting = FittingModels().cal_curve_fit(fitting_xaxis, fitting_yaxis, j) if not fitting is None: fitting_func_name = fitting[1] all_fitting_results[j-1]['name'] = fitting_func_name all_fitting_results[j-1]['sr'].append(fitting[4]) # sum of squared error if re.search(r'^tf\d+$', _method): estimated_map = CalEstMAP().cal_map( rel_docs = np.rint(fitting[3]*sum_yaxis).astype(int), all_docs = [x_dict[x][1] for x in raw_xaxis], mode=1 ) else: estimated_map = CalEstMAP().cal_map( rel_docs = np.rint(fitting[3]*sum_yaxis).astype(int), all_docs = [x_dict[x][1] for x in raw_xaxis], mode=1 ) all_fitting_results[j-1]['ap'].append(estimated_map) # average precision actual_map = p[qid]['map'] if p[qid] else 0 all_fitting_results[j-1]['ap_diff'].append(math.fabs(estimated_map-actual_map)) fitting.append(estimated_map) fitting.append(math.fabs(estimated_map-actual_map)) all_fittings.append(fitting) if fitting_func_name not in all_fitting_performances: all_fitting_performances[fitting_func_name] = {} all_fitting_performances[fitting_func_name][qid] = estimated_map #print fitting[0], fitting[1], fitting[3] else: #print j, 'None' pass all_fittings.sort(key=itemgetter(4)) try: print qid, query_term, all_fittings[0][0], all_fittings[0][1], all_fittings[0][2], all_fittings[0][4] except: continue fitted_y = [0 for i in range(len(xaxis))] for x in xaxis: if x in fitting_xaxis: idx = fitting_xaxis.index(x) fitted_y[idx] = all_fittings[0][3][idx] best_fit_func_name = all_fittings[0][1] all_fittings.sort(key=itemgetter(-1)) zoom_yaxis_fitting = fitted_y[zoom_x:] self.plot_figure(ax, xaxis, fitted_y, qid+'-'+query_term, '%s\n%s(%.4f)' % (best_fit_func_name, all_fittings[0][1], all_fittings[0][-2]), drawline=True, linestyle='--', zoom=zoom_x > 0, zoom_ax = zoom_ax, zoom_xaxis=zoom_xaxis, zoom_yaxis=zoom_yaxis_fitting, legend_pos='best', xlimit=xlimit, ylimit=ylimit, legend_markscale=0.5) if draw_individual: output_fn = os.path.join(self.all_results_root, output_root, '%s-%s-%s-%s-%s-%s-%d-%.1f-%.1f-zoom%d-%s-%s-individual.%s' % ( collection_name, _method, 'ratio' if plot_ratio else 'abscnt', 'total' if plot_total_or_avg else 'avg', 'rel' if plot_rel_or_all else 'all', 'line' if drawline else 'dots', numbins, xlimit, ylimit, zoom_x, 'compact' if compact_x else 'raw', 'fit' if curve_fitting else 'plain', oformat) ) plt.savefig(output_fn, format=oformat, bbox_inches='tight', dpi=400) if curve_fitting: # plot the goodness of fit all_fitting_results = [ele for ele in all_fitting_results if 'name' in ele and ele['name'] not in ['AD']] goodness_fit_data = [ele['sr'] for ele in all_fitting_results if 'sr' in ele] labels = [ele['name'] for ele in all_fitting_results if 'sr' in ele and 'name' in ele] fig, ax = plt.subplots(nrows=1, ncols=1, sharex=False, sharey=False, figsize=(6, 3.*1)) font = {'size' : 8} plt.rc('font', **font) ax.boxplot(goodness_fit_data, labels=labels) output_fn = os.path.join(self.all_results_root, output_root, '%s-%s-fitting.%s' % (collection_name, _method, oformat) ) plt.savefig(output_fn, format=oformat, bbox_inches='tight', dpi=400) # plot the AP diff ap_diff_data = [ele['ap_diff'] for ele in all_fitting_results if 'ap_diff' in ele] labels = [ele['name'] for ele in all_fitting_results if 'ap_diff' in ele and 'name' in ele] fig, ax = plt.subplots(nrows=1, ncols=1, sharex=False, sharey=False, figsize=(6, 3.*1)) font = {'size' : 8} plt.rc('font', **font) ax.boxplot(ap_diff_data, labels=labels) output_fn = os.path.join(self.all_results_root, output_root, '%s-%s-apdiff.%s' % (collection_name, _method, oformat) ) plt.savefig(output_fn, format=oformat, bbox_inches='tight', dpi=400) # draw the figure for the whole collection collection_vocablulary_stat = cs.get_vocabulary_stats() collection_vocablulary_stat_str = '' idx = 1 for k,v in collection_vocablulary_stat.items(): collection_vocablulary_stat_str += k+'='+'%.2f'%v+' ' if idx == 3: collection_vocablulary_stat_str += '\n' idx = 1 idx += 1 fig, axs = plt.subplots(nrows=1, ncols=1, sharex=False, sharey=False, figsize=(6, 3.*1)) font = {'size' : 8} plt.rc('font', **font) xaxis = collection_x_dict.keys() xaxis.sort() if plot_ratio: yaxis = [collection_x_dict[x][0]*1./collection_x_dict[x][1] for x in xaxis] else: if plot_total_or_avg: yaxis = [(collection_x_dict[x][0]) if plot_rel_or_all else (collection_x_dict[x][1]) for x in xaxis] else: yaxis = [(collection_x_dict[x][0]/len(queries)) if plot_rel_or_all else (collection_x_dict[x][1]/len(queries)) for x in xaxis] #print np.sum(yaxis[20:]), np.sum(yaxis[20:]) if numbins > 0: interval = collection_level_maxX*1.0/numbins newxaxis = [i for i in np.arange(0, collection_level_maxX+1e-10, interval)] newyaxis = [[0.0, 0.0] for x in newxaxis] for x in xaxis: newx = int(x / interval) newyaxis[newx][0] += collection_x_dict[x][0] newyaxis[newx][1] += collection_x_dict[x][1] xaxis = newxaxis if plot_ratio: yaxis = [ele[0]/ele[1] if ele[1] != 0 else 0.0 for ele in newyaxis] else: if plot_total_or_avg: yaxis = [(ele[0]) if plot_rel_or_all else (ele[1]) for ele in newyaxis] else: yaxis = [(ele[0]/len(queries)) if plot_rel_or_all else (ele[1]/len(queries)) for ele in newyaxis] # we do not care about the actual values of x # so we just map the actual values to integer values return_data = copy.deepcopy(collection_x_dict) if curve_fitting: #### calculate the stats for fitting_func_name in all_fitting_performances: actual_maps = [p[qid]['map'] if p[qid] else 0 for qid in queries] estimated_maps = [all_fitting_performances[fitting_func_name][qid] if qid in all_fitting_performances[fitting_func_name] else 0 for qid in queries] print fitting_func_name, print scipy.stats.pearsonr(actual_maps, estimated_maps), print scipy.stats.kendalltau(actual_maps, estimated_maps) print '-'*30 if draw_all: if compact_x: xaxis = range(1, len(xaxis)+1) xaxis = np.array(xaxis, dtype=np.float32) yaxis = np.array(yaxis, dtype=np.float32) if curve_fitting and not plot_ratio: yaxis /= np.sum(yaxis) collection_legend = '' if performance_as_legend: collection_legend = '$MAP:%.4f$' % (np.mean([p[qid]['map'] if p[qid] else 0 for qid in queries])) #collection_legend += '\n$MAP_E:%.4f$' % (np.mean(all_expected_maps)) zoom_xaxis = xaxis[zoom_x:] zoom_yaxis = yaxis[zoom_x:] axs, zoom_axs = self.plot_figure(axs, xaxis, yaxis, collection_name, collection_legend, drawline=drawline, xlimit=xlimit, ylimit=ylimit, zoom=zoom_x > 0, zoom_xaxis=zoom_xaxis, zoom_yaxis=zoom_yaxis) if curve_fitting: all_fittings = [] fitting_xaxis = [] fitting_yaxis = [] for i, ele in enumerate(yaxis): if ele != 0: fitting_xaxis.append(xaxis[i]) fitting_yaxis.append(ele) for j in range(1, FittingModels().size()+1): fitting = FittingModels().cal_curve_fit(fitting_xaxis, fitting_yaxis, j) if not fitting is None: all_fittings.append(fitting) #print fitting[0], fitting[1], fitting[3] else: #print j, 'None' pass all_fittings.sort(key=itemgetter(4)) if all_fittings: print all_fittings[0][0], all_fittings[0][1], all_fittings[0][2], all_fittings[0][4] fitted_y = [0 for i in range(len(xaxis))] for x in xaxis: if x in fitting_xaxis: idx = fitting_xaxis.index(x) fitted_y[idx] = all_fittings[0][3][idx] zoom_yaxis_fitting = fitted_y[zoom_x:] self.plot_figure(axs, xaxis, fitted_y, collection_name, all_fittings[0][1], drawline=True, linestyle='--', zoom=zoom_x > 0, zoom_ax = zoom_axs, zoom_xaxis=zoom_xaxis, zoom_yaxis=zoom_yaxis_fitting, legend_pos='best', xlimit=xlimit, ylimit=ylimit) output_fn = os.path.join(self.all_results_root, output_root, '%s-%s-%s-%s-%s-%s-%d-%.1f-%.1f-zoom%d-%s-%s-all.%s' % ( collection_name, _method, 'ratio' if plot_ratio else 'abscnt', 'total' if plot_total_or_avg else 'avg', 'rel' if plot_rel_or_all else 'all', 'line' if drawline else 'dots', numbins, xlimit, ylimit, zoom_x, 'compact' if compact_x else 'raw', 'fit' if curve_fitting else 'plain', oformat) ) plt.savefig(output_fn, format=oformat, bbox_inches='tight', dpi=400) return collection_name, return_data