Example #1
0
 def get_docs_tf(self):
     """
     We get the statistics from /collection_path/detailed_doc_stats/ 
     so that we can get everything for the top 10,000 documents for 
     each query generated by Dirichlet language model method.
     """
     all_queries = Query(self.collection_path).get_queries()
     queries = {ele['num']:ele['title'] for ele in all_queries}
     doc_details = GenDocDetails(self.collection_path)
     res = {}
     for qid in queries:
         res[qid] = []
         idx = 0
         try:
             for row in doc_details.get_qid_details(qid):
                 docid = row['docid']
                 tf = float(row['total_tf'])
                 #score = self.hypothesis_tf_function(tf, _type, scale, mu, sigma)
                 res[qid].append([docid, tf])
                 idx += 1
                 if idx >= 1000:
                     pass
             res[qid].sort(key=itemgetter(1,0), reverse=True)
         except IOError:
             pass
     return res
    def plot_rel_prob(self, query_length, x_func, _method, plot_ratio=True, 
            plot_total_or_avg=True, plot_rel_or_all=True, 
            performance_as_legend=True, drawline=True, numbins=60, xlimit=0, 
            ylimit=0, zoom_x=0, compact_x=False, curve_fitting=False, 
            draw_individual=False, draw_all=True, oformat='eps'):
        """
        plot the P(D=1|TF=x)

        Input:
        @query_length: only plot the queries of length, 0 for all queries.
        @x_func: how to get the x-axis of the figure. By default, this should 
            be TF values. But we are flexible with other options, e.g. tf/dl
        @_method: Which method is going to be plot. The parameters should also be 
            attached, e.g. dir,mu:2500
        @plot_ratio: When this is false, plot the y-axis as the number of relevant 
            documents; When this is true, plot the y-axis as the #rel_docs/#docs
        @plot_total_or_avg: When this is true, plot the y-axis as the collection 
            total ; When this is false, plot the collection average. 
            Only available when plot_ratio is false is only available for collection-wise
        @plot_rel_or_all: When this is true, plot the y-axis as the number of 
            relevant docs ; When this is false, plot the number of all docs. 
            Only available when plot_ratio is false is only available for collection-wise
        @performance_as_legend: whether to add performance(e.g. MAP) 
            as part of the legend
        @drawline: draw the data points as line(true) or dots(false)
        @numbins: the number of bins if we choose to plot x points as bins, 0 for no bins
        @xlimit: the limit of xaxis, any value larger than this value would not 
            be plotted. default 0, meaning plot all data.
        @ylimit: the limit of yaxis, any value larger than this value would not 
            be plotted. default 0, meaning plot all data.
        @zoom: whether zoom part of the plot
        @zoom_x: the zoom start x point, 0 for no zoom.
        @compact_x: map the x to continuous integers, e.g. 1,2,3,4,....
        @oformat: output format, eps or png
        """
        collection_name = self.collection_name
        cs = CollectionStats(self.collection_path)
        doc_details = GenDocDetails(self.collection_path)
        output_root = os.path.join('collection_figures', str(query_length))
        if not os.path.exists(os.path.join(self.all_results_root, output_root)):
            os.makedirs(os.path.join(self.all_results_root, output_root))
        if query_length == 0:
            queries = Query(self.collection_path).get_queries()
        else:
            queries = Query(self.collection_path).get_queries_of_length(query_length)
        queries = {ele['num']:ele['title'] for ele in queries}
        #print qids
        rel_docs = Judgment(self.collection_path).get_relevant_docs_of_some_queries(queries.keys(), 1, 'dict')
        #print np.mean([len(rel_docs[qid]) for qid in rel_docs])
        eval_class = Evaluation(self.collection_path)
        print _method
        p = eval_class.get_all_performance_of_some_queries(
            method=_method,
            qids=queries.keys(), 
            return_all_metrics=False, 
            metrics=['map']
        )
        collection_x_dict = {}
        collection_level_maxX = 0.0
        num_cols = min(4, len(queries))
        num_rows = int(math.ceil(len(rel_docs)*1.0/num_cols))
        fig, axs = plt.subplots(nrows=num_rows, ncols=num_cols, sharex=False, sharey=False, figsize=(2*num_cols, 2*num_rows))
        font = {'size' : 5}
        plt.rc('font', **font)
        row_idx = 0
        col_idx = 0
        #idfs = [(qid, math.log(cs.get_term_IDF1(queries[qid]))) for qid in rel_docs]
        #idfs.sort(key=itemgetter(1))
        all_expected_maps = []
        if curve_fitting:
            all_fitting_results = [{'sr': [], 'ap':[], 'ap_diff':[]} for i in range(FittingModels().size())]
            all_fitting_performances = {}
        for qid in sorted(queries):
            if num_rows > 1:
                ax = axs[row_idx][col_idx]
            else:
                if num_cols > 1:
                    ax = axs[col_idx]
                else:
                    ax = axs
            col_idx += 1
            if col_idx >= num_cols:
                row_idx += 1
                col_idx = 0
            query_term = queries[qid]
            maxTF = cs.get_term_maxTF(query_term)
            #idf = math.log(cs.get_term_IDF1(query_term))
            #legend = 'idf:%.2f'%idf
            if performance_as_legend:
                legend = '\nAP:%.4f' % (p[qid]['map'] if p[qid] else 0)
            x_dict = {}
            qid_docs_len = 0
            #for row in cs.get_qid_details(qid):
            for row in doc_details.get_qid_details(qid):
                qid_docs_len += 1
                x = x_func(cs, row)
                if x > collection_level_maxX:
                    collection_level_maxX = x
                rel = (int(row['rel_score'])>=1)
                if x not in x_dict:
                    x_dict[x] = [0, 0] # [rel_docs, total_docs]
                if rel:
                    x_dict[x][0] += 1
                x_dict[x][1] += 1
                if x not in collection_x_dict:
                    collection_x_dict[x] = [0, 0] # [rel_docs, total_docs]
                if rel:
                    collection_x_dict[x][0] += 1
                collection_x_dict[x][1] += 1
            xaxis = x_dict.keys()
            xaxis.sort()
            if plot_ratio:
                yaxis = [x_dict[x][0]*1./x_dict[x][1] for x in xaxis]
            else:
                yaxis = [(x_dict[x][0]) if plot_rel_or_all else (x_dict[x][1]) for x in xaxis]
            ranking_list = [(x_dict[x][0], x_dict[x][1]) for x in xaxis]
            all_expected_maps.append(EMAP().cal_expected_map(ranking_list))

            if draw_individual:
                if np.sum(xaxis) == 0 or np.sum(yaxis) == 0:
                    continue
                raw_xaxis = copy.deepcopy(xaxis)
                xaxis = np.array(xaxis, dtype=np.float32)
                yaxis = np.array(yaxis, dtype=np.float32)
                if compact_x:
                    xaxis = range(1, len(xaxis)+1)
                if curve_fitting and not plot_ratio:
                    sum_yaxis = np.sum(yaxis)
                    yaxis /= sum_yaxis
                query_stat = cs.get_term_stats(query_term)
                zoom_xaxis = xaxis[zoom_x:]
                zoom_yaxis = yaxis[zoom_x:]
                ax, zoom_ax = self.plot_figure(ax, xaxis, yaxis, qid+'-'+query_term, legend,
                    drawline=drawline,
                    xlimit=xlimit,
                    ylimit=ylimit,
                    zoom=zoom_x > 0,
                    zoom_xaxis=zoom_xaxis,
                    zoom_yaxis=zoom_yaxis,
                    legend_markscale=0.5)
                if curve_fitting:
                    all_fittings = []
                    fitting_xaxis = []
                    fitting_yaxis = []
                    for i, ele in enumerate(yaxis):
                    #if ele != 0:
                        fitting_xaxis.append(xaxis[i])
                        fitting_yaxis.append(ele)
                    for j in range(1, FittingModels().size()+1):
                        fitting = FittingModels().cal_curve_fit(fitting_xaxis, fitting_yaxis, j)
                        if not fitting is None:
                            fitting_func_name = fitting[1]
                            all_fitting_results[j-1]['name'] = fitting_func_name
                            all_fitting_results[j-1]['sr'].append(fitting[4]) # sum of squared error
                            if re.search(r'^tf\d+$', _method):
                                estimated_map = CalEstMAP().cal_map(
                                    rel_docs = np.rint(fitting[3]*sum_yaxis).astype(int),
                                    all_docs = [x_dict[x][1] for x in raw_xaxis],
                                    mode=1
                                )
                            else:
                                estimated_map = CalEstMAP().cal_map(
                                    rel_docs = np.rint(fitting[3]*sum_yaxis).astype(int),
                                    all_docs = [x_dict[x][1] for x in raw_xaxis],
                                    mode=1
                                )
                            all_fitting_results[j-1]['ap'].append(estimated_map) # average precision
                            actual_map = p[qid]['map'] if p[qid] else 0
                            all_fitting_results[j-1]['ap_diff'].append(math.fabs(estimated_map-actual_map))    
                            fitting.append(estimated_map)
                            fitting.append(math.fabs(estimated_map-actual_map))
                            all_fittings.append(fitting)
                            if fitting_func_name not in all_fitting_performances:
                                all_fitting_performances[fitting_func_name] = {}
                            all_fitting_performances[fitting_func_name][qid] = estimated_map
                            #print fitting[0], fitting[1], fitting[3]
                        else:
                            #print j, 'None'
                            pass
                    all_fittings.sort(key=itemgetter(4))
                    try:
                        print qid, query_term, all_fittings[0][0], all_fittings[0][1], all_fittings[0][2], all_fittings[0][4]
                    except:
                        continue
                    fitted_y = [0 for i in range(len(xaxis))]
                    for x in xaxis:
                        if x in fitting_xaxis:
                            idx = fitting_xaxis.index(x)
                            fitted_y[idx] = all_fittings[0][3][idx]
                    best_fit_func_name = all_fittings[0][1]
                    all_fittings.sort(key=itemgetter(-1))
                    zoom_yaxis_fitting = fitted_y[zoom_x:]
                    self.plot_figure(ax, xaxis, fitted_y, qid+'-'+query_term, 
                        '%s\n%s(%.4f)' % (best_fit_func_name, all_fittings[0][1], all_fittings[0][-2]), 
                        drawline=True, 
                        linestyle='--',
                        zoom=zoom_x > 0,
                        zoom_ax = zoom_ax,
                        zoom_xaxis=zoom_xaxis,
                        zoom_yaxis=zoom_yaxis_fitting,
                        legend_pos='best',
                        xlimit=xlimit,
                        ylimit=ylimit,
                        legend_markscale=0.5)
        if draw_individual:
            output_fn = os.path.join(self.all_results_root, output_root, 
                '%s-%s-%s-%s-%s-%s-%d-%.1f-%.1f-zoom%d-%s-%s-individual.%s' % (
                    collection_name, 
                    _method, 
                    'ratio' if plot_ratio else 'abscnt', 
                    'total' if plot_total_or_avg else 'avg',
                    'rel' if plot_rel_or_all else 'all',
                    'line' if drawline else 'dots', 
                    numbins, 
                    xlimit,
                    ylimit,
                    zoom_x, 
                    'compact' if compact_x else 'raw',
                    'fit' if curve_fitting else 'plain',
                    oformat) )
            plt.savefig(output_fn, format=oformat, bbox_inches='tight', dpi=400)

            if curve_fitting:
                # plot the goodness of fit
                all_fitting_results = [ele for ele in all_fitting_results if 'name' in ele and ele['name'] not in ['AD']]
                goodness_fit_data = [ele['sr'] for ele in all_fitting_results if 'sr' in ele]
                labels = [ele['name'] for ele in all_fitting_results if 'sr' in ele and 'name' in ele]
                fig, ax = plt.subplots(nrows=1, ncols=1, sharex=False, sharey=False, figsize=(6, 3.*1))
                font = {'size' : 8}
                plt.rc('font', **font)
                ax.boxplot(goodness_fit_data, labels=labels)
                output_fn = os.path.join(self.all_results_root, output_root, 
                    '%s-%s-fitting.%s' % (collection_name, _method, oformat) )
                plt.savefig(output_fn, format=oformat, bbox_inches='tight', dpi=400)
                # plot the AP diff
                ap_diff_data = [ele['ap_diff'] for ele in all_fitting_results if 'ap_diff' in ele]
                labels = [ele['name'] for ele in all_fitting_results if 'ap_diff' in ele and 'name' in ele]
                fig, ax = plt.subplots(nrows=1, ncols=1, sharex=False, sharey=False, figsize=(6, 3.*1))
                font = {'size' : 8}
                plt.rc('font', **font)
                ax.boxplot(ap_diff_data, labels=labels)
                output_fn = os.path.join(self.all_results_root, output_root, 
                    '%s-%s-apdiff.%s' % (collection_name, _method, oformat) )
                plt.savefig(output_fn, format=oformat, bbox_inches='tight', dpi=400)

        # draw the figure for the whole collection
        collection_vocablulary_stat = cs.get_vocabulary_stats()
        collection_vocablulary_stat_str = ''
        idx = 1
        for k,v in collection_vocablulary_stat.items():
            collection_vocablulary_stat_str += k+'='+'%.2f'%v+' '
            if idx == 3:
                collection_vocablulary_stat_str += '\n'
                idx = 1
            idx += 1

        fig, axs = plt.subplots(nrows=1, ncols=1, sharex=False, sharey=False, figsize=(6, 3.*1))
        font = {'size' : 8}
        plt.rc('font', **font)
        xaxis = collection_x_dict.keys()
        xaxis.sort()
        if plot_ratio:
            yaxis = [collection_x_dict[x][0]*1./collection_x_dict[x][1] for x in xaxis]
        else:
            if plot_total_or_avg:
                yaxis = [(collection_x_dict[x][0]) if plot_rel_or_all else (collection_x_dict[x][1]) for x in xaxis] 
            else:
                yaxis = [(collection_x_dict[x][0]/len(queries)) if plot_rel_or_all else (collection_x_dict[x][1]/len(queries)) for x in xaxis]
            #print np.sum(yaxis[20:]), np.sum(yaxis[20:])
        if numbins > 0:
            interval = collection_level_maxX*1.0/numbins
            newxaxis = [i for i in np.arange(0, collection_level_maxX+1e-10, interval)]
            newyaxis = [[0.0, 0.0] for x in newxaxis]
            for x in xaxis:
                newx = int(x / interval)
                newyaxis[newx][0] += collection_x_dict[x][0]
                newyaxis[newx][1] += collection_x_dict[x][1]
            xaxis = newxaxis
            if plot_ratio:
                yaxis = [ele[0]/ele[1] if ele[1] != 0 else 0.0 for ele in newyaxis]
            else:
                if plot_total_or_avg:
                    yaxis = [(ele[0]) if plot_rel_or_all else (ele[1]) for ele in newyaxis] 
                else:
                    yaxis = [(ele[0]/len(queries)) if plot_rel_or_all else (ele[1]/len(queries)) for ele in newyaxis]

        # we do not care about the actual values of x
        # so we just map the actual values to integer values
        return_data = copy.deepcopy(collection_x_dict)

        if curve_fitting:
            #### calculate the stats
            for fitting_func_name in all_fitting_performances:
                actual_maps = [p[qid]['map'] if p[qid] else 0 for qid in queries]
                estimated_maps = [all_fitting_performances[fitting_func_name][qid] if qid in all_fitting_performances[fitting_func_name] else 0 for qid in queries]
                print fitting_func_name, 
                print scipy.stats.pearsonr(actual_maps, estimated_maps),
                print scipy.stats.kendalltau(actual_maps, estimated_maps)
                print '-'*30

        if draw_all:
            if compact_x:
                xaxis = range(1, len(xaxis)+1)

            xaxis = np.array(xaxis, dtype=np.float32)
            yaxis = np.array(yaxis, dtype=np.float32)
            if curve_fitting and not plot_ratio:
                yaxis /= np.sum(yaxis)

            collection_legend = ''
            if performance_as_legend:
                collection_legend = '$MAP:%.4f$' % (np.mean([p[qid]['map'] if p[qid] else 0 for qid in queries]))
                #collection_legend += '\n$MAP_E:%.4f$' % (np.mean(all_expected_maps))

            zoom_xaxis = xaxis[zoom_x:]
            zoom_yaxis = yaxis[zoom_x:]
            axs, zoom_axs = self.plot_figure(axs, xaxis, yaxis, collection_name, collection_legend, 
                drawline=drawline,
                xlimit=xlimit,
                ylimit=ylimit,
                zoom=zoom_x > 0,
                zoom_xaxis=zoom_xaxis,
                zoom_yaxis=zoom_yaxis)

            if curve_fitting:
                all_fittings = []
                fitting_xaxis = []
                fitting_yaxis = []
                for i, ele in enumerate(yaxis):
                    if ele != 0:
                        fitting_xaxis.append(xaxis[i])
                        fitting_yaxis.append(ele)
                for j in range(1, FittingModels().size()+1):
                    fitting = FittingModels().cal_curve_fit(fitting_xaxis, fitting_yaxis, j)
                    if not fitting is None:
                        all_fittings.append(fitting)
                        #print fitting[0], fitting[1], fitting[3]
                    else:
                        #print j, 'None'
                        pass
                all_fittings.sort(key=itemgetter(4))
                if all_fittings:
                    print all_fittings[0][0], all_fittings[0][1], all_fittings[0][2], all_fittings[0][4]
                    fitted_y = [0 for i in range(len(xaxis))]
                    for x in xaxis:
                        if x in fitting_xaxis:
                            idx = fitting_xaxis.index(x)
                            fitted_y[idx] = all_fittings[0][3][idx]

                    zoom_yaxis_fitting = fitted_y[zoom_x:]
                    self.plot_figure(axs, xaxis, fitted_y, collection_name, all_fittings[0][1], 
                        drawline=True, 
                        linestyle='--',
                        zoom=zoom_x > 0,
                        zoom_ax = zoom_axs,
                        zoom_xaxis=zoom_xaxis,
                        zoom_yaxis=zoom_yaxis_fitting,
                        legend_pos='best',
                        xlimit=xlimit,
                        ylimit=ylimit)

            output_fn = os.path.join(self.all_results_root, output_root, 
                '%s-%s-%s-%s-%s-%s-%d-%.1f-%.1f-zoom%d-%s-%s-all.%s' % (
                    collection_name, 
                    _method, 
                    'ratio' if plot_ratio else 'abscnt', 
                    'total' if plot_total_or_avg else 'avg',
                    'rel' if plot_rel_or_all else 'all',
                    'line' if drawline else 'dots', 
                    numbins, 
                    xlimit,
                    ylimit,
                    zoom_x, 
                    'compact' if compact_x else 'raw',
                    'fit' if curve_fitting else 'plain',
                    oformat) )
            plt.savefig(output_fn, format=oformat, bbox_inches='tight', dpi=400)

        return collection_name, return_data