def gen_ranking_list(self, method, _callback, paras):
     """
     We get the statistics from /collection_path/detailed_doc_stats/ 
     so that we can get everything for the top 10,000 documents for 
     each query generated by Dirichlet language model method.
     """
     single_queries = Query(self.collection_path).get_queries_of_length(1)
     queries = {ele['num']:ele['title'] for ele in single_queries}
     doc_details = GenSqaDocDetails(self.collection_path)
     cs = CollectionStats(self.collection_path)
     avdl = cs.get_avdl()
     total_terms = cs.get_total_terms()
     res = {}
     for qid in queries:
         print queries[qid]
         res[qid] = []
         idx = 0
         ctf = cs.get_term_collection_occur(queries[qid])
         idf = cs.get_term_logidf1(queries[qid])
         #for row in cs.get_qid_details(qid):
         for row in doc_details.get_qid_details(qid):
             docid = row['docid']
             total_tf = float(row['total_tf'])
             doc_len = float(row['doc_len'])
             localpara = copy.deepcopy(paras)
             localpara.extend([total_tf, doc_len, avdl, ctf, total_terms, idf])
             score = _callback(localpara)
             res[qid].append((docid, score))
             idx += 1
             if idx >= 1000:
                 break
     self.output_results(res, method)
     self.eval(method)
Example #2
0
    def print_statistics(self, methods):
        single_queries = Query(self.collection_path).get_queries_of_length(1)
        queries = {ele['num']:ele['title'] for ele in single_queries}
        cs = CollectionStats(self.collection_path)
        performance = Performances(self.collection_path)
        res = performance.gen_optimal_performances_queries(methods, queries.keys())

        avdl = cs.get_avdl()
        total_terms = cs.get_total_terms()
        collection_freq = []
        for qid in queries:
            idx = 0
            ctf = cs.get_term_collection_occur(queries[qid])
            idf = cs.get_term_logidf1(queries[qid])
            collection_freq.append( ctf*1.0/total_terms )
        print avdl
        print np.mean(collection_freq)

        for ele in res:
            label = ele[0]
            p = ele[1]
            para = float(ele[2].split(':')[1])
            print label
            if 'okapi' in label:
                print 'b:', para, 'beta:', 1.2*para/avdl, 'c2:', 1.2*(1-para)
            if 'pivoted' in label:
                print 's:', para, 'beta:', para/avdl, 'c2:', 1-para
    def process(self, qid, method_name, method_paras, output_fn):
        cs = CollectionStats(self.collection_path)
        single_queries = Query(self.collection_path).get_queries_of_length(1)
        queries = {ele['num']:ele['title'] for ele in single_queries}
        #print qids
        self.rel_docs = Judgment(self.collection_path).get_relevant_docs_of_some_queries([qid], 1, 'dict')
        # idfs = [(qid, math.log(cs.get_term_IDF1(queries[qid]))) for qid in self.rel_docs]
        # idfs.sort(key=itemgetter(1))
        
        avdl = cs.get_avdl()
        total_terms = cs.get_total_terms()
        data = {True: [], False: []} # False: non-relevant  True: relevant

        ctf = cs.get_term_collection_occur(queries[qid])
        collection_para = {
            'avdl': avdl, 
            'total_terms': total_terms,
            'ctf': ctf
        }
        for row in cs.get_qid_details(qid):
            docid = row['docid']
            total_tf = float(row['total_tf'])
            doc_len = float(row['doc_len'])
            rel_score = int(row['rel_score'])
            rel = (rel_score>=1)
            data[rel].append( {
                'docid': docid,
                'tf': total_tf, 
                'ln': doc_len
            } )
        method_para_dict = {ele.split(':')[0]:ele.split(':')[1] for ele in method_paras.split(',')}
        max_map, max_para = self.learn(qid, data, collection_para, method_name, method_para_dict)
        with open(output_fn, 'wb') as f:
            json.dump({'map':max_map, 'para':max_para, 'eta':method_para_dict['eta']}, f, indent=2)
Example #4
0
 def output_data_file(self):
     cs = CollectionStats(self.collection_path)
     single_queries = Query(self.collection_path).get_queries_of_length(1)
     queries = {ele['num']:ele['title'] for ele in single_queries}
     #print qids
     with open(os.path.join(self.collection_path, 'svm_data_index_file'), 'wb') as indexf:
         for qid in queries:
             data_fn = os.path.join(self.svm_data_root, qid)
             indexf.write('%s\n' % (data_fn))
             with open(data_fn, 'wb') as f:
                 for row in cs.get_qid_details(qid):
                     docid = row['docid']
                     total_tf = float(row['total_tf'])
                     doc_len = float(row['doc_len'])
                     rel_score = int(row['rel_score'])
                     #rel = (rel_score>=1)
                     f.write('%d qid:%s 1:%f 2:%f\n' % (rel_score, qid, total_tf, doc_len))
    def gen_perfect_ranking_list(self, plotbins=True, numbins=60):
        cs = CollectionStats(self.collection_path)
        single_queries = Query(self.collection_path).get_queries_of_length(1)
        queries = {ele['num']:ele['title'] for ele in single_queries}
        #print qids
        rel_docs = Judgment(self.collection_path).get_relevant_docs_of_some_queries(queries.keys(), 1, 'dict')
        idfs = [(qid, math.log(cs.get_term_IDF1(queries[qid]))) for qid in rel_docs]
        idfs.sort(key=itemgetter(1))
        res = {}
        for qid,idf in idfs:
            x_dict = {}
            res[qid] = []
            score_mapping = {}
            maxScore = -99999999
            for row in cs.get_qid_details(qid):
                docid = row['docid']
                total_tf = float(row['total_tf'])
                doc_len = float(row['doc_len'])
                rel_score = int(row['rel_score'])
                score = math.log(total_tf+1.0)/(math.log(total_tf+1.0)+math.log(doc_len))
                #score = total_tf/(total_tf + doc_len)
                score_mapping[docid] = score
                if score > maxScore:
                    maxScore = score
                rel = (rel_score>=1)
                if score not in x_dict:
                    x_dict[score] = [0, 0, [docid, score, rel, len(rel_docs[qid])]] # [rel_docs, total_docs]
                if rel:
                    x_dict[score][0] += 1
                x_dict[score][1] += 1

            # xaxis = x_dict.keys()
            # xaxis.sort()
            # yaxis = [(x_dict[x][0]*1./x_dict[x][1], x_dict[x][2]) for x in xaxis]
            # if plotbins:
            interval = maxScore*1.0/numbins
            newxaxis = [i for i in np.arange(0, maxScore+1e-10, interval)]
            newyaxis = [[0.0, 0.0, []] for x in newxaxis]
            for x in x_dict:
                newx = int(x / interval)
                # print x_dict[x]
                newyaxis[newx][0] += x_dict[x][0]
                newyaxis[newx][1] += x_dict[x][1]
                newyaxis[newx][2].append( x_dict[x][2] )
                # print x, newx
                # print newxaxis
                # print newyaxis
                # raw_input()
            xaxis = newxaxis
            yaxis = [(ele[0]*1.0/ele[1], ele[2]) if ele[1] != 0 else (0, []) for ele in newyaxis]
            yaxis.sort(key=itemgetter(0), reverse=True)
            #yaxis.sort(key=self._sort_by_map, reverse=True)
            sbase = 1e9
            for ele in yaxis:
                for doc in ele[1]:
                    docid = doc[0]  
                    if len(res[qid]) < 1000:     
                        res[qid].append((docid, sbase+score_mapping[docid]))
                    sbase -= 100

            #print len(res[qid])

        method = 'hypothesis_stq_tf_ln_upperbound'
        self.output_results(res, method)
        self.eval(method)
    def plot_rel_prob(self, query_length, x_func, _method, plot_ratio=True, 
            plot_total_or_avg=True, plot_rel_or_all=True, 
            performance_as_legend=True, drawline=True, numbins=60, xlimit=0, 
            ylimit=0, zoom_x=0, compact_x=False, curve_fitting=False, 
            draw_individual=False, draw_all=True, oformat='eps'):
        """
        plot the P(D=1|TF=x)

        Input:
        @query_length: only plot the queries of length, 0 for all queries.
        @x_func: how to get the x-axis of the figure. By default, this should 
            be TF values. But we are flexible with other options, e.g. tf/dl
        @_method: Which method is going to be plot. The parameters should also be 
            attached, e.g. dir,mu:2500
        @plot_ratio: When this is false, plot the y-axis as the number of relevant 
            documents; When this is true, plot the y-axis as the #rel_docs/#docs
        @plot_total_or_avg: When this is true, plot the y-axis as the collection 
            total ; When this is false, plot the collection average. 
            Only available when plot_ratio is false is only available for collection-wise
        @plot_rel_or_all: When this is true, plot the y-axis as the number of 
            relevant docs ; When this is false, plot the number of all docs. 
            Only available when plot_ratio is false is only available for collection-wise
        @performance_as_legend: whether to add performance(e.g. MAP) 
            as part of the legend
        @drawline: draw the data points as line(true) or dots(false)
        @numbins: the number of bins if we choose to plot x points as bins, 0 for no bins
        @xlimit: the limit of xaxis, any value larger than this value would not 
            be plotted. default 0, meaning plot all data.
        @ylimit: the limit of yaxis, any value larger than this value would not 
            be plotted. default 0, meaning plot all data.
        @zoom: whether zoom part of the plot
        @zoom_x: the zoom start x point, 0 for no zoom.
        @compact_x: map the x to continuous integers, e.g. 1,2,3,4,....
        @oformat: output format, eps or png
        """
        collection_name = self.collection_name
        cs = CollectionStats(self.collection_path)
        doc_details = GenDocDetails(self.collection_path)
        output_root = os.path.join('collection_figures', str(query_length))
        if not os.path.exists(os.path.join(self.all_results_root, output_root)):
            os.makedirs(os.path.join(self.all_results_root, output_root))
        if query_length == 0:
            queries = Query(self.collection_path).get_queries()
        else:
            queries = Query(self.collection_path).get_queries_of_length(query_length)
        queries = {ele['num']:ele['title'] for ele in queries}
        #print qids
        rel_docs = Judgment(self.collection_path).get_relevant_docs_of_some_queries(queries.keys(), 1, 'dict')
        #print np.mean([len(rel_docs[qid]) for qid in rel_docs])
        eval_class = Evaluation(self.collection_path)
        print _method
        p = eval_class.get_all_performance_of_some_queries(
            method=_method,
            qids=queries.keys(), 
            return_all_metrics=False, 
            metrics=['map']
        )
        collection_x_dict = {}
        collection_level_maxX = 0.0
        num_cols = min(4, len(queries))
        num_rows = int(math.ceil(len(rel_docs)*1.0/num_cols))
        fig, axs = plt.subplots(nrows=num_rows, ncols=num_cols, sharex=False, sharey=False, figsize=(2*num_cols, 2*num_rows))
        font = {'size' : 5}
        plt.rc('font', **font)
        row_idx = 0
        col_idx = 0
        #idfs = [(qid, math.log(cs.get_term_IDF1(queries[qid]))) for qid in rel_docs]
        #idfs.sort(key=itemgetter(1))
        all_expected_maps = []
        if curve_fitting:
            all_fitting_results = [{'sr': [], 'ap':[], 'ap_diff':[]} for i in range(FittingModels().size())]
            all_fitting_performances = {}
        for qid in sorted(queries):
            if num_rows > 1:
                ax = axs[row_idx][col_idx]
            else:
                if num_cols > 1:
                    ax = axs[col_idx]
                else:
                    ax = axs
            col_idx += 1
            if col_idx >= num_cols:
                row_idx += 1
                col_idx = 0
            query_term = queries[qid]
            maxTF = cs.get_term_maxTF(query_term)
            #idf = math.log(cs.get_term_IDF1(query_term))
            #legend = 'idf:%.2f'%idf
            if performance_as_legend:
                legend = '\nAP:%.4f' % (p[qid]['map'] if p[qid] else 0)
            x_dict = {}
            qid_docs_len = 0
            #for row in cs.get_qid_details(qid):
            for row in doc_details.get_qid_details(qid):
                qid_docs_len += 1
                x = x_func(cs, row)
                if x > collection_level_maxX:
                    collection_level_maxX = x
                rel = (int(row['rel_score'])>=1)
                if x not in x_dict:
                    x_dict[x] = [0, 0] # [rel_docs, total_docs]
                if rel:
                    x_dict[x][0] += 1
                x_dict[x][1] += 1
                if x not in collection_x_dict:
                    collection_x_dict[x] = [0, 0] # [rel_docs, total_docs]
                if rel:
                    collection_x_dict[x][0] += 1
                collection_x_dict[x][1] += 1
            xaxis = x_dict.keys()
            xaxis.sort()
            if plot_ratio:
                yaxis = [x_dict[x][0]*1./x_dict[x][1] for x in xaxis]
            else:
                yaxis = [(x_dict[x][0]) if plot_rel_or_all else (x_dict[x][1]) for x in xaxis]
            ranking_list = [(x_dict[x][0], x_dict[x][1]) for x in xaxis]
            all_expected_maps.append(EMAP().cal_expected_map(ranking_list))

            if draw_individual:
                if np.sum(xaxis) == 0 or np.sum(yaxis) == 0:
                    continue
                raw_xaxis = copy.deepcopy(xaxis)
                xaxis = np.array(xaxis, dtype=np.float32)
                yaxis = np.array(yaxis, dtype=np.float32)
                if compact_x:
                    xaxis = range(1, len(xaxis)+1)
                if curve_fitting and not plot_ratio:
                    sum_yaxis = np.sum(yaxis)
                    yaxis /= sum_yaxis
                query_stat = cs.get_term_stats(query_term)
                zoom_xaxis = xaxis[zoom_x:]
                zoom_yaxis = yaxis[zoom_x:]
                ax, zoom_ax = self.plot_figure(ax, xaxis, yaxis, qid+'-'+query_term, legend,
                    drawline=drawline,
                    xlimit=xlimit,
                    ylimit=ylimit,
                    zoom=zoom_x > 0,
                    zoom_xaxis=zoom_xaxis,
                    zoom_yaxis=zoom_yaxis,
                    legend_markscale=0.5)
                if curve_fitting:
                    all_fittings = []
                    fitting_xaxis = []
                    fitting_yaxis = []
                    for i, ele in enumerate(yaxis):
                    #if ele != 0:
                        fitting_xaxis.append(xaxis[i])
                        fitting_yaxis.append(ele)
                    for j in range(1, FittingModels().size()+1):
                        fitting = FittingModels().cal_curve_fit(fitting_xaxis, fitting_yaxis, j)
                        if not fitting is None:
                            fitting_func_name = fitting[1]
                            all_fitting_results[j-1]['name'] = fitting_func_name
                            all_fitting_results[j-1]['sr'].append(fitting[4]) # sum of squared error
                            if re.search(r'^tf\d+$', _method):
                                estimated_map = CalEstMAP().cal_map(
                                    rel_docs = np.rint(fitting[3]*sum_yaxis).astype(int),
                                    all_docs = [x_dict[x][1] for x in raw_xaxis],
                                    mode=1
                                )
                            else:
                                estimated_map = CalEstMAP().cal_map(
                                    rel_docs = np.rint(fitting[3]*sum_yaxis).astype(int),
                                    all_docs = [x_dict[x][1] for x in raw_xaxis],
                                    mode=1
                                )
                            all_fitting_results[j-1]['ap'].append(estimated_map) # average precision
                            actual_map = p[qid]['map'] if p[qid] else 0
                            all_fitting_results[j-1]['ap_diff'].append(math.fabs(estimated_map-actual_map))    
                            fitting.append(estimated_map)
                            fitting.append(math.fabs(estimated_map-actual_map))
                            all_fittings.append(fitting)
                            if fitting_func_name not in all_fitting_performances:
                                all_fitting_performances[fitting_func_name] = {}
                            all_fitting_performances[fitting_func_name][qid] = estimated_map
                            #print fitting[0], fitting[1], fitting[3]
                        else:
                            #print j, 'None'
                            pass
                    all_fittings.sort(key=itemgetter(4))
                    try:
                        print qid, query_term, all_fittings[0][0], all_fittings[0][1], all_fittings[0][2], all_fittings[0][4]
                    except:
                        continue
                    fitted_y = [0 for i in range(len(xaxis))]
                    for x in xaxis:
                        if x in fitting_xaxis:
                            idx = fitting_xaxis.index(x)
                            fitted_y[idx] = all_fittings[0][3][idx]
                    best_fit_func_name = all_fittings[0][1]
                    all_fittings.sort(key=itemgetter(-1))
                    zoom_yaxis_fitting = fitted_y[zoom_x:]
                    self.plot_figure(ax, xaxis, fitted_y, qid+'-'+query_term, 
                        '%s\n%s(%.4f)' % (best_fit_func_name, all_fittings[0][1], all_fittings[0][-2]), 
                        drawline=True, 
                        linestyle='--',
                        zoom=zoom_x > 0,
                        zoom_ax = zoom_ax,
                        zoom_xaxis=zoom_xaxis,
                        zoom_yaxis=zoom_yaxis_fitting,
                        legend_pos='best',
                        xlimit=xlimit,
                        ylimit=ylimit,
                        legend_markscale=0.5)
        if draw_individual:
            output_fn = os.path.join(self.all_results_root, output_root, 
                '%s-%s-%s-%s-%s-%s-%d-%.1f-%.1f-zoom%d-%s-%s-individual.%s' % (
                    collection_name, 
                    _method, 
                    'ratio' if plot_ratio else 'abscnt', 
                    'total' if plot_total_or_avg else 'avg',
                    'rel' if plot_rel_or_all else 'all',
                    'line' if drawline else 'dots', 
                    numbins, 
                    xlimit,
                    ylimit,
                    zoom_x, 
                    'compact' if compact_x else 'raw',
                    'fit' if curve_fitting else 'plain',
                    oformat) )
            plt.savefig(output_fn, format=oformat, bbox_inches='tight', dpi=400)

            if curve_fitting:
                # plot the goodness of fit
                all_fitting_results = [ele for ele in all_fitting_results if 'name' in ele and ele['name'] not in ['AD']]
                goodness_fit_data = [ele['sr'] for ele in all_fitting_results if 'sr' in ele]
                labels = [ele['name'] for ele in all_fitting_results if 'sr' in ele and 'name' in ele]
                fig, ax = plt.subplots(nrows=1, ncols=1, sharex=False, sharey=False, figsize=(6, 3.*1))
                font = {'size' : 8}
                plt.rc('font', **font)
                ax.boxplot(goodness_fit_data, labels=labels)
                output_fn = os.path.join(self.all_results_root, output_root, 
                    '%s-%s-fitting.%s' % (collection_name, _method, oformat) )
                plt.savefig(output_fn, format=oformat, bbox_inches='tight', dpi=400)
                # plot the AP diff
                ap_diff_data = [ele['ap_diff'] for ele in all_fitting_results if 'ap_diff' in ele]
                labels = [ele['name'] for ele in all_fitting_results if 'ap_diff' in ele and 'name' in ele]
                fig, ax = plt.subplots(nrows=1, ncols=1, sharex=False, sharey=False, figsize=(6, 3.*1))
                font = {'size' : 8}
                plt.rc('font', **font)
                ax.boxplot(ap_diff_data, labels=labels)
                output_fn = os.path.join(self.all_results_root, output_root, 
                    '%s-%s-apdiff.%s' % (collection_name, _method, oformat) )
                plt.savefig(output_fn, format=oformat, bbox_inches='tight', dpi=400)

        # draw the figure for the whole collection
        collection_vocablulary_stat = cs.get_vocabulary_stats()
        collection_vocablulary_stat_str = ''
        idx = 1
        for k,v in collection_vocablulary_stat.items():
            collection_vocablulary_stat_str += k+'='+'%.2f'%v+' '
            if idx == 3:
                collection_vocablulary_stat_str += '\n'
                idx = 1
            idx += 1

        fig, axs = plt.subplots(nrows=1, ncols=1, sharex=False, sharey=False, figsize=(6, 3.*1))
        font = {'size' : 8}
        plt.rc('font', **font)
        xaxis = collection_x_dict.keys()
        xaxis.sort()
        if plot_ratio:
            yaxis = [collection_x_dict[x][0]*1./collection_x_dict[x][1] for x in xaxis]
        else:
            if plot_total_or_avg:
                yaxis = [(collection_x_dict[x][0]) if plot_rel_or_all else (collection_x_dict[x][1]) for x in xaxis] 
            else:
                yaxis = [(collection_x_dict[x][0]/len(queries)) if plot_rel_or_all else (collection_x_dict[x][1]/len(queries)) for x in xaxis]
            #print np.sum(yaxis[20:]), np.sum(yaxis[20:])
        if numbins > 0:
            interval = collection_level_maxX*1.0/numbins
            newxaxis = [i for i in np.arange(0, collection_level_maxX+1e-10, interval)]
            newyaxis = [[0.0, 0.0] for x in newxaxis]
            for x in xaxis:
                newx = int(x / interval)
                newyaxis[newx][0] += collection_x_dict[x][0]
                newyaxis[newx][1] += collection_x_dict[x][1]
            xaxis = newxaxis
            if plot_ratio:
                yaxis = [ele[0]/ele[1] if ele[1] != 0 else 0.0 for ele in newyaxis]
            else:
                if plot_total_or_avg:
                    yaxis = [(ele[0]) if plot_rel_or_all else (ele[1]) for ele in newyaxis] 
                else:
                    yaxis = [(ele[0]/len(queries)) if plot_rel_or_all else (ele[1]/len(queries)) for ele in newyaxis]

        # we do not care about the actual values of x
        # so we just map the actual values to integer values
        return_data = copy.deepcopy(collection_x_dict)

        if curve_fitting:
            #### calculate the stats
            for fitting_func_name in all_fitting_performances:
                actual_maps = [p[qid]['map'] if p[qid] else 0 for qid in queries]
                estimated_maps = [all_fitting_performances[fitting_func_name][qid] if qid in all_fitting_performances[fitting_func_name] else 0 for qid in queries]
                print fitting_func_name, 
                print scipy.stats.pearsonr(actual_maps, estimated_maps),
                print scipy.stats.kendalltau(actual_maps, estimated_maps)
                print '-'*30

        if draw_all:
            if compact_x:
                xaxis = range(1, len(xaxis)+1)

            xaxis = np.array(xaxis, dtype=np.float32)
            yaxis = np.array(yaxis, dtype=np.float32)
            if curve_fitting and not plot_ratio:
                yaxis /= np.sum(yaxis)

            collection_legend = ''
            if performance_as_legend:
                collection_legend = '$MAP:%.4f$' % (np.mean([p[qid]['map'] if p[qid] else 0 for qid in queries]))
                #collection_legend += '\n$MAP_E:%.4f$' % (np.mean(all_expected_maps))

            zoom_xaxis = xaxis[zoom_x:]
            zoom_yaxis = yaxis[zoom_x:]
            axs, zoom_axs = self.plot_figure(axs, xaxis, yaxis, collection_name, collection_legend, 
                drawline=drawline,
                xlimit=xlimit,
                ylimit=ylimit,
                zoom=zoom_x > 0,
                zoom_xaxis=zoom_xaxis,
                zoom_yaxis=zoom_yaxis)

            if curve_fitting:
                all_fittings = []
                fitting_xaxis = []
                fitting_yaxis = []
                for i, ele in enumerate(yaxis):
                    if ele != 0:
                        fitting_xaxis.append(xaxis[i])
                        fitting_yaxis.append(ele)
                for j in range(1, FittingModels().size()+1):
                    fitting = FittingModels().cal_curve_fit(fitting_xaxis, fitting_yaxis, j)
                    if not fitting is None:
                        all_fittings.append(fitting)
                        #print fitting[0], fitting[1], fitting[3]
                    else:
                        #print j, 'None'
                        pass
                all_fittings.sort(key=itemgetter(4))
                if all_fittings:
                    print all_fittings[0][0], all_fittings[0][1], all_fittings[0][2], all_fittings[0][4]
                    fitted_y = [0 for i in range(len(xaxis))]
                    for x in xaxis:
                        if x in fitting_xaxis:
                            idx = fitting_xaxis.index(x)
                            fitted_y[idx] = all_fittings[0][3][idx]

                    zoom_yaxis_fitting = fitted_y[zoom_x:]
                    self.plot_figure(axs, xaxis, fitted_y, collection_name, all_fittings[0][1], 
                        drawline=True, 
                        linestyle='--',
                        zoom=zoom_x > 0,
                        zoom_ax = zoom_axs,
                        zoom_xaxis=zoom_xaxis,
                        zoom_yaxis=zoom_yaxis_fitting,
                        legend_pos='best',
                        xlimit=xlimit,
                        ylimit=ylimit)

            output_fn = os.path.join(self.all_results_root, output_root, 
                '%s-%s-%s-%s-%s-%s-%d-%.1f-%.1f-zoom%d-%s-%s-all.%s' % (
                    collection_name, 
                    _method, 
                    'ratio' if plot_ratio else 'abscnt', 
                    'total' if plot_total_or_avg else 'avg',
                    'rel' if plot_rel_or_all else 'all',
                    'line' if drawline else 'dots', 
                    numbins, 
                    xlimit,
                    ylimit,
                    zoom_x, 
                    'compact' if compact_x else 'raw',
                    'fit' if curve_fitting else 'plain',
                    oformat) )
            plt.savefig(output_fn, format=oformat, bbox_inches='tight', dpi=400)

        return collection_name, return_data