Ejemplo n.º 1
0
def test_train(test, train, y_test, y_train, labels=[], bins=25, node=0, plot_dir=None, weight_test=None, weight_train=None):
    ks = {}

    fig, ax = plt.subplots(1,1,figsize=(10,10))

    h = {}
    for i, label in enumerate(labels):
        
        _ks, _p = scipy.stats.kstest(
            train[:,node][(y_train==i)],
            test[:,node][(y_test==i)]
        )
        #_ks, _p = -1, -1
        
        ks[label] = (_p, _ks)

        h[label+'_test'] = Hist1D(test[:,node][(y_test==i)], bins=bins, weights=weight_test[(y_test==i)]).normalize()
        h[label+'_train'] = Hist1D(train[:,node][(y_train==i)], bins=bins, label=label+' (p=%.2f, KS=%.2f)'%(_p, _ks), weights=weight_train[(y_train==i)]).normalize()
        

        h[label+'_test'].plot(color=colors[i], histtype="step", ls='--', linewidth=2)
        h[label+'_train'].plot(color=colors[i], histtype="step", linewidth=2)

    if plot_dir:
        finalizePlotDir(plot_dir)
        fig.savefig("{}/score_node_{}.png".format(plot_dir, node))
        fig.savefig("{}/score_node_{}.pdf".format(plot_dir, node))
    
    return ks
Ejemplo n.º 2
0
def test_train_cat(test, train, y_test, y_train, labels=[], n_cat=5, plot_dir=None, weight_test=None, weight_train=None):
    ks = {}
    bins = [x-0.5 for x in range(n_cat+1)]
    
    fig, ax = plt.subplots(1,1,figsize=(10,10))
    
    h = {}
    for i, label in enumerate(labels):
        
        _ks, _p = scipy.stats.kstest(
            train.argmax(axis=1)[(y_train==i)],
            test.argmax(axis=1)[(y_test==i)]
        )
        #_ks, _p = -1, -1

        ks[label] = (_p, _ks)
        
        h[label+'_test'] = Hist1D(test.argmax(axis=1)[(y_test==i)], bins=bins, weights=weight_test[(y_test==i)]).normalize()
        h[label+'_train'] = Hist1D(train.argmax(axis=1)[(y_train==i)], bins=bins, label=label+' (p=%.2f, KS=%.2f)'%(_p, _ks), weights=weight_train[(y_train==i)]).normalize()
        

        h[label+'_test'].plot(color=colors[i], histtype="step", ls='--', linewidth=2)
        h[label+'_train'].plot(color=colors[i], histtype="step", linewidth=2)
        
    ax.set_ylabel('a.u.')
    ax.set_xlabel('category')

    ax.set_ylim(0,1/n_cat*5)

    if plot_dir:
        finalizePlotDir(plot_dir)
        fig.savefig("{}/categories.png".format(plot_dir))
        fig.savefig("{}/categories.pdf".format(plot_dir))

    return ks
Ejemplo n.º 3
0
def saveFig( fig, ax, rax, path, name, scale='linear', shape=False, y_max=-1 ):
    outdir = os.path.join(path,scale)
    finalizePlotDir(outdir)
    ax.set_yscale(scale)
    ax.set_ylabel('Events')

    if scale == 'linear':
        if y_max<0 or True:
            pass
        else:
            ax.set_ylim(0, 1 if shape else 1.2*y_max)
    else:
        if y_max<0 and not shape:
            pass
        else:
            ax.set_ylim(0.000005 if shape else 0.05, 3 if shape else 300*y_max)

    #if scale == 'log':
    #    ax.set_ylim(y_min, y_max)
    #else:
    #    ax.set_ylim(0, y_max)
    #    #if shape:
    #    #     ax.yaxis.set_ticks(np.array([10e-4,10e-3,10e-2,10e-1,10e0]))
    #    #else:
    #    #    ax.yaxis.set_ticks(np.array([10e-2,10e-1,10e0,10e1,10e2,10e3,10e4,10e5,10e6]))


    handles, labels = ax.get_legend_handles_labels()
    new_labels = []
    WHcount = 0
    for handle, label in zip(handles, labels):
        #print (handle, label)
        try:
            #new_labels.append(my_labels[label])
            
            if WHcount>0 and label=='WH':
                new_labels.append('WH (1000,0)')
                handle.set_color('#000000')
            else:
                new_labels.append(my_labels[label])
                if not label=='pseudodata':
                    handle.set_color(colors[label])
            if label=='WH':
                WHcount+=1
        except:
            pass

    if rax:
        plt.subplots_adjust(hspace=0)
        rax.set_ylabel('Obs./Pred.')
        rax.set_ylim(0.5,1.5)

    ax.legend(title='',ncol=2,handles=handles, labels=new_labels, frameon=False)

    fig.text(0.15, 0.995, '$\\bf{CMS}$', fontsize=20,  horizontalalignment='left', verticalalignment='bottom', transform=ax.transAxes )
    fig.text(0.30, 1., '$\\it{Simulation}$', fontsize=14, horizontalalignment='left', verticalalignment='bottom', transform=ax.transAxes )
    fig.text(0.8, 1., '13 TeV', fontsize=14, horizontalalignment='left', verticalalignment='bottom', transform=ax.transAxes )

    fig.savefig(os.path.join(outdir, "{}.pdf".format(name)))
    fig.savefig(os.path.join(outdir, "{}.png".format(name)))
Ejemplo n.º 4
0
def get_cat_plot(X, y, labels=[], n_cat=5, plot_dir=None, weight=None):
    ks = {}
    bins = [x-0.5 for x in range(n_cat+1)]
    
    fig, ax = plt.subplots(1,1,figsize=(10,10))
    
    h = {}
    for i, label in enumerate(labels):
        
        h[label+'_train'] = Hist1D(X.argmax(axis=1)[(y==i)], bins=bins, label=label, weights=weight[(y==i)])
        
        h[label+'_train'].plot(color=colors[i], histtype="step", linewidth=2)
        
    ax.set_ylabel('a.u.')
    ax.set_xlabel('category')

    ax.set_ylim(0,200)

    if plot_dir:
        finalizePlotDir(plot_dir)
        fig.savefig("{}/abs_categories.png".format(plot_dir))
        fig.savefig("{}/abs_categories.pdf".format(plot_dir))
Ejemplo n.º 5
0
def makePlot(output,
             histo,
             axis,
             bins=None,
             data=[],
             normalize=True,
             log=False,
             save=False,
             axis_label=None,
             ratio_range=None,
             upHists=[],
             downHists=[],
             shape=False,
             ymax=False,
             new_colors=colors,
             new_labels=my_labels,
             order=None,
             signals=[],
             omit=[],
             lumi=60.0,
             binwnorm=None,
             overlay=None,
             use_label=True,
             y_axis_label='Events'):

    if save:
        finalizePlotDir('/'.join(save.split('/')[:-1]))

    mc_sel = re.compile(
        '(?!(%s))' %
        ('|'.join(data + omit))) if len(data + omit) > 0 else re.compile('')
    data_sel = re.compile('|'.join(data))
    bkg_sel = re.compile(
        '(?!(%s))' %
        ('|'.join(data + signals + omit))) if len(data + signals +
                                                  omit) > 0 else re.compile('')

    if histo is None:
        processes = [p[0] for p in output.values().keys() if not p[0] in data]
        histogram = output.copy()
    else:
        processes = [
            p[0] for p in output[histo].values().keys() if not p[0] in data
        ]
        histogram = output[histo].copy()

    histogram = histogram.project(axis, 'dataset')
    if overlay: overlay = overlay.project(axis, 'dataset')
    if bins:
        histogram = histogram.rebin(axis, bins)
        if overlay: overlay = overlay.rebin(axis, bins)

    y_max = histogram[bkg_sel].sum("dataset").values(overflow='over')[()].max()
    print(histogram[bkg_sel].sum("dataset").values(overflow='over')[()].max())
    MC_total = histogram[bkg_sel].sum("dataset").values(
        overflow='over')[()].sum()
    Data_total = 0
    if data:
        Data_total = histogram[data_sel].sum("dataset").values(
            overflow='over')[()].sum()
        #observation = histogram[data[0]].sum('dataset').copy()
        #first = True
        #for d in data:
        #    print (d)
        #    if not first:
        #        observation.add(histogram[d].sum('dataset'))
        #        print ("adding")
        #    first = False

    print("Data:", round(Data_total, 0), "MC:", round(MC_total, 2))

    if normalize and data_sel:
        scales = {process: Data_total / MC_total for process in processes}
        histogram.scale(scales, axis='dataset')
    else:
        scales = {}

    if shape:
        scales = {
            process: 1 / histogram[process].sum("dataset").values(
                overflow='over')[()].sum()
            for process in processes
        }
        histogram.scale(scales, axis='dataset')

    if data:
        fig, (ax, rax) = plt.subplots(2,
                                      1,
                                      figsize=(10, 10),
                                      gridspec_kw={"height_ratios": (3, 1)},
                                      sharex=True)
    else:
        fig, ax = plt.subplots(1, 1, figsize=(10, 10))

    if signals:
        for sig in signals:
            ax = hist.plot1d(histogram[sig],
                             overlay="dataset",
                             ax=ax,
                             stack=False,
                             overflow='over',
                             clear=False,
                             line_opts=line_opts,
                             fill_opts=None,
                             binwnorm=binwnorm)
    if overlay:
        ax = hist.plot1d(overlay,
                         overlay="dataset",
                         ax=ax,
                         stack=False,
                         overflow='over',
                         clear=False,
                         line_opts=line_opts,
                         fill_opts=None,
                         binwnorm=binwnorm)

    if shape:
        ax = hist.plot1d(histogram[bkg_sel],
                         overlay="dataset",
                         ax=ax,
                         stack=False,
                         overflow='over',
                         clear=False,
                         line_opts=line_opts,
                         fill_opts=None,
                         binwnorm=binwnorm)
    else:
        ax = hist.plot1d(histogram[bkg_sel],
                         overlay="dataset",
                         ax=ax,
                         stack=True,
                         overflow='over',
                         clear=False,
                         line_opts=None,
                         fill_opts=fill_opts,
                         order=(order if order else processes),
                         binwnorm=binwnorm)
    if data:
        ax = hist.plot1d(histogram[data_sel].sum("dataset"),
                         ax=ax,
                         overflow='over',
                         error_opts=data_err_opts,
                         clear=False,
                         binwnorm=binwnorm)
        #ax = hist.plot1d(observation, ax=ax, overflow='over', error_opts=data_err_opts, clear=False)

        hist.plotratio(
            num=histogram[data_sel].sum("dataset"),
            denom=histogram[bkg_sel].sum("dataset"),
            ax=rax,
            error_opts=data_err_opts,
            denom_fill_opts=
            None,  # triggers this: https://github.com/CoffeaTeam/coffea/blob/master/coffea/hist/plot.py#L376
            guide_opts={},
            unc='num',
            #unc=None,
            overflow='over')

    handles, labels = ax.get_legend_handles_labels()
    updated_labels = []
    for handle, label in zip(handles, labels):
        try:
            if label is None or label == 'None':
                updated_labels.append("Observation")
                handle.set_color('#000000')
            else:
                updated_labels.append(new_labels[label])
                handle.set_color(new_colors[label])
        except:
            pass

    if data:
        if ratio_range:
            rax.set_ylim(*ratio_range)
        else:
            rax.set_ylim(0.1, 1.9)
        rax.set_ylabel('Obs./Pred.')
        if axis_label:
            rax.set_xlabel(axis_label)

    ax.set_xlabel(axis_label)
    ax.set_ylabel(y_axis_label)

    if not binwnorm:
        if not shape:
            addUncertainties(ax,
                             axis,
                             histogram,
                             bkg_sel,
                             [output[histo + '_' + x] for x in upHists],
                             [output[histo + '_' + x] for x in downHists],
                             overflow='over',
                             rebin=bins,
                             ratio=False,
                             scales=scales)

        if data:
            addUncertainties(rax,
                             axis,
                             histogram,
                             bkg_sel,
                             [output[histo + '_' + x] for x in upHists],
                             [output[histo + '_' + x] for x in downHists],
                             overflow='over',
                             rebin=bins,
                             ratio=True,
                             scales=scales)

    if log:
        ax.set_yscale('log')

    y_mult = 1.7 if not log else 100

    if ymax:
        ax.set_ylim(0.01, ymax)
    else:
        y_max = y_max * y_mult * (Data_total /
                                  MC_total) if data else y_max * y_mult
        ax.set_ylim(0.01, y_max if not shape else 2)
        #if binwnorm: ax.set_ylim(0.5)

    ax.legend(
        loc='upper right',
        ncol=2,
        borderaxespad=0.0,
        labels=updated_labels,
        handles=handles,
    )
    plt.subplots_adjust(hspace=0)

    if use_label:
        if len(data) > 0:
            fig.text(0.0,
                     0.995,
                     '$\\bf{CMS}$ Preliminary',
                     fontsize=25,
                     horizontalalignment='left',
                     verticalalignment='bottom',
                     transform=ax.transAxes)
        else:
            fig.text(0.0,
                     0.995,
                     '$\\bf{CMS}$ Simulation',
                     fontsize=25,
                     horizontalalignment='left',
                     verticalalignment='bottom',
                     transform=ax.transAxes)
        fig.text(0.6,
                 0.995,
                 r'$%.1f\ fb^{-1}$ (13 TeV)' % (lumi),
                 fontsize=25,
                 horizontalalignment='left',
                 verticalalignment='bottom',
                 transform=ax.transAxes)

    if normalize:
        fig.text(0.55,
                 0.65,
                 'Data/MC = %s' % round(Data_total / MC_total, 2),
                 fontsize=20,
                 horizontalalignment='left',
                 verticalalignment='bottom',
                 transform=ax.transAxes)

    if save:
        #finalizePlotDir(outdir)
        fig.savefig("{}.pdf".format(save))
        fig.savefig("{}.png".format(save))
        #fig.savefig(save)
        print("Figure saved in:", save)
Ejemplo n.º 6
0
def makePlot(output, histo, axis, bins=None, mc_sel=bkgonly, data_sel='MuonEG', normalize=True, log=False, save=False, axis_label=None, ratio_range=None, upHists=[], downHists=[], shape=False, ymax=False, new_colors=colors, new_labels=my_labels, order=None):
    
    if save:
        finalizePlotDir( '/'.join(save.split('/')[:-1]) )
    
    
    if histo is None:
        processes = [ p[0] for p in output.values().keys() if not p[0]=='MuonEG' ]
        histogram = output.copy()
    else:
        processes = [ p[0] for p in output[histo].values().keys() if not p[0]=='MuonEG' ]
        histogram = output[histo].copy()

    histogram = histogram.project(axis, 'dataset')
    if bins:
        histogram = histogram.rebin(axis, bins)

    y_max = histogram[mc_sel].sum("dataset").values(overflow='over')[()].max()

    MC_total = histogram[mc_sel].sum("dataset").values(overflow='over')[()].sum()
    Data_total = 0
    if data_sel:
        Data_total = histogram[data_sel].sum("dataset").values(overflow='over')[()].sum()
    
    print ("Data:", round(Data_total,0), "MC:", round(MC_total,2))
    
    if normalize and data_sel:
        scales = { process: Data_total/MC_total for process in processes }
        histogram.scale(scales, axis='dataset')
    else:
        scales = {}

    if shape:
        scales = { process: 1/histogram[process].sum("dataset").values(overflow='over')[()].sum() for process in processes }
        histogram.scale(scales, axis='dataset')
    
    if data_sel:
        fig, (ax, rax) = plt.subplots(2,1,figsize=(10,10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True)
    else:
        fig, ax  = plt.subplots(1,1,figsize=(10,10) )

    if shape:
        ax = hist.plot1d(histogram[mc_sel], overlay="dataset", ax=ax, stack=False, overflow='over', clear=False, line_opts=line_opts, fill_opts=None)
    else:
        ax = hist.plot1d(histogram[mc_sel], overlay="dataset", ax=ax, stack=True, overflow='over', clear=False, line_opts=None, fill_opts=fill_opts, order=(order if order else processes))
    if data_sel:
        ax = hist.plot1d(histogram[data_sel], overlay="dataset", ax=ax, overflow='over', error_opts=data_err_opts, clear=False)

        hist.plotratio(
                num=histogram[data_sel].sum("dataset"),
                denom=histogram[mc_sel].sum("dataset"),
                ax=rax,
                error_opts=data_err_opts,
                denom_fill_opts=None, # triggers this: https://github.com/CoffeaTeam/coffea/blob/master/coffea/hist/plot.py#L376
                guide_opts={},
                unc='num',
                #unc=None,
                overflow='over'
        )
    
    
    handles, labels = ax.get_legend_handles_labels()
    updated_labels = []
    for handle, label in zip(handles, labels):
        #print (label)
        try:
            updated_labels.append(new_labels[label])
            if not label=='MuonEG':
                handle.set_color(new_colors[label])
        except:
            pass

    if data_sel:
        if ratio_range:
            rax.set_ylim(*ratio_range)
        else:
            rax.set_ylim(0.1,1.9)
        rax.set_ylabel('Obs./Pred.')
        if axis_label:
            rax.set_xlabel(axis_label)

    ax.set_xlabel(axis_label)
    ax.set_ylabel('Events')
    
    if not shape:
        addUncertainties(ax, axis, histogram, mc_sel, [output[histo+'_'+x] for x in upHists], [output[histo+'_'+x] for x in downHists], overflow='over', rebin=bins, ratio=False, scales=scales)

    if data_sel:
        addUncertainties(rax, axis, histogram, mc_sel, [output[histo+'_'+x] for x in upHists], [output[histo+'_'+x] for x in downHists], overflow='over', rebin=bins, ratio=True, scales=scales)
    
    if log:
        ax.set_yscale('log')
        
    y_mult = 1.3 if not log else 100
    if ymax:
        ax.set_ylim(0.01, ymax)
    else:
        ax.set_ylim(0.01,y_max*y_mult if not shape else 2)

    ax.legend(
        loc='upper right',
        ncol=2,
        borderaxespad=0.0,
        labels=updated_labels,
        handles=handles,
    )
    plt.subplots_adjust(hspace=0)

    fig.text(0.0, 0.995, '$\\bf{CMS}$ Preliminary', fontsize=20,  horizontalalignment='left', verticalalignment='bottom', transform=ax.transAxes )
    fig.text(0.8, 0.995, '13 TeV', fontsize=20,  horizontalalignment='left', verticalalignment='bottom', transform=ax.transAxes )

    if normalize:
        fig.text(0.55, 0.65, 'Data/MC = %s'%round(Data_total/MC_total,2), fontsize=20,  horizontalalignment='left', verticalalignment='bottom', transform=ax.transAxes )


    if save:
        #finalizePlotDir(outdir)
        fig.savefig("{}.pdf".format(save))
        fig.savefig("{}.png".format(save))
        #fig.savefig(save)
        print ("Figure saved in:", save)