Ejemplo n.º 1
0
def prepare(baseDir,
            particle,
            probe,
            resonance,
            era,
            config,
            num,
            denom,
            variableLabels,
            skipPlots=False,
            cutAndCount=False):

    subEra = era.split('_')[0]  # data subera is beginning of era
    lumi = registry.luminosity(particle, probe, resonance, era, subEra)
    hists = {}

    effName = get_eff_name(num, denom)
    extEffName = get_extended_eff_name(num, denom, variableLabels)
    binning = config.binning()
    dataSubEra, mcSubEra = get_data_mc_sub_eras(resonance, era)

    systList = config.get('systematics', {
        x: {
            'fitTypes': [],
            'shiftTypes': []
        }
        for x in ['SF', 'dataEff', 'mcEff']
    })

    def get_variable_name_pretty(variableLabel):
        variables = config.variables()
        return variables.get(variableLabel, {}).get('pretty', variableLabel)

    # create output histograms
    nVars = len(variableLabels)
    if nVars == 1:
        THX = ROOT.TH1F
    elif nVars == 2:
        THX = ROOT.TH2F
    elif nVars == 3:
        THX = ROOT.TH3F
    else:
        raise NotImplementedError(
            'More than 3 dimensions are not supported for scale factors')

    hargs = [extEffName, extEffName]
    for variableLabel in variableLabels:
        hargs += [
            len(binning[variableLabel]) - 1,
            array('d', binning[variableLabel])
        ]
    hist = THX(*hargs)
    axes = [hist.GetXaxis(), hist.GetYaxis(), hist.GetZaxis()]
    for vi, variableLabel in enumerate(variableLabels):
        axes[vi].SetTitle(get_variable_name_pretty(variableLabel))
    if nVars == 1:
        hist.GetYaxis().SetTitle('Scalefactor')
    if nVars == 2:
        hist.SetOption('colz')
        hist.GetZaxis().SetTitle('Scalefactor')
    hist_stat = hist.Clone(extEffName + '_stat')
    hist_syst = hist.Clone(extEffName + '_syst')
    histList_syst = {
        'combined_syst': hist.Clone(extEffName + '_combined_syst'),
    }
    histList_syst['combined_syst'].GetZaxis().SetTitle('Uncertainty')

    hist_dataEff = hist.Clone(extEffName + '_efficiencyData')
    if nVars == 1:
        hist_dataEff.GetYaxis().SetTitle('Efficiency')
    if nVars == 2:
        hist_dataEff.GetZaxis().SetTitle('Efficiency')
    hist_dataEff_stat = hist_dataEff.Clone(extEffName + '_efficiencyData_stat')
    hist_dataEff_syst = hist_dataEff.Clone(extEffName + '_efficiencyData_syst')
    histList_dataEff_syst = {
        'combined_syst':
        hist_dataEff.Clone(extEffName + '_efficiencyData_combined_syst'),
    }
    histList_dataEff_syst['combined_syst'].GetZaxis().SetTitle('Uncertainty')
    hist_mcEff = hist_dataEff.Clone(extEffName + '_efficiencyMC')
    hist_mcEff_stat = hist_dataEff.Clone(extEffName + '_efficiencyMC_stat')
    hist_mcEff_syst = hist_dataEff.Clone(extEffName + '_efficiencyMC_syst')
    histList_mcEff_syst = {
        'combined_syst':
        hist_dataEff.Clone(extEffName + '_efficiencyMC_combined_syst'),
    }
    histList_mcEff_syst['combined_syst'].GetZaxis().SetTitle('Uncertainty')

    # the individual systematics
    for iSyst in itertools.chain(systList['SF']['fitTypes'],
                                 systList['SF']['shiftTypes']):
        histList_syst[iSyst] = hist.Clone(extEffName + '_' + iSyst)
        histList_syst[iSyst + '_syst'] = hist.Clone(extEffName + '_' + iSyst +
                                                    '_syst')
        histList_syst[iSyst + '_syst'].GetZaxis().SetTitle('Uncertainty')
    for iSyst in itertools.chain(systList['dataEff']['fitTypes'],
                                 systList['dataEff']['shiftTypes']):
        histList_dataEff_syst[iSyst] = hist_dataEff.Clone(extEffName + '_' +
                                                          iSyst)
        histList_dataEff_syst[iSyst +
                              '_syst'] = hist_dataEff.Clone(extEffName + '_' +
                                                            iSyst + '_syst')
        histList_dataEff_syst[iSyst +
                              '_syst'].GetZaxis().SetTitle('Uncertainty')
    for iSyst in itertools.chain(systList['mcEff']['fitTypes'],
                                 systList['mcEff']['shiftTypes']):
        histList_mcEff_syst[iSyst] = hist_mcEff.Clone(extEffName + '_' + iSyst)
        histList_mcEff_syst[iSyst +
                            '_syst'] = hist_mcEff.Clone(extEffName + '_' +
                                                        iSyst + '_syst')
        histList_mcEff_syst[iSyst + '_syst'].GetZaxis().SetTitle('Uncertainty')

    varName = get_variables_name(variableLabels)

    # iterate through the bin indices
    # this does nested for loops of the N-D binning (e.g. pt, eta)
    # binning starts at 1 (0 is underflow), same as ROOT
    indices = [
        list(range(1, len(binning[variableLabel])))
        for variableLabel in variableLabels
    ]
    output = {effName: {varName: {}}}
    for index in itertools.product(*indices):
        binName = get_full_name(num, denom, variableLabels, index)
        subVarKeys = [
            '{}:[{},{}]'.format(variableLabels[i],
                                binning[variableLabels[i]][ind - 1],
                                binning[variableLabels[i]][ind])
            for i, ind in enumerate(index)
        ]
        _out = output[effName][varName]

        # add binning definitions
        _out['binning'] = [{
            'variable': vl,
            'binning': binning[vl].tolist(),
        } for vl in variableLabels]

        for subVarKey in subVarKeys:
            if subVarKey not in _out:
                _out[subVarKey] = {}
            _out = _out[subVarKey]

        # the fitted distributions
        fitType = 'Nominal'
        dataFNameFit = os.path.join(baseDir, 'fits_data', particle, probe,
                                    resonance, era, fitType, effName,
                                    binName + '.root')
        dataFNameCNC = os.path.join(baseDir, 'flat', particle, probe,
                                    resonance, era, dataSubEra, 'Nominal',
                                    extEffName + '.root')
        mcFNameCNC = os.path.join(baseDir, 'flat', particle, probe, resonance,
                                  era, mcSubEra, 'Nominal',
                                  extEffName + '.root')
        if cutAndCount:
            sf, sf_stat, dataEff, dataStat, mcEff, mcStat = getSF_cutAndCount(
                binName, dataFNameCNC, mcFNameCNC)
        else:
            sf, sf_stat, dataEff, dataStat, mcEff, mcStat = getSF(
                binName, dataFNameFit)
        fitTypes = set(systList['SF']['fitTypes'] +
                       systList['dataEff']['fitTypes'] +
                       systList['mcEff']['fitTypes'])
        shiftTypes = set(systList['SF']['shiftTypes'] +
                         systList['dataEff']['shiftTypes'] +
                         systList['mcEff']['shiftTypes'])
        if cutAndCount:
            sf_syst = getSyst_cutAndCount(binName, dataFNameCNC, mcFNameCNC,
                                          fitTypes, shiftTypes)
        else:
            sf_syst = getSyst(binName, dataFNameFit, fitTypes, shiftTypes)

        combined_syst = {}
        for kind in ['SF', 'dataEff', 'mcEff']:
            combined_syst[kind] = 0
            errKey = 'err'
            if kind == 'dataEff':
                errKey = 'dataErr'
            if kind == 'mcEff':
                errKey = 'mcErr'
            for t in itertools.chain(systList[kind]['fitTypes'],
                                     systList[kind]['shiftTypes']):
                combined_syst[kind] += sf_syst[t][errKey]**2
            combined_syst[kind] = combined_syst[kind]**0.5

        sf_err = (sf_stat**2 + combined_syst['SF']**2)**0.5
        dataErr = (dataStat**2 + combined_syst['dataEff']**2)**0.5
        mcErr = (mcStat**2 + combined_syst['mcEff']**2)**0.5
        _out['value'] = sf
        _out['stat'] = sf_stat
        _out['syst'] = combined_syst['SF']
        for s in itertools.chain(systList['SF']['fitTypes'],
                                 systList['SF']['shiftTypes']):
            _out[s] = sf_syst[s]['err']

        def set_bin(hist, index, val, err):
            index = list(index)
            val_args = index + [val]
            err_args = index + [err]
            hist.SetBinContent(*val_args)
            if err >= 0:
                hist.SetBinError(*err_args)

        set_bin(hist, index, sf, sf_err)
        set_bin(hist_stat, index, sf, sf_stat)
        set_bin(hist_syst, index, sf, combined_syst['SF'])
        set_bin(histList_syst['combined_syst'], index, combined_syst['SF'], -1)
        set_bin(hist_dataEff, index, dataEff, dataErr)
        set_bin(hist_dataEff_stat, index, dataEff, dataStat)
        set_bin(hist_dataEff_syst, index, dataEff, combined_syst['dataEff'])
        set_bin(histList_dataEff_syst['combined_syst'], index,
                combined_syst['dataEff'], -1)
        set_bin(hist_mcEff, index, mcEff, mcErr)
        set_bin(hist_mcEff_stat, index, mcEff, mcStat)
        set_bin(hist_mcEff_syst, index, mcEff, combined_syst['mcEff'])
        set_bin(histList_mcEff_syst['combined_syst'], index,
                combined_syst['mcEff'], -1)
        for iKey in sf_syst.keys():
            if iKey in histList_syst:
                set_bin(histList_syst[iKey], index, sf_syst[iKey]['sf'],
                        sf_syst[iKey]['err'])
                set_bin(histList_syst[iKey + '_syst'], index,
                        sf_syst[iKey]['err'], -1)

            if iKey in histList_dataEff_syst:
                set_bin(histList_dataEff_syst[iKey], index,
                        sf_syst[iKey]['dataEff'], sf_syst[iKey]['dataErr'])
                set_bin(histList_dataEff_syst[iKey + '_syst'], index,
                        sf_syst[iKey]['dataErr'], -1)

            if iKey in histList_mcEff_syst:
                set_bin(histList_mcEff_syst[iKey], index,
                        sf_syst[iKey]['mcEff'], sf_syst[iKey]['mcErr'])
                set_bin(histList_mcEff_syst[iKey + '_syst'], index,
                        sf_syst[iKey]['mcErr'], -1)

    hists[extEffName] = hist
    hists[extEffName + '_stat'] = hist_stat
    hists[extEffName + '_syst'] = hist_syst
    hists[extEffName + '_efficiencyData'] = hist_dataEff
    hists[extEffName + '_efficiencyData_stat'] = hist_dataEff_stat
    hists[extEffName + '_efficiencyData_syst'] = hist_dataEff_syst
    hists[extEffName + '_efficiencyMC'] = hist_mcEff
    hists[extEffName + '_efficiencyMC_stat'] = hist_mcEff_stat
    hists[extEffName + '_efficiencyMC_syst'] = hist_mcEff_syst
    for iKey in histList_syst.keys():
        hname = extEffName + '_' + iKey
        hists[hname] = histList_syst[iKey]
    for iKey in histList_dataEff_syst.keys():
        hname = extEffName + '_efficiencyData_' + iKey
        hists[hname] = histList_dataEff_syst[iKey]
    for iKey in histList_mcEff_syst.keys():
        hname = extEffName + '_efficiencyMC_' + iKey
        hists[hname] = histList_mcEff_syst[iKey]

    # save the efficiency
    plotDir = os.path.join(baseDir, 'plots', particle, probe, resonance, era,
                           effName, 'efficiency')
    os.makedirs(plotDir, exist_ok=True)

    effDir = os.path.join(baseDir, 'efficiencies', particle, probe, resonance,
                          era, effName)
    os.makedirs(effDir, exist_ok=True)
    effPath = os.path.join(effDir, extEffName)

    # JSON format
    with open('{}.json'.format(effPath), 'w') as f:
        f.write(json.dumps(output, indent=4, sort_keys=True))

    # ROOT histogram format
    tfile = ROOT.TFile.Open('{}.root'.format(effPath), 'recreate')
    for h in sorted(hists):
        hists[h].Write(h)

        if nVars == 2 and not skipPlots:
            cName = 'c' + h
            canvas = ROOT.TCanvas(cName, cName, 1000, 800)
            ROOT.gStyle.SetPaintTextFormat("5.3f")
            canvas.SetRightMargin(0.24)
            hists[h].Draw('colz text')
            plotPath = os.path.join(plotDir, h)
            canvas.Modified()
            canvas.Update()

            CMS_lumi.cmsText = 'CMS'
            CMS_lumi.writeExtraText = True
            CMS_lumi.extraText = 'Preliminary'
            CMS_lumi.lumi_13TeV = "%0.1f fb^{-1}" % (lumi)
            CMS_lumi.CMS_lumi(canvas, 4, 0)

            canvas.Print('{}.png'.format(plotPath))
            canvas.Print('{}.pdf'.format(plotPath))
            canvas.Print('{}.root'.format(plotPath))

    tfile.Close()

    if skipPlots:
        return

    # gets a graph projection of an ND histogram for a given axis
    # with axis index (ie x,y,z = 0,1,2) and other dimensions ind
    def get_graph(hist, axis, axis_ind, *ind):
        ind = list(ind)
        ni = axis.GetNbins()
        xvals = [axis.GetBinCenter(i + 1) for i in range(ni)]
        xvals_errLow = [
            xvals[i] - axis.GetBinLowEdge(i + 1) for i in range(ni)
        ]
        xvals_errHigh = [
            axis.GetBinUpEdge(i + 1) - xvals[i] for i in range(ni)
        ]
        yvals = [
            hist.GetBinContent(*ind[:axis_ind] + [i + 1] + ind[axis_ind:])
            for i in range(ni)
        ]
        yvals_err = [
            hist.GetBinError(*ind[:axis_ind] + [i + 1] + ind[axis_ind:])
            for i in range(ni)
        ]
        graph = ROOT.TGraphAsymmErrors(
            ni,
            array('d', xvals),
            array('d', yvals),
            array('d', xvals_errLow),
            array('d', xvals_errHigh),
            array('d', yvals_err),
            array('d', yvals_err),
        )
        return graph

    # plot the efficiencies
    # some default colors for plots
    colors = [
        ROOT.kBlack, ROOT.kBlue, ROOT.kRed, ROOT.kGreen + 2, ROOT.kMagenta + 1,
        ROOT.kOrange + 1, ROOT.kTeal - 1, ROOT.kRed - 3, ROOT.kCyan + 2
    ]

    def plot_1d_eff(savename,
                    graphs,
                    labels=['Data', 'Simulation'],
                    colors=colors,
                    xlabel='',
                    ylabel='Efficiency',
                    xRange=[],
                    additional_text=[]):
        ng = len(graphs)
        mg = ROOT.TMultiGraph()
        for gi in range(ng):
            graphs[gi].SetLineColor(colors[gi])
            graphs[gi].SetMarkerColor(colors[gi])
            mg.Add(graphs[gi])

        canvas = ROOT.TCanvas(savename, savename, 800, 800)
        mg.Draw('AP0')
        mg.GetXaxis().SetTitle(xlabel)
        if xRange:
            mg.GetXaxis().SetRangeUser(*xRange)
        mg.GetYaxis().SetTitle(ylabel)
        mg.GetYaxis().SetRangeUser(0.8, 1.10)
        legend = ROOT.TLegend(0.5, 0.70, 0.92, 0.92)
        legend.SetTextFont(42)
        legend.SetBorderSize(0)
        legend.SetFillColor(0)
        for gi in range(ng):
            legend.AddEntry(graphs[gi], labels[gi], 'l')
        legend.SetHeader('{} / {}'.format(num, denom))
        legend.Draw()

        if additional_text:
            nother = len(additional_text)
            dims = [0.18, 0.84 - nother * 0.04 - 0.02, 0.35, 0.84]
            text = ROOT.TPaveText(*dims + ['NB NDC'])
            text.SetTextFont(42)
            text.SetBorderSize(0)
            text.SetFillColor(0)
            text.SetTextAlign(11)
            text.SetTextSize(0.03)
            for rtext in additional_text:
                text.AddText(rtext)
            text.Draw()

        CMS_lumi.cmsText = 'CMS'
        CMS_lumi.writeExtraText = True
        CMS_lumi.extraText = 'Preliminary'
        CMS_lumi.lumi_13TeV = "%0.1f fb^{-1}" % (lumi)
        CMS_lumi.CMS_lumi(canvas, 4, 11)

        canvas.Print('{}.png'.format(savename))
        canvas.Print('{}.pdf'.format(savename))
        canvas.Print('{}.root'.format(savename))

    # enumerate over the axis/variable to plot
    axes = [
        hists[extEffName].GetXaxis(), hists[extEffName].GetYaxis(),
        hists[extEffName].GetZaxis()
    ]
    for vi, variableLabel in enumerate(variableLabels):

        # iterate over the other axis indices
        otherVariableLabels = [
            ovl for ovl in variableLabels if ovl != variableLabel
        ]
        otherVariableIndices = [
            ovi for ovi, ovl in enumerate(variableLabels)
            if ovl != variableLabel
        ]
        indices = [
            list(range(1, len(binning[vl]))) for vl in otherVariableLabels
        ]
        if indices:
            for index in itertools.product(*indices):
                graph_data = get_graph(hists[extEffName + '_efficiencyData'],
                                       axes[vi], vi, *index)
                graph_mc = get_graph(hists[extEffName + '_efficiencyMC'],
                                     axes[vi], vi, *index)
                xlabel = get_variable_name_pretty(variableLabel)
                ylabel = 'Efficiency'
                xRange = [
                    axes[vi].GetBinLowEdge(1),
                    axes[vi].GetBinUpEdge(axes[vi].GetNbins())
                ]
                additional_text = []
                for novi, (ovi, ovl) in enumerate(
                        zip(otherVariableIndices, otherVariableLabels)):
                    xlow = axes[ovi].GetBinLowEdge(index[novi])
                    xhigh = axes[ovi].GetBinUpEdge(index[novi])
                    rtext = '{} < {} < {}'.format(
                        xlow, get_variable_name_pretty(ovl), xhigh)
                    additional_text += [rtext]
                plotDir = os.path.join(baseDir, 'plots', particle, probe,
                                       resonance, era, effName, 'efficiency')
                os.makedirs(plotDir, exist_ok=True)
                otherVariableLabel = get_bin_name(otherVariableLabels, index)
                plotName = '{}_{}_vs_{}'.format(effName, otherVariableLabel,
                                                variableLabel)
                plotPath = os.path.join(plotDir, plotName)
                plot_1d_eff(plotPath, [graph_data, graph_mc],
                            xlabel=xlabel,
                            ylabel=ylabel,
                            xRange=xRange,
                            additional_text=additional_text)

                # dataEfficiency systs
                graphs = [
                    get_graph(hists[extEffName + '_efficiencyData'], axes[vi],
                              vi, *index)
                ]
                labels = ['Nominal']
                for iSyst in itertools.chain(
                        systList['dataEff']['fitTypes'],
                        systList['dataEff']['shiftTypes']):
                    graphs += [
                        get_graph(
                            hists[extEffName + '_efficiencyData_' + iSyst],
                            axes[vi], vi, *index)
                    ]
                    labels += [iSyst]
                plotName = '{}_{}_vs_{}_efficiencyData_syst'.format(
                    effName,
                    otherVariableLabel,
                    variableLabel,
                )
                plotPath = os.path.join(plotDir, plotName)
                plot_1d_eff(plotPath,
                            graphs,
                            labels=labels,
                            xlabel=xlabel,
                            ylabel=ylabel,
                            xRange=xRange,
                            additional_text=additional_text)

                # mcEfficiency systs
                graphs = [
                    get_graph(hists[extEffName + '_efficiencyMC'], axes[vi],
                              vi, *index)
                ]
                labels = ['Nominal']
                for iSyst in itertools.chain(systList['mcEff']['fitTypes'],
                                             systList['mcEff']['shiftTypes']):
                    graphs += [
                        get_graph(hists[extEffName + '_efficiencyMC_' + iSyst],
                                  axes[vi], vi, *index)
                    ]
                    labels += [iSyst]
                plotName = '{}_{}_vs_{}_efficiencyMC_syst'.format(
                    effName,
                    otherVariableLabel,
                    variableLabel,
                )
                plotPath = os.path.join(plotDir, plotName)
                plot_1d_eff(plotPath,
                            graphs,
                            labels=labels,
                            xlabel=xlabel,
                            ylabel=ylabel,
                            xRange=xRange,
                            additional_text=additional_text)

        # if no indices, easier, just itself
        else:
            graph_data = get_graph(hists[extEffName + '_efficiencyData'],
                                   axes[vi], vi)
            graph_mc = get_graph(hists[extEffName + '_efficiencyMC'], axes[vi],
                                 vi)

            xlabel = get_variable_name_pretty(variableLabel)
            ylabel = 'Efficiency'
            xRange = [
                axes[0].GetBinLowEdge(1),
                axes[0].GetBinUpEdge(axes[0].GetNbins())
            ]
            plotDir = os.path.join(baseDir, 'plots', particle, probe,
                                   resonance, era, effName, 'efficiency')
            os.makedirs(plotDir, exist_ok=True)
            plotName = '{}_vs_{}'.format(effName, variableLabel)
            plotPath = os.path.join(plotDir, plotName)
            plot_1d_eff(plotPath, [graph_data, graph_mc],
                        xlabel=xlabel,
                        ylabel=ylabel,
                        xRange=xRange)
Ejemplo n.º 2
0
def prepare(baseDir, particle, probe, resonance, era, config, num, denom,
            variableLabels):
    hists = {}

    effName = get_eff_name(num, denom)
    extEffName = get_extended_eff_name(num, denom, variableLabels)
    binning = config.binning()
    dataSubEra, mcSubEra = get_data_mc_sub_eras(resonance, era)

    systList = config.get('systematics', {
        x: {
            'fitTypes': [],
            'shittTypes': []
        }
        for x in ['SF', 'dataEff', 'mcEff']
    })

    def get_variable_name_pretty(variableLabel):
        variables = config.variables()
        return variables.get(variableLabel, {}).get('pretty', variableLabel)

    # create output histograms
    nVars = len(variableLabels)
    if nVars == 1:
        THX = ROOT.TH1F
    elif nVars == 2:
        THX = ROOT.TH2F
    elif nVars == 3:
        THX = ROOT.TH3F
    else:
        raise NotImplementedError(
            'More than 3 dimensions are not supported for scale factors')

    hargs = [extEffName, extEffName]
    for variableLabel in variableLabels:
        hargs += [
            len(binning[variableLabel]) - 1,
            array('d', binning[variableLabel])
        ]
    hist = THX(*hargs)
    axes = [hist.GetXaxis(), hist.GetYaxis(), hist.GetZaxis()]
    for vi, variableLabel in enumerate(variableLabels):
        axes[vi].SetTitle(get_variable_name_pretty(variableLabel))
    if nVars == 1:
        hist.GetYaxis().SetTitle('Scalefactor')
    if nVars == 2:
        hist.SetOption('colz')
        hist.GetZaxis().SetTitle('Scalefactor')
    hist_stat = hist.Clone(extEffName + '_stat')
    hist_syst = hist.Clone(extEffName + '_syst')
    histList_syst = {'combined': hist.Clone(effName + '_combinedSyst')}

    hist_dataEff = hist.Clone(extEffName + '_efficiencyData')
    if nVars == 1:
        hist_dataEff.GetYaxis().SetTitle('Efficiency')
    if nVars == 2:
        hist_dataEff.GetZaxis().SetTitle('Efficiency')
    hist_dataEff_stat = hist_dataEff.Clone(extEffName + '_efficiencyData_stat')
    hist_dataEff_syst = hist_dataEff.Clone(extEffName + '_efficiencyData_syst')
    histList_dataEff_syst = {
        'combined':
        hist_dataEff.Clone(effName + '_efficiencyData_combinedSyst')
    }
    hist_mcEff = hist_dataEff.Clone(extEffName + '_efficiencyMC')
    hist_mcEff_stat = hist_dataEff.Clone(extEffName + '_efficiencyMC_stat')
    hist_mcEff_syst = hist_dataEff.Clone(extEffName + '_efficiencyMC_syst')
    histList_mcEff_syst = {
        'combined': hist_dataEff.Clone(effName + '_efficiencyMC_combinedSyst')
    }
    for iSyst in itertools.chain(systList['SF']['fitTypes'],
                                 systList['SF']['shiftTypes']):
        histList_syst.update({iSyst: hist.Clone(effName + '_' + iSyst)})
    for iSyst in itertools.chain(systList['dataEff']['fitTypes'],
                                 systList['dataEff']['shiftTypes']):
        histList_dataEff_syst.update(
            {iSyst: hist.Clone(effName + '_' + iSyst)})
    for iSyst in itertools.chain(systList['mcEff']['fitTypes'],
                                 systList['mcEff']['shiftTypes']):
        histList_mcEff_syst.update({iSyst: hist.Clone(effName + '_' + iSyst)})

    varName = get_variables_name(variableLabels)

    # iterate through the bin indices
    # this does nested for loops of the N-D binning (e.g. pt, eta)
    # binning starts at 1 (0 is underflow), same as ROOT
    indices = [
        list(range(1, len(binning[variableLabel])))
        for variableLabel in variableLabels
    ]
    output = {effName: {varName: {}}}
    for index in itertools.product(*indices):
        binName = get_full_name(num, denom, variableLabels, index)
        subVarKeys = [
            '{}:[{},{}]'.format(variableLabels[i],
                                binning[variableLabels[i]][ind - 1],
                                binning[variableLabels[i]][ind])
            for i, ind in enumerate(index)
        ]
        _out = output[effName][varName]

        # add binning definitions
        _out['binning'] = [{
            'variable': vl,
            'binning': binning[vl].tolist(),
        } for vl in variableLabels]

        for subVarKey in subVarKeys:
            if subVarKey not in _out:
                _out[subVarKey] = {}
            _out = _out[subVarKey]

        # the fitted distributions
        fitType = 'Nominal'
        dataFNameFit = os.path.join(baseDir, 'fits_data', particle, probe,
                                    resonance, era, fitType, effName,
                                    binName + '.root')
        sf, sf_stat, dataEff, dataStat, mcEff, mcStat = getSF(
            binName, dataFNameFit)
        sf_syst = getSyst(binName, dataFNameFit, dataEff, mcEff,
                          systList['SF']['fitTypes'],
                          systList['SF']['shiftTypes'])
        dataSyst = getSyst(binName, dataFNameFit, dataEff, mcEff,
                           systList['dataEff']['fitTypes'],
                           systList['dataEff']['shiftTypes'])
        mcSyst = getSyst(binName, dataFNameFit, dataEff, mcEff,
                         systList['mcEff']['fitTypes'],
                         systList['mcEff']['shiftTypes'])
        sf_err = (sf_stat**2 + sf_syst['combined']**2)**0.5
        dataErr = (dataStat**2 + dataSyst['combined']**2)**0.5
        mcErr = (mcStat**2 + mcSyst['combined']**2)**0.5
        _out['value'] = sf
        _out['stat'] = sf_stat
        _out['syst'] = sf_syst['combined']
        for s in itertools.chain(systList['SF']['fitTypes'],
                                 systList['SF']['shiftTypes']):
            _out[s] = sf_syst[s]

        def set_bin(hist, index, val, err):
            index = list(index)
            val_args = index + [val]
            err_args = index + [err]
            hist.SetBinContent(*val_args)
            if err >= 0:
                hist.SetBinError(*err_args)

        set_bin(hist, index, sf, sf_err)
        set_bin(hist_stat, index, sf, sf_stat)
        set_bin(hist_syst, index, sf, sf_syst['combined'])
        for iKey in sf_syst.keys():
            set_bin(histList_syst[iKey], index, sf_syst[iKey], -1)

        set_bin(hist_dataEff, index, dataEff, dataErr)
        set_bin(hist_dataEff_stat, index, dataEff, dataStat)
        set_bin(hist_dataEff_syst, index, dataEff, dataSyst['combined'])
        for iKey in dataSyst.keys():
            set_bin(histList_dataEff_syst[iKey], index, dataSyst[iKey], -1)

        set_bin(hist_mcEff, index, mcEff, mcErr)
        set_bin(hist_mcEff_stat, index, mcEff, mcStat)
        set_bin(hist_mcEff_syst, index, mcEff, mcSyst['combined'])
        for iKey in mcSyst.keys():
            set_bin(histList_mcEff_syst[iKey], index, mcSyst[iKey], -1)

    hists[extEffName] = hist
    hists[extEffName + '_stat'] = hist_stat
    hists[extEffName + '_syst'] = hist_syst
    hists[extEffName + '_efficiencyData'] = hist_dataEff
    hists[extEffName + '_efficiencyData_stat'] = hist_dataEff_stat
    hists[extEffName + '_efficiencyData_syst'] = hist_dataEff_syst
    hists[extEffName + '_efficiencyMC'] = hist_mcEff
    hists[extEffName + '_efficiencyMC_stat'] = hist_mcEff_stat
    hists[extEffName + '_efficiencyMC_syst'] = hist_mcEff_syst
    for iKey in histList_syst.keys():
        hists[effName + '_' + iKey] = histList_syst[iKey]
    for iKey in histList_dataEff_syst.keys():
        hists[effName + '_efficiencyData_' +
              iKey] = histList_dataEff_syst[iKey]
    for iKey in histList_mcEff_syst.keys():
        hists[effName + '_efficiencyMC_' + iKey] = histList_mcEff_syst[iKey]

    # save the efficiency
    plotDir = os.path.join(baseDir, 'plots', particle, probe, resonance, era,
                           effName, 'efficiency')
    os.makedirs(plotDir, exist_ok=True)

    effDir = os.path.join(baseDir, 'efficiencies', particle, probe, resonance,
                          era, effName)
    os.makedirs(effDir, exist_ok=True)
    effPath = os.path.join(effDir, extEffName)

    # JSON format
    with open('{}.json'.format(effPath), 'w') as f:
        f.write(json.dumps(output, indent=4, sort_keys=True))

    # ROOT histogram format
    tfile = ROOT.TFile.Open('{}.root'.format(effPath), 'recreate')
    for h in sorted(hists):
        hists[h].Write(h)

        if nVars == 2:
            cName = 'c' + h
            canvas = ROOT.TCanvas(cName, cName, 1000, 800)
            ROOT.gStyle.SetPaintTextFormat("5.3f")
            canvas.SetRightMargin(0.24)
            hists[h].Draw('colz text')
            plotPath = os.path.join(plotDir, h)
            canvas.Modified()
            canvas.Update()
            canvas.Print('{}.png'.format(plotPath))
            canvas.Print('{}.pdf'.format(plotPath))

    tfile.Close()

    # gets a graph projection of an ND histogram for a given axis
    # with axis index (ie x,y,z = 0,1,2) and other dimensions ind
    def get_graph(hist, axis, axis_ind, *ind):
        ind = list(ind)
        ni = axis.GetNbins()
        xvals = [axis.GetBinCenter(i + 1) for i in range(ni)]
        xvals_errLow = [
            xvals[i] - axis.GetBinLowEdge(i + 1) for i in range(ni)
        ]
        xvals_errHigh = [
            axis.GetBinUpEdge(i + 1) - xvals[i] for i in range(ni)
        ]
        yvals = [
            hist.GetBinContent(*ind[:axis_ind] + [i + 1] + ind[axis_ind:])
            for i in range(ni)
        ]
        yvals_err = [
            hist.GetBinError(*ind[:axis_ind] + [i + 1] + ind[axis_ind:])
            for i in range(ni)
        ]
        graph = ROOT.TGraphAsymmErrors(
            ni,
            array('d', xvals),
            array('d', yvals),
            array('d', xvals_errLow),
            array('d', xvals_errHigh),
            array('d', yvals_err),
            array('d', yvals_err),
        )
        return graph

    # plot the efficiencies
    # enumerate over the axis/variable to plot
    axes = [
        hists[extEffName].GetXaxis(), hists[extEffName].GetYaxis(),
        hists[extEffName].GetZaxis()
    ]
    for vi, variableLabel in enumerate(variableLabels):

        # iterate over the other axis indices
        otherVariableLabels = [
            ovl for ovl in variableLabels if ovl != variableLabel
        ]
        otherVariableIndices = [
            ovi for ovi, ovl in enumerate(variableLabels)
            if ovl != variableLabel
        ]
        indices = [
            list(range(1, len(binning[vl]))) for vl in otherVariableLabels
        ]
        if indices:
            for index in itertools.product(*indices):
                graph_data = get_graph(hists[extEffName + '_efficiencyData'],
                                       axes[vi], vi, *index)
                graph_data.SetLineColor(ROOT.kBlack)
                graph_data.SetMarkerColor(ROOT.kBlack)
                graph_mc = get_graph(hists[extEffName + '_efficiencyMC'],
                                     axes[vi], vi, *index)
                graph_mc.SetLineColor(ROOT.kBlue)
                graph_mc.SetMarkerColor(ROOT.kBlue)
                mg = ROOT.TMultiGraph()
                mg.Add(graph_data)
                mg.Add(graph_mc)

                cName = 'c' + extEffName + '_'.join([str(i) for i in index])\
                    + variableLabel
                canvas = ROOT.TCanvas(cName, cName, 800, 800)
                mg.Draw('AP0')
                mg.GetXaxis().SetTitle(get_variable_name_pretty(variableLabel))
                xRange = [
                    axes[vi].GetBinLowEdge(1),
                    axes[vi].GetBinUpEdge(axes[vi].GetNbins())
                ]
                mg.GetXaxis().SetRangeUser(*xRange)
                mg.GetYaxis().SetTitle('Efficiency')
                mg.GetYaxis().SetRangeUser(0.8, 1.10)
                legend = ROOT.TLegend(0.5, 0.70, 0.92, 0.92)
                legend.SetTextFont(42)
                legend.SetBorderSize(0)
                legend.SetFillColor(0)
                legend.AddEntry(graph_data, 'Data', 'l')
                legend.AddEntry(graph_mc, 'Simulation', 'l')
                legend.SetHeader('{} / {}'.format(num, denom))
                legend.Draw()

                nother = len(indices)
                dims = [0.18, 0.84 - nother * 0.04 - 0.02, 0.35, 0.84]
                text = ROOT.TPaveText(*dims + ['NB NDC'])
                text.SetTextFont(42)
                text.SetBorderSize(0)
                text.SetFillColor(0)
                text.SetTextAlign(11)
                text.SetTextSize(0.03)
                for novi, (ovi, ovl) in enumerate(
                        zip(otherVariableIndices, otherVariableLabels)):
                    xlow = axes[ovi].GetBinLowEdge(index[novi])
                    xhigh = axes[ovi].GetBinUpEdge(index[novi])
                    rtext = '{} < {} < {}'.format(
                        xlow, get_variable_name_pretty(ovl), xhigh)
                    text.AddText(rtext)
                text.Draw()
                CMS_lumi.cmsText = 'CMS'
                CMS_lumi.writeExtraText = True
                CMS_lumi.extraText = 'Preliminary'
                CMS_lumi.lumi_13TeV = "%0.1f fb^{-1}" % (41.5)
                CMS_lumi.CMS_lumi(canvas, 4, 11)
                plotDir = os.path.join(baseDir, 'plots', particle, probe,
                                       resonance, era, effName, 'efficiency')
                os.makedirs(plotDir, exist_ok=True)
                otherVariableLabel = get_bin_name(otherVariableLabels, index)
                plotName = '{}_{}_vs_{}'.format(effName, otherVariableLabel,
                                                variableLabel)
                plotPath = os.path.join(plotDir, plotName)
                canvas.Print('{}.png'.format(plotPath))
                canvas.Print('{}.pdf'.format(plotPath))

        # if no indices, easier, just itself
        else:
            graph_data = get_graph(hists[extEffName + '_efficiencyData'],
                                   axes[vi], vi)
            graph_data.SetLineColor(ROOT.kBlack)
            graph_data.SetMarkerColor(ROOT.kBlack)
            graph_mc = get_graph(hists[extEffName + '_efficiencyMC'], axes[vi],
                                 vi)
            graph_mc.SetLineColor(ROOT.kBlue)
            graph_mc.SetMarkerColor(ROOT.kBlue)
            mg = ROOT.TMultiGraph()
            mg.Add(graph_data)
            mg.Add(graph_mc)

            canvas = ROOT.TCanvas('c' + extEffName, 'c', 800, 800)
            mg.Draw('AP0')
            mg.GetXaxis().SetTitle(get_variable_name_pretty(variableLabel))
            mg.GetYaxis().SetTitle('Efficiency')
            mg.GetYaxis().SetRangeUser(0.8, 1.10)
            legend = ROOT.TLegend(0.5, 0.70, 0.92, 0.92)
            legend.SetTextFont(42)
            legend.SetBorderSize(0)
            legend.SetFillColor(0)
            legend.AddEntry(graph_data, 'Data', 'l')
            legend.AddEntry(graph_mc, 'Simulation', 'l')
            legend.SetHeader('{} / {}'.format(num, denom))
            legend.Draw()

            CMS_lumi.cmsText = 'CMS'
            CMS_lumi.writeExtraText = True
            CMS_lumi.extraText = 'Preliminary'
            CMS_lumi.lumi_13TeV = "%0.1f fb^{-1}" % (41.5)
            CMS_lumi.CMS_lumi(canvas, 4, 11)
            plotDir = os.path.join(baseDir, 'plots', particle, probe,
                                   resonance, era, effName, 'efficiency')
            os.makedirs(plotDir, exist_ok=True)
            plotName = '{}_vs_{}'.format(effName, variableLabel)
            plotPath = os.path.join(plotDir, plotName)
            canvas.Print('{}.png'.format(plotPath))
            canvas.Print('{}.pdf'.format(plotPath))
Ejemplo n.º 3
0
def run_conversion(spark, particle, probe, resonance, era, subEra,
                   config, shift='Nominal', **kwargs):
    _numerator = kwargs.pop('numerator', [])
    _denominator = kwargs.pop('denominator', [])
    _baseDir = kwargs.pop('baseDir', '')

    testing = False
    print('Running conversion for', resonance, era, subEra, shift)

    if useParquet:
        fnames = list(registry.parquet(
            particle, probe, resonance, era, subEra))
    else:
        fnames = registry.root(particle, probe, resonance, era, subEra)
        fnames = ['root://eoscms.cern.ch/'+f for f in fnames]

    jobPath = os.path.join(particle, probe, resonance, era, subEra)
    if shift:
        jobPath = os.path.join(jobPath, shift)
    if testing:
        jobPath = os.path.join('testing', jobPath)
    else:
        jobPath = os.path.join('flat', jobPath)
    if _baseDir:
        jobPath = os.path.join(_baseDir, jobPath)
    os.makedirs(jobPath, exist_ok=True)

    doGen = subEra in ['DY_madgraph', 'DY_powheg']

    # default numerator/denominator defintions
    efficiencies = config.efficiencies()

    # get the dataframe
    if useParquet:
        print('Loading parquet files:', fnames)
        if isinstance(fnames, list):
            baseDF = spark.read.parquet(*fnames)
        else:
            baseDF = spark.read.parquet(fnames)
    else:
        treename = registry.treename(particle, probe, resonance, era, subEra)
        baseDF = spark.read.format("root")\
                      .option('tree', treename)\
                      .load(fnames)
    # create the miniIsoaltion columns
    miniIsoDF = get_miniIso_dataframe(baseDF)

    # create the definitions columns
    definitions = config.definitions()
    defDF = miniIsoDF

    for d in definitions:
        defDF = defDF.withColumn(d, F.expr(definitions[d]))

    # select tags
    tagsDF = defDF.filter(config.selection())

    # build the weights (pileup for MC)
    weightedDF = get_weighted_dataframe(
        tagsDF, doGen, resonance, era, subEra, shift=shift)

    # create the binning structure
    fitVariable = config.fitVariable()
    binningSet = set([fitVariable])
    if doGen:
        fitVariableGen = config.fitVariableGen()
        binningSet = binningSet.union(set([fitVariableGen]))
    binVariables = config.binVariables()
    for bvs in binVariables:
        binningSet = binningSet.union(set(bvs))

    binning = config.binning()
    variables = config.variables()
    binnedDF = weightedDF
    for bName in binningSet:
        binnedDF = get_binned_dataframe(
            binnedDF, bName+"Bin",
            variables[bName]['variable'],
            binning[bName])

    # build the unrealized yield dataframes
    # they are binned in the ID, bin variables, and fit variable
    yields = {}
    yields_gen = {}

    for numLabel, denLabel in efficiencies:
        den = binnedDF.filter(denLabel)
        for binVars in binVariables:
            key = (numLabel, denLabel, tuple(binVars))
            yields[key] = den.groupBy(
                numLabel, *[b+'Bin' for b in
                            binVars+[fitVariable]])\
                .agg({'weight2': 'sum', 'weight': 'sum'})
            if doGen:
                yields_gen[key] = den.groupBy(
                    numLabel, *[b+'Bin' for b in
                                binVars+[fitVariableGen]])\
                    .agg({'weight2': 'sum', 'weight': 'sum'})

    def get_values(df, mLabel, **binValues):
        for k, v in binValues.items():
            df = df[df[k] == v]
        df = df.set_index(mLabel)
        # fill empty bins with 0
        # includes underflow and overflow in the ROOT numbering scheme
        # (0 is underflow, len(binning)+1 is overflow)
        values = pd.Series(np.zeros(len(binning['mass'])+1))
        values[df.index] = df['sum(weight)']
        values = values.to_numpy()
        sumw2 = pd.Series(np.zeros(len(binning['mass'])+1))
        if 'sum(weight2)' in df.columns:
            sumw2[df.index] = df['sum(weight2)']
        else:
            sumw2[df.index] = df['sum(weight)']  # no weights provided
        sumw2 = sumw2.to_numpy()
        return values, sumw2

    def get_hist(values, sumw2, edges, overflow=True):
        if overflow:
            hist = TH1.from_numpy((values[1:-1], edges))
            hist[0] = values[0]
            hist[-1] = values[-1]
            hist._fSumw2 = sumw2
        else:
            hist = TH1.from_numpy((values, edges))
            hist._fSumw2[1:-1] = sumw2
        return hist

    # realize each of the yield tables
    # then produce the histograms and saves them
    # this is the first time things are put into memory
    for num_den_binVars in yields:
        num, den, binVars = num_den_binVars
        if _numerator and num not in _numerator:
            continue
        if _denominator and den not in _denominator:
            continue
        extended_eff_name = get_extended_eff_name(num, den, binVars)

        eff_outname = f'{jobPath}/{extended_eff_name}.root'
        hists = {}

        print('Processing', eff_outname)
        realized = yields[num_den_binVars].toPandas()

        for bins in itertools.product(
                *[range(1, len(binning[b])) for b in binVars]):
            binname = get_full_name(num, den, binVars, bins)
            binargs = {b+'Bin': v for b, v in zip(binVars, bins)}
            mLabel = fitVariable + 'Bin'

            passargs = {num: True}
            passargs.update(binargs)
            values, sumw2 = get_values(realized, mLabel, **passargs)
            edges = binning[fitVariable]
            hists[binname+'_Pass'] = get_hist(values, sumw2, edges)

            failargs = {num: False}
            failargs.update(binargs)
            values, sumw2 = get_values(realized, mLabel, **failargs)
            edges = binning[fitVariable]
            hists[binname+'_Fail'] = get_hist(values, sumw2, edges)

        if doGen:
            realized = yields_gen[num_den_binVars].toPandas()
            for bins in itertools.product(
                    *[range(1, len(binning[b])) for b in binVars]):
                binname = get_full_name(num, den, binVars, bins)
                binargs = {b+'Bin': v for b, v in zip(binVars, bins)}
                mLabel = fitVariableGen + 'Bin'

                passargs = {num: True}
                passargs.update(binargs)
                values, sumw2 = get_values(realized, mLabel, **passargs)
                edges = binning[fitVariableGen]
                hists[binname+'_Pass_Gen'] = get_hist(values, sumw2, edges)

                failargs = {num: False}
                failargs.update(binargs)
                values, sumw2 = get_values(realized, mLabel, **failargs)
                edges = binning[fitVariableGen]
                hists[binname+'_Fail_Gen'] = get_hist(values, sumw2, edges)

        with uproot.recreate(eff_outname) as f:
            for h, hist in sorted(hists.items()):
                f[h] = hist
Ejemplo n.º 4
0
def build_fit_jobs(particle, probe, resonance, era, config, **kwargs):
    _baseDir = kwargs.pop('baseDir', '')
    _numerator = kwargs.pop('numerator', [])
    _denominator = kwargs.pop('denominator', [])
    _fitType = kwargs.pop('fitType', [])
    _shiftType = kwargs.pop('shiftType', [])
    _sampleType = kwargs.pop('sampleType', [])
    _efficiencyBin = kwargs.pop('efficiencyBin', [])
    _recover = kwargs.pop('recover', False)
    _recoverMode = kwargs.pop('recoverMode', 'simple')
    doData = (not _sampleType) or ('data' in _sampleType)
    doMC = (not _sampleType) or ('mc' in _sampleType)

    dataSubEra, mcSubEra = get_data_mc_sub_eras(resonance, era)

    def process(outFName):
        if _recover and _recoverMode == 'simple':
            return recover_simple(outFName)
        return True

    jobs = []
    # iterate through the efficiencies
    efficiencies = config.efficiencies()
    binning = config.binning()
    for num, denom in efficiencies:
        if _numerator and num not in _numerator:
            continue
        if _denominator and denom not in _denominator:
            continue

        # iterate through the output binning structure
        for variableLabels in config.binVariables():
            # iterate through the bin indices
            # this does nested for loops of the N-D binning (e.g. pt, eta)
            indices = [
                list(range(len(binning[variableLabel]) - 1))
                for variableLabel in variableLabels
            ]
            for index in itertools.product(*indices):
                # binning goes from 1 to N
                index = [i + 1 for i in index]
                binName = get_full_name(num, denom, variableLabels, index)
                extEffName = get_extended_eff_name(num, denom, variableLabels)
                effName = get_eff_name(num, denom)
                if _efficiencyBin and binName not in _efficiencyBin:
                    continue

                def get_jobs(fitType, shiftType, inType, outType):
                    _jobs = []
                    templateFName = os.path.join(_baseDir, 'flat', particle,
                                                 probe, resonance, era,
                                                 mcSubEra, inType,
                                                 extEffName + '.root')
                    outFName = os.path.join(_baseDir, 'fits_data', particle,
                                            probe, resonance, era, outType,
                                            effName, binName + '.root')
                    inFName = os.path.join(_baseDir, 'flat', particle, probe,
                                           resonance, era, dataSubEra, inType,
                                           extEffName + '.root')
                    plotDir = os.path.join(_baseDir, 'plots', particle, probe,
                                           resonance, era, 'fits_data',
                                           outType, effName)
                    if doData and process(outFName):
                        _jobs += [(outFName, inFName, binName, templateFName,
                                   plotDir, fitType, 'data', shiftType)]
                    outFName = os.path.join(_baseDir, 'fits_mc', particle,
                                            probe, resonance, era, outType,
                                            effName, binName + '.root')
                    inFName = os.path.join(_baseDir, 'flat', particle, probe,
                                           resonance, era, mcSubEra, inType,
                                           extEffName + '.root')
                    plotDir = os.path.join(_baseDir, 'plots', particle, probe,
                                           resonance, era, 'fits_mc', outType,
                                           effName)
                    # there is no need to fit MC for templates
                    # PDF based fits are:
                    #   NominalOld, AltSigOld
                    if doMC and process(outFName) and\
                            fitType in ['NominalOld', 'AltSigOld']:
                        _jobs += [(outFName, inFName, binName, templateFName,
                                   plotDir, fitType, 'mc', shiftType)]
                    return _jobs

                for fitShift in config.fitShifts():
                    if (_fitType or _shiftType):
                        if not ((_fitType and fitShift in _fitType) or
                                (_shiftType and fitShift in _shiftType)):
                            continue
                    params = config.fitShift(fitShift)
                    jobs += get_jobs(params['fitType'], params['shiftType'],
                                     params['inType'], fitShift)

    return jobs