def make_plot(acc, region, distribution, year, data, mc, signal=None, outdir='./output/stack/', integrate=None, ylim=None, xlim=None, rylim=None, tag=None, output_format='pdf', ratio=True): """Creates a data vs MC comparison plot :param acc: Accumulator (processor output) :type acc: coffea.processor.accumulator """ # Rebin s = Style() h = copy.deepcopy(acc[distribution]) assert (h) try: newax = s.get_binning(distribution, region) h = h.rebin(h.axis(newax.name), newax) except KeyError: pass # Integrate over an extra axis inte_tag = "" if integrate: (inte_axis, inte_low, inte_high) = integrate h = h.integrate(inte_axis, slice(inte_low, inte_high)) #can add an overflow option here inte_tag += "_" + inte_axis + "_" + str(inte_low) + "_" + str( inte_high) # Pick the region we want to look at # E.g. cr_2m_j = Di-Muon control region with monojet selection h = h.integrate(h.axis('region'), region) # Plotting # Add ratio plot at the bottom if specified (default) # Otherwise just plot the histogram if ratio: fig, (ax, rax) = plt.subplots(2, 1, figsize=(7, 7), gridspec_kw={"height_ratios": (3, 1)}, sharex=True) else: fig, ax = plt.subplots(1, 1, figsize=(7, 5)) data_err_opts = { 'linestyle': 'none', 'marker': '.', 'markersize': 10., 'color': 'k', 'elinewidth': 1, } signal_err_opts = { 'linestyle': '-', 'color': 'crimson', 'elinewidth': 1, } # Plot single muon data # Note the syntax we use to pick the data set if data: hist.plot1d(h[data], overlay='dataset', error_opts=data_err_opts, ax=ax, overflow='all', binwnorm=1) if signal: hist.plot1d(h[signal], overlay='dataset', error_opts=signal_err_opts, ax=ax, overflow='all', binwnorm=1, clear=False) # Plot MC background samples # Here we use a regular expression to match # data sets we want hist.plot1d(h[mc], overlay='dataset', stack=True, clear=False, overflow='all', ax=ax, binwnorm=1) # Apply correct colors to BG histograms handles, labels = ax.get_legend_handles_labels() new_labels = [] for handle, label in zip(handles, labels): col = None for k, v in colors.items(): if re.match(k, label): col = v break if col: handle.set_color(col) handle.set_linestyle('-') handle.set_edgecolor('k') l = None channel = channel_name(region) # Pick the proper legend labels for the channel if channel == 'VBF': legend_labels_to_use = legend_labels['VBF'] elif channel in ['Monojet', 'Mono-V']: legend_labels_to_use = legend_labels['Monojet/Mono-V'] # Add in the common labels legend_labels_to_use.update(legend_labels['Common']) for k, v in legend_labels_to_use.items(): if re.match(k, label): l = v new_labels.append(l if l else label) # Update legend try: region_name = s.region_names[region] except KeyError: region_name = region ax.legend(title=region_name, ncol=2, handles=handles, labels=new_labels) # Ratio plot if data: hist.plotratio(h[data].integrate('dataset'), h[mc].integrate('dataset'), ax=rax, denom_fill_opts={}, guide_opts={}, unc='num', overflow='all', error_opts=data_err_opts) ax.text(1., 0., distribution, fontsize=10, horizontalalignment='right', verticalalignment='bottom', transform=ax.transAxes) fig.text(0., 1., '$\\bf{CMS}$ internal', fontsize=14, horizontalalignment='left', verticalalignment='bottom', transform=ax.transAxes) fig.text(1., 1., f'{channel_name(region)}, {lumi(year):.1f} fb$^{{-1}}$ ({year})', fontsize=14, horizontalalignment='right', verticalalignment='bottom', transform=ax.transAxes) # Aesthetics ax.set_yscale("log") ax.set_ylabel('Events / GeV') plot_settings = style.plot_settings() if region in plot_settings.keys(): plot_settings = plot_settings[region] if distribution in plot_settings.keys(): plot_settings = plot_settings[distribution] if ylim: if ylim == "auto": width = np.diff([x for x in h.axes() if "dataset" not in x.name][0].edges()) vmc = h[mc].integrate("dataset").values()[()] / width try: vdata = h[data].integrate("dataset").values()[()] / width except: vdata = vmc if signal: vsig = h[signal].integrate("dataset").values()[()] / width else: vsig = vmc ax.set_ylim( 0.5 * min([ np.min(vmc[vmc > 0]), np.min(vdata[vdata > 0]), np.min(vsig[vsig > 0]) ]), 1e2 * max([np.max(vmc), np.max(vdata), np.min(vsig)]), ) else: ax.set_ylim(ylim[0], ylim[1]) elif 'ylim' in plot_settings.keys(): ax.set_ylim(plot_settings['ylim']) else: ax.set_ylim(1e-1, 1e6) if xlim: ax.set_xlim(xlim[0], xlim[1]) elif 'xlim' in plot_settings.keys(): ax.set_xlim(plot_settings['xlim']) if ratio: if rylim: rax.set_ylim(*rylim) else: rax.set_ylim(0.5, 1.5) loc1 = matplotlib.ticker.MultipleLocator(base=0.2) loc2 = matplotlib.ticker.MultipleLocator(base=0.1) rax.yaxis.set_major_locator(loc1) rax.yaxis.set_minor_locator(loc2) rax.grid(axis='y', which='minor', linestyle='--') rax.grid(axis='y', which='major', linestyle='--') rax.set_ylabel('Data / MC') if not os.path.exists(outdir): os.makedirs(outdir) for form in output_format.split(','): outpath = pjoin( outdir, f"{region}_{distribution}{inte_tag}_{tag + '_' if tag else ''}{year}.{form}" ) fig.savefig(outpath) print(f"Saved plot file in {outpath}") plt.close('all')
def plot(inpath): indir = os.path.abspath(inpath) # The processor output is stored in an # 'accumulator', which in our case is # just a dictionary holding all the histograms # Put all your *coffea files into 'indir' and # pass the directory as an argument here. # All input files in the directory will # automatically be found, merged and read. # The merging only happens the first time # you run over a specific set of inputs. acc = dir_archive(inpath, serialized=True, compression=0, memsize=1e3) # Get a settings dictionary that details # which plots to make for each region, # what the axis limits are, etc # Can add plots by extending the dictionary # Or modify axes ranges, etc settings = plot_settings() merged = set() # Separate plots per year for year in [2017, 2018]: # The data to be used for each region # Muon regions use MET, # electron+photon regions use EGamma # ( EGamma = SingleElectron+SinglePhoton for 2017) data = { 'sr_vbf': None, 'cr_1m_vbf': f'MET_{year}', 'cr_2m_vbf': f'MET_{year}', 'cr_1e_vbf': f'EGamma_{year}', 'cr_2e_vbf': f'EGamma_{year}', 'cr_g_vbf': f'EGamma_{year}', } # Same for MC selection # Match datasets by regular expressions # Here for LO V samples (HT binned) mc_lo = { 'sr_vbf': re.compile( f'(ZJetsToNuNu.*|EW.*|TTJets.*FXFX.*|Diboson.*|ST.*|QCD_HT.*|.*DYJetsToLL_M-50_HT_MLM.*|.*WJetsToLNu.*HT.*).*{year}' ), 'cr_1m_vbf': re.compile( f'(EW.*|TTJets.*FXFX.*|Diboson.*|ST.*|QCD_HT.*|.*DYJetsToLL_M-50_HT_MLM.*|.*WJetsToLNu.*HT.*).*{year}' ), 'cr_1e_vbf': re.compile( f'(EW.*|TTJets.*FXFX.*|Diboson.*|ST.*|QCD_HT.*|.*DYJetsToLL_M-50_HT_MLM.*|.*WJetsToLNu.*HT.*).*{year}' ), 'cr_2m_vbf': re.compile( f'(EW.*|TTJets.*FXFX.*|Diboson.*|ST.*|QCD_HT.*|.*DYJetsToLL_M-50_HT_MLM.*).*{year}' ), 'cr_2e_vbf': re.compile( f'(EW.*|TTJets.*FXFX.*|Diboson.*|ST.*|QCD_HT.*|.*DYJetsToLL_M-50_HT_MLM.*).*{year}' ), 'cr_g_vbf': re.compile(f'(GJets_(HT|SM).*|QCD_HT.*|WJetsToLNu.*HT.*).*{year}'), } # Want to compare LO and NLO, # so do same thing for NLO V samples # All non-V samples remain the same mc_nlo = { 'sr_vbf': re.compile( f'(ZJetsToNuNu.*|EW.*|TTJets.*FXFX.*|Diboson.*|ST.*|QCD_HT.*|.*DYJetsToLL_M-50_HT_MLM.*|.*WJetsToLNu.*FXFX.*).*{year}' ), 'cr_1m_vbf': re.compile( f'(EW.*|TTJets.*FXFX.*|Diboson.*|ST.*|QCD_HT.*|.*DY.*FXFX.*|.*WJetsToLNu.*FXFX.*).*{year}' ), 'cr_1e_vbf': re.compile( f'(EW.*|TTJets.*FXFX.*|Diboson.*|ST.*|QCD_HT.*|.*DY.*FXFX.*|.*WJetsToLNu.*FXFX.*).*{year}' ), 'cr_2m_vbf': re.compile( f'(EW.*|TTJets.*FXFX.*|Diboson.*|ST.*|QCD_HT.*|.*DY.*FXFX.*).*{year}' ), 'cr_2e_vbf': re.compile( f'(EW.*|TTJets.*FXFX.*|Diboson.*|ST.*|QCD_HT.*|.*DY.*FXFX.*).*{year}' ), 'cr_g_vbf': re.compile(f'(GJets_(HT|SM).*|QCD_HT.*|W.*FXFX.*).*{year}'), } regions = list(mc_lo.keys()) # Remove signal region, no need in ratio plots regions.remove('sr_vbf') # Make control region ratio plots for both # LO and NLO. Can be skipped if you only # want data / MC agreement plots. outdir = f'./output/{os.path.basename(indir)}/ratios' # Load ingredients from cache acc.load('mjj') acc.load('sumw') acc.load('sumw_pileup') acc.load('nevents') cr_ratio_plot(acc, year=year, tag='losf', outdir=outdir, mc=mc_lo, regions=regions, distribution='mjj') cr_ratio_plot(acc, year=year, tag='nlo', outdir=outdir, mc=mc_nlo, regions=regions, distribution='mjj') # Data / MC plots are made here # Loop over all regions for region in mc_lo.keys(): ratio = True if region != 'sr_vbf' else False # Make separate output direcotry for each region outdir = f'./output/{os.path.basename(indir)}/{region}' # Settings for this region plotset = settings[region] # Loop over the distributions for distribution in plotset.keys(): # Load from cache if not distribution in merged: acc.load(distribution) if not distribution in acc.keys(): print( f"WARNING: Distribution {distribution} not found in input files." ) continue acc[distribution] = merge_extensions( acc[distribution], acc, reweight_pu=not ('nopu' in distribution)) scale_xs_lumi(acc[distribution]) acc[distribution] = merge_datasets(acc[distribution]) acc[distribution].axis('dataset').sorting = 'integral' merged.add(distribution) try: # The heavy lifting of making a plot is hidden # in make_plot. We call it once using the LO MC make_plot( acc, region=region, distribution=distribution, year=year, data=data[region], mc=mc_lo[region], ylim=plotset[distribution].get('ylim', None), xlim=plotset[distribution].get('xlim', None), tag='losf', outdir=f'./output/{os.path.basename(indir)}/{region}', output_format='pdf', ratio=ratio) # And then we also call it for the NLO MC # The output files will be named according to the 'tag' # argument, so we will be able to tell them apart. make_plot( acc, region=region, distribution=distribution, year=year, data=data[region], mc=mc_nlo[region], ylim=plotset[distribution].get('ylim', None), xlim=plotset[distribution].get('xlim', None), tag='nlo', outdir=f'./output/{os.path.basename(indir)}/{region}', output_format='pdf', ratio=ratio) except KeyError: continue
def plot(args): indir = os.path.abspath(args.inpath) # The processor output is stored in an # 'accumulator', which in our case is # just a dictionary holding all the histograms # Put all your *coffea files into 'indir' and # pass the directory as an argument here. # All input files in the directory will # automatically be found, merged and read. # The merging only happens the first time # you run over a specific set of inputs. acc = dir_archive(args.inpath, serialized=True, compression=0, memsize=1e3) # Get a settings dictionary that details # which plots to make for each region, # what the axis limits are, etc # Can add plots by extending the dictionary # Or modify axes ranges, etc settings = plot_settings() merged = set() # Separate plots per year for year in [2017, 2018]: # The data to be used for each region # Muon regions use MET, # electron+photon regions use EGamma # ( EGamma = SingleElectron+SinglePhoton for 2017) data = { 'sr_vbf': f'MET_{year}', 'cr_1m_vbf': f'MET_{year}', 'cr_2m_vbf': f'MET_{year}', 'cr_1e_vbf': f'EGamma_{year}', 'cr_2e_vbf': f'EGamma_{year}', 'cr_g_vbf': f'EGamma_{year}', } # Same for MC selection # Match datasets by regular expressions # Here for LO V samples (HT binned) mc_lo = { 'sr_vbf': re.compile( f'(ZJetsToNuNu.*|EW.*|Top_FXFX.*|Diboson.*|.*DYJetsToLL_M-50_HT_MLM.*|.*WJetsToLNu.*HT.*).*{year}' ), 'cr_1m_vbf': re.compile( f'(EWKW.*|Top_FXFX.*|Diboson.*|.*DYJetsToLL_M-50_HT_MLM.*|.*WJetsToLNu.*HT.*).*{year}' ), 'cr_1e_vbf': re.compile( f'(EWKW.*|Top_FXFX.*|Diboson.*|.*DYJetsToLL_M-50_HT_MLM.*|.*WJetsToLNu.*HT.*).*{year}' ), 'cr_2m_vbf': re.compile( f'(EWKZ.*ZToLL.*|Top_FXFX.*|Diboson.*|.*DYJetsToLL_M-50_HT_MLM.*).*{year}' ), 'cr_2e_vbf': re.compile( f'(EWKZ.*ZToLL.*|Top_FXFX.*|Diboson.*|.*DYJetsToLL_M-50_HT_MLM.*).*{year}' ), 'cr_g_vbf': re.compile( f'(GJets_(DR-0p4|SM).*|QCD_data.*|WJetsToLNu.*HT.*).*{year}'), } # Load ingredients from cache acc.load('sumw') acc.load('sumw_pileup') acc.load('nevents') # Data / MC plots are made here # Loop over all regions for region in mc_lo.keys(): if not re.match(args.region, region): continue # Plot ratio pads for all regions (now that we're unblinded) ratio = True # Make separate output direcotry for each region outdir = f'./output/{os.path.basename(indir)}/{region}' # Settings for this region plotset = settings[region] # Loop over the distributions for distribution in plotset.keys(): if not re.match(args.distribution, distribution): continue # Load from cache if not distribution in merged: acc.load(distribution) if not distribution in acc.keys(): print( f"WARNING: Distribution {distribution} not found in input files." ) continue acc[distribution] = merge_extensions( acc[distribution], acc, reweight_pu=not ('nopu' in distribution)) scale_xs_lumi(acc[distribution]) acc[distribution] = merge_datasets(acc[distribution]) acc[distribution].axis('dataset').sorting = 'integral' merged.add(distribution) try: # The heavy lifting of making a plot is hidden # in make_plot. We call it once using the LO MC imc = mc_lo[region] if "cr_g" in region and distribution != "recoil": imc = re.compile( imc.pattern.replace('QCD_data', 'QCD.*HT')) make_plot( acc, region=region, distribution=distribution, year=year, data=data[region], mc=imc, ylim=plotset[distribution].get('ylim', None), xlim=plotset[distribution].get('xlim', None), tag='losf', outdir=f'./output/{os.path.basename(indir)}/{region}', output_format='pdf', ratio=ratio) except KeyError: continue
def make_plot(acc, region, distribution, year, data, mc, signal=None, outdir='./output/stack/', integrate=None, ylim=None, xlim=None, rylim=None, tag=None, output_format='pdf', ratio=True): """Creates a data vs MC comparison plot :param acc: Accumulator (processor output) :type acc: coffea.processor.accumulator """ # Rebin s = Style() h = copy.deepcopy(acc[distribution]) assert (h) try: newax = s.get_binning(distribution, region) h = h.rebin(h.axis(newax.name), newax) except KeyError: pass # Integrate over an extra axis inte_tag = "" if integrate: (inte_axis, inte_low, inte_high) = integrate h = h.integrate(inte_axis, slice(inte_low, inte_high)) #can add an overflow option here inte_tag += "_" + inte_axis + "_" + str(inte_low) + "_" + str( inte_high) # Pick the region we want to look at # E.g. cr_2m_j = Di-Muon control region with monojet selection h = h.integrate(h.axis('region'), region) # Plotting # Add ratio plot at the bottom if specified (default) # Otherwise just plot the histogram if ratio: fig, (ax, rax) = plt.subplots(2, 1, figsize=(7, 7), gridspec_kw={"height_ratios": (3, 1)}, sharex=True) else: fig, ax = plt.subplots(1, 1, figsize=(7, 5)) data_err_opts = { 'linestyle': 'none', 'marker': '.', 'markersize': 10., 'color': 'k', 'elinewidth': 1, 'emarker': '_' } signal_err_opts = { 'linestyle': 'none', 'marker': '.', 'markersize': 10., 'color': 'r', 'elinewidth': 1, 'emarker': '_' } # Plot single muon data # Note the syntax we use to pick the data set if data: fig, ax, _ = hist.plot1d(h[data], overlay='dataset', error_opts=data_err_opts, ax=ax, overflow='all', binwnorm=True) if signal: fig, ax, _ = hist.plot1d(h[signal], overlay='dataset', error_opts=signal_err_opts, ax=ax, overflow='all', binwnorm=True) # Plot MC background samples # Here we use a regular expression to match # data sets we want _, _, primitives = hist.plot1d(h[mc], overlay='dataset', stack=True, clear=False, overflow='all', ax=ax, binwnorm=True) for name, ps in primitives.items(): name = str(name) col = None for k, v in colors.items(): if re.match(k, name): col = v break for item in ps: if col: item.set_facecolor(col) item.set_linestyle('-') item.set_edgecolor('k') # Legend try: region_name = s.region_names[region] except KeyError: region_name = region ax.legend(title=region_name, ncol=1) # Ratio plot if data: hist.plotratio(h[data].integrate('dataset'), h[mc].integrate('dataset'), ax=rax, denom_fill_opts={}, guide_opts={}, unc='num', overflow='all', error_opts=data_err_opts) ax.text(1., 0., distribution, fontsize=10, horizontalalignment='right', verticalalignment='bottom', transform=ax.transAxes) fig.text(1., 1., f'{lumi(year)} fb$^{{-1}}$ ({year})', fontsize=14, horizontalalignment='right', verticalalignment='bottom', transform=ax.transAxes) fig.text(0., 1., '$\\bf{CMS}$ internal', fontsize=14, horizontalalignment='left', verticalalignment='bottom', transform=ax.transAxes) # Aesthetics ax.set_yscale("log") ax.set_ylabel('Events / Bin width') plot_settings = style.plot_settings() if region in plot_settings.keys(): plot_settings = plot_settings[region] if distribution in plot_settings.keys(): plot_settings = plot_settings[distribution] if ylim: ax.set_ylim(ylim[0], ylim[1]) elif 'ylim' in plot_settings.keys(): ax.set_ylim(plot_settings['ylim']) else: ax.set_ylim(1e-1, 1e6) if xlim: ax.set_xlim(xlim[0], xlim[1]) elif 'xlim' in plot_settings.keys(): ax.set_xlim(plot_settings['xlim']) if ratio: if rylim: rax.set_ylim(*rylim) else: rax.set_ylim(0.75, 1.25) loc1 = matplotlib.ticker.MultipleLocator(base=0.2) loc2 = matplotlib.ticker.MultipleLocator(base=0.1) rax.yaxis.set_major_locator(loc1) rax.yaxis.set_minor_locator(loc2) rax.grid(axis='y', which='minor', linestyle='--') rax.grid(axis='y', which='major', linestyle='--') rax.set_ylabel('Data / MC') if not os.path.exists(outdir): os.makedirs(outdir) outpath = pjoin( outdir, f"{region}_{distribution}{inte_tag}_{tag + '_' if tag else ''}{year}.{output_format}" ) fig.savefig(outpath) print(f"Saved plot file in {outpath}") plt.close('all')