Ejemplo n.º 1
0
def make_plot(acc,
              region,
              distribution,
              year,
              data,
              mc,
              signal=None,
              outdir='./output/stack/',
              integrate=None,
              ylim=None,
              xlim=None,
              rylim=None,
              tag=None,
              output_format='pdf',
              ratio=True):
    """Creates a data vs MC comparison plot

    :param acc: Accumulator (processor output)
    :type acc: coffea.processor.accumulator
    """
    # Rebin
    s = Style()
    h = copy.deepcopy(acc[distribution])
    assert (h)
    try:
        newax = s.get_binning(distribution, region)
        h = h.rebin(h.axis(newax.name), newax)
    except KeyError:
        pass

    # Integrate over an extra axis
    inte_tag = ""
    if integrate:
        (inte_axis, inte_low, inte_high) = integrate
        h = h.integrate(inte_axis,
                        slice(inte_low,
                              inte_high))  #can add an overflow option here
        inte_tag += "_" + inte_axis + "_" + str(inte_low) + "_" + str(
            inte_high)

    # Pick the region we want to look at
    # E.g. cr_2m_j = Di-Muon control region with monojet selection
    h = h.integrate(h.axis('region'), region)

    # Plotting
    # Add ratio plot at the bottom if specified (default)
    # Otherwise just plot the histogram
    if ratio:
        fig, (ax, rax) = plt.subplots(2,
                                      1,
                                      figsize=(7, 7),
                                      gridspec_kw={"height_ratios": (3, 1)},
                                      sharex=True)

    else:
        fig, ax = plt.subplots(1, 1, figsize=(7, 5))

    data_err_opts = {
        'linestyle': 'none',
        'marker': '.',
        'markersize': 10.,
        'color': 'k',
        'elinewidth': 1,
    }
    signal_err_opts = {
        'linestyle': '-',
        'color': 'crimson',
        'elinewidth': 1,
    }

    # Plot single muon data
    # Note the syntax we use to pick the data set
    if data:
        hist.plot1d(h[data],
                    overlay='dataset',
                    error_opts=data_err_opts,
                    ax=ax,
                    overflow='all',
                    binwnorm=1)

    if signal:
        hist.plot1d(h[signal],
                    overlay='dataset',
                    error_opts=signal_err_opts,
                    ax=ax,
                    overflow='all',
                    binwnorm=1,
                    clear=False)

    # Plot MC background samples
    # Here we use a regular expression to match
    # data sets we want
    hist.plot1d(h[mc],
                overlay='dataset',
                stack=True,
                clear=False,
                overflow='all',
                ax=ax,
                binwnorm=1)

    # Apply correct colors to BG histograms
    handles, labels = ax.get_legend_handles_labels()
    new_labels = []
    for handle, label in zip(handles, labels):
        col = None
        for k, v in colors.items():
            if re.match(k, label):
                col = v
                break
        if col:
            handle.set_color(col)
            handle.set_linestyle('-')
            handle.set_edgecolor('k')

        l = None

        channel = channel_name(region)
        # Pick the proper legend labels for the channel
        if channel == 'VBF':
            legend_labels_to_use = legend_labels['VBF']
        elif channel in ['Monojet', 'Mono-V']:
            legend_labels_to_use = legend_labels['Monojet/Mono-V']

        # Add in the common labels
        legend_labels_to_use.update(legend_labels['Common'])

        for k, v in legend_labels_to_use.items():
            if re.match(k, label):
                l = v
        new_labels.append(l if l else label)

    # Update legend
    try:
        region_name = s.region_names[region]
    except KeyError:
        region_name = region
    ax.legend(title=region_name, ncol=2, handles=handles, labels=new_labels)

    # Ratio plot
    if data:
        hist.plotratio(h[data].integrate('dataset'),
                       h[mc].integrate('dataset'),
                       ax=rax,
                       denom_fill_opts={},
                       guide_opts={},
                       unc='num',
                       overflow='all',
                       error_opts=data_err_opts)

    ax.text(1.,
            0.,
            distribution,
            fontsize=10,
            horizontalalignment='right',
            verticalalignment='bottom',
            transform=ax.transAxes)
    fig.text(0.,
             1.,
             '$\\bf{CMS}$ internal',
             fontsize=14,
             horizontalalignment='left',
             verticalalignment='bottom',
             transform=ax.transAxes)

    fig.text(1.,
             1.,
             f'{channel_name(region)}, {lumi(year):.1f} fb$^{{-1}}$ ({year})',
             fontsize=14,
             horizontalalignment='right',
             verticalalignment='bottom',
             transform=ax.transAxes)
    # Aesthetics
    ax.set_yscale("log")
    ax.set_ylabel('Events / GeV')
    plot_settings = style.plot_settings()
    if region in plot_settings.keys():
        plot_settings = plot_settings[region]
    if distribution in plot_settings.keys():
        plot_settings = plot_settings[distribution]
    if ylim:
        if ylim == "auto":
            width = np.diff([x for x in h.axes()
                             if "dataset" not in x.name][0].edges())
            vmc = h[mc].integrate("dataset").values()[()] / width
            try:
                vdata = h[data].integrate("dataset").values()[()] / width
            except:
                vdata = vmc
            if signal:
                vsig = h[signal].integrate("dataset").values()[()] / width
            else:
                vsig = vmc

            ax.set_ylim(
                0.5 * min([
                    np.min(vmc[vmc > 0]),
                    np.min(vdata[vdata > 0]),
                    np.min(vsig[vsig > 0])
                ]),
                1e2 *
                max([np.max(vmc), np.max(vdata),
                     np.min(vsig)]),
            )

        else:
            ax.set_ylim(ylim[0], ylim[1])
    elif 'ylim' in plot_settings.keys():
        ax.set_ylim(plot_settings['ylim'])
    else:
        ax.set_ylim(1e-1, 1e6)

    if xlim:
        ax.set_xlim(xlim[0], xlim[1])
    elif 'xlim' in plot_settings.keys():
        ax.set_xlim(plot_settings['xlim'])

    if ratio:
        if rylim:
            rax.set_ylim(*rylim)
        else:
            rax.set_ylim(0.5, 1.5)
        loc1 = matplotlib.ticker.MultipleLocator(base=0.2)
        loc2 = matplotlib.ticker.MultipleLocator(base=0.1)
        rax.yaxis.set_major_locator(loc1)
        rax.yaxis.set_minor_locator(loc2)
        rax.grid(axis='y', which='minor', linestyle='--')
        rax.grid(axis='y', which='major', linestyle='--')
        rax.set_ylabel('Data / MC')
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    for form in output_format.split(','):
        outpath = pjoin(
            outdir,
            f"{region}_{distribution}{inte_tag}_{tag + '_' if tag else ''}{year}.{form}"
        )
        fig.savefig(outpath)
        print(f"Saved plot file in {outpath}")
    plt.close('all')
Ejemplo n.º 2
0
def plot(inpath):
    indir = os.path.abspath(inpath)

    # The processor output is stored in an
    # 'accumulator', which in our case is
    # just a dictionary holding all the histograms
    # Put all your *coffea files into 'indir' and
    # pass the directory as an argument here.
    # All input files in the directory will
    # automatically be found, merged and read.
    # The merging only happens the first time
    # you run over a specific set of inputs.
    acc = dir_archive(inpath, serialized=True, compression=0, memsize=1e3)
    # Get a settings dictionary that details
    # which plots to make for each region,
    # what the axis limits are, etc
    # Can add plots by extending the dictionary
    # Or modify axes ranges, etc
    settings = plot_settings()

    merged = set()

    # Separate plots per year
    for year in [2017, 2018]:
        # The data to be used for each region
        # Muon regions use MET,
        # electron+photon regions use EGamma
        # ( EGamma = SingleElectron+SinglePhoton for 2017)
        data = {
            'sr_vbf': None,
            'cr_1m_vbf': f'MET_{year}',
            'cr_2m_vbf': f'MET_{year}',
            'cr_1e_vbf': f'EGamma_{year}',
            'cr_2e_vbf': f'EGamma_{year}',
            'cr_g_vbf': f'EGamma_{year}',
        }

        # Same for MC selection
        # Match datasets by regular expressions
        # Here for LO V samples (HT binned)
        mc_lo = {
            'sr_vbf':
            re.compile(
                f'(ZJetsToNuNu.*|EW.*|TTJets.*FXFX.*|Diboson.*|ST.*|QCD_HT.*|.*DYJetsToLL_M-50_HT_MLM.*|.*WJetsToLNu.*HT.*).*{year}'
            ),
            'cr_1m_vbf':
            re.compile(
                f'(EW.*|TTJets.*FXFX.*|Diboson.*|ST.*|QCD_HT.*|.*DYJetsToLL_M-50_HT_MLM.*|.*WJetsToLNu.*HT.*).*{year}'
            ),
            'cr_1e_vbf':
            re.compile(
                f'(EW.*|TTJets.*FXFX.*|Diboson.*|ST.*|QCD_HT.*|.*DYJetsToLL_M-50_HT_MLM.*|.*WJetsToLNu.*HT.*).*{year}'
            ),
            'cr_2m_vbf':
            re.compile(
                f'(EW.*|TTJets.*FXFX.*|Diboson.*|ST.*|QCD_HT.*|.*DYJetsToLL_M-50_HT_MLM.*).*{year}'
            ),
            'cr_2e_vbf':
            re.compile(
                f'(EW.*|TTJets.*FXFX.*|Diboson.*|ST.*|QCD_HT.*|.*DYJetsToLL_M-50_HT_MLM.*).*{year}'
            ),
            'cr_g_vbf':
            re.compile(f'(GJets_(HT|SM).*|QCD_HT.*|WJetsToLNu.*HT.*).*{year}'),
        }

        # Want to compare LO and NLO,
        # so do same thing for NLO V samples
        # All non-V samples remain the same
        mc_nlo = {
            'sr_vbf':
            re.compile(
                f'(ZJetsToNuNu.*|EW.*|TTJets.*FXFX.*|Diboson.*|ST.*|QCD_HT.*|.*DYJetsToLL_M-50_HT_MLM.*|.*WJetsToLNu.*FXFX.*).*{year}'
            ),
            'cr_1m_vbf':
            re.compile(
                f'(EW.*|TTJets.*FXFX.*|Diboson.*|ST.*|QCD_HT.*|.*DY.*FXFX.*|.*WJetsToLNu.*FXFX.*).*{year}'
            ),
            'cr_1e_vbf':
            re.compile(
                f'(EW.*|TTJets.*FXFX.*|Diboson.*|ST.*|QCD_HT.*|.*DY.*FXFX.*|.*WJetsToLNu.*FXFX.*).*{year}'
            ),
            'cr_2m_vbf':
            re.compile(
                f'(EW.*|TTJets.*FXFX.*|Diboson.*|ST.*|QCD_HT.*|.*DY.*FXFX.*).*{year}'
            ),
            'cr_2e_vbf':
            re.compile(
                f'(EW.*|TTJets.*FXFX.*|Diboson.*|ST.*|QCD_HT.*|.*DY.*FXFX.*).*{year}'
            ),
            'cr_g_vbf':
            re.compile(f'(GJets_(HT|SM).*|QCD_HT.*|W.*FXFX.*).*{year}'),
        }

        regions = list(mc_lo.keys())
        # Remove signal region, no need in ratio plots
        regions.remove('sr_vbf')

        # Make control region ratio plots for both
        # LO and NLO. Can be skipped if you only
        # want data / MC agreement plots.
        outdir = f'./output/{os.path.basename(indir)}/ratios'

        # Load ingredients from cache
        acc.load('mjj')
        acc.load('sumw')
        acc.load('sumw_pileup')
        acc.load('nevents')
        cr_ratio_plot(acc,
                      year=year,
                      tag='losf',
                      outdir=outdir,
                      mc=mc_lo,
                      regions=regions,
                      distribution='mjj')
        cr_ratio_plot(acc,
                      year=year,
                      tag='nlo',
                      outdir=outdir,
                      mc=mc_nlo,
                      regions=regions,
                      distribution='mjj')

        # Data / MC plots are made here
        # Loop over all regions
        for region in mc_lo.keys():
            ratio = True if region != 'sr_vbf' else False
            # Make separate output direcotry for each region
            outdir = f'./output/{os.path.basename(indir)}/{region}'
            # Settings for this region
            plotset = settings[region]

            # Loop over the distributions
            for distribution in plotset.keys():
                # Load from cache
                if not distribution in merged:
                    acc.load(distribution)

                    if not distribution in acc.keys():
                        print(
                            f"WARNING: Distribution {distribution} not found in input files."
                        )
                        continue
                    acc[distribution] = merge_extensions(
                        acc[distribution],
                        acc,
                        reweight_pu=not ('nopu' in distribution))
                    scale_xs_lumi(acc[distribution])
                    acc[distribution] = merge_datasets(acc[distribution])
                    acc[distribution].axis('dataset').sorting = 'integral'
                    merged.add(distribution)
                try:
                    # The heavy lifting of making a plot is hidden
                    # in make_plot. We call it once using the LO MC
                    make_plot(
                        acc,
                        region=region,
                        distribution=distribution,
                        year=year,
                        data=data[region],
                        mc=mc_lo[region],
                        ylim=plotset[distribution].get('ylim', None),
                        xlim=plotset[distribution].get('xlim', None),
                        tag='losf',
                        outdir=f'./output/{os.path.basename(indir)}/{region}',
                        output_format='pdf',
                        ratio=ratio)

                    # And then we also call it for the NLO MC
                    # The output files will be named according to the 'tag'
                    # argument, so we  will be able to tell them apart.
                    make_plot(
                        acc,
                        region=region,
                        distribution=distribution,
                        year=year,
                        data=data[region],
                        mc=mc_nlo[region],
                        ylim=plotset[distribution].get('ylim', None),
                        xlim=plotset[distribution].get('xlim', None),
                        tag='nlo',
                        outdir=f'./output/{os.path.basename(indir)}/{region}',
                        output_format='pdf',
                        ratio=ratio)

                except KeyError:
                    continue
Ejemplo n.º 3
0
def plot(args):
    indir = os.path.abspath(args.inpath)

    # The processor output is stored in an
    # 'accumulator', which in our case is
    # just a dictionary holding all the histograms
    # Put all your *coffea files into 'indir' and
    # pass the directory as an argument here.
    # All input files in the directory will
    # automatically be found, merged and read.
    # The merging only happens the first time
    # you run over a specific set of inputs.
    acc = dir_archive(args.inpath, serialized=True, compression=0, memsize=1e3)
    # Get a settings dictionary that details
    # which plots to make for each region,
    # what the axis limits are, etc
    # Can add plots by extending the dictionary
    # Or modify axes ranges, etc
    settings = plot_settings()

    merged = set()

    # Separate plots per year
    for year in [2017, 2018]:
        # The data to be used for each region
        # Muon regions use MET,
        # electron+photon regions use EGamma
        # ( EGamma = SingleElectron+SinglePhoton for 2017)
        data = {
            'sr_vbf': f'MET_{year}',
            'cr_1m_vbf': f'MET_{year}',
            'cr_2m_vbf': f'MET_{year}',
            'cr_1e_vbf': f'EGamma_{year}',
            'cr_2e_vbf': f'EGamma_{year}',
            'cr_g_vbf': f'EGamma_{year}',
        }

        # Same for MC selection
        # Match datasets by regular expressions
        # Here for LO V samples (HT binned)
        mc_lo = {
            'sr_vbf':
            re.compile(
                f'(ZJetsToNuNu.*|EW.*|Top_FXFX.*|Diboson.*|.*DYJetsToLL_M-50_HT_MLM.*|.*WJetsToLNu.*HT.*).*{year}'
            ),
            'cr_1m_vbf':
            re.compile(
                f'(EWKW.*|Top_FXFX.*|Diboson.*|.*DYJetsToLL_M-50_HT_MLM.*|.*WJetsToLNu.*HT.*).*{year}'
            ),
            'cr_1e_vbf':
            re.compile(
                f'(EWKW.*|Top_FXFX.*|Diboson.*|.*DYJetsToLL_M-50_HT_MLM.*|.*WJetsToLNu.*HT.*).*{year}'
            ),
            'cr_2m_vbf':
            re.compile(
                f'(EWKZ.*ZToLL.*|Top_FXFX.*|Diboson.*|.*DYJetsToLL_M-50_HT_MLM.*).*{year}'
            ),
            'cr_2e_vbf':
            re.compile(
                f'(EWKZ.*ZToLL.*|Top_FXFX.*|Diboson.*|.*DYJetsToLL_M-50_HT_MLM.*).*{year}'
            ),
            'cr_g_vbf':
            re.compile(
                f'(GJets_(DR-0p4|SM).*|QCD_data.*|WJetsToLNu.*HT.*).*{year}'),
        }

        # Load ingredients from cache
        acc.load('sumw')
        acc.load('sumw_pileup')
        acc.load('nevents')

        # Data / MC plots are made here
        # Loop over all regions
        for region in mc_lo.keys():
            if not re.match(args.region, region):
                continue
            # Plot ratio pads for all regions (now that we're unblinded)
            ratio = True
            # Make separate output direcotry for each region
            outdir = f'./output/{os.path.basename(indir)}/{region}'
            # Settings for this region
            plotset = settings[region]

            # Loop over the distributions
            for distribution in plotset.keys():
                if not re.match(args.distribution, distribution):
                    continue
                # Load from cache
                if not distribution in merged:
                    acc.load(distribution)

                    if not distribution in acc.keys():
                        print(
                            f"WARNING: Distribution {distribution} not found in input files."
                        )
                        continue
                    acc[distribution] = merge_extensions(
                        acc[distribution],
                        acc,
                        reweight_pu=not ('nopu' in distribution))
                    scale_xs_lumi(acc[distribution])
                    acc[distribution] = merge_datasets(acc[distribution])
                    acc[distribution].axis('dataset').sorting = 'integral'
                    merged.add(distribution)
                try:
                    # The heavy lifting of making a plot is hidden
                    # in make_plot. We call it once using the LO MC
                    imc = mc_lo[region]
                    if "cr_g" in region and distribution != "recoil":
                        imc = re.compile(
                            imc.pattern.replace('QCD_data', 'QCD.*HT'))
                    make_plot(
                        acc,
                        region=region,
                        distribution=distribution,
                        year=year,
                        data=data[region],
                        mc=imc,
                        ylim=plotset[distribution].get('ylim', None),
                        xlim=plotset[distribution].get('xlim', None),
                        tag='losf',
                        outdir=f'./output/{os.path.basename(indir)}/{region}',
                        output_format='pdf',
                        ratio=ratio)
                except KeyError:
                    continue
Ejemplo n.º 4
0
def make_plot(acc,
              region,
              distribution,
              year,
              data,
              mc,
              signal=None,
              outdir='./output/stack/',
              integrate=None,
              ylim=None,
              xlim=None,
              rylim=None,
              tag=None,
              output_format='pdf',
              ratio=True):
    """Creates a data vs MC comparison plot

    :param acc: Accumulator (processor output)
    :type acc: coffea.processor.accumulator
    """
    # Rebin
    s = Style()
    h = copy.deepcopy(acc[distribution])
    assert (h)
    try:
        newax = s.get_binning(distribution, region)
        h = h.rebin(h.axis(newax.name), newax)
    except KeyError:
        pass

    # Integrate over an extra axis
    inte_tag = ""
    if integrate:
        (inte_axis, inte_low, inte_high) = integrate
        h = h.integrate(inte_axis,
                        slice(inte_low,
                              inte_high))  #can add an overflow option here
        inte_tag += "_" + inte_axis + "_" + str(inte_low) + "_" + str(
            inte_high)

    # Pick the region we want to look at
    # E.g. cr_2m_j = Di-Muon control region with monojet selection
    h = h.integrate(h.axis('region'), region)

    # Plotting
    # Add ratio plot at the bottom if specified (default)
    # Otherwise just plot the histogram
    if ratio:
        fig, (ax, rax) = plt.subplots(2,
                                      1,
                                      figsize=(7, 7),
                                      gridspec_kw={"height_ratios": (3, 1)},
                                      sharex=True)

    else:
        fig, ax = plt.subplots(1, 1, figsize=(7, 5))

    data_err_opts = {
        'linestyle': 'none',
        'marker': '.',
        'markersize': 10.,
        'color': 'k',
        'elinewidth': 1,
        'emarker': '_'
    }
    signal_err_opts = {
        'linestyle': 'none',
        'marker': '.',
        'markersize': 10.,
        'color': 'r',
        'elinewidth': 1,
        'emarker': '_'
    }

    # Plot single muon data
    # Note the syntax we use to pick the data set
    if data:
        fig, ax, _ = hist.plot1d(h[data],
                                 overlay='dataset',
                                 error_opts=data_err_opts,
                                 ax=ax,
                                 overflow='all',
                                 binwnorm=True)

    if signal:
        fig, ax, _ = hist.plot1d(h[signal],
                                 overlay='dataset',
                                 error_opts=signal_err_opts,
                                 ax=ax,
                                 overflow='all',
                                 binwnorm=True)

    # Plot MC background samples
    # Here we use a regular expression to match
    # data sets we want
    _, _, primitives = hist.plot1d(h[mc],
                                   overlay='dataset',
                                   stack=True,
                                   clear=False,
                                   overflow='all',
                                   ax=ax,
                                   binwnorm=True)

    for name, ps in primitives.items():
        name = str(name)
        col = None
        for k, v in colors.items():
            if re.match(k, name):
                col = v
                break
        for item in ps:
            if col:
                item.set_facecolor(col)
            item.set_linestyle('-')
            item.set_edgecolor('k')
    # Legend
    try:
        region_name = s.region_names[region]
    except KeyError:
        region_name = region
    ax.legend(title=region_name, ncol=1)

    # Ratio plot
    if data:
        hist.plotratio(h[data].integrate('dataset'),
                       h[mc].integrate('dataset'),
                       ax=rax,
                       denom_fill_opts={},
                       guide_opts={},
                       unc='num',
                       overflow='all',
                       error_opts=data_err_opts)

    ax.text(1.,
            0.,
            distribution,
            fontsize=10,
            horizontalalignment='right',
            verticalalignment='bottom',
            transform=ax.transAxes)
    fig.text(1.,
             1.,
             f'{lumi(year)} fb$^{{-1}}$ ({year})',
             fontsize=14,
             horizontalalignment='right',
             verticalalignment='bottom',
             transform=ax.transAxes)
    fig.text(0.,
             1.,
             '$\\bf{CMS}$ internal',
             fontsize=14,
             horizontalalignment='left',
             verticalalignment='bottom',
             transform=ax.transAxes)
    # Aesthetics
    ax.set_yscale("log")
    ax.set_ylabel('Events / Bin width')
    plot_settings = style.plot_settings()
    if region in plot_settings.keys():
        plot_settings = plot_settings[region]
    if distribution in plot_settings.keys():
        plot_settings = plot_settings[distribution]
    if ylim:
        ax.set_ylim(ylim[0], ylim[1])
    elif 'ylim' in plot_settings.keys():
        ax.set_ylim(plot_settings['ylim'])
    else:
        ax.set_ylim(1e-1, 1e6)

    if xlim:
        ax.set_xlim(xlim[0], xlim[1])
    elif 'xlim' in plot_settings.keys():
        ax.set_xlim(plot_settings['xlim'])

    if ratio:
        if rylim:
            rax.set_ylim(*rylim)
        else:
            rax.set_ylim(0.75, 1.25)
        loc1 = matplotlib.ticker.MultipleLocator(base=0.2)
        loc2 = matplotlib.ticker.MultipleLocator(base=0.1)
        rax.yaxis.set_major_locator(loc1)
        rax.yaxis.set_minor_locator(loc2)
        rax.grid(axis='y', which='minor', linestyle='--')
        rax.grid(axis='y', which='major', linestyle='--')
        rax.set_ylabel('Data / MC')
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    outpath = pjoin(
        outdir,
        f"{region}_{distribution}{inte_tag}_{tag + '_' if tag else ''}{year}.{output_format}"
    )
    fig.savefig(outpath)
    print(f"Saved plot file in {outpath}")
    plt.close('all')