def distribution(data_, args, feat, pt_range, mass_range, title=None):
    """
    Perform study of substructure variable distributions.

    Saves plot `figures/distribution_[feat].pdf`

    Arguments:
        data: Pandas data frame from which to read data.
        args: Namespace holding command-line arguments.
        feat: Feature for which to plot signal- and background distributions.
    """

    # Select data
    if pt_range is not None:
        data = data_[(data_['pt'] > pt_range[0]) & (data_['pt'] < pt_range[1])]
    else:
        data = data_
        pass

    if mass_range is not None:
        data = data[(data['m'] > mass_range[0]) & (data['m'] < mass_range[1])]
        pass

    # Define bins
    xmin = wpercentile(data[feat].values,
                       1,
                       weights=data['weight_test'].values)
    xmax = wpercentile(data[feat].values,
                       99,
                       weights=data['weight_test'].values)

    if feat == 'D2-k#minusNN':
        print "distribution: kNN feature '{}'".format(feat)
        xmin, xmax = -1., 2.
    elif feat.lower().startswith('d2'):
        print "distribution: D2  feature '{}'".format(feat)
        xmin, xmax = 0., 3.
    elif 'tau21' in feat.lower():
        xmin, xmax = 0., 1.
        pass

    snap = 0.5  # Snap to nearest multiple in appropriate direction
    xmin = np.floor(xmin / snap) * snap
    xmax = np.ceil(xmax / snap) * snap

    bins = np.linspace(xmin, xmax, 50 + 1, endpoint=True)

    # Perform plotting
    c = plot(args, data, feat, bins, pt_range, mass_range)

    # Output
    mkdir('figures/distribution/')
    path = 'figures/distribution/distribution_{}{}{}.pdf'.format(
        standardise(feat), '__pT{:.0f}_{:.0f}'.format(pt_range[0], pt_range[1])
        if pt_range is not None else '', '__mass{:.0f}_{:.0f}'.format(
            mass_range[0], mass_range[1]) if mass_range is not None else '')

    c.save(path=path)  #this was actually missing, lol

    return c, args, path
Example #2
0
def jetmass (data, args, feat, eff_sig=50):
    """
    Perform study of jet mass distributions before and after subtructure cut.

    Saves plot `figures/jetmass_[feat]__eff_sig_[eff_sig].pdf`

    Arguments:
        data: Pandas data frame from which to read data.
        args: Namespace holding command-line arguments.
        feat: Feature for which to plot signal- and background distributions.
        eff_sig: Signal efficiency at which to impose cut
    """

    # Define masks and direction-dependent cut value
    msk_sig = data['signal'] == 1
    msk_bkg = ~msk_sig
    eff_cut = eff_sig if signal_low(feat) else 100 - eff_sig
    cut = wpercentile(data.loc[msk_sig, feat].values, eff_cut, weights=data.loc[msk_sig, 'weight_test'].values)
    msk_pass = data[feat] > cut

    # Ensure correct cut direction
    if signal_low(feat):
        msk_pass = ~msk_pass
        pass

    # Perform plotting
    c = plot(data, args, feat, msk_pass, msk_bkg, eff_sig)

    # Output
    path = 'figures/jetmass_{}__eff_sig_{:d}.pdf'.format(standardise(feat), int(eff_sig))

    return c, args, path
Example #3
0
def efficiency(data, args, feat, title=None):
    """
    Perform study of background efficiency vs. mass for different inclusive
    efficiency cuts

    Saves plot `figures/efficiency_[feat].pdf`

    Arguments:
        data: Pandas data frame from which to read data.
        args: Namespace holding command-line arguments.
        feat: Feature for which to study efficiencies
    """

    # Define common variables
    msk = data['signal'] == 0
    effs = [5, 10, 20, 40, 80]

    # Define cuts
    cuts = list()
    for eff in effs:
        cut = wpercentile(data.loc[msk, feat].values,
                          eff if signal_low(feat) else 100 - eff,
                          weights=data.loc[msk, 'weight_test'].values)
        cuts.append(cut)
        pass

    # Compute cut efficiency vs. mass
    profiles = list()
    for cut, eff in zip(cuts, effs):

        # Get correct pass-cut mask
        msk_pass = data[feat] > cut
        if signal_low(feat):
            msk_pass = ~msk_pass
            pass

        # Fill efficiency profile
        profile = ROOT.TProfile('profile_{}_{}'.format(feat, cut), "",
                                len(MASSBINS) - 1, MASSBINS)

        M = np.vstack((data.loc[msk, 'm'].values, msk_pass[msk])).T
        weights = data.loc[msk, 'weight_test'].values

        root_numpy.fill_profile(profile, M, weights=weights)

        # Add to list
        profiles.append(profile)
        pass

    # Perform plotting
    c = plot(args, data, feat, profiles, cuts, effs)

    # Output
    if title is None:
        path = 'figures/efficiency_{}.pdf'.format(standardise(feat))
    else:
        path = 'figures/' + title + '_efficiency_{}.pdf'.format(
            standardise(feat))
    c.save(path=path)
    return c, args, path
Example #4
0
def fill_profile(data):
    """Fill ROOT.TH2F with the measured, weighted values of the `EFF`-percentile
    of the background `VAR`. """

    # Define arrays
    shape = (AXIS[VARX][0], AXIS[VARY][0])
    bins = [
        np.linspace(AXIS[var][1],
                    AXIS[var][2],
                    AXIS[var][0] + 1,
                    endpoint=True) for var in VARS
    ]
    x, y, z = (np.zeros(shape) for _ in range(3))

    # Create `profile` histogram
    profile = ROOT.TH2F('profile', "",
                        len(bins[0]) - 1, bins[0].flatten('C'),
                        len(bins[1]) - 1, bins[1].flatten('C'))
    #data['weight1'] =  data['sample_weight']*data['MC_weight']

    # Fill profile
    for i, j in itertools.product(*map(range, shape)):

        # Bin edges in x and y
        edges = [bin[idx:idx + 2] for idx, bin in zip([i, j], bins)]

        # Masks
        msks = [(data[var] > edges[dim][0]) & (data[var] <= edges[dim][1])
                for dim, var in enumerate(VARS)]
        msk = reduce(lambda x, y: x & y, msks)

        # Percentile
        perc = np.nan
        if np.sum(
                msk
        ) > 20:  # Ensure sufficient statistics for meaningful percentile. Was 20
            perc = wpercentile(
                data=data.loc[msk, VAR].values,
                percents=100 - EFF,
                weights=data.loc[msk, 'TotalEventWeight'].values)  #wpercentile
            pass

        x[i, j] = np.mean(edges[0])
        y[i, j] = np.mean(edges[1])
        z[i, j] = perc

        # Set non-zero bin content
        if perc != np.nan:
            profile.SetBinContent(i + 1, j + 1, perc)
            pass
        pass

    # Normalise arrays
    x, y = standardise(x, y, rank=None)

    # Filter out NaNs
    msk = ~np.isnan(z)
    x, y, z = x[msk], y[msk], z[msk]

    return profile, (x, y, z)
Example #5
0
def distribution(data_, args, feat, pt_range, mass_range):
    """
    Perform study of substructure variable distributions.

    Saves plot `figures/distribution_[feat].pdf`

    Arguments:
        data: Pandas data frame from which to read data.
        args: Namespace holding command-line arguments.
        feat: Feature for which to plot signal- and background distributions.
    """

    # Select data
    if pt_range is not None:
        data = data_[(data_['pt'] > pt_range[0]) & (data_['pt'] < pt_range[1])]
    else:
        data = data_
        pass

    if mass_range is not None:
        data = data[(data['m'] > mass_range[0]) & (data['m'] < mass_range[1])]
        pass

    # Define bins
    xmin = wpercentile(data[feat].values,
                       1,
                       weights=data['weight_test'].values)
    xmax = wpercentile(data[feat].values,
                       99,
                       weights=data['weight_test'].values)

    snap = 0.5  # Snap to nearest multiple in appropriate direction
    xmin = np.floor(xmin / snap) * snap
    xmax = np.ceil(xmax / snap) * snap

    bins = np.linspace(xmin, xmax, 50 + 1, endpoint=True)

    # Perform plotting
    c = plot(args, data, feat, bins, pt_range, mass_range)

    # Output
    path = 'figures/distribution_{}{}{}.pdf'.format(
        standardise(feat), '__pT{:.0f}_{:.0f}'.format(pt_range[0], pt_range[1])
        if pt_range is not None else '', '__mass{:.0f}_{:.0f}'.format(
            mass_range[0], mass_range[1]) if mass_range is not None else '')

    return c, args, path
Example #6
0
def fill_profile (data, variable, bg_eff, signal_above=False):
    """Fill ROOT.TH2F with the measured, weighted values of the bg_eff-percentile
    of the background `VAR`. """

    if signal_above: bg_eff = 100. - bg_eff  # ensures that region above cut is counted as signal, not below

    # Define arrays
    shape   = (AXIS[VARX][0], AXIS[VARY][0])
    bins    = [np.linspace(AXIS[var][1], AXIS[var][2], AXIS[var][0] + 1, endpoint=True) for var in VARS]
    x, y, z = (np.zeros(shape) for _ in range(3))

    # Create `profile` histogram
    profile = ROOT.TH2F('profile', "", len(bins[0]) - 1, bins[0].flatten('C'), len(bins[1]) - 1, bins[1].flatten('C'))

    # Fill profile
    for i,j in itertools.product(*map(range, shape)):

        # Bin edges in x and y
        edges = [bin[idx:idx+2] for idx, bin in zip([i,j],bins)]

        # Masks
        msks = [(data[var] > edges[dim][0]) & (data[var] <= edges[dim][1]) for dim, var in enumerate(VARS)]
        msk = reduce(lambda x,y: x & y, msks)

        # Percentile
        perc = np.nan
        if np.sum(msk) > 20:  # Ensure sufficient statistics for meaningful percentile
            perc = wpercentile(data=   data.loc[msk, variable]          .values, percents=bg_eff,
                               weights=data.loc[msk, 'weight_test'].values)
            pass

        x[i,j] = np.mean(edges[0])
        y[i,j] = np.mean(edges[1])
        z[i,j] = perc

        # Set non-zero bin content
        if perc != np.nan:
            profile.SetBinContent(i + 1, j + 1, perc)
            pass
        pass

    # Normalise arrays
    x,y = standardise(x,y)

    # Filter out NaNs
    msk = ~np.isnan(z)
    x, y, z = x[msk], y[msk], z[msk]

    return profile, (x,y,z)
Example #7
0
def jetmasscomparison(data, args, features, eff_sig=50):
    """
    Perform study of jet mass distributions before and after subtructure cut for
    different substructure taggers.

    Saves plot `figures/jetmasscomparison__eff_sig_[eff_sig].pdf`

    Arguments:
        data: Pandas data frame from which to read data.
        args: Namespace holding command-line arguments.
        features: Features for which to plot signal- and background distributions.
        eff_sig: Signal efficiency at which to impose cut.
    """

    # Define masks and direction-dependent cut value
    msk_sig = data['signal'] == 1
    cuts, msks_pass = dict(), dict()
    for feat in features:
        eff_cut = eff_sig if signal_low(feat) else 100 - eff_sig
        cut = wpercentile(data.loc[msk_sig, feat].values,
                          eff_cut,
                          weights=data.loc[msk_sig, 'weight_test'].values)
        msks_pass[feat] = data[feat] > cut

        # Ensure correct cut direction
        if signal_low(feat):
            msks_pass[feat] = ~msks_pass[feat]
            pass
        pass

    # Perform plotting
    c = plot(data, args, features, msks_pass, eff_sig)

    # Perform plotting on individual figures
    plot_individual(data, args, features, msks_pass, eff_sig)

    # Output
    path = 'figures/jetmasscomparison__eff_sig_{:d}.pdf'.format(int(eff_sig))

    return c, args, path
def main(args):

    # ...

    # Load data
    data_, features, _ = load_data(args.input + 'data.h5', train=True)

    for pt_bin in [(200., 500.), (500., 1000.)]:

        # Impose pT-cut
        data = data_[(data_['pt'] >= pt_bin[0]) & (data_['pt'] < pt_bin[1])]

        var = 'Tau21'
        msk_sig = (data['signal'] == 1)
        x = data[var].values
        m = data['m'].values
        w = data['weight_test'].values

        # Get cut value
        cut = wpercentile(x[msk_sig], 50., weights=w)
        print "Cut value: {:.2f}".format(cut)

        # Discard signal
        x = x[~msk_sig]
        m = m[~msk_sig]
        w = w[~msk_sig]

        # Get pass mask
        msk_pass = x < cut
        print "Background efficiency: {:.1f}%".format(
            100. * w[msk_pass].sum() / w.sum())

        # Canvas
        offset = 0.06
        margin = 0.3
        # @NOTE
        #   A = Height of pad 0
        #   B = Height of pads 1,2
        #   C = Height of pad 3
        # -->
        #   A = 0.5
        #
        #   (1. - 2 * offset) * B = (1. - 2*offset - margin) * C
        #   ==>
        #   B = C * (1. - 2*offset - margin) / (1. - 2 * offset)
        #   ==>
        #   B = C * (1 - margin / (1. - 2 * offset))
        #
        #   A + 2 * B + C = 1
        #   ==>
        #   A + 2 * C * (1 - margin / (1. - 2 * offset)) + C = 1
        #   ==>
        #   C = (1 - A) / (1 + 2 * (1 - margin / (1. - 2 * offset)))

        A = 0.5
        C = (1 - A) / (1 + 2 * (1 - margin / (1. - 2 * offset)))
        B = C * (1 - margin / (1. - 2 * offset))

        c = rp.canvas(batch=True,
                      num_pads=4,
                      fraction=(A, B, B, C),
                      size=(600, 700))

        # Set pad margins
        c.pad(0)._bare().SetBottomMargin(offset)
        c.pad(1)._bare().SetTopMargin(offset)
        c.pad(1)._bare().SetBottomMargin(offset)
        c.pad(2)._bare().SetTopMargin(offset)
        c.pad(2)._bare().SetBottomMargin(offset)
        c.pad(3)._bare().SetTopMargin(offset)
        c.pad(3)._bare().SetBottomMargin(offset + margin)

        # Styling
        HISTSTYLE[True]['label'] = 'Passing cut, #it{{P}}'.format(
            latex(var, ROOT=True))
        HISTSTYLE[False]['label'] = 'Failing cut, #it{{F}}'.format(
            latex(var, ROOT=True))

        # Histograms
        F = c.hist(m[~msk_pass],
                   bins=MASSBINS,
                   weights=w[~msk_pass],
                   normalise=True,
                   **HISTSTYLE[False])
        P = c.hist(m[msk_pass],
                   bins=MASSBINS,
                   weights=w[msk_pass],
                   normalise=True,
                   **HISTSTYLE[True])

        P, F = map(root_numpy.hist2array, [P, F])
        M = (P + F) / 2
        c.hist(M,
               bins=MASSBINS,
               normalise=True,
               linewidth=3,
               linecolor=ROOT.kViolet,
               linestyle=2,
               label='Average, #it{M}')

        # Compute divergences
        KL_PM = -P * np.log2(M / P)
        KL_FM = -F * np.log2(M / F)
        JSD = (KL_PM + KL_FM) / 2.
        JSDsum = np.cumsum(JSD)

        opts = dict(bins=MASSBINS, fillcolor=ROOT.kGray, alpha=0.5)

        # Draw divergences
        c.pad(1).hist(KL_PM, **opts)
        c.pad(1).ylim(-0.12, 0.05)
        c.pad(1).yline(0.)

        c.pad(2).hist(KL_FM, **opts)
        c.pad(2).ylim(-0.05, 0.12)
        c.pad(2).yline(0.)

        c.pad(3).hist(JSD, **opts)
        c.pad(3).ylim(0., 0.03)
        c.pad(3).yline(0.)

        o = rp.overlay(c.pad(3), color=ROOT.kViolet, ndiv=502)
        o.hist(JSDsum, bins=MASSBINS, linecolor=ROOT.kViolet)
        o.label("#sum_{i #leq n} JSD(P #parallel F)")
        o.lim(0, 0.2)
        #o._update_overlay()

        # Styling axes
        c.pad(0)._xaxis().SetTitleOffset(999.)
        c.pad(1)._xaxis().SetTitleOffset(999.)
        c.pad(2)._xaxis().SetTitleOffset(999.)
        c.pad(3)._xaxis().SetTitleOffset(5.)
        c.pad(0)._xaxis().SetLabelOffset(999.)
        c.pad(1)._xaxis().SetLabelOffset(999.)
        c.pad(2)._xaxis().SetLabelOffset(999.)

        c.pad(0)._yaxis().SetNdivisions(505)
        c.pad(1)._yaxis().SetNdivisions(502)
        c.pad(2)._yaxis().SetNdivisions(502)
        c.pad(3)._yaxis().SetNdivisions(502)

        c.pad(0).ylim(0, 0.20)
        c.pad(0).cd()
        c.pad(0)._get_first_primitive().Draw('SAME AXIS')

        # Decorations
        c.text(TEXT + [
            "Multijets, training dataset",
            "Cut on {:s} at #varepsilon_{{sig}}^{{rel}} = 50%".format(
                latex(var, ROOT=True)),
            "p_{{T}} #in  [{:.0f}, {:.0f}] GeV".format(*pt_bin)
        ],
               qualifier='Simulation Internal')
        c.legend(width=0.25)
        c.xlabel("Large-#it{R} jet mass [GeV]")
        c.ylabel("Fraction of jets")
        c.pad(1).ylabel('KL(P #parallel M)')
        c.pad(2).ylabel('KL(F #parallel M)')
        c.pad(3).ylabel('JSD(P #parallel F)')

        # Save
        c.save('figures/massdecorrelationmetric_{:s}__pT{:.0f}_{:.0f}GeV.pdf'.
               format(var, *pt_bin))
        pass
    return 0
Example #9
0
def jetmasscomparison(data_, args, features, pt_range, eff_sig=50, title=None):
    """
    Perform study of jet mass distributions before and after subtructure cut for
    different substructure taggers.

    Saves plot `figures/jetmasscomparison__eff_sig_[eff_sig].pdf`

    Arguments:
        data: Pandas data frame from which to read data.
        args: Namespace holding command-line arguments.
        features: Features for which to plot signal- and background distributions.
        eff_sig: Signal efficiency at which to impose cut.
	pt_range: pT selection of the data.
    """

    # Define masks and direction-dependent cut value

    # Select pT-range
    if pt_range is not None:
        data = data_[(data_['pt'] > pt_range[0]) & (data_['pt'] < pt_range[1])]
    else:
        data = data_
        pass

    msk_sig = data['signal'] == 1
    cuts, msks_pass = dict(), dict()
    for feat in features:
        eff_cut = eff_sig if signal_low(feat) else 100 - eff_sig
        cut = wpercentile(data.loc[msk_sig, feat].values,
                          eff_cut,
                          weights=data.loc[msk_sig, 'weight_test'].values)
        msks_pass[feat] = data[feat] > cut

        # Ensure correct cut direction
        if signal_low(feat):
            msks_pass[feat] = ~msks_pass[feat]
            pass
        pass

    # Perform plotting
    c = plot(data, args, features, msks_pass, eff_sig, pt_range)

    # Perform plotting on individual figures
    plot_individual(data, args, features, msks_pass, eff_sig, pt_range, title)

    # Output
    #path = 'figures/jetmasscomparison__eff_sig_{:d}.pdf'.format(int(eff_sig))
    if title is None:
        if pt_range is not None:
            path = 'figures/jetmasscomparison_pT{}to{}__eff_sig_{:d}.pdf'.format(
                pt_range[0], pt_range[1], int(eff_sig))
        else:
            path = 'figures/jetmasscomparison__eff_sig_{:d}.pdf'.format(
                int(eff_sig))
    else:
        if pt_range is not None:
            path = 'figures/' + title + '_jetmasscomparison_pT{}to{}__eff_sig_{:d}.pdf'.format(
                pt_range[0], pt_range[1], int(eff_sig))
        else:
            path = 'figures/' + title + '_jetmasscomparison__eff_sig_{:d}.pdf'.format(
                int(eff_sig))

    return c, args, path
Example #10
0
def jsd(data_, args, feature_dict, pt_range, title=None):
    """
    Perform study of ...

    Saves plot `figures/jsd.pdf`

    Arguments:
        data: Pandas data frame from which to read data.
        args: Namespace holding command-line arguments.
        features: Features for ...
    """

    # Extract features and count appearance of each base variable
    features = []
    appearances = []
    for basevar in feature_dict.keys():
        for suffix in feature_dict[basevar]:
            features.append(basevar + suffix)
        appearances.append(len(feature_dict[basevar]))

    # Select data
    if pt_range is not None:
        data = data_[(data_['pt'] > pt_range[0]) & (data_['pt'] < pt_range[1])]
    else:
        data = data_
        pass

    # Create local histogram style dict
    histstyle = dict(**HISTSTYLE)
    histstyle[True]['label'] = "Pass"
    histstyle[False]['label'] = "Fail"

    # Define common variables
    msk = data['signal'] == 0
    effs = np.linspace(0, 100, 10 * 2, endpoint=False)[1:].astype(int)

    # Loop tagger features
    jsd = {feat: [] for feat in features}
    for ifeat, feat in enumerate(features):

        if len(jsd[feat]): continue  # Duplicate feature.

        # Define cuts
        cuts = list()
        for eff in effs:
            cut = wpercentile(data.loc[msk, feat].values,
                              eff if signal_low(feat) else 100 - eff,
                              weights=data.loc[msk, 'weight_test'].values)
            cuts.append(cut)
            pass

        # Compute KL divergence for successive cuts
        for cut, eff in zip(cuts, effs):

            # Create ROOT histograms
            msk_pass = data[feat] > cut
            if signal_low(feat):
                msk_pass = ~msk_pass
                pass

            # Get histograms / plot
            c = rp.canvas(batch=not args.show)
            h_pass = c.hist(data.loc[msk_pass & msk, 'm'].values,
                            bins=MASSBINS,
                            weights=data.loc[msk_pass & msk,
                                             'weight_test'].values,
                            normalise=True,
                            **histstyle[True])  #, display=False)
            h_fail = c.hist(data.loc[~msk_pass & msk, 'm'].values,
                            bins=MASSBINS,
                            weights=data.loc[~msk_pass & msk,
                                             'weight_test'].values,
                            normalise=True,
                            **histstyle[False])  #, display=False)

            # Convert to numpy arrays
            p = root_numpy.hist2array(h_pass)
            f = root_numpy.hist2array(h_fail)

            # Compute Jensen-Shannon divergence
            jsd[feat].append(JSD(p, f, base=2))

            # -- Decorations
            #c.xlabel("Large-#it{R} jet mass [GeV]")
            #c.ylabel("Fraction of jets")
            #c.legend()
            #c.logy()
            #c.text(TEXT + [
            #    "{:s} {} {:.3f}".format(latex(feat, ROOT=True), '<' if signal_low(feat) else '>', cut),
            #    "JSD = {:.4f}".format(jsd[feat][-1])] + \
            #    (["p_{{T}} #in  [{:.0f}, {:.0f}] GeV".format(*pt_range)] if pt_range else []),
            #    qualifier=QUALIFIER, ATLAS=False)

            # -- Save
            #if title is None:
            #    c.save('figures/temp_jsd_{:s}_{:.0f}{}.pdf'.format(feat, eff, '' if pt_range is None else '__pT{:.0f}_{:.0f}'.format(*pt_range)))
            #else:
            #    c.save('figures/'+title+'_temp_jsd_{:s}_{:.0f}{}.pdf'.format(feat, eff, '' if pt_range is None else '__pT{:.0f}_{:.0f}'.format(*pt_range)))

            pass
        pass

    # Compute meaningful limit on JSD
    jsd_limits = list()
    sigmoid = lambda x: 1. / (1. + np.exp(-x))
    for eff in sigmoid(np.linspace(-5, 5, 20 + 1, endpoint=True)):
        limits = jsd_limit(data[msk], eff, num_bootstrap=5)
        jsd_limits.append((eff, np.mean(limits), np.std(limits)))
        pass

    # Perform plotting
    c = plot(args, data, effs, jsd, jsd_limits, features, pt_range,
             appearances)

    # Output
    if title is None:
        path = 'figures/jsd{}.pdf'.format(
            '' if pt_range is None else '__pT{:.0f}_{:.0f}'.format(*pt_range))
    else:
        path = 'figures/' + title + '_jsd{}.pdf'.format(
            '' if pt_range is None else '__pT{:.0f}_{:.0f}'.format(*pt_range))
    c.save(path=path)
    return c, args, path
Example #11
0
def fill_profile_1D(data):
    """Fill ROOT.TH2F with the measured, weighted values of the `EFF`-percentile
    of the background `VAR`. """

    # Define arrays
    #bins    = np.linspace(AXIS[VARX][1], AXIS[VARX][2], AXIS[VARX][0] + 1, endpoint=True)
    # Make variable sized bins
    #bins = np.linspace(AXIS[VARX][1], 4000, 40, endpoint=True)
    #bins = np.append(bins, [4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000])

    # Build bin structure with at least ?50 event in each bin
    # and bin widths of at least AXIS[VARX][0]

    minBinSize = 100  #AXIS[VARX][0]
    binEdge = AXIS[VARX][2]
    binList = []
    binList.append(binEdge)
    k = 1
    while binEdge - k * minBinSize > AXIS[VARX][1]:
        msk = (data[VARX] > binEdge - k * minBinSize) & (data[VARX] <= binEdge)
        if (np.sum(msk) * EFF / 100. > MIN_STAT):
            binEdge -= k * minBinSize
            binList.append(binEdge)
            k = 1
        else:
            k += 1

    binList.append(AXIS[VARX][1])
    binList.reverse()
    bins = np.array(binList)
    print "Bins: ", len(bins), bins

    shape = len(bins) - 1  #AXIS[VARX][0] #
    x, y, e = (np.zeros(shape) for _ in range(3))

    # Create `profile` histogram
    profile = ROOT.TH1F('profile', "", len(bins) - 1, bins)

    #if INPUT == "mc":
    #    data.loc[:,'TotalEventWeight'] /=  139000000.

    # Fill profile
    for i in (range(shape)):

        # Masks
        msk = (data[VARX] > bins[i]) & (data[VARX] <= bins[i + 1])

        # Percentile
        #perc = np.nan
        #if np.sum(msk) > 20:  # Ensure sufficient statistics for meaningful percentile. Was 20
        perc = wpercentile(
            data=data.loc[msk, VAR].values,
            percents=100 - EFF,
            weights=data.loc[msk, 'TotalEventWeight'].values)  #wpercentile
        #   pass

        x[i] = np.mean([bins[i], bins[i + 1]])
        y[i] = perc
        if np.sum(msk) > 0:
            e[i] = np.sqrt(np.sum(msk)) / np.sum(msk)
        else:
            print "Bin ", i, " has np.sum(msk) < 20. Weird."
            e[i] = 0

        # Set non-zero bin content
        if perc != np.nan:
            profile.SetBinContent(i + 1, perc)
            pass
        pass

    # Normalise array
    # x = standardise(x, rank=None)

    # Filter out NaNs
    msk = ~np.isnan(y)
    x, y, e = x[msk], y[msk], y[msk]

    return profile, (x, y, e)
Example #12
0
def main (args):

    # Definitions
    histstyle = dict(**HISTSTYLE)

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, features, _ = load_data('data/' + args.input) #, test=True) # 

    outFile = ROOT.TFile.Open("figures/knn_jet_ungrtrk500_eff{}_data.root".format(knn_eff),"RECREATE")


    EFF = 0.5
    VAR = 'jet_ungrtrk500'
    VARX = 'dijetmass'
    FIT_RANGE = (0, 6000) # Necessary?

    #eff_sig = 0.50
    #fpr, tpr, thresholds = roc_curve(data['signal'], data[kNN_basevar], sample_weight=data['weight'])
    #idx = np.argmin(np.abs(tpr - eff_sig))
    #print "Background acceptance @ {:.2f}% sig. eff.: {:.2f}% ({} > {:.2f})".format(eff_sig * 100., (fpr[idx]) * 100., kNN_basevar, thresholds[idx]) #changed from 1-fpr[idx]
    #print "Chosen target efficiency: {:.2f}%".format(kNN_eff)


    weight = 'weight'  # 'weight_test' / 'weight'
    bins_mjj = np.linspace(100, 8000, 20)
    fineBins = np.linspace(100, 8000, 7900)
    fineBinsRe = fineBins.reshape(-1,1)

    percs = []
    for i in range(1, len(bins_mjj)):
        
        msk = (data[VARX] > bins_mjj[i-1]) & (data[VARX] <= bins_mjj[i]) & (data['signal']==0) 

        if np.sum(msk) > 20:  # Ensure sufficient statistics for meaningful percentile. Was 20
            percs.append( wpercentile(data=data.loc[msk, VAR].values, percents=100-EFF, weights=data.loc[msk, weight].values) )#wpercentile
            
        else:
            percs.append(0)

    print "Length of percs: ", len(percs), percs

    percs = percs[0:-1]
    bins_mjj = bins_mjj[0:-1]
    
    X = bins_mjj.reshape(-1,1)
    X = X[1:len(bins_mjj)]


    print len(X), len(percs)

    # Fit parameters
    knn_neighbors = 2
    knn_weights = 'uniform'
    fit_deg = 1

    knn = KNeighborsRegressor(n_neighbors=5, weights='distance') 
    y_knn = knn.fit(X, percs).predict(fineBinsRe)
    
    c = rp.canvas(batch=True)
    knnFit = c.plot(y_knn, bins=fineBins, linecolor=ROOT.kRed+2, linewidth=2, linestyle=1, label="knn fit, uniform", option='L')

    c.save('figures/distributions/percentile_test.pdf'.format(EFF, args.input))           

    outFile.cd()
    knnFit.SetName("kNNfit")
    knnFit.Write()
    outFile.Close()

    """
Example #13
0
def jetmasscomparison(data, args, features, eff_sig=25):
    """
    Perform study of jet mass distributions before and after subtructure cut for
    different substructure taggers.

    Saves plot `figures/jetmasscomparison__eff_sig_[eff_sig].pdf`

    Arguments:
        data: Pandas data frame from which to read data.
        args: Namespace holding command-line arguments.
        features: Features for which to plot signal- and background distributions.
        eff_sig: Signal efficiency at which to impose cut.
    """

    # Define masks and direction-dependent cut value
    msk_sig = data['sigType'] == 1
    cuts, msks_pass = dict(), dict()
    lead_features = []

    print "Features: ", features

    for feat in features:
        eff_cut = eff_sig if signal_low(feat) else 100 - eff_sig

        if (not 'lead' in feat) and (not 'sub' in feat):
            print "hej"

            cut = wpercentile(data.loc[msk_sig, feat].values,
                              eff_cut,
                              weights=data.loc[msk_sig, 'weight'].values)
            msk = (data[feat] > cut)

            fpr, tpr, thresholds = roc_curve(data['signal'],
                                             data[feat],
                                             sample_weight=data['weight'])
            idx = np.argmin(np.abs(tpr - eff_sig / 100.))

            print "Pass criteria:", feat, " > ", cut
            print "Background acceptance @ {:.2f}% sig. eff.: {:.5f}% ({} > {:.2f})".format(
                eff_sig, (fpr[idx]) * 100., feat, thresholds[idx])

            msks_pass[feat] = msk
            lead_features.append(feat)

        else:

            if 'lead' in feat:
                cut1 = wpercentile(data.loc[msk_sig, feat].values,
                                   eff_cut,
                                   weights=data.loc[msk_sig, 'weight'].values)
                msk1 = (data[feat] > cut1)

                fpr, tpr, thresholds = roc_curve(data['signal'],
                                                 data[feat],
                                                 sample_weight=data['weight'])
                idx = np.argmin(np.abs(tpr - eff_sig / 100.))

                print "H Pass criteria:", feat, " > ", cut1
                print "H Background acceptance @ {:.2f}% sig. eff.: {:.6f}% ({} > {:.2f})".format(
                    eff_sig, (fpr[idx]) * 100., feat, thresholds[idx])

                lead_features.append(feat)

                subfeat = feat.replace("lead", "sub")
                data1 = data[msk1]
                cut2 = wpercentile(data1.loc[msk_sig, subfeat].values,
                                   eff_cut,
                                   weights=data1.loc[msk_sig, 'weight'].values)
                fpr, tpr, thresholds = roc_curve(data1['signal'],
                                                 data1[subfeat],
                                                 sample_weight=data1['weight'])

                idx = np.argmin(np.abs(tpr - eff_sig / 100.))
                idy = np.argmin(np.abs(thresholds - cut1))

                print "H Pass criteria:", subfeat, " > ", cut2, idy, len(
                    thresholds)
                print "H Background acceptance @ {:.5f}% sig. eff.: {:.5f}% ({} > {:.5f})".format(
                    (tpr[idy]) * 100, (fpr[idy]) * 100., subfeat,
                    thresholds[idy])

                #msks_pass[feat]=(data[feat]>cut1) | (data[subfeat]>cut1)
                msks_pass[feat] = (data[feat] > cut1) & (data[subfeat] > cut1)

        # Ensure correct cut direction
        if signal_low(feat):
            msks_pass[feat] = ~msks_pass[feat]
            pass
        pass

    # Perform plotting
    #c = plot(data, args, features, msks_pass, eff_sig)

    # Perform plotting on individual figures
    c = plot_individual(data, args, lead_features, msks_pass, eff_sig)

    # Output
    path = 'figures/jetmasscomparison__eff_sig_{:d}_{}.pdf'.format(
        int(eff_sig), MODEL)
    path = 'figures/jetmasscomparison__eff_sig_{:d}_{}.eps'.format(
        int(eff_sig), MODEL)

    return c, args, path