Beispiel #1
0
def main(args):

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, features, _ = load_data(args.input + 'data.h5', sample=0.01)  # @TEMP

    # Define classifier configuration(s)
    pattern = 'uboost_ur_{:4.2f}_te_92_rel21_fixed'
    urs = sorted([0.0, 0.01, 0.1, 0.3])
    classifiers = [
        ('AdaBoost' if ur == 0 else 'uBoost (#alpha={:4.2f})'.format(ur),
         pattern.format(ur).replace('.', 'p')) for ur in urs
    ]

    # Compute classifiers variables in parallel
    njobs = min(7, len(classifiers))
    with Profile("Run tests in parallel"):
        ret = Parallel(n_jobs=njobs)(delayed(compute)(data, name)
                                     for _, name in classifiers)
        pass

    # Add classifier variables to data
    for name, staged_series in ret:
        for stage, series in enumerate(staged_series):
            data['{:s}__{:d}'.format(name, stage)] = series
            pass
        pass

    # Plot learning curves
    plot(data, urs, classifiers)

    return 0
Beispiel #2
0
def plot(data, urs, classifiers):
    """
    Common method to perform tests on named uBoost/Adaboost classifier.
    """

    # Plotting learning process
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    with Profile("Plotting learning process"):

        for alpha, (title, name) in zip(urs, classifiers):
            if title is 'AdaBoost': continue
            print "===", name, title

            # Get training/test split masks
            msk_train = data['train'] == 1
            msk_test = data['train'] == 0

            # Get target and weight arrays
            y_train = data.loc[msk_train, 'signal'].values.flatten()
            y_test = data.loc[msk_test, 'signal'].values.flatten()
            w_train = data.loc[msk_train, 'weight_adv'].values.flatten()
            w_test = data.loc[msk_test, 'weight_adv'].values.flatten()

            # Compute log-loss for each epoch
            ll_ab_train, ll_ab_test = list(), list()
            ll_ub_train, ll_ub_test = list(), list()

            nb_epochs = len(
                filter(lambda col: col.startswith(name), data.columns))
            x = np.arange(nb_epochs)

            for epoch in range(nb_epochs):

                # -- Get column names for current epoch
                col_ab = '{:s}__{:d}'.format(
                    classifiers[0][1],
                    epoch)  # Assuming `AdaBoost` is first classifier
                col_ub = '{:s}__{:d}'.format(name, epoch)

                # -- Get classifier variables for current epoch
                p_ab_train = data.loc[msk_train, col_ab]
                p_ab_test = data.loc[msk_test, col_ab]
                p_ub_train = data.loc[msk_train, col_ub]
                p_ub_test = data.loc[msk_test, col_ub]

                # -- Compute log-loss for current epoch
                ll_ab_train.append(
                    log_loss(y_train, p_ab_train, sample_weight=w_train))
                ll_ab_test.append(
                    log_loss(y_test, p_ab_test, sample_weight=w_test))
                ll_ub_train.append(
                    log_loss(y_train, p_ub_train, sample_weight=w_train))
                ll_ub_test.append(
                    log_loss(y_test, p_ub_test, sample_weight=w_test))
                pass

            # Plot log-loss curves
            c = rp.canvas(batch=True)

            # -- Common plotting options
            opts = dict(linewidth=2, legend_option='L')
            c.graph(ll_ab_train,
                    bins=x,
                    linecolor=rp.colours[5],
                    linestyle=1,
                    option='AL',
                    label='AdaBoost',
                    **opts)
            c.graph(ll_ab_test,
                    bins=x,
                    linecolor=rp.colours[5],
                    linestyle=2,
                    option='L',
                    **opts)
            c.graph(ll_ub_train,
                    bins=x,
                    linecolor=rp.colours[1],
                    linestyle=1,
                    option='L',
                    label='uBoost',
                    **opts)
            c.graph(ll_ub_test,
                    bins=x,
                    linecolor=rp.colours[1],
                    linestyle=2,
                    option='L',
                    **opts)

            # -- Decorations
            c.pad()._yaxis().SetNdivisions(505)
            c.xlabel("Training epoch")
            c.ylabel("BDT classifier loss")
            c.xlim(0, len(x))
            c.ylim(0.3, 1.4)
            c.legend(width=0.28)
            c.legend(header='Dataset:',
                     categories=[('Training', {
                         'linestyle': 1
                     }), ('Testing', {
                         'linestyle': 2
                     })],
                     width=0.28,
                     ymax=0.69)

            for leg in c.pad()._legends:
                leg.SetFillStyle(0)
                pass

            c.text([
                "#sqrt{s} = 13 TeV", "#it{W} jet tagging",
                "Uniforming rate #alpha = {:3.1f}".format(alpha)
            ],
                   qualifier="Simulation Internal")

            # -- Save
            c.save('figures/loss_uboost__alpha{:4.2f}'.format(alpha).replace(
                '.', 'p') + '.pdf')

            pass
        pass

    return
def main(args):

    # Initialise
    args, cfg = initialise(args)

    # Initialise Keras backend
    initialise_backend(args)

    # Neural network-specific initialisation of the configuration dict
    initialise_config(args, cfg)

    # Keras import(s)
    import keras.backend as K
    from keras.models import load_model

    # Project import(s)
    from adversarial.models import classifier_model, adversary_model, combined_model, decorrelation_model

    # Load data
    data, features, _ = load_data(args.input + 'data.h5', test=True)

    def meaningful_digits(number):
        digits = 0
        if number > 0:
            digits = int(np.ceil(max(-np.log10(number), 0)))
            pass
        return '{l:.{d:d}f}'.format(d=digits, l=number)

    # -- Adversarial neural network (ANN) scan
    lambda_reg = 100.
    lambda_regs = sorted([100.])
    ann_vars = list()
    lambda_strs = list()
    for lambda_reg_ in lambda_regs:
        lambda_str = meaningful_digits(lambda_reg_).replace('.', 'p')
        lambda_strs.append(lambda_str)

        ann_var_ = "ANN(#lambda={:s})".format(lambda_str.replace('p', '.'))
        ann_vars.append(ann_var_)
        pass

    ann_var = ann_vars[lambda_regs.index(lambda_reg)]

    print "ann_var"
    print ann_var

    # Tagger feature collection
    # tagger_features = ['NN', ann_var]
    tagger_features = ['NN', ann_var, 'MV2c10', 'XbbScoreHiggs']
    # tagger_features = ['MV2c10']

    # Add variables
    # --------------------------------------------------------------------------
    with Profile("Add variables"):

        # NN
        from run.adversarial.common import add_nn
        with Profile("NN"):
            classifier = load_model(
                'models/adversarial/classifier/full/classifier.h5')
            add_nn(data, classifier, 'NN')
            pass

        # ANN
        with Profile("ANN"):
            from adversarial.utils import DECORRELATION_VARIABLES
            adversary = adversary_model(
                gmm_dimensions=len(DECORRELATION_VARIABLES),
                **cfg['adversary']['model'])

            combined = combined_model(classifier, adversary,
                                      **cfg['combined']['model'])

            for ann_var_, lambda_str_ in zip(ann_vars, lambda_strs):
                print "== Loading model for {}".format(ann_var_)
                combined.load_weights(
                    'models/adversarial/combined/full/combined_lambda{}.h5'.
                    format(lambda_str_))
                add_nn(data, classifier, ann_var_)
                pass
            pass

        with Profile("MV2c10"):
            data["MV2c10"] = pd.concat(
                [data["MV2c10_discriminant_1"], data["MV2c10_discriminant_2"]],
                axis=1).min(axis=1)

        # Add MV2 and XbbScore here
        # e.g. min(MV2_sj1, MV2_sj2)

    # Remove unused variables
    used_variables = set(tagger_features + ann_vars +
                         ['mass', 'pt', 'npv', 'weight_test'])
    unused_variables = [var for var in list(data) if var not in used_variables]
    data.drop(columns=unused_variables)
    gc.collect()

    # Perform performance studies
    perform_studies(data, args, tagger_features, ann_vars)

    return 0
Beispiel #4
0
def main(args):

    # Initialising
    # --------------------------------------------------------------------------
    args, cfg = initialise(args)

    # Loading data
    # --------------------------------------------------------------------------
    data, features, _ = load_data(args.input + 'data_1M_10M.h5')
    #data = data.sample(frac=0.5, random_state=32)  # @TEMP
    data = data[data['train'] == 1]

    # Reduce size of data
    drop_features = [
        feat for feat in list(data)
        if feat not in features + ['m', 'signal', 'weight_adv']
    ]
    data.drop(drop_features, axis=1)

    cfg['uBoost']['train_features'] = features
    cfg['uBoost']['random_state'] = SEED
    cfg['DecisionTreeClassifier']['random_state'] = SEED

    # Arrays
    X = data

    #print(X.head())

    w = np.array(data['weight_adv']).flatten()
    y = np.array(data['signal']).flatten()

    # Fit uBoost classifier
    # --------------------------------------------------------------------------
    with Profile("Fitting uBoost classifier"):

        # @NOTE: There might be an issue with the sample weights, because the
        #        local efficiencies computed using kNN does not seem to take the
        #        sample weights into account.
        #
        #        See:
        #          https://github.com/arogozhnikov/hep_ml/blob/master/hep_ml/uboost.py#L247-L248
        #        and
        #          https://github.com/arogozhnikov/hep_ml/blob/master/hep_ml/metrics_utils.py#L159-L176
        #        with `divided_weights` not set.
        #
        #        `sample_weight` seem to be use only as a starting point for the
        #        boosted, and so not used for the efficiency calculation.
        #
        #        If this is indeed the case, it would be possible to simply
        #        sample MC events by their weight, and use `sample_weight = 1`
        #        for all samples passed to uBoost.
        #
        # @NOTE: I have gotten less sure of the above, so probably no panic.

        def train_uBoost(X, y, w, cfg, uniforming_rate):
            """
            ...
            """

            # Create base classifier
            base_tree = DecisionTreeClassifier(**cfg['DecisionTreeClassifier'])

            # Update training configuration
            these_cfg = dict(**cfg['uBoost'])
            these_cfg['uniforming_rate'] = uniforming_rate

            # Create uBoost classifier
            uboost = uBoostBDT(base_estimator=base_tree, **these_cfg)

            # Fit uBoost classifier
            uboost.fit(X, y, sample_weight=w)

            return uboost

        #uniforming_rates = [0.0, 0.01, 0.1, 0.3, 1.0, 3.0, 10.0, 30.0, 100.0]
        uniforming_rates = [0.0, 0.01, 0.1, 0.3, 0.5, 1.0]
        #uniforming_rates = [0.5, 1.0]
        n_jobs = min(7, len(uniforming_rates))  # ...(10, ...

        jobs = [
            delayed(train_uBoost, check_pickle=False)(X, y, w, cfg,
                                                      uniforming_rate)
            for uniforming_rate in uniforming_rates
        ]

        result = Parallel(n_jobs=n_jobs, backend="threading")(jobs)
        pass

    # Saving classifiers
    # --------------------------------------------------------------------------
    for uboost, uniforming_rate in zip(result, uniforming_rates):
        with Profile("Saving classifiers"):

            # Ensure model directory exists
            mkdir('models/uboost/')

            suffix_ur = "ur_{:s}".format(
                ("%.2f" % uniforming_rate).replace('.', 'p'))
            suffix_te = "te_{:d}".format(
                int(cfg['uBoost']['target_efficiency'] * 100))

            # Save uBoost classifier
            with gzip.open(
                    'models/uboost/uboost_{}_{}_rel21_fixed_def_cfg_1000boost.pkl.gz'
                    .format(suffix_ur, suffix_te), 'w') as f:
                pickle.dump(uboost, f)
                pass
            pass
        pass

    return 0
Beispiel #5
0
def main(args):

    # Definitions
    histstyle = dict(**HISTSTYLE)

    # Initialise
    args, cfg = initialise(args)

    # Load data
    #data = np.zeros(1, 95213009, 10)
    data, features, _ = load_data(
        'data/djr_LCTopo_2.h5')  # + args.input) #, test=True) #
    #data2, features, _ = load_data('data/djr_LCTopo_2.h5') # + args.input) #, test=True) #
    #data = np.concatenate((data1, data2))

    #f1 = h5py.File('data/djr_LCTopo_1.h5', 'r')
    #f2 = h5py.File('data/djr_LCTopo_2.h5', 'r')

    knnCut = 0
    ntrkCut = 50
    emfracCut = 0.65
    scale = 139 * 1000000  # (inverse nanobarn)
    signal_to_plot = 7

    sigDict = {
        0: 'All Models',
        1: 'Model A, m = 2 TeV',
        2: 'Model A, m = 1 TeV',
        3: 'Model A, m = 1.5 TeV',
        4: 'Model A, m = 2.5 TeV',
        5: 'Model B, m = 1 TeV',
        6: 'Model B, m = 1.5 TeV',
        7: 'Model B, m = 2 TeV',
        8: 'Model B, m = 2.5 TeV',
        9: 'Model C, m = 1 TeV',
        10: 'Model C, m = 1.5 TeV',
        11: 'Model C, m = 2 TeV',
        12: 'Model C, m = 2.5 TeV',
        13: 'Model D, m = 1 TeV',
        14: 'Model D, m = 1.5 TeV',
        15: 'Model D, m = 2 TeV',
        16: 'Model D, m = 2.5 TeV',
    }

    outHistFile = ROOT.TFile.Open(
        "figures/mjjHistograms_kNN{}_eff{}.root".format(knnCut, kNN_eff),
        "RECREATE")

    histstyle[True]['label'] = 'Multijets'
    histstyle[False]['label'] = 'Dark jets, {}'.format(sigDict[signal_to_plot])

    # Add knn variables

    #base_var = ['lead_jet_ungrtrk500', 'sub_jet_ungrtrk500']
    base_var = 'jet_ungrtrk500'
    kNN_var = base_var.replace('jet', 'knn')
    #base_vars = ['lead_'+base_var, 'sub_'+base_var]
    #kNN_vars = ['lead_'+kNN_var, 'sub_'+kNN_var]

    print data.shape

    with Profile("Add variables"):
        #for i in range(len(base_var)):
        print "k-NN base variable: {} (cp. {})".format(base_var, kNN_var)
        add_knn(data,
                newfeat='lead_' + kNN_var,
                path='models/knn/{}_{}_{}_{}.pkl.gz'.format(
                    FIT, base_var, kNN_eff, sigModel))
        add_knn(data,
                newfeat='sub_' + kNN_var,
                path='models/knn/{}_{}_{}_{}.pkl.gz'.format(
                    FIT, base_var, kNN_eff, sigModel))

        #add_knn(data, newfeat=kNN_var, path='models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel))

        print 'models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff,
                                                      sigModel)
        """
        base_var = ['lead_jet_ungrtrk500', 'sub_jet_ungrtrk500']
        kNN_var = [var.replace('jet', 'knn') for var in base_var]
        
        with Profile("Add variables"):
        from run.knn.common import add_knn, MODEL, VAR as kNN_basevar, EFF as kNN_eff
        print "k-NN base variable: {} (cp. {})".format(kNN_basevar, kNN_var)
        for i in range(len(base_var)):
        add_knn(data, newfeat=kNN_var[i], path='models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var[i], kNN_eff, MODEL))
        print 'models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var[i], kNN_eff, MODEL)
        """

    weight = 'weight'  # 'weight_test' / 'weight'
    bins_pt = np.linspace(450, 3500, 40)
    bins_mjj = np.linspace(0, 8000, 80)

    # Useful masks
    msk_bkg = data['signal'] == 0
    if signal_to_plot == 0:
        msk_sig = data['signal'] == 1
    else:
        msk_sig = data['sigType'] == signal_to_plot

    #msk_weight = data['weight']<0.2

    msk_knn = (data['lead_knn_ungrtrk500'] >
               knnCut) & (data['sub_knn_ungrtrk500'] > knnCut)
    msk_ungr = (data['lead_jet_ungrtrk500'] >
                ntrkCut) & (data['sub_jet_ungrtrk500'] > ntrkCut)
    msk_emfrac = (data['lead_jet_EMFrac'] <
                  emfracCut) & (data['sub_jet_EMFrac'] < emfracCut)

    msk_knn_1 = (data['lead_knn_ungrtrk500'] > knnCut)
    msk_ungr_1 = (data['lead_jet_ungrtrk500'] > ntrkCut)

    #msk_knn = (data['knn_ungrtrk500']>knnCut)
    #msk_ungr = (data['jet_ungrtrk500']>90.0)

    msk_ntrkBkg = msk_ungr & msk_emfrac & msk_bkg  #& msk_weight #& msk_pt & msk_m & msk_eta
    msk_ntrkSig = msk_ungr & msk_emfrac & msk_sig  #& msk_pt & msk_m & msk_eta

    msk_knnBkg = msk_knn & msk_bkg
    msk_knnSig = msk_knn & msk_sig

    msk_ntrkBkg1 = msk_ungr_1 & msk_bkg  #& msk_weight #& msk_pt & msk_m & msk_eta
    msk_ntrkSig1 = msk_ungr_1 & msk_sig  #& msk_pt & msk_m & msk_eta
    msk_knnBkg1 = msk_knn_1 & msk_bkg  #& msk_weight #& msk_pt & msk_m & msk_eta
    msk_knnSig1 = msk_knn_1 & msk_sig  #& msk_pt & msk_m & msk_eta

    msk_inclBkg = msk_bkg  #& msk_weight #& msk_pt & msk_m & msk_eta
    msk_inclSig = msk_sig  #& msk_pt & msk_m & msk_eta

    # Mjj dist with cut on ntrk, ungrtrk compared to inclusive selection
    c = rp.canvas(batch=True)
    hist_inclBkg = c.hist(data.loc[msk_inclBkg, 'dijetmass'].values,
                          bins=bins_mjj,
                          weights=scale * data.loc[msk_inclBkg, weight].values,
                          label="Multijets, Inclusive",
                          normalise=True,
                          linecolor=ROOT.kGreen + 2,
                          linewidth=3)
    hist_knnBkg = c.hist(
        data.loc[msk_knnBkg, 'dijetmass'].values,
        bins=bins_mjj,
        weights=scale * data.loc[msk_knnBkg, weight].values,
        label="Multijets, n_{{trk}}^{{#epsilon}}>{}".format(knnCut),
        normalise=True,
        linecolor=ROOT.kMagenta + 2,
        linestyle=2,
        linewidth=3)

    hist_ntrkBkg = c.hist(data.loc[msk_ntrkBkg, 'dijetmass'].values,
                          bins=bins_mjj,
                          weights=scale * data.loc[msk_ntrkBkg, weight].values,
                          label="Multijets, n_{{trk}}>{}".format(ntrkCut),
                          normalise=True,
                          linecolor=ROOT.kOrange + 2,
                          linestyle=2,
                          linewidth=3)
    #hist_CRBkg = c.hist(data.loc[msk_CR_bkg, 'dijetmass'].values, bins=bins_mjj, weights=scale*data.loc[msk_CR_bkg, weight].values, label="CR Bkg, C<20", normalise=True, linecolor=ROOT.kGray+2, linestyle=2)

    c.legend(width=0.4, xmin=0.5, ymax=0.9)
    c.ylabel("Fraction of jets")
    c.xlabel("m_{jj} [GeV]")
    c.logy()
    #c.ylim(0.00005, 5)
    #c.save('figures/distributions/mjj_Bkg_CR20.pdf'.format(knnCut))
    #c.save('figures/distributions/mjj_Bkg_CR20.eps'.format(knnCut))
    c.save('figures/distributions/mjj_BkgDist_ntrk{}_knn{}_{}.pdf'.format(
        ntrkCut, knnCut, FIT))
    c.save('figures/distributions/mjj_BkgDist_ntrk{}_knn{}_{}.eps'.format(
        ntrkCut, knnCut, FIT))

    del c

    c = rp.canvas(batch=True)
    hist_Sig = c.hist(data.loc[msk_sig, 'dijetmass'].values,
                      bins=bins_mjj,
                      weights=data.loc[msk_sig, weight].values,
                      label="Model A, m = 2 TeV, inclusive",
                      normalise=True,
                      linecolor=ROOT.kGreen + 2)

    hist_knnSig = c.hist(
        data.loc[msk_knnSig, 'dijetmass'].values,
        bins=bins_mjj,
        weights=data.loc[msk_knnSig, weight].values,
        label="Model A, m = 2 TeV, #it{{n}}_{{trk}}^{{#epsilon}}>{}".format(
            knnCut),
        normalise=True,
        linecolor=ROOT.kMagenta + 2,
        linestyle=2)

    hist_ntrkSig = c.hist(
        data.loc[msk_ntrkSig, 'dijetmass'].values,
        bins=bins_mjj,
        weights=data.loc[msk_ntrkSig, weight].values,
        label="Model A, m = 2 TeV, #it{{n}}_{{trk}}>{}".format(ntrkCut),
        normalise=True,
        linecolor=ROOT.kOrange + 2,
        linestyle=2)

    #hist_CRSig = c.hist(data.loc[msk_CR_sig, 'dijetmass'].values, bins=bins_mjj, weights=data.loc[msk_CR_sig, weight].values, label="Sig, CR", normalise=True, linecolor=ROOT.kGray+2, linestyle=2)

    c.legend(width=0.4, xmin=0.5, ymax=0.9)
    c.ylabel("Fraction of jets")
    c.xlabel("m_{jj} [GeV]")
    c.logy()
    #c.ylim(0.00005, 5)
    c.save('figures/distributions/mjj_SigDist_ntrk{}_knn{}_{}.pdf'.format(
        ntrkCut, knnCut, FIT))
    c.save('figures/distributions/mjj_SigDist_ntrk{}_knn{}_{}.eps'.format(
        ntrkCut, knnCut, FIT))

    del c

    c = rp.canvas(batch=True)

    hist_knnSig = c.hist(
        data.loc[msk_knnSig, 'dijetmass'].values,
        bins=bins_mjj,
        weights=data.loc[msk_knnSig, weight].values,
        label="Model A, m = 2 TeV, knn_ntrk>{}".format(knnCut),
        normalise=False,
        linecolor=ROOT.kBlue + 1,
        linestyle=1)

    hist_knnBkg = c.hist(data.loc[msk_knnBkg, 'dijetmass'].values,
                         bins=bins_mjj,
                         weights=scale * data.loc[msk_knnBkg, weight].values,
                         label="Multijets, knn_ntrk>{}".format(knnCut),
                         normalise=False,
                         linecolor=ROOT.kMagenta + 2,
                         linestyle=2)

    hist_ntrkBkg = c.hist(data.loc[msk_ntrkBkg, 'dijetmass'].values,
                          bins=bins_mjj,
                          weights=scale * data.loc[msk_ntrkBkg, weight].values,
                          label="Multijets, ntrk>{}".format(ntrkCut),
                          normalise=False,
                          linecolor=ROOT.kOrange + 2,
                          linestyle=2)

    c.legend(width=0.4, xmin=0.3, ymax=0.9)
    c.ylabel("Number of events")
    c.xlabel("m_{jj} [GeV]")
    c.logy()
    #c.ylim(0.00005, 5)
    c.save('figures/distributions/mjj_Dist_noNorm_knn{}_{}.pdf'.format(
        knnCut, FIT))
    c.save('figures/distributions/mjj_Dist_noNorm_knn{}_{}.eps'.format(
        knnCut, FIT))

    bins_mjj = np.linspace(0, 10000, 50)

    # Unscaled histograms for calculating efficiencies

    hist_inclBkg = c.hist(data.loc[msk_inclBkg, 'dijetmass'].values,
                          bins=bins_mjj,
                          weights=scale * data.loc[msk_inclBkg, weight].values,
                          normalise=False)

    hist_inclSig = c.hist(data.loc[msk_inclSig, 'dijetmass'].values,
                          bins=bins_mjj,
                          weights=data.loc[msk_inclSig, weight].values,
                          normalise=False)

    hist_ntrkSig = c.hist(data.loc[msk_ntrkSig, 'dijetmass'].values,
                          bins=bins_mjj,
                          weights=data.loc[msk_ntrkSig, weight].values,
                          normalise=False)

    hist_knnSig = c.hist(data.loc[msk_knnSig, 'dijetmass'].values,
                         bins=bins_mjj,
                         weights=data.loc[msk_knnSig, weight].values,
                         normalise=False)

    hist_ntrkSig1 = c.hist(data.loc[msk_ntrkSig1, 'dijetmass'].values,
                           bins=bins_mjj,
                           weights=data.loc[msk_ntrkSig1, weight].values,
                           normalise=False)

    hist_ntrkBkg1 = c.hist(data.loc[msk_ntrkBkg1, 'dijetmass'].values,
                           bins=bins_mjj,
                           weights=data.loc[msk_ntrkBkg1, weight].values,
                           normalise=False)

    hist_knnBkg1 = c.hist(data.loc[msk_knnBkg1, 'dijetmass'].values,
                          bins=bins_mjj,
                          weights=data.loc[msk_knnBkg1, weight].values,
                          normalise=False)

    hist_knnSig1 = c.hist(data.loc[msk_knnSig1, 'dijetmass'].values,
                          bins=bins_mjj,
                          weights=data.loc[msk_knnSig1, weight].values,
                          normalise=False)

    print "Bkg inclusive integral: ", hist_inclBkg.GetEffectiveEntries()
    print "Sig inclusive integral: ", hist_inclSig.GetEffectiveEntries()

    print "Bkg pass kNN eff entries / integral: ", hist_knnBkg.GetEffectiveEntries(
    ), hist_knnBkg.Integral()
    print "Sig pass kNN eff entries / integral: ", hist_knnSig.GetEffectiveEntries(
    ), hist_knnSig.Integral()

    print "Bkg pass ntrk eff entries / integral: ", hist_ntrkBkg.GetEffectiveEntries(
    ), hist_ntrkBkg.Integral()
    print "Sig pass ntrk eff entries / integral: ", hist_ntrkSig.GetEffectiveEntries(
    ), hist_ntrkSig.Integral()

    print "Bkg Eff. knn_ntrk> {}, eff. entries: ".format(
        knnCut), 100 * hist_knnBkg.GetEffectiveEntries(
        ) / hist_inclBkg.GetEffectiveEntries()
    print "Sig Eff. knn_ntrk> {}, eff. entries: ".format(
        knnCut), 100 * hist_knnSig.GetEffectiveEntries(
        ) / hist_inclSig.GetEffectiveEntries()

    print "Bkg Eff. knn_ntrk> {}, integral: ".format(
        knnCut), 100 * hist_knnBkg.Integral() / hist_inclBkg.Integral()
    print "Sig Eff. knn_ntrk> {}, integral: ".format(
        knnCut), 100 * hist_knnSig.Integral() / hist_inclSig.Integral()

    print "Bkg Eff. ntrk>{}, eff. entries: ".format(
        ntrkCut), 100 * hist_ntrkBkg.GetEffectiveEntries(
        ) / hist_inclBkg.GetEffectiveEntries()
    print "Sig Eff. ntrk>{}, eff. entries: ".format(
        ntrkCut), 100 * hist_ntrkSig.GetEffectiveEntries(
        ) / hist_inclSig.GetEffectiveEntries(
        )  #, hist_ntrkSig.GetEffectiveEntries()

    print "Bkg Eff. 1 jet knn_ntrk> {}, eff. entries: ".format(
        knnCut), 100 * hist_knnBkg1.GetEffectiveEntries(
        ) / hist_inclBkg.GetEffectiveEntries()
    print "Sig Eff. 1 jet knn_ntrk> {}, eff. entries: ".format(
        knnCut), 100 * hist_knnSig1.GetEffectiveEntries(
        ) / hist_inclSig.GetEffectiveEntries()

    print "Bkg Eff. 1 jet knn_ntrk> {}, integral: ".format(
        knnCut), 100 * hist_knnBkg1.GetEffectiveEntries(
        ) / hist_inclBkg.GetEffectiveEntries()
    print "Sig Eff. 1 jet knn_ntrk> {}, integral: ".format(
        knnCut), 100 * hist_knnSig1.GetEffectiveEntries(
        ) / hist_inclSig.GetEffectiveEntries()

    outHistFile.cd()
    hist_knnBkg.SetName("bkg_knn")
    hist_knnSig.SetName("sig_knn")
    hist_knnBkg.Write()
    hist_knnSig.Write()
    outHistFile.Close()
    # Mjj dist for CR compared to inclusive selection
    """
def main(args):

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, _, _ = load_data(args.input + 'data.h5', train=True)
    msk_sig = data['signal'] == 1
    msk_bkg = ~msk_sig

    # -------------------------------------------------------------------------
    ####
    #### # Initialise Keras backend
    #### initialise_backend(args)
    ####
    #### # Neural network-specific initialisation of the configuration dict
    #### initialise_config(args, cfg)
    ####
    #### # Keras import(s)
    #### from keras.models import load_model
    ####
    #### # NN
    #### from run.adversarial.common import add_nn
    #### with Profile("NN"):
    ####     classifier = load_model('models/adversarial/classifier/full/classifier.h5')
    ####     add_nn(data, classifier, 'NN')
    ####     pass
    # -------------------------------------------------------------------------

    # Fill measured profile
    profile_meas, _ = fill_profile(data[msk_bkg])

    # Add k-NN variable
    knnfeat = 'knn'
    add_knn(data,
            newfeat=knnfeat,
            path='models/knn/knn_{}_{}.pkl.gz'.format(VAR, EFF))

    # Loading KNN classifier
    knn = loadclf('models/knn/knn_{:s}_{:.0f}.pkl.gz'.format(VAR, EFF))

    # Filling fitted profile
    with Profile("Filling fitted profile"):
        rebin = 8
        edges, centres = dict(), dict()
        for ax, var in zip(['x', 'y'], [VARX, VARY]):

            # Short-hands
            vbins, vmin, vmax = AXIS[var]

            # Re-binned bin edges  @TODO: Make standardised right away?
            edges[ax] = np.interp(
                np.linspace(0, vbins, vbins * rebin + 1, endpoint=True),
                range(vbins + 1),
                np.linspace(vmin, vmax, vbins + 1, endpoint=True))

            # Re-binned bin centres
            centres[ax] = edges[ax][:-1] + 0.5 * np.diff(edges[ax])
            pass

        # Get predictions evaluated at re-binned bin centres
        g = dict()
        g['x'], g['y'] = np.meshgrid(centres['x'], centres['y'])
        g['x'], g['y'] = standardise(g['x'], g['y'])

        X = np.vstack((g['x'].flatten(), g['y'].flatten())).T
        fit = knn.predict(X).reshape(g['x'].shape).T

        # Fill ROOT "profile"
        profile_fit = ROOT.TH2F('profile_fit', "",
                                len(edges['x']) - 1, edges['x'].flatten('C'),
                                len(edges['y']) - 1, edges['y'].flatten('C'))
        root_numpy.array2hist(fit, profile_fit)
        pass

    # Plotting
    with Profile("Plotting"):
        for fit in [False, True]:

            # Select correct profile
            profile = profile_fit if fit else profile_meas

            # Plot
            plot(profile, fit)
            pass
        pass

    # Plotting local selection efficiencies for D2-kNN < 0
    # -- Compute signal efficiency
    for sig, msk in zip([True, False], [msk_sig, msk_bkg]):

        if sig:
            rgbs = [(247 / 255., 251 / 255., 255 / 255.),
                    (222 / 255., 235 / 255., 247 / 255.),
                    (198 / 255., 219 / 255., 239 / 255.),
                    (158 / 255., 202 / 255., 225 / 255.),
                    (107 / 255., 174 / 255., 214 / 255.),
                    (66 / 255., 146 / 255., 198 / 255.),
                    (33 / 255., 113 / 255., 181 / 255.),
                    (8 / 255., 81 / 255., 156 / 255.),
                    (8 / 255., 48 / 255., 107 / 255.)]

            red, green, blue = map(np.array, zip(*rgbs))
            nb_cols = len(rgbs)
            stops = np.linspace(0, 1, nb_cols, endpoint=True)
        else:
            rgbs = [(255 / 255., 51 / 255., 4 / 255.),
                    (247 / 255., 251 / 255., 255 / 255.),
                    (222 / 255., 235 / 255., 247 / 255.),
                    (198 / 255., 219 / 255., 239 / 255.),
                    (158 / 255., 202 / 255., 225 / 255.),
                    (107 / 255., 174 / 255., 214 / 255.),
                    (66 / 255., 146 / 255., 198 / 255.),
                    (33 / 255., 113 / 255., 181 / 255.),
                    (8 / 255., 81 / 255., 156 / 255.),
                    (8 / 255., 48 / 255., 107 / 255.)]

            red, green, blue = map(np.array, zip(*rgbs))
            nb_cols = len(rgbs)
            stops = np.array([0] + list(
                np.linspace(0, 1, nb_cols - 1, endpoint=True) *
                (1. - EFF / 100.) + EFF / 100.))
            pass

        ROOT.TColor.CreateGradientColorTable(nb_cols, stops, red, green, blue,
                                             NB_CONTOUR)

        # Define arrays
        shape = (AXIS[VARX][0], AXIS[VARY][0])
        bins = [
            np.linspace(AXIS[var][1],
                        AXIS[var][2],
                        AXIS[var][0] + 1,
                        endpoint=True) for var in VARS
        ]
        x, y, z = (np.zeros(shape) for _ in range(3))

        # Create `profile` histogram
        profile = ROOT.TH2F('profile', "",
                            len(bins[0]) - 1, bins[0].flatten('C'),
                            len(bins[1]) - 1, bins[1].flatten('C'))

        # Compute inclusive efficiency in bins of `VARY`
        effs = list()
        for edges in zip(bins[1][:-1], bins[1][1:]):
            msk_bin = (data[VARY] > edges[0]) & (data[VARY] < edges[1])
            msk_pass = data[knnfeat] < 0
            num = data.loc[msk & msk_bin & msk_pass,
                           'weight_test'].values.sum()
            den = data.loc[msk & msk_bin, 'weight_test'].values.sum()
            effs.append(num / den)
            pass

        # Fill profile
        for i, j in itertools.product(*map(range, shape)):

            # Bin edges in x and y
            edges = [bin[idx:idx + 2] for idx, bin in zip([i, j], bins)]

            # Masks
            msks = [(data[var] > edges[dim][0]) & (data[var] <= edges[dim][1])
                    for dim, var in enumerate(VARS)]
            msk_bin = reduce(lambda x, y: x & y, msks)
            data_ = data[msk & msk_bin]

            # Set non-zero bin content
            if np.sum(msk & msk_bin):
                msk_pass = data_[knnfeat] < 0
                num = data.loc[msk & msk_bin & msk_pass,
                               'weight_test'].values.sum()
                den = data.loc[msk & msk_bin, 'weight_test'].values.sum()
                eff = num / den
                profile.SetBinContent(i + 1, j + 1, eff)
                pass
            pass

        c = rp.canvas(batch=True)
        pad = c.pads()[0]._bare()
        pad.cd()
        pad.SetRightMargin(0.20)
        pad.SetLeftMargin(0.15)
        pad.SetTopMargin(0.10)

        # Styling
        profile.GetXaxis().SetTitle("Large-#it{R} jet " +
                                    latex(VARX, ROOT=True) +
                                    " = log(m^{2}/p_{T}^{2})")
        profile.GetYaxis().SetTitle("Large-#it{R} jet " +
                                    latex(VARY, ROOT=True) + " [GeV]")
        profile.GetZaxis().SetTitle("Selection efficiency for %s^{(%s%%)}" %
                                    (latex(VAR, ROOT=True), EFF))

        profile.GetYaxis().SetNdivisions(505)
        profile.GetZaxis().SetNdivisions(505)
        profile.GetXaxis().SetTitleOffset(1.4)
        profile.GetYaxis().SetTitleOffset(1.8)
        profile.GetZaxis().SetTitleOffset(1.3)
        zrange = (0., 1.)
        if zrange:
            profile.GetZaxis().SetRangeUser(*zrange)
            pass
        profile.SetContour(NB_CONTOUR)

        # Draw
        profile.Draw('COLZ')

        # Decorations
        c.text(qualifier=QUALIFIER, ymax=0.92, xmin=0.15)
        c.text(["#sqrt{s} = 13 TeV", "#it{W} jets" if sig else "Multijets"],
               ATLAS=False)

        # -- Efficiencies
        xaxis = profile.GetXaxis()
        yaxis = profile.GetYaxis()
        tlatex = ROOT.TLatex()
        tlatex.SetTextColor(ROOT.kGray + 2)
        tlatex.SetTextSize(0.023)
        tlatex.SetTextFont(42)
        tlatex.SetTextAlign(32)
        xt = xaxis.GetBinLowEdge(xaxis.GetNbins())
        for eff, ibin in zip(effs, range(1, yaxis.GetNbins() + 1)):
            yt = yaxis.GetBinCenter(ibin)
            tlatex.DrawLatex(
                xt, yt, "%s%.1f%%" %
                ("#bar{#varepsilon}^{rel}_{%s} = " %
                 ('sig' if sig else 'bkg') if ibin == 1 else '', eff * 100.))
            pass

        # -- Bounds
        BOUNDS[0].DrawCopy("SAME")
        BOUNDS[1].DrawCopy("SAME")
        c.latex("m > 50 GeV",
                -4.5,
                BOUNDS[0].Eval(-4.5) + 30,
                align=21,
                angle=-37,
                textsize=13,
                textcolor=ROOT.kGray + 3)
        c.latex("m < 300 GeV",
                -2.5,
                BOUNDS[1].Eval(-2.5) - 30,
                align=23,
                angle=-57,
                textsize=13,
                textcolor=ROOT.kGray + 3)

        # Save
        mkdir('figures/knn/')
        c.save('figures/knn/knn_eff_{}_{:s}_{:.0f}.pdf'.format(
            'sig' if sig else 'bkg', VAR, EFF))
        pass

    return
def perform_studies(data,
                    args,
                    tagger_features,
                    extracted_features,
                    title=None):
    """
    Method delegating performance studies.
    """
    #masscuts  = [True, False]
    masscuts = [False]
    pt_ranges = [None, (200, 500), (500, 1000), (1000, 2000)]
    #pt_ranges = [(1000, 2000)]
    #pt_ranges = [None]

    ## Perform combined robustness study
    #with Profile("Study: Robustness"):
    #    for masscut in masscuts:
    #        studies.robustness_full(data, args, tagger_features, masscut=masscut, title=title)
    #        pass
    #    pass

    ## Perform jet mass distribution comparison study
    #with Profile("Study: Jet mass comparison"):
    #    for pt_range in pt_ranges:
    #        print "pt_range =", pt_range
    #        studies.jetmasscomparison(data, args, tagger_features, pt_range, title=title)
    #    pass

    # Perform summary plot study
    with Profile("Study: Summary plot"):
        scan_features = dict()

        for masscut, pt_range in itertools.product(masscuts, pt_ranges):
            studies.summary(data,
                            args,
                            tagger_features,
                            scan_features,
                            masscut=masscut,
                            pt_range=pt_range,
                            title=title)
            pass
        pass

    ## Perform distributions study
    #with Profile("Study: Substructure tagger distributions"):
    #    mass_ranges = np.linspace(50, 300, 5 + 1, endpoint=True)
    #    mass_ranges = [None] + zip(mass_ranges[:-1], mass_ranges[1:])
    #    #for feat, pt_range, mass_range in itertools.product(tagger_features, pt_ranges, mass_ranges):  # tagger_features
    #    for feat, pt_range, mass_range in itertools.product(extracted_features, pt_ranges, mass_ranges):  # tagger_features
    #        studies.distribution(data, args, feat, pt_range, mass_range, title=title)
    #        pass
    #    pass

    ## Perform ROC study
    #with Profile("Study: ROC"):
    #    #masscuts = [(65,105)]
    #    #pt_ranges = [(None), (300,500), (1000,1500)]
    #    for masscut, pt_range in itertools.product(masscuts, pt_ranges):
    #        studies.roc(data, args, tagger_features, masscut=masscut, pt_range=pt_range, title=title)
    #        pass
    #    pass

    ## Perform JSD study
    #with Profile("Study: JSD"):
    #    for pt_range in pt_ranges:
    #        studies.jsd(data, args, tagger_features, pt_range, title=title)
    #        pass
    #    pass

    ## Perform efficiency study
    #with Profile("Study: Efficiency"):
    #    #for feat in tagger_features:
    #	 for feat in extracted_features:
    #        studies.efficiency(data, args, feat, title=title)
    #        pass

    return
Beispiel #8
0
def perform_optimisation(var, bins, data):
    """
    ...
    """

    # Fill 2D substructure profile
    profile2d = fill_2d_profile(data, var, bins, "m", MASS_BINS)

    # Get 1D profile for lowest mass bin
    profile0 = profile2d.ProjectionY("%s_lowMass" % profile2d.GetName(), 1, 1)
    profile0 = kde(profile0)
    normalise(profile0, density=True)

    # Perform the optimisation
    bestShapeVal = 0
    bestSumChi2 = 1e20
    for shapeVal in SHAPEVAL_RANGE:
        print "Shape value: ", shapeVal
        sumChi2 = 0.

        # Each mass bin needs to be optimized over omega
        for mass in range(len(MASS_BINS) - 1):
            print "   Mass bin: ", mass

            # Get 1D profile for current mass bin
            profile = profile2d.ProjectionY(
                "%s_bin_%i" % (profile2d.GetName(), mass), mass + 1, mass + 1)

            # Fit current profile to low-mass profile
            chi2, bestOmega, _, _ = fit(profile, shapeVal, profile0,
                                        "%.2f" % mass)

            # Accumulate chi2
            sumChi2 += chi2
            pass

        # Update chi2 for current `shapeVal`
        print "-- sumChi2: {} (cp. {})".format(sumChi2, bestSumChi2)
        if sumChi2 < bestSumChi2:
            bestSumChi2 = sumChi2
            bestShapeVal = shapeVal
            pass
        pass

    # Saving CSS transforms
    with Profile("Saving CSS transform"):

        # Ensure model directory exists
        mkdir('models/css/')
        mkdir(
            'figures/css/'
        )  ## put in by me because errors were eturned when saving the pdfs

        # Get the optimal, measured `omega`s for each mass-bin
        bestOmegas = list()
        for mass in range(len(MASS_BINS) - 1):
            profile = profile2d.ProjectionY(
                "%s_bin_%i_final" % (profile2d.GetName(), mass), mass + 1,
                mass + 1)
            sumChi2, bestOmega, profile_css, profile0rebin = fit(
                profile, bestShapeVal, profile0, "%.2f" % mass)

            # Test-plot distributions used for fitting!
            # -- Canvas
            c = rp.canvas(batch=True)

            # -- Plot
            profile = kde(profile)
            normalise(profile, density=True)

            lowmassbin = "#it{{m}} #in  [{:.1f}, {:.1f}] GeV".format(
                MASS_BINS[0], MASS_BINS[1]).replace('.0', '')
            massbin = "#it{{m}} #in  [{:.1f}, {:.1f}] GeV".format(
                MASS_BINS[mass], MASS_BINS[mass + 1]).replace('.0', '')
            c.hist(profile0rebin,
                   label=latex(var, ROOT=True) + ",    {}".format(lowmassbin),
                   linecolor=rp.colours[1],
                   fillcolor=rp.colours[1],
                   alpha=0.5,
                   option='HISTL',
                   legend_option='FL')
            c.hist(profile,
                   label=latex(var, ROOT=True) + ",    {}".format(massbin),
                   linecolor=rp.colours[4],
                   linestyle=2,
                   option='HISTL')
            c.hist(profile_css,
                   label=latex(var + 'CSS', ROOT=True) +
                   ", {}".format(massbin),
                   linecolor=rp.colours[3],
                   option='HISTL')

            # -- Decorations
            c.xlabel(
                latex(var, ROOT=True) + ", " + latex(var + 'CSS', ROOT=True))
            c.ylabel("Number of jets p.d.f.")
            c.legend(xmin=0.45, ymax=0.76, width=0.25)
            c.text(["#sqrt{s} = 13 TeV,  Multijets", "KDE smoothed"],
                   qualifier=QUALIFIER,
                   ATLAS=False)
            c.pad()._xaxis().SetTitleOffset(1.3)
            c.pad()._yaxis().SetNdivisions(105)
            c.pad()._primitives[-1].Draw('SAME AXIS')
            c.padding(0.50)

            # -- Save
            c.save('figures/css/css_test_{}_mass{}.pdf'.format(var, mass))

            # Store best-fit omega in array
            print mass, bestOmega
            bestOmegas.append(bestOmega)
            pass

        # Fit best omega vs. mass
        x = MASS_BINS[:-1] + 0.5 * np.diff(MASS_BINS)
        y = np.array(bestOmegas)

        h = ROOT.TH1F('hfit', "", len(MASS_BINS) - 1, MASS_BINS)
        root_numpy.array2hist(y, h)
        for ibin in range(1, len(x) + 1):
            h.SetBinError(
                ibin,
                0.02)  # Just some value to ensure equal errors on all points
            pass

        m0 = 0.5 * (MASS_BINS[0] + MASS_BINS[1])
        f = ROOT.TF1(
            "fit",
            "[0] * (1./{m0}  - 1./x) + [1] * TMath::Log(x/{m0})".format(m0=m0),
            m0, 300)
        f.SetLineColor(rp.colours[4])
        f.SetLineStyle(2)
        h.Fit(f)

        # Write out the optimal configuration for each mass bin
        for mass in range(len(MASS_BINS) - 1):
            profile = profile2d.ProjectionY(
                "%s_bin_%i_final" % (profile2d.GetName(), mass), mass + 1,
                mass + 1)
            profile = kde(profile)
            normalise(profile, density=True)
            bestOmegaFitted_ = f.Eval(
                h.GetBinCenter(mass + 1)) + np.finfo(float).eps
            bestOmegaFitted = max(bestOmegaFitted_, 1E-04)
            #bestOmegaFitted = h.GetBinContent(mass + 1)
            print "bestOmegaFitted[{}] = {} --> {}".format(
                mass, bestOmegaFitted_, bestOmegaFitted)
            F, Ginv = get_css_fns(bestShapeVal, bestOmegaFitted, profile, "")

            # Save classifier
            saveclf(F, 'models/css/css_%s_F_%i.pkl.gz' % (var, mass))
            saveclf(Ginv, 'models/css/css_%s_Ginv_%i.pkl.gz' % (var, mass))
            pass

        # Plot best omega vs. mass
        # -- Canvas
        c = rp.canvas(batch=True)

        # -- Plots
        #c.hist(bestOmegas, bins=MASS_BINS, linecolor=rp.colours[1])
        c.hist(h, linecolor=rp.colours[1], option='HIST', label="Measured")
        f.Draw('SAME')

        # -- Decorations
        c.xlabel("Large-#it{R} jet mass [GeV]")
        c.ylabel("Best-fit #Omega_{D}")
        c.text([
            "#sqrt{s} = 13 TeV,  Multijets", "CSS applied to {}".format(
                latex(var, ROOT=True)),
            "Best-fit #alpha = {:.1f}".format(bestShapeVal)
        ],
               qualifier=QUALIFIER,
               ATLAS=False)
        c.legend(categories=[('Functional fit', {
            'linewidth': 2,
            'linestyle': 2,
            'linecolor': rp.colours[4]
        })])
        # Save
        c.save('figures/css/cssBestOmega_{}.pdf'.format(var))
        pass

    return 0
Beispiel #9
0
def main(args):

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, _, _ = load_data(args.input + 'data.h5', train=True, background=True)

    # -------------------------------------------------------------------------
    ####
    #### # Initialise Keras backend
    #### initialise_backend(args)
    ####
    #### # Neural network-specific initialisation of the configuration dict
    #### initialise_config(args, cfg)
    ####
    #### # Keras import(s)
    #### from keras.models import load_model
    ####
    #### # NN
    #### from run.adversarial.common import add_nn
    #### with Profile("NN"):
    ####     classifier = load_model('models/adversarial/classifier/full/classifier.h5')
    ####     add_nn(data, classifier, 'NN')
    ####     pass
    # -------------------------------------------------------------------------

    # Fill measured profile
    profile_meas, _ = fill_profile(data)

    # Loading KNN classifier
    knn = loadclf('models/knn/knn_{:s}_{:.0f}.pkl.gz'.format(VAR, EFF))

    # Filling fitted profile
    with Profile("Filling fitted profile"):
        rebin = 8
        edges, centres = dict(), dict()
        for ax, var in zip(['x', 'y'], [VARX, VARY]):

            # Short-hands
            vbins, vmin, vmax = AXIS[var]

            # Re-binned bin edges  @TODO: Make standardised right away?
            edges[ax] = np.interp(
                np.linspace(0, vbins, vbins * rebin + 1, endpoint=True),
                range(vbins + 1),
                np.linspace(vmin, vmax, vbins + 1, endpoint=True))

            # Re-binned bin centres
            centres[ax] = edges[ax][:-1] + 0.5 * np.diff(edges[ax])
            pass

        # Get predictions evaluated at re-binned bin centres
        g = dict()
        g['x'], g['y'] = np.meshgrid(centres['x'], centres['y'])
        g['x'], g['y'] = standardise(g['x'], g['y'])

        X = np.vstack((g['x'].flatten(), g['y'].flatten())).T
        fit = knn.predict(X).reshape(g['x'].shape).T

        # Fill ROOT "profile"
        profile_fit = ROOT.TH2F('profile_fit', "",
                                len(edges['x']) - 1, edges['x'].flatten('C'),
                                len(edges['y']) - 1, edges['y'].flatten('C'))
        root_numpy.array2hist(fit, profile_fit)
        pass

    # Plotting
    with Profile("Plotting"):
        for fit in [False, True]:

            # Select correct profile
            profile = profile_fit if fit else profile_meas

            # Plot
            plot(profile, fit)
            pass
        pass

    return
Beispiel #10
0
def main(args):

    # Initialise
    args, cfg = initialise(args)

    # Initialise Keras backend
    initialise_backend(args)

    # Neural network-specific initialisation of the configuration dict
    initialise_config(args, cfg)

    # Keras import(s)
    import keras.backend as K
    from keras.models import load_model

    # Project import(s)
    from adversarial.models import classifier_model, adversary_model, combined_model, decorrelation_model

    # Load data
    data, features, _ = load_data('data/' + args.input, test=True)

    # Common definitions
    # --------------------------------------------------------------------------
    # -- k-nearest neighbour
    #kNN_var = 'D2-k#minusNN'
    #kNN_var = 'C1_02-knn'
    #base_var = 'sub_jet_ntrk'
    #kNN_var = base_var.replace('sub_jet_', '') + '-knn'

    #base_var = ['lead_jet_ungrtrk500', 'sub_jet_ungrtrk500']
    #kNN_var = [var.replace('jet', 'knn') for var in base_var]

    base_var = 'jet_ungrtrk500'
    kNN_var = base_var.replace('jet', 'knn')

    #base_var = ['jet_ungrtrk500']
    #kNN_var = [var.replace('jet', 'knn') for var in base_var]

    #base_var = ['ntrk_sum']
    #kNN_var = [var + '-knn' for var in base_var]

    def meaningful_digits(number):
        digits = 0
        if number > 0:
            digits = int(np.ceil(max(-np.log10(number), 0)))
            pass
        return '{l:.{d:d}f}'.format(d=digits, l=number)

    """
    # -- Adversarial neural network (ANN) scan
    lambda_reg  = 10.
    lambda_regs = sorted([1., 3., 10.])
å ham har jeg talt med løbende. For mange dage siden har vi talt om, om man kunne bruge grundlovsdag, og hvordan det ville hænge sammen med de frister, der er. In    ann_vars    = list()
    lambda_strs = list()
    for lambda_reg_ in lambda_regs:
        lambda_str = meaningful_digits(lambda_reg_).replace('.', 'p')
        lambda_strs.append(lambda_str)

        ann_var_ = "ANN(#lambda={:s})".format(lambda_str.replace('p', '.'))
        ann_vars.append(ann_var_)
        pass

    ann_var = ann_vars[lambda_regs.index(lambda_reg)]

    # -- uBoost scan
    uboost_eff = 92
    uboost_ur  = 0.3
    uboost_urs = sorted([0., 0.01, 0.1, 0.3, 1.0])
    uboost_var  =  'uBoost(#alpha={:s})'.format(meaningful_digits(uboost_ur))
    uboost_vars = ['uBoost(#alpha={:s})'.format(meaningful_digits(ur)) for ur in uboost_urs]
    uboost_pattern = 'uboost_ur_{{:4.2f}}_te_{:.0f}_rel21_fixed'.format(uboost_eff)
    """
    # Tagger feature collection
    #tagger_features = ['Tau21','Tau21DDT', 'D2', kNN_var, 'D2', 'D2CSS', 'NN', ann_var, 'Adaboost', uboost_var]
    #tagger_features = ['lead_jet_C1_02', kNN_var]
    tagger_features = [
        'lead_' + base_var, 'lead_' + kNN_var, 'sub_' + base_var,
        'sub_' + kNN_var
    ]

    #tagger_features = base_var + kNN_var

    # Add variables
    # --------------------------------------------------------------------------

    with Profile("Add variables"):
        #for i in range(len(base_var)):
        from run.knn.common import add_knn, MODEL as sigModel, VAR as kNN_basevar, EFF as kNN_eff
        print "k-NN base variable: {} (cp. {})".format(base_var, kNN_var)
        add_knn(data,
                newfeat='lead_' + kNN_var,
                path='models/knn/knn_{}_{}_{}.pkl.gz'.format(
                    base_var, kNN_eff, sigModel))
        add_knn(data,
                newfeat='sub_' + kNN_var,
                path='models/knn/knn_{}_{}_{}.pkl.gz'.format(
                    base_var, kNN_eff, sigModel))

    # Remove unused variables
    used_variables = set(tagger_features +
                         ['lead_jet_m', 'lead_jet_pt', 'dijetmass', 'weight'])
    unused_variables = [var for var in list(data) if var not in used_variables]
    data.drop(columns=unused_variables)
    gc.collect()

    # Perform performance studies
    perform_studies(data, args, tagger_features)

    return 0
Beispiel #11
0
def main (args):

    # Definitions
    histstyle = dict(**HISTSTYLE)

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, features, _ = load_data('data/djr_LCTopo_1.h5') #, test=True)
    #data2, features, _ = load_data('data/djr_LCTopo_2.h5') #, test=True)

    #data = np.concatenate((data1, data2))

    sigNumber = 0

    sigDict = {
        0: 'All Models',
        1: 'Model A, m = 1 TeV',
        2: 'Model A, m = 1.5 TeV',
        3: 'Model A, m = 2 TeV',
        4: 'Model A, m = 2.5 TeV',
        5: 'Model B, m = 1 TeV',
        6: 'Model B, m = 1.5 TeV',
        7: 'Model B, m = 2 TeV',
        8: 'Model B, m = 2.5 TeV',
        9: 'Model C, m = 1 TeV',
        10: 'Model C, m = 1.5 TeV',
        11: 'Model C, m = 2 TeV',
        12: 'Model C, m = 2.5 TeV',
        13: 'Model D, m = 1 TeV',
        14: 'Model D, m = 1.5 TeV',
        15: 'Model D, m = 2 TeV',
        16: 'Model D, m = 2.5 TeV',
        }

    histstyle[True] ['label'] = 'Multijets'
    histstyle[False]['label'] = 'Dark jets, {}'.format(sigDict[sigNumber])

    # Add knn variables

    #base_var = ['lead_jet_ungrtrk500', 'sub_jet_ungrtrk500']
    base_var = 'jet_ungrtrk500'
    kNN_var = base_var.replace('jet', 'knn')
    #base_vars = [base_var]
    #kNN_vars = [kNN_var]
    base_vars = ['lead_'+base_var, 'sub_'+base_var]
    kNN_vars = ['lead_'+kNN_var, 'sub_'+kNN_var]

    
    with Profile("Add variables"):
        from run.knn.common import add_knn, EFF as kNN_eff
        #for i in range(len(base_var)):
        print "k-NN base variable: {} (cp. {})".format(base_var, kNN_var)
        add_knn(data, newfeat='lead_'+kNN_var, path='models/knn/knn1D_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel))
        add_knn(data, newfeat='sub_'+kNN_var, path='models/knn/knn1D_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel))

        #add_knn(data, newfeat=kNN_var, path='models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel))
        print 'models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel)

    # Check variable distributions
        
    weight = 'weight'  # 'weight_test' / 'weight'
    scale = 139*1000000 # (inverse nanobarn)

    msk_bkg = data['signal'] == 0
    if sigNumber==0:
        msk_sig = data['signal'] == 1 
    else:
        msk_sig = data['sigType'] == sigNumber 


    knnBins = np.linspace(-100, 200, 75, endpoint=True)

    for var in kNN_vars:
        ### Canvas ###
        c = rp.canvas(num_pads=2, batch=True)
        c_tmp = rp.canvas(num_pads=1, batch=True)
        c2 = rp.canvas(batch=True)

        ### Plot ###
        h2 = c.pads()[0].hist(data.loc[msk_sig, var].values, bins=knnBins, weights=data.loc[msk_sig, weight].values, normalise=True, **histstyle[False])
        h1 = c.pads()[0].hist(data.loc[msk_bkg, var].values, bins=knnBins, weights=scale*data.loc[msk_bkg, weight].values, normalise=True, **histstyle[True])

        h1_incl = c_tmp.hist(data.loc[msk_bkg, var].values, bins=knnBins, weights=scale*data.loc[msk_bkg, weight].values, normalise=False)
        h2_incl = c_tmp.hist(data.loc[msk_sig, var].values, bins=knnBins, weights=data.loc[msk_sig, weight].values, normalise=False)

        #h1_CR = c_tmp.hist(data.loc[msk_CR_bkg, var].values, bins=knnBins, weights=scale*data.loc[msk_CR_bkg, weight].values, normalise=False)
        #h2_CR = c_tmp.hist(data.loc[msk_CR_sig, var].values, bins=knnBins, weights=data.loc[msk_CR_sig, weight].values, normalise=False)

        print "bkg. incl integral: ", h1_incl.GetEffectiveEntries()
        print "sig. incl integral: ", h2_incl.GetEffectiveEntries()
        #print "bkg. CR efficiency: ", h1_CR.GetEffectiveEntries()/h1_incl.GetEffectiveEntries()
        #print "sig. CR efficiency: ", h2_CR.GetEffectiveEntries()/h2_incl.GetEffectiveEntries()

        normFactor = 1.0 / (3./2 + np.sqrt(h1_incl.GetEffectiveEntries()) )
        print "Sensitivity with no cut: ", normFactor

        ### sensitivity ###
        sensitivity = []
        bkg_eff_1jet = []
        i = 0
        for cut in knnBins:

            msk_pass = (data[kNN_vars[0]]>cut) & (data[kNN_vars[1]]>cut)
            msk_pass1 = data[kNN_vars[0]>cut)
            #msk_pass = (data[var]>cut)
            msk_bkg_pass = msk_bkg & msk_pass
            msk_sig_pass = msk_sig & msk_pass

            msk_bkg_pass1 = msk_bkg & msk_pass_1jet
            msk_sig_pass1 = msk_sig & msk_pass_1jet

            h1_pass = c_tmp.hist(data.loc[msk_bkg_pass, var].values, bins=knnBins, weights=scale*data.loc[msk_bkg_pass, weight].values, normalise=False)
            h2_pass = c_tmp.hist(data.loc[msk_sig_pass, var].values, bins=knnBins, weights=data.loc[msk_sig_pass, weight].values, normalise=False)

            h1_pass1 = c_tmp.hist(data.loc[msk_bkg_pass1, var].values, bins=knnBins, weights=data.loc[msk_sig_pass, weight].values, normalise=False)

            if ( h2_incl.GetEffectiveEntries()>0 ) : #and h1_pass.GetEffectiveEntries()>0) :
                sensitivity.append( ((h2_pass.GetEffectiveEntries()/h2_incl.GetEffectiveEntries()) / (3./2 + np.sqrt(h1_pass.GetEffectiveEntries()) )) / normFactor )
                #print "bkg. eff. @ " , cut, ": ", h1_pass.GetEffectiveEntries()/h1_incl.GetEffectiveEntries()  
                #print "signal eff. @ ", cut, ": ", h2_pass.GetEffectiveEntries()/h2_incl.GetEffectiveEntries()
                #print "Sensitivity gain@ ", cut, ": ", ((h2_pass.GetEffectiveEntries()/h2_incl.GetEffectiveEntries()) / (3./2 + np.sqrt(h1_pass.GetEffectiveEntries())) ) / normFactor

            else: 
                sensitivity.append(0)

            if (h1_incl.GetEffectiveEntries()>0 ) :
                bkg_eff_1jet.append(h1_pass1.GetEffectiveEntries()/h1_incl.GetEffectiveEntries())
            else:
                bkg_eff_1jet.append(0)
                

            i = i+1

        #c.pads()[0].ylim(0,0.25)
        c.pads()[0].logy()
        c.pads()[0].xlim(-100,200)
        c.pads()[1].ylim(0,30)
        c.pads()[1].xlim(-100,200)
        c.pads()[1].graph( sensitivity, bins=knnBins) #, oob=False )

        ### Decorations ###
        c.legend(width=0.4, xmin=0.3, ymax=0.9)
        #c.xlabel("n_{trk}^{#epsilon={}\%}".format(kNN_eff)) #latex(var, ROOT=True))
        c.xlabel("n_{trk}^{#epsilon}") #latex(var, ROOT=True))
        c.ylabel("Fraction of jets")
        c.pads()[1].ylabel("Sensitivity gain")#"#epsilon_{S}/(#frac{3}{2} + #sqrt{B})/")
        c.pads()[1].text(["Sensitivity = #varepsilon_{S}/(#frac{3}{2} + #sqrt{B})", 
                ], xmin=0.2, ymax=0.80, ATLAS=False)


        c2.graph(sensitivity, bkg_eff_1jet)
        c2.xlabel("Single jet #varepsilon_B")
        c2.ylabel("Sensitivity gain")
        c2.text(["#epsilon=0.5 %",], xmin=0.2, ymax=0.8, ATLAS=False)

        ### Save ###
        #mkdir('figures/distributions')
        c.save('figures/distributions/sensitivity_{}_sig{}_eff{}.pdf'.format(var, sigNumber, kNN_eff))
        c.save('figures/distributions/sensitivity_{}_sig{}_eff{}.eps'.format(var, sigNumber, kNN_eff))

        c2.save('figure/distribution/sensitivity_1jEfficiency.pdf'.format(var,sigNumber,kNN_eff))
        print 'figures/distributions/sensitivity_{}_sig{}_eff{}.pdf'.format(var, sigNumber, kNN_eff)
        pass
    

    # Plot also the normal ntrk distribution for cross check with Roland's result

    msk_bkg = data['signal'] == 0
    if sigNumber==0:
        msk_sig = data['signal'] == 1 # data['sigType'] == sigNumber #                             
    else:
        msk_sig = data['sigType'] == sigNumber # data['sigType'] == sigNumber #                    
    #msk_weight = data['weight']<0.0002
    #msk_bkg = msk_bkg & msk_pt & msk_m & msk_eta 
    #msk_sig = msk_sig & msk_pt & msk_m & msk_eta 


    baseBins = np.linspace(0, 200, 75, endpoint=True) #axes[var][1], axes[var][2], axes[var][0] + 1, endpoint=True)

    for var in base_vars:
        ### Canvas ###
        c = rp.canvas(num_pads=2, batch=True)
        c.pads()[0].logy()

        c_tmp = rp.canvas(batch=True)

        ### Plot ###
        h2 = c.pads()[0].hist(data.loc[msk_sig, var].values, bins=baseBins, weights=data.loc[msk_sig, weight].values, normalise=True, **histstyle[False])
        h1 = c.pads()[0].hist(data.loc[msk_bkg, var].values, bins=baseBins, weights=scale*data.loc[msk_bkg, weight].values, normalise=True, **histstyle[True])

        h1_incl = c_tmp.hist(data.loc[msk_bkg, var].values, bins=baseBins, weights=scale*data.loc[msk_bkg, weight].values, normalise=False)
        h2_incl = c_tmp.hist(data.loc[msk_sig, var].values, bins=baseBins, weights=data.loc[msk_sig, weight].values, normalise=False)


        print "bkg. incl integral: ", h1_incl.GetEffectiveEntries()
        print "sig. incl integral: ", h2_incl.GetEffectiveEntries()

        normFactor = 1.0 / (3./2 + np.sqrt(h1_incl.Integral()) )

        #print "Sensitivity with no cut: ", normFactor


        ### sensitivity ###
        sensitivity = []
        i = 0
        for cut in baseBins:
            #print cut

            msk_pass = (data[base_vars[0]]>cut) & (data[base_vars[1]]>cut) #
            #msk_pass = data[var]>cut

            msk_bkg_pass = msk_bkg & msk_pass
            msk_sig_pass = msk_sig & msk_pass
            
            h1_pass = c_tmp.hist(data.loc[msk_bkg_pass, var].values, bins=baseBins, weights=scale*data.loc[msk_bkg_pass, weight].values, normalise=False)
            h2_pass = c_tmp.hist(data.loc[msk_sig_pass, var].values, bins=baseBins, weights=data.loc[msk_sig_pass, weight].values, normalise=False)


            if ( h2_incl.Integral()>0 ): #and h1_pass.Integral()>0 ):
                sensitivity.append( (h2_pass.Integral()/h2_incl.Integral()) /  (3./2. + np.sqrt(h1_pass.Integral())) / normFactor )

                #print "signal eff.  at ", cut, ": ", (h2_pass.Integral()/h2_incl.Integral()) 
                #print "bkg eff.  at ", cut, ": ", (h1_pass.Integral()/h1_incl.Integral()) 
                #print "sensitivity gain at ", cut, ": ", (h2_pass.Integral()/h2_incl.Integral()) /  (3./2. + np.sqrt(h1_pass.Integral())) / normFactor

            else:
                sensitivity.append(0)

            i = i+1

        c.pads()[1].ylim(0,80)
        c.pads()[1].xlim(0,200)
        c.pads()[1].graph( sensitivity, bins=baseBins) #, oob=False )

        ### Decorations ###
        c.legend(width=0.4, xmin=0.3, ymax=0.9)
        #c.xlabel(latex(var, ROOT=True))
        c.ylabel("Fraction of jets")
        c.xlabel("n_{trk}") #latex(var, ROOT=True))                                             
        c.pads()[1].ylabel("sensitivity gain") #"#epsilon_{S}/(#frac{3}{2} + #sqrt{B})")
        c.pads()[1].text(["sensitivity = #epsilon_{S}/(#frac{3}{2} + #sqrt{B})",
                ], xmin=0.2, ymax=0.80, ATLAS=False)

        ### Save ###
        c.save('figures/distributions/sensitivity_{}_sig{}_eff{}.pdf'.format(var, sigNumber, kNN_eff))
        c.save('figures/distributions/sensitivity_{}_sig{}_eff{}.eps'.format(var, sigNumber, kNN_eff))
        pass
Beispiel #12
0
def main(args):

    # Initialise
    args, cfg = initialise(args)

    # Initialise Keras backend
    #initialise_backend(args)

    # Neural network-specific initialisation of the configuration dict
    #initialise_config(args, cfg)

    # Keras import(s)
    #import keras.backend as K
    #from keras.models import load_model

    # Project import(s)
    #from adversarial.models import classifier_model, adversary_model, combined_model, decorrelation_model

    # Load data
    #data, features, _ = load_data(args.input + 'data.h5', test=True)
    data, features, _ = load_data(args.input + 'data.h5',
                                  test_full_signal=True)

    # Common definitions
    # --------------------------------------------------------------------------
    # -- k-nearest neighbour
    kNN_var_N2 = 'N_{2}-k#minusNN'
    kNN_var_tau21 = 'tau_{21}-k#minusNN'

    def meaningful_digits(number):
        digits = 0
        if number > 0:
            digits = int(np.ceil(max(-np.log10(number), 0)))
            pass
        return '{l:.{d:d}f}'.format(d=digits, l=number)

    # -- Adversarial neural network (ANN) scan
    #lambda_reg  = 10.
    #lambda_regs = sorted([1., 3., 10.])
    #ann_vars    = list()
    #lambda_strs = list()
    #for lambda_reg_ in lambda_regs:
    #    lambda_str = meaningful_digits(lambda_reg_).replace('.', 'p')
    #    lambda_strs.append(lambda_str)

    #    ann_var_ = "ANN(#lambda={:s})".format(lambda_str.replace('p', '.'))
    #    ann_vars.append(ann_var_)
    #    pass

    #ann_var = ann_vars[lambda_regs.index(lambda_reg)]

    # -- uBoost scan
    #uboost_eff = 92
    #uboost_ur  = 0.3
    #uboost_urs = sorted([0., 0.01, 0.1, 0.3, 1.0])
    #uboost_var  =  'uBoost(#alpha={:s})'.format(meaningful_digits(uboost_ur))
    #uboost_vars = ['uBoost(#alpha={:s})'.format(meaningful_digits(ur)) for ur in uboost_urs]
    #uboost_pattern = 'uboost_ur_{{:4.2f}}_te_{:.0f}_rel21_fixed'.format(uboost_eff)

    # Tagger feature collection
    #tagger_features = ['Tau21','Tau21DDT', 'D2', kNN_var, 'D2', 'D2CSS', 'NN', ann_var, 'Adaboost', uboost_var]
    #tagger_features = ['tau21', 'tau21DDT', 'tau21', 'tau21kNN', 'tau21', 'tau21CSS', 'N2_B1', 'N2_B1DDT', 'N2_B1', 'N2_B1kNN', 'N2_B1', 'N2_B1CSS']; title="tau21_vs_N2_B1"
    #tagger_features = ['N2_B1', 'N2_B1DDT', 'N2_B1', 'N2_B1kNN', 'N2_B1', 'N2_B1CSS']; title="N2_B1"
    #tagger_features = ['tau21', 'tau21DDT', 'N2_B1', 'N2_B1kNN', 'N2_B1', 'N2_B1CSS']; title="ATLAS"
    tagger_features = [
        'decDeepWvsQCD', 'decDeepWvsQCDDDT', 'decDeepWvsQCD',
        'decDeepWvsQCDkNN', 'decDeepWvsQCD', 'decDeepWvsQCDCSS'
    ]
    title = "decDeep"
    tagger_features = [
        'DeepWvsQCD', 'DeepWvsQCDDDT', 'DeepWvsQCD', 'DeepWvsQCDkNN',
        'DeepWvsQCD', 'DeepWvsQCDCSS'
    ]
    title = "Deep"

    # Add variables
    # --------------------------------------------------------------------------
    with Profile("Add variables"):

        ## Tau21DDT
        #from run.ddt.common import add_ddt
        #add_ddt(data, feat='tau21', path='models/ddt/ddt_tau21.pkl.gz')

        ## N2DDT
        #from run.ddt.common import add_ddt
        #add_ddt(data, feat='N2_B1', path='models/ddt/ddt_N2_B1.pkl.gz')

        ## decDeepQvsQCDDDT
        #from run.ddt.common import add_ddt
        #add_ddt(data, feat='decDeepWvsQCD', path='models/ddt/ddt_decDeepWvsQCD.pkl.gz')

        # DeepQvsQCDDDT
        from run.ddt.common import add_ddt
        add_ddt(data,
                feat='DeepWvsQCD',
                path='models/ddt/ddt_DeepWvsQCD.pkl.gz')

        ## Tau21-kNN
        #from run.knn.common import add_knn, VAR_TAU21 as kNN_basevar, TAU21_EFF as kNN_eff
        #print "k-NN base variable: {} (cp. {})".format(kNN_basevar, kNN_var_tau21)
        #add_knn(data, feat=kNN_basevar, path='models/knn/knn_{}_{}.pkl.gz'.format(kNN_basevar, kNN_eff))

        ## N2-kNN
        #from run.knn.common import add_knn, VAR_N2 as kNN_basevar, N2_EFF as kNN_eff
        #print "k-NN base variable: {} (cp. {})".format(kNN_basevar, kNN_var_N2)
        #add_knn(data, feat=kNN_basevar, path='models/knn/knn_{}_{}.pkl.gz'.format(kNN_basevar, kNN_eff))

        ## decDeepWvsQCD-kNN
        #from run.knn.common import add_knn, VAR_DECDEEP as kNN_basevar, DECDEEP_EFF as kNN_eff
        #print "k-NN base variable: {} (cp. {})".format(kNN_basevar, kNN_var_N2)
        #add_knn(data, feat=kNN_basevar, path='models/knn/knn_{}_{}.pkl.gz'.format(kNN_basevar, kNN_eff))

        # DeepWvsQCD-kNN
        from run.knn.common import add_knn, VAR_DEEP as kNN_basevar, DEEP_EFF as kNN_eff
        print "k-NN base variable: {} (cp. {})".format(kNN_basevar, kNN_var_N2)
        add_knn(data,
                feat=kNN_basevar,
                path='models/knn/knn_{}_{}.pkl.gz'.format(
                    kNN_basevar, kNN_eff))

        ## Tau21-CSS
        #from run.css.common import add_css
        #add_css("tau21", data)

        ## N2-CSS
        #from run.css.common import add_css
        #add_css("N2_B1", data)

        ## decDeepWvsQCD-CSS
        #from run.css.common import add_css
        #add_css("decDeepWvsQCD", data)

        # DeepWvsQCD-CSS
        from run.css.common import add_css
        add_css("DeepWvsQCD", data)

        pass

    # Remove unused variables
    #used_variables = set(tagger_features + ann_vars + uboost_vars + ['m', 'pt', 'npv', 'weight_test'])
    used_variables = set(tagger_features +
                         ['m', 'pt', 'weight_test', 'npv'
                          ])  ## need to put 'npv' back in for robustness study
    unused_variables = [var for var in list(data) if var not in used_variables]
    data.drop(columns=unused_variables)
    gc.collect()

    # Perform performance studies
    #perform_studies (data, args, tagger_features, ann_vars, uboost_vars)
    perform_studies(data, args, tagger_features, title=title)

    return 0
Beispiel #13
0
def perform_studies(data, args, tagger_features, title=None):
    """
    Method delegating performance studies.
    """
    #masscuts  = [True, False]
    masscuts = [False]
    pt_ranges = [None, (200, 500), (500, 1000), (1000, 2000)]

    ## Perform combined robustness study
    #with Profile("Study: Robustness"):
    #    for masscut in masscuts:
    #        studies.robustness_full(data, args, tagger_features, masscut=masscut, title=title)
    #        pass
    #    pass

    ## Perform jet mass distribution comparison study
    #with Profile("Study: Jet mass comparison"):
    #    for pt_range in pt_ranges:
    #        print "pt_range =", pt_range
    #        studies.jetmasscomparison(data, args, tagger_features, pt_range, title=title)
    #    pass

    # Perform summary plot study
    with Profile("Study: Summary plot"):
        #regex_nn = re.compile('\#lambda=[\d\.]+')
        #regex_ub = re.compile('\#alpha=[\d\.]+')

        #scan_features = {'NN':       map(lambda feat: (feat, regex_nn.search(feat).group(0)), ann_vars),
        #                 'Adaboost': map(lambda feat: (feat, regex_ub.search(feat).group(0)), uboost_vars)
        #                 }
        scan_features = dict()

        for masscut, pt_range in itertools.product(masscuts, pt_ranges):
            studies.summary(data,
                            args,
                            tagger_features,
                            scan_features,
                            masscut=masscut,
                            pt_range=pt_range,
                            title=title)
            pass
        pass

    ## Perform distributions study
    #with Profile("Study: Substructure tagger distributions"):
    #    mass_ranges = np.linspace(50, 300, 5 + 1, endpoint=True)
    #    mass_ranges = [None] + zip(mass_ranges[:-1], mass_ranges[1:])
    #    for feat, pt_range, mass_range in itertools.product(tagger_features, pt_ranges, mass_ranges):  # tagger_features
    #        studies.distribution(data, args, feat, pt_range, mass_range, title=title)
    #        pass
    #    pass

    # Perform ROC study
    with Profile("Study: ROC"):
        for masscut, pt_range in itertools.product(masscuts, pt_ranges):
            studies.roc(data,
                        args,
                        tagger_features,
                        masscut=masscut,
                        pt_range=pt_range,
                        title=title)
            pass
        pass
def main(args):

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, features, _ = load_data(args.input + 'data.h5',
                                  test_full_signal=True)

    #data, features, _ = load_data(args.input + 'data.h5', train_full_signal=True)  #for faster checking, don't use for actual comparison

    # Common definitions
    # --------------------------------------------------------------------------

    def meaningful_digits(number):
        digits = 0
        if number > 0:
            digits = int(np.ceil(max(-np.log10(number), 0)))
            pass
        return '{l:.{d:d}f}'.format(d=digits, l=number)

    # Tagger feature collection
    #tagger_features = ['Tau21','Tau21DDT', 'D2', kNN_var, 'D2', 'D2CSS', 'NN', ann_var, 'Adaboost', uboost_var]
    #tagger_features = ['tau21', 'tau21DDT', 'tau21', 'tau21kNN', 'tau21', 'tau21CSS', 'N2_B1', 'N2_B1DDT', 'N2_B1', 'N2_B1kNN', 'N2_B1', 'N2_B1CSS']; title="tau21_vs_N2_B1"
    #tagger_features = ['N2_B1', 'N2_B1DDT', 'N2_B1', 'N2_B1kNN', 'N2_B1', 'N2_B1CSS']; title="N2_B1"
    #tagger_features = ['tau21', 'tau21DDT', 'N2_B1', 'N2_B1kNN', 'N2_B1', 'N2_B1CSS']; title="ATLAS"
    #tagger_features = ['decDeepWvsQCD', 'decDeepWvsQCDDDT', 'decDeepWvsQCD', 'decDeepWvsQCDkNN', 'decDeepWvsQCD', 'decDeepWvsQCDCSS']; title="decDeep"

    #tagger_features = {'tau21':['','DDT'], 'N2_B1':['','kNN','CSS']}; title='ATLAS2'
    #tagger_features = {'tau21':['','DDT'], 'N2_B1':['','kNN',], 'decDeepWvsQCD':['','kNN'], 'DeepWvsQCD':['','kNN']}; title='Deep_vs_Analytic'
    #tagger_features = {'tau21':[''], 'N2_B1':[''], 'decDeepWvsQCD':[''], 'DeepWvsQCD':['']}; title='Deep_Check2'
    tagger_features = {
        'tau21': ['', 'DDT', 'kNN', 'CSS'],
        'N2_B1': ['', 'DDT', 'kNN', 'CSS']
    }
    title = 'Corrected_Full_Analytic'
    #tagger_features = {'tau21':['', 'DDT', 'kNN', 'CSS'], 'N2_B1':['', 'DDT', 'kNN','CSS']}; title='Full_Analytic_vs_Atlas'

    extracted_features = []
    for basevar in tagger_features.keys():
        for suffix in tagger_features[basevar]:
            extracted_features.append(basevar + suffix)

    # Add variables
    # --------------------------------------------------------------------------
    with Profile("Add variables"):

        # the selections of which variables to add could also be automated from the tagger_features list...

        # Tau21DDT
        from run.ddt.common import add_ddt
        add_ddt(data, feat='tau21', path='models/ddt/ddt_tau21.pkl.gz')

        # N2DDT
        from run.ddt.common import add_ddt
        add_ddt(data, feat='N2_B1', path='models/ddt/ddt_N2_B1.pkl.gz')

        ## decDeepQvsQCDDDT
        #from run.ddt.common import add_ddt
        #add_ddt(data, feat='decDeepWvsQCD', path='models/ddt/ddt_decDeepWvsQCD.pkl.gz')

        ## DeepQvsQCDDDT
        #from run.ddt.common import add_ddt
        #add_ddt(data, feat='DeepWvsQCD', path='models/ddt/ddt_DeepWvsQCD.pkl.gz')

        # Tau21-kNN
        from run.knn.common import add_knn, VAR_TAU21 as kNN_basevar, TAU21_EFF as kNN_eff
        print "k-NN base variable: {} (cp. {})".format(kNN_basevar,
                                                       'tau_{21}-k#minusNN')
        add_knn(data,
                feat=kNN_basevar,
                path='models/knn/knn_{}_{}.pkl.gz'.format(
                    kNN_basevar, kNN_eff))

        # N2-kNN
        from run.knn.common import add_knn, VAR_N2 as kNN_basevar, N2_EFF as kNN_eff
        print "k-NN base variable: {} (cp. {})".format(kNN_basevar,
                                                       'N_{2}-kNN')
        add_knn(data,
                feat=kNN_basevar,
                path='models/knn/knn_{}_{}.pkl.gz'.format(
                    kNN_basevar, kNN_eff))

        ## decDeepWvsQCD-kNN
        #from run.knn.common import add_knn, VAR_DECDEEP as kNN_basevar, DECDEEP_EFF as kNN_eff
        #print "k-NN base variable: {} (cp. {})".format(kNN_basevar, 'decDeepWvsQCD')
        #add_knn(data, feat=kNN_basevar, path='models/knn/knn_{}_{}.pkl.gz'.format(kNN_basevar, kNN_eff))

        ## DeepWvsQCD-kNN
        #from run.knn.common import add_knn, VAR_DEEP as kNN_basevar, DEEP_EFF as kNN_eff
        #print "k-NN base variable: {} (cp. {})".format(kNN_basevar, 'DeepWvsQCD')
        #add_knn(data, feat=kNN_basevar, path='models/knn/knn_{}_{}.pkl.gz'.format(kNN_basevar, kNN_eff))

        # Tau21-CSS
        from run.css.common import add_css
        add_css("tau21", data)

        # N2-CSS
        from run.css.common import add_css
        add_css("N2_B1", data)

        ## decDeepWvsQCD-CSS
        #from run.css.common import add_css
        #add_css("decDeepWvsQCD", data)

        ## DeepWvsQCD-CSS
        #from run.css.common import add_css
        #add_css("DeepWvsQCD", data)

        pass

    # Remove unused variables
    #used_variables = set(tagger_features + ['m', 'pt', 'weight_test', 'npv'])
    used_variables = set(extracted_features +
                         ['m', 'pt', 'weight_test', 'npv'])
    unused_variables = [var for var in list(data) if var not in used_variables]
    data.drop(columns=unused_variables)
    gc.collect()

    # Perform performance studies
    perform_studies(data,
                    args,
                    tagger_features,
                    extracted_features,
                    title=title)
    return 0
Beispiel #15
0
def perform_studies(data, args, tagger_features, ann_vars, uboost_vars):
    """
    Method delegating performance studies.
    """
    masscuts = [True, False]
    pt_ranges = [None, (200, 500), (500, 1000)]

    # Perform combined robustness study
    with Profile("Study: Robustness"):
        for masscut in masscuts:
            studies.robustness_full(data,
                                    args,
                                    tagger_features,
                                    masscut=masscut)
            pass
        pass

    # Perform jet mass distribution comparison study
    with Profile("Study: Jet mass comparison"):
        studies.jetmasscomparison(data, args, tagger_features)
        pass

    # Perform summary plot study
    with Profile("Study: Summary plot"):
        regex_nn = re.compile('\#lambda=[\d\.]+')
        regex_ub = re.compile('\#alpha=[\d\.]+')

        scan_features = {
            'NN':
            map(lambda feat: (feat, regex_nn.search(feat).group(0)), ann_vars),
            'Adaboost':
            map(lambda feat: (feat, regex_ub.search(feat).group(0)),
                uboost_vars)
        }

        for masscut, pt_range in itertools.product(masscuts, pt_ranges):
            studies.summary(data,
                            args,
                            tagger_features,
                            scan_features,
                            masscut=masscut,
                            pt_range=pt_range)
            pass
        pass

    # Perform distributions study
    with Profile("Study: Substructure tagger distributions"):
        mass_ranges = np.linspace(50, 300, 5 + 1, endpoint=True)
        mass_ranges = [None] + zip(mass_ranges[:-1], mass_ranges[1:])
        for feat, pt_range, mass_range in itertools.product(
                tagger_features, pt_ranges, mass_ranges):  # tagger_features
            studies.distribution(data, args, feat, pt_range, mass_range)
            pass
        pass

    # Perform ROC study
    with Profile("Study: ROC"):
        for masscut, pt_range in itertools.product(masscuts, pt_ranges):
            studies.roc(data,
                        args,
                        tagger_features,
                        masscut=masscut,
                        pt_range=pt_range)
            pass
        pass

    # Perform JSD study
    with Profile("Study: JSD"):
        studies.jsd(data, args, tagger_features)
        pass

    # Perform efficiency study
    with Profile("Study: Efficiency"):
        for feat in tagger_features:
            studies.efficiency(data, args, feat)
            pass
        pass

    return
Beispiel #16
0
def main (args):

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, _, _ = load_data('data/' + args.input) #, test=True)
    msk_sig = data['signal'] == 1
    msk_bkg = ~msk_sig

    # -------------------------------------------------------------------------
    ####
    #### # Initialise Keras backend
    #### initialise_backend(args)
    ####
    #### # Neural network-specific initialisation of the configuration dict
    #### initialise_config(args, cfg)
    ####
    #### # Keras import(s)
    #### from keras.models import load_model
    ####
    #### # NN
    #### from run.adversarial.common import add_nn
    #### with Profile("NN"):
    ####     classifier = load_model('models/adversarial/classifier/full/classifier.h5')
    ####     add_nn(data, classifier, 'NN')
    ####     pass
    # -------------------------------------------------------------------------

    # Fill measured profile
    profile_meas, (x,percs, err) = fill_profile_1D(data[msk_bkg])
    weights = 1/err

    # Add k-NN variable
    knnfeat = 'knn'
    orgfeat = VAR
    add_knn(data, newfeat=knnfeat, path='models/knn/{}_{}_{}_{}.pkl.gz'.format(FIT, VAR, EFF, MODEL)) 

    # Loading KNN classifier
    knn = loadclf('models/knn/{}_{:s}_{}_{}.pkl.gz'.format(FIT, VAR, EFF, MODEL))
    #knn = loadclf('models/knn/{}_{:s}_{}_{}.pkl.gz'.format(FIT, VAR, EFF, MODEL))

    X = x.reshape(-1,1)

    # Filling fitted profile
    with Profile("Filling fitted profile"):
        rebin = 8

        # Short-hands
        vbins, vmin, vmax = AXIS[VARX]

        # Re-binned bin edges  @TODO: Make standardised right away?
        # edges = np.interp(np.linspace(0, vbins, vbins * rebin + 1, endpoint=True), 
        #                  range(vbins + 1),
        #                  np.linspace(vmin, vmax,  vbins + 1,         endpoint=True))

        fineBins = np.linspace(vmin, vmax,  vbins*rebin + 1,         endpoint=True)
        orgBins = np.linspace(vmin, vmax,  vbins + 1,         endpoint=True)

        # Re-binned bin centres
        fineCentres = fineBins[:-1] + 0.5 * np.diff(fineBins)
        orgCentres = orgBins[:-1] + 0.5 * np.diff(orgBins)
        
        pass

        # Get predictions evaluated at re-binned bin centres
        if 'erf' in FIT:
            fit = func(fineCentres, knn[0], knn[1], knn[2])
            print "Check: ", func([1500, 2000], knn[0], knn[1], knn[2]) 
        else:
            fit = knn.predict(fineCentres.reshape(-1,1)) #centres.reshape(-1,1))

        # Fill ROOT "profile"
        profile_fit = ROOT.TH1F('profile_fit', "", len(fineBins) - 1, fineBins.flatten('C'))
        root_numpy.array2hist(fit, profile_fit)
        
        knn1 = PolynomialFeatures(degree=2)                                           
        X_poly = knn1.fit_transform(X)
        reg = LinearRegression(fit_intercept=False) #fit_intercept=False)
        reg.fit(X_poly, percs, weights)
        score = round(reg.score(X_poly, percs), 4)
        coef = reg.coef_
        intercept = reg.intercept_
        print "COEFFICIENTS: ", coef, intercept
        
        TCoef = ROOT.TVector3(coef[0], coef[1], coef[2]) 
        outFile = ROOT.TFile.Open("models/{}_jet_ungrtrk500_eff{}_stat{}_{}.root".format(FIT, EFF, MIN_STAT, MODEL),"RECREATE")
        outFile.cd()
        TCoef.Write()
        profile_fit.SetName("kNNfit")
        profile_fit.Write()
        outFile.Close()

        # profile_meas2 = ROOT.TH1F('profile_meas', "", len(x) - 1, x.flatten('C'))
        # root_numpy.array2hist(percs, profile_meas2)
        profile_meas2 = ROOT.TGraph(len(x), x, percs) 
        pass


    # Plotting
    with Profile("Plotting"):
        # Plot
        plot(profile_meas2, profile_fit)
        pass

    # Plotting local selection efficiencies for D2-kNN < 0
    # -- Compute signal efficiency

    # MC weights are scaled with lumi. This is just for better comparison
    #if INPUT =="mc": 
    #    data.loc[:,'TotalEventWeight'] /=  139000000. 

    for sig, msk in zip([True, False], [msk_sig, msk_bkg]):

        # Define arrays
        shape   = AXIS[VARX][0]
        bins    = np.linspace(AXIS[VARX][1], AXIS[VARX][2], AXIS[VARX][0]+ 1, endpoint=True)
        #bins = np.linspace(AXIS[VARX][1], 4000, 40, endpoint=True)
        #bins = np.append(bins, [4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000])

        print "HERE: ", bins 
        
        #x, y = (np.zeros(shape) for _ in range(2))

        # Create `profile` histogram
        profile_knn = ROOT.TH1F('profile', "", len(bins) - 1, bins ) #.flatten('C') )
        profile_org = ROOT.TH1F('profile', "", len(bins) - 1, bins ) #.flatten('C') )

        # Compute inclusive efficiency in bins of `VARX`
        effs = list()
        
        for i in range(shape):
            msk_bin  = (data[VARX] > bins[i]) & (data[VARX] <= bins[i+1])
            msk_pass =  data[knnfeat] > 0 # <?
            msk_pass_org =  data[orgfeat] > 70 # <?
            num = data.loc[msk & msk_bin & msk_pass, 'TotalEventWeight'].values.sum()
            num_org = data.loc[msk & msk_bin & msk_pass_org, 'TotalEventWeight'].values.sum()
            den = data.loc[msk & msk_bin,'TotalEventWeight'].values.sum()
            if den > 0:
                eff = num/den *100.
                eff_org = num_org/den *100.
                profile_knn.SetBinContent(i + 1, eff)
                profile_org.SetBinContent(i + 1, eff_org)
                effs.append(eff)
            #else:
            #print i, "Density = 0"
            pass

        c = rp.canvas(batch=True)
        leg = ROOT.TLegend(0.2, 0.75, 0.5, 0.85)
        leg.AddEntry(profile_knn, "#it{n}_{trk}^{#varepsilon=%s%%} > 0" % ( EFF), "l")
        leg.AddEntry(profile_org, "#it{n}_{trk} > 70", "l")
        leg.Draw()

        pad = c.pads()[0]._bare()
        pad.cd()
        pad.SetRightMargin(0.10)
        pad.SetLeftMargin(0.15)
        pad.SetTopMargin(0.10)

        # Styling
        profile_knn.SetLineColor(rp.colours[1])
        profile_org.SetLineColor(rp.colours[2])
        profile_knn.SetMarkerStyle(24)
        profile_knn.GetXaxis().SetTitle( "#it{m}_{jj} [GeV]" ) #latex(VARX, ROOT=True) + "[GeV]") #+ " = log(m^{2}/p_{T}^{2})")
        #profile.GetXaxis().SetTitle("Large-#it{R} jet " + latex(VARX, ROOT=True))# + " = log(m^{2}/p_{T}^{2})")
        profile_org.GetYaxis().SetTitle("Selection efficiency (%)") # for #it{n}_{trk}^{#varepsilon=%s%%}>0" % ( EFF))

        profile_knn.GetYaxis().SetNdivisions(505)
        #profile_knn.GetXaxis().SetNdivisions(505)
        profile_knn.GetXaxis().SetTitleOffset(1.4)
        profile_knn.GetYaxis().SetTitleOffset(1.8)
        profile_knn.GetXaxis().SetRangeUser(*XRANGE)
        profile_org.GetXaxis().SetRangeUser(*XRANGE)

        yrange = (0., EFF*3) #2.0 percent
        if yrange:
            profile_knn.GetYaxis().SetRangeUser(*yrange)
            profile_org.GetYaxis().SetRangeUser(*yrange)
            pass

        # Draw
        profile_org.Draw()
        profile_knn.Draw("same")

        # Save
        mkdir('figures/knn/')
        c.save('figures/knn/{}_eff_{}_{:s}_{}_{}_stat{}.pdf'.format(FIT, 'sig' if sig else 'bkg', VAR, EFF, MODEL+INPUT, MIN_STAT))
        #c.save('figures/knn/{}_eff_{}_{:s}_{}_{}_stat{}.png'.format(FIT, 'sig' if sig else 'bkg', VAR, EFF, MODEL, MIN_STAT))
        c.save('figures/knn/{}_eff_{}_{:s}_{}_{}_stat{}.eps'.format(FIT, 'sig' if sig else 'bkg', VAR, EFF, MODEL+INPUT, MIN_STAT))
        del c
        
        pass

    return
Beispiel #17
0
def main(args):

    # Initialise
    args, cfg = initialise(args)

    # Initialise Keras backend
    initialise_backend(args)

    # Neural network-specific initialisation of the configuration dict
    initialise_config(args, cfg)

    # Keras import(s)
    import keras.backend as K
    from keras.models import load_model

    # Project import(s)
    from adversarial.models import classifier_model, adversary_model, combined_model, decorrelation_model

    # Load data
    data, features, _ = load_data(args.input + 'data.h5', test=True)

    # Common definitions
    # --------------------------------------------------------------------------
    # -- k-nearest neighbour
    kNN_var = 'D2-k#minusNN'

    def meaningful_digits(number):
        digits = 0
        if number > 0:
            digits = int(np.ceil(max(-np.log10(number), 0)))
            pass
        return '{l:.{d:d}f}'.format(d=digits, l=number)

    # -- Adversarial neural network (ANN) scan
    lambda_reg = 10.
    lambda_regs = sorted([1., 3., 10.])
    ann_vars = list()
    lambda_strs = list()
    for lambda_reg_ in lambda_regs:
        lambda_str = meaningful_digits(lambda_reg_).replace('.', 'p')
        lambda_strs.append(lambda_str)

        ann_var_ = "ANN(#lambda={:s})".format(lambda_str.replace('p', '.'))
        ann_vars.append(ann_var_)
        pass

    ann_var = ann_vars[lambda_regs.index(lambda_reg)]

    # -- uBoost scan
    uboost_eff = 92
    uboost_ur = 0.3
    uboost_urs = sorted([0., 0.01, 0.1, 0.3, 1.0])
    uboost_var = 'uBoost(#alpha={:s})'.format(meaningful_digits(uboost_ur))
    uboost_vars = [
        'uBoost(#alpha={:s})'.format(meaningful_digits(ur))
        for ur in uboost_urs
    ]
    uboost_pattern = 'uboost_ur_{{:4.2f}}_te_{:.0f}_rel21_fixed'.format(
        uboost_eff)

    # Tagger feature collection
    tagger_features = [
        'Tau21', 'Tau21DDT', 'D2', kNN_var, 'D2', 'D2CSS', 'NN', ann_var,
        'Adaboost', uboost_var
    ]

    # Add variables
    # --------------------------------------------------------------------------
    with Profile("Add variables"):

        # Tau21DDT
        from run.ddt.common import add_ddt
        add_ddt(data, path='models/ddt/ddt.pkl.gz')

        # D2-kNN
        from run.knn.common import add_knn, VAR as kNN_basevar, EFF as kNN_eff
        print "k-NN base variable: {} (cp. {})".format(kNN_basevar, kNN_var)
        add_knn(data,
                newfeat=kNN_var,
                path='models/knn/knn_{}_{}.pkl.gz'.format(
                    kNN_basevar, kNN_eff))

        # D2-CSS
        from run.css.common import add_css
        add_css("D2", data)

        # NN
        from run.adversarial.common import add_nn
        with Profile("NN"):
            classifier = load_model(
                'models/adversarial/classifier/full/classifier.h5')
            add_nn(data, classifier, 'NN')
            pass

        # ANN
        with Profile("ANN"):
            from adversarial.utils import DECORRELATION_VARIABLES
            adversary = adversary_model(
                gmm_dimensions=len(DECORRELATION_VARIABLES),
                **cfg['adversary']['model'])

            combined = combined_model(classifier, adversary,
                                      **cfg['combined']['model'])

            for ann_var_, lambda_str_ in zip(ann_vars, lambda_strs):
                print "== Loading model for {}".format(ann_var_)
                combined.load_weights(
                    'models/adversarial/combined/full/combined_lambda{}.h5'.
                    format(lambda_str_))
                add_nn(data, classifier, ann_var_)
                pass
            pass

        # Adaboost/uBoost
        with Profile("Adaboost/uBoost"):
            from run.uboost.common import add_bdt
            for var, ur in zip(uboost_vars, uboost_urs):
                var = ('Adaboost' if ur == 0 else var)
                path = 'models/uboost/' + uboost_pattern.format(ur).replace(
                    '.', 'p') + '.pkl.gz'
                print "== Loading model for {}".format(var)
                add_bdt(data, var, path)
                pass

            # Remove `Adaboost` from scan list
            uboost_vars.pop(0)
            pass

        pass

    # Remove unused variables
    used_variables = set(tagger_features + ann_vars + uboost_vars +
                         ['m', 'pt', 'npv', 'weight_test'])
    unused_variables = [var for var in list(data) if var not in used_variables]
    data.drop(columns=unused_variables)
    gc.collect()

    # Perform performance studies
    perform_studies(data, args, tagger_features, ann_vars, uboost_vars)

    return 0
def test(data, variable, bg_eff, signal_above=False):
    # Shout out to Cynthia Brewer and Mark Harrower
    # [http://colorbrewer2.org]. Palette is colorblind-safe.
    rgbs = [(247 / 255., 251 / 255., 255 / 255.),
            (222 / 255., 235 / 255., 247 / 255.),
            (198 / 255., 219 / 255., 239 / 255.),
            (158 / 255., 202 / 255., 225 / 255.),
            (107 / 255., 174 / 255., 214 / 255.),
            (66 / 255., 146 / 255., 198 / 255.),
            (33 / 255., 113 / 255., 181 / 255.),
            (8 / 255., 81 / 255., 156 / 255.),
            (8 / 255., 48 / 255., 107 / 255.)]

    red, green, blue = map(np.array, zip(*rgbs))
    nb_cols = len(rgbs)
    stops = np.linspace(0, 1, nb_cols, endpoint=True)
    ROOT.TColor.CreateGradientColorTable(nb_cols, stops, red, green, blue,
                                         NB_CONTOUR)

    msk_sig = data['signal'] == 1
    msk_bkg = ~msk_sig

    # Fill measured profile
    with Profile("filling profile"):
        profile_meas, _ = fill_profile(data[msk_bkg],
                                       variable,
                                       bg_eff,
                                       signal_above=signal_above)

    # Add k-NN variable
    with Profile("adding variable"):
        knnfeat = 'knn'
        #add_knn(data, feat=variable, newfeat=knnfeat, path='knn_fitter/models/knn_{}_{}.pkl.gz'.format(variable, bg_eff))
        add_knn(data,
                feat=variable,
                newfeat=knnfeat,
                path=args.output +
                '/models/knn_{:s}_{:.0f}.pkl.gz'.format(variable, bg_eff))

    # Loading KNN classifier
    with Profile("loading model"):
        #knn = loadclf('knn_fitter/models/knn_{:s}_{:.0f}.pkl.gz'.format(variable, bg_eff))
        knn = loadclf(
            args.output +
            '/models/knn_{:s}_{:.0f}.pkl.gz'.format(variable, bg_eff))

    # Filling fitted profile
    with Profile("Filling fitted profile"):
        rebin = 8
        edges, centres = dict(), dict()
        for ax, var in zip(['x', 'y'], [VARX, VARY]):

            # Short-hands
            vbins, vmin, vmax = AXIS[var]

            # Re-binned bin edges
            edges[ax] = np.interp(
                np.linspace(0, vbins, vbins * rebin + 1, endpoint=True),
                range(vbins + 1),
                np.linspace(vmin, vmax, vbins + 1, endpoint=True))

            # Re-binned bin centres
            centres[ax] = edges[ax][:-1] + 0.5 * np.diff(edges[ax])
            pass

        # Get predictions evaluated at re-binned bin centres
        g = dict()
        g['x'], g['y'] = np.meshgrid(centres['x'], centres['y'])
        g['x'], g['y'] = standardise(g['x'], g['y'])

        X = np.vstack((g['x'].flatten(), g['y'].flatten())).T
        fit = knn.predict(X).reshape(g['x'].shape).T

        # Fill ROOT "profile"
        profile_fit = ROOT.TH2F('profile_fit', "",
                                len(edges['x']) - 1, edges['x'].flatten('C'),
                                len(edges['y']) - 1, edges['y'].flatten('C'))
        root_numpy.array2hist(fit, profile_fit)
        pass

    # Plotting
    for fit in [False, True]:

        # Select correct profile
        profile = profile_fit if fit else profile_meas

        # Plot
        plot(profile, fit, variable, bg_eff)
        pass
    pass

    # Plotting local selection efficiencies for D2-kNN < 0
    # -- Compute signal efficiency
    for sig, msk in zip([True, False], [msk_sig, msk_bkg]):
        if sig:
            print "working on signal"
        else:
            print "working on bg"

        if sig:
            rgbs = [(247 / 255., 251 / 255., 255 / 255.),
                    (222 / 255., 235 / 255., 247 / 255.),
                    (198 / 255., 219 / 255., 239 / 255.),
                    (158 / 255., 202 / 255., 225 / 255.),
                    (107 / 255., 174 / 255., 214 / 255.),
                    (66 / 255., 146 / 255., 198 / 255.),
                    (33 / 255., 113 / 255., 181 / 255.),
                    (8 / 255., 81 / 255., 156 / 255.),
                    (8 / 255., 48 / 255., 107 / 255.)]

            red, green, blue = map(np.array, zip(*rgbs))
            nb_cols = len(rgbs)
            stops = np.linspace(0, 1, nb_cols, endpoint=True)
        else:
            rgbs = [(255 / 255., 51 / 255., 4 / 255.),
                    (247 / 255., 251 / 255., 255 / 255.),
                    (222 / 255., 235 / 255., 247 / 255.),
                    (198 / 255., 219 / 255., 239 / 255.),
                    (158 / 255., 202 / 255., 225 / 255.),
                    (107 / 255., 174 / 255., 214 / 255.),
                    (66 / 255., 146 / 255., 198 / 255.),
                    (33 / 255., 113 / 255., 181 / 255.),
                    (8 / 255., 81 / 255., 156 / 255.),
                    (8 / 255., 48 / 255., 107 / 255.)]

            red, green, blue = map(np.array, zip(*rgbs))
            nb_cols = len(rgbs)
            stops = np.array([0] + list(
                np.linspace(0, 1, nb_cols - 1, endpoint=True) *
                (1. - bg_eff / 100.) + bg_eff / 100.))
            pass

            ROOT.TColor.CreateGradientColorTable(nb_cols, stops, red, green,
                                                 blue, NB_CONTOUR)

        # Define arrays
        shape = (AXIS[VARX][0], AXIS[VARY][0])
        bins = [
            np.linspace(AXIS[var][1],
                        AXIS[var][2],
                        AXIS[var][0] + 1,
                        endpoint=True) for var in VARS
        ]
        x, y, z = (np.zeros(shape) for _ in range(3))

        # Create `profile` histogram
        profile = ROOT.TH2F('profile', "",
                            len(bins[0]) - 1, bins[0].flatten('C'),
                            len(bins[1]) - 1, bins[1].flatten('C'))

        # Compute inclusive efficiency in bins of `VARY`
        effs = list()
        for edges in zip(bins[1][:-1], bins[1][1:]):
            msk_bin = (data[VARY] > edges[0]) & (data[VARY] < edges[1])
            if signal_above:
                msk_pass = data[knnfeat] > 0  # ensure correct cut direction
            else:
                msk_pass = data[knnfeat] < 0
            num_msk = msk * msk_bin * msk_pass
            num = data.loc[num_msk, 'weight_test'].values.sum()
            den = data.loc[msk & msk_bin, 'weight_test'].values.sum()
            effs.append(num / den)
            pass

        # Fill profile
        with Profile("Fill profile"):
            for i, j in itertools.product(*map(range, shape)):
                #print "Fill profile - (i, j) = ({}, {})".format(i,j)
                # Bin edges in x and y
                edges = [bin[idx:idx + 2] for idx, bin in zip([i, j], bins)]

                # Masks
                msks = [
                    (data[var] > edges[dim][0]) & (data[var] <= edges[dim][1])
                    for dim, var in enumerate(VARS)
                ]
                msk_bin = reduce(lambda x, y: x & y, msks)

                # Set non-zero bin content
                if np.sum(msk & msk_bin):
                    if signal_above:
                        msk_pass = data[
                            knnfeat] > 0  # ensure correct cut direction
                    else:
                        msk_pass = data[knnfeat] < 0
                    num_msk = msk * msk_bin * msk_pass
                    num = data.loc[num_msk, 'weight_test'].values.sum()
                    den = data.loc[msk & msk_bin, 'weight_test'].values.sum()
                    eff = num / den
                    profile.SetBinContent(i + 1, j + 1, eff)
                    pass

        c = rp.canvas(batch=True)
        pad = c.pads()[0]._bare()
        pad.cd()
        pad.SetRightMargin(0.20)
        pad.SetLeftMargin(0.15)
        pad.SetTopMargin(0.10)

        # Styling
        profile.GetXaxis().SetTitle("Large-#it{R} jet " +
                                    latex(VARX, ROOT=True) +
                                    " = log(m^{2}/p_{T}^{2})")
        profile.GetYaxis().SetTitle("Large-#it{R} jet " +
                                    latex(VARY, ROOT=True) + " [GeV]")
        profile.GetZaxis().SetTitle("Selection efficiency for %s^{(%s%%)}" %
                                    (latex(variable, ROOT=True), bg_eff))

        profile.GetYaxis().SetNdivisions(505)
        profile.GetZaxis().SetNdivisions(505)
        profile.GetXaxis().SetTitleOffset(1.4)
        profile.GetYaxis().SetTitleOffset(1.8)
        profile.GetZaxis().SetTitleOffset(1.3)
        zrange = (0., 1.)
        if zrange:
            profile.GetZaxis().SetRangeUser(*zrange)
            pass
        profile.SetContour(NB_CONTOUR)

        # Draw
        profile.Draw('COLZ')

        # Decorations
        c.text(qualifier=QUALIFIER, ymax=0.92, xmin=0.15, ATLAS=False)
        c.text(["#sqrt{s} = 13 TeV", "#it{W} jets" if sig else "Multijets"],
               ATLAS=False)

        # -- Efficiencies
        xaxis = profile.GetXaxis()
        yaxis = profile.GetYaxis()
        tlatex = ROOT.TLatex()
        tlatex.SetTextColor(ROOT.kGray + 2)
        tlatex.SetTextSize(0.023)
        tlatex.SetTextFont(42)
        tlatex.SetTextAlign(32)
        xt = xaxis.GetBinLowEdge(xaxis.GetNbins())
        for eff, ibin in zip(effs, range(1, yaxis.GetNbins() + 1)):
            yt = yaxis.GetBinCenter(ibin)
            tlatex.DrawLatex(
                xt, yt, "%s%.1f%%" %
                ("#bar{#varepsilon}^{rel}_{%s} = " %
                 ('sig' if sig else 'bkg') if ibin == 1 else '', eff * 100.))
            pass

        # -- Bounds
        BOUNDS[0].DrawCopy("SAME")
        BOUNDS[1].DrawCopy("SAME")
        c.latex("m > 50 GeV",
                -4.5,
                BOUNDS[0].Eval(-4.5) + 30,
                align=21,
                angle=-37,
                textsize=13,
                textcolor=ROOT.kGray + 3)
        c.latex("m < 300 GeV",
                -2.5,
                BOUNDS[1].Eval(-2.5) - 30,
                align=23,
                angle=-57,
                textsize=13,
                textcolor=ROOT.kGray + 3)

        # Save
        mkdir('knn_fitter/figures/')
        c.save('knn_fitter/figures/knn_eff_{}_{:s}_{:.0f}.pdf'.format(
            'sig' if sig else 'bkg', variable, bg_eff))
        mkdir(args.output + '/figures/')
        c.save(args.output + '/figures/knn_eff_{}_{:s}_{:.0f}.pdf'.format(
            'sig' if sig else 'bkg', variable, bg_eff))
        pass

    return
Beispiel #19
0
def main ():

    # For reproducibility
    np.random.seed(21)

    # Parse command-line argument
    args = parser.parse_args()

    # Modify directory name to conform to convention
    if not args.dir.endswith('/'): args.dir += '/'

    print "Reading and reweighting, splitting files in:\n  {}".format(args.dir)

    # paths = sorted(glob.glob(args.dir + '*/*_slim.h5'))
    paths = sorted(glob.glob("./extractedHbbTopDatasets/*.h5"))


    print "Found {} files.".format(len(paths))

    # Reading input HDF5 file(s)
    data = None
    with Profile("Reading input HDF5 file(s)"):

        # Run batched conversion in parallel
        queue = multiprocessing.Queue()
        parts = run_batched(FileLoader, list(enumerate(paths)), queue=queue, max_processes=args.max_processes)
        
        data = np.lib.recfunctions.stack_arrays(zip(*sorted(parts, key=lambda t: t[0]))[1], autoconvert=True, usemask=False)

        # Concatenate data in sorted order, for reproducibility
        # data = np.concatenate(zip(*sorted(parts, key=lambda t: t[0]))[1])
        pass
    
    print "Found {} samples.".format(data.shape[0])

    # Subsample
    with Profile("Subsample"):
        for sig in [0,1]:

            # Select samples belonging to current category
            if sig == 0:
                msk = (data['signal'] == 0) & (data["dsid"] > 360000)
            else:
                msk = (data["signal"] == 1)

            # Store reference of samples belonging to other category
            other = np.array(~msk).astype(bool)

            # Subsample current category
            num_sample = int((args.train + args.test) * 1E+06)
            if num_sample <= msk.sum():
                idx = np.random.choice(np.where(msk)[0], num_sample, replace=False)
                sample = np.zeros_like(msk).astype(bool)
                sample[idx] = True
            else:
                print "[WARNING] Requested {:.1e} samples, but only {:.1e} are availabe in current mask. Using all available samples.".format(num_sample, msk.sum())
                sample = np.ones_like(msk).astype(bool)
                pass

            # Select subsample, and all samples from other categories
            data = data[sample | other]
            pass
        pass


    # Re-weighting
    with Profile("Re-weighting"):

        # Add new data columns
        data = append_fields(data, 'weight_train', np.ones_like(data['weight_test']))
        data = append_fields(data, 'weight_adv',   np.ones_like(data['weight_test']))

        # Reweight signal and background separately
        for sig in [0,1]:

            # Prepare data arrays
            msk = data['signal'] == sig

            # Flat pT
            # ------------------------------------------------------------------
            original = data['pt'][msk]
            xmin, xmax = original.min(), original.max()
            target = np.random.rand(original.size) * (xmax - xmin) + xmin

            # Fit bins-reweighter
            reweighter = BinsReweighter(n_bins=100, n_neighs=1)
            reweighter.fit(original, target=target)
            
            # Predict new, flat-pT weight
            data['weight_train'][msk] = reweighter.predict_weights(original)


            # (Flat-pT, physical-m) reweighted
            # ------------------------------------------------------------------
            original        = data['pt'][msk]
            original_weight = data['weight_test'][msk]

            ptmin, ptmax = data['pt'].min(), data['pt'].max()
            target = np.random.rand(msk.sum()) * (ptmax - ptmin) + ptmin

            # Fit bins-reweighter
            reweighter = BinsReweighter(n_bins=100, n_neighs=1)
            reweighter.fit(original, original_weight=original_weight, target=target)

            # Compute new weights
            data['weight_adv'][msk] = reweighter.predict_weights(original, original_weight=original_weight)

            # Standardise weight variables
            # ------------------------------------------------------------------
            weight_variables = filter(lambda name: name.startswith('weight_'), data.dtype.names)
            for var in weight_variables:
                print "  Ensuring unit mean for {}".format(var)
                data[var][msk] /= data[var][msk].mean()
                pass

            pass
        pass


    # Train/test split
    with Profile("Performing train/test split"):
        msk_sig = data['signal'] == 1
        num_sig =   msk_sig .sum()
        num_bkg = (~msk_sig).sum()
        num_train = int(args.train * 1E+06)
        print "Found {:.1e} signal and {:.1e} background samples.".format(num_sig, num_bkg)
        print "Using {:.1e} samples for training for each class, leaving {:.1e} signal and {:.1e} background samples for testing.".format(num_train, num_sig - num_train, num_bkg - num_train)

        idx_sig = np.where( msk_sig)[0]
        idx_bkg = np.where(~msk_sig)[0]
        idx_sig_train = np.random.choice(idx_sig, num_train, replace=False)
        idx_bkg_train = np.random.choice(idx_bkg, num_train, replace=False)

        data = append_fields(data, 'train', np.zeros_like(data['signal']).astype(int))
        data['train'][idx_sig_train] = 1
        data['train'][idx_bkg_train] = 1
        pass


    # Shuffle
    with Profile("Shuffling samples"):
        idx = np.arange(data.shape[0])
        np.random.shuffle(idx)
        data = data[idx]
        pass


    # Writing output HDF5 file
    with Profile("Writing output HDF5 file"):
        save_hdf5(data,  './reweightDatasets/extractedData.h5')
        pass

    return
Beispiel #20
0
def main (args):

    # Definitions
    histstyle = dict(**HISTSTYLE)

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, features, _ = load_data('data/djr_LCTopo_1.h5') #, test=True)
    #data2, features, _ = load_data('data/djr_LCTopo_2.h5') #, test=True)

    #data = np.concatenate((data1, data2))

    sigNumber = 0

    sigDict = {
        0: 'All Models',
        1: 'Model A, m = 1 TeV',
        2: 'Model A, m = 1.5 TeV',
        3: 'Model A, m = 2 TeV',
        4: 'Model A, m = 2.5 TeV',
        5: 'Model B, m = 1 TeV',
        6: 'Model B, m = 1.5 TeV',
        7: 'Model B, m = 2 TeV',
        8: 'Model B, m = 2.5 TeV',
        9: 'Model C, m = 1 TeV',
        10: 'Model C, m = 1.5 TeV',
        11: 'Model C, m = 2 TeV',
        12: 'Model C, m = 2.5 TeV',
        13: 'Model D, m = 1 TeV',
        14: 'Model D, m = 1.5 TeV',
        15: 'Model D, m = 2 TeV',
        16: 'Model D, m = 2.5 TeV',
        }

    outFile = ROOT.TFile.Open("figures/sensitivity_targetEff{}.root".format(kNN_eff),"RECREATE")

    histstyle[True] ['label'] = 'Multijets'
    histstyle[False]['label'] = 'Dark jets, {}'.format(sigDict[sigNumber])

    # Add knn variables

    #base_var = ['lead_jet_ungrtrk500', 'sub_jet_ungrtrk500']
    base_var = 'jet_ungrtrk500'
    kNN_var = base_var.replace('jet', 'knn')
    #base_vars = [base_var]
    #kNN_vars = [kNN_var]
    base_vars = ['lead_'+base_var, 'sub_'+base_var]
    kNN_vars = ['lead_'+kNN_var, 'sub_'+kNN_var]

    
    with Profile("Add variables"):
        #for i in range(len(base_var)):
        print "k-NN base variable: {} (cp. {})".format(base_var, kNN_var)
        add_knn(data, newfeat='lead_'+kNN_var, path='models/knn/knn1D_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel))
        add_knn(data, newfeat='sub_'+kNN_var, path='models/knn/knn1D_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel))

        #add_knn(data, newfeat=kNN_var, path='models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel))
        print 'models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel)

    # Check variable distributions
        
    weight = 'weight'  # 'weight_test' / 'weight'
    scale = 139*1000000 # (inverse nanobarn)

    msk_bkg = data['signal'] == 0
    if sigNumber==0:
        msk_sig = data['signal'] == 1 
    else:
        msk_sig = data['sigType'] == sigNumber 


    knnBins = np.linspace(-100, 200, 75, endpoint=True)
    effBins = np.linspace(0,1,100, endpoint=True)

    for var in kNN_vars:
        ### Canvas ###
        c = rp.canvas(num_pads=2, batch=True)
        c_tmp = rp.canvas(num_pads=1, batch=True)

        ### Plot ###
        h2 = c.pads()[0].hist(data.loc[msk_sig, var].values, bins=knnBins, weights=data.loc[msk_sig, weight].values, normalise=True, **histstyle[False])
        h1 = c.pads()[0].hist(data.loc[msk_bkg, var].values, bins=knnBins, weights=scale*data.loc[msk_bkg, weight].values, normalise=True, **histstyle[True])

        h1_incl = c_tmp.hist(data.loc[msk_bkg, var].values, bins=knnBins, weights=scale*data.loc[msk_bkg, weight].values, normalise=False)
        h2_incl = c_tmp.hist(data.loc[msk_sig, var].values, bins=knnBins, weights=data.loc[msk_sig, weight].values, normalise=False)

        #h1_CR = c_tmp.hist(data.loc[msk_CR_bkg, var].values, bins=knnBins, weights=scale*data.loc[msk_CR_bkg, weight].values, normalise=False)
        #h2_CR = c_tmp.hist(data.loc[msk_CR_sig, var].values, bins=knnBins, weights=data.loc[msk_CR_sig, weight].values, normalise=False)

        print "bkg. incl integral: ", h1_incl.GetEffectiveEntries()
        print "sig. incl integral: ", h2_incl.GetEffectiveEntries()
        #print "bkg. CR efficiency: ", h1_CR.GetEffectiveEntries()/h1_incl.GetEffectiveEntries()
        #print "sig. CR efficiency: ", h2_CR.GetEffectiveEntries()/h2_incl.GetEffectiveEntries()

        normFactor = 1.0 / (3./2 + np.sqrt(h1_incl.GetEffectiveEntries()) )
        print "Sensitivity with no cut: ", normFactor

        ### sensitivity ###
        sensitivity, bkg_eff_1jet = array( 'd' ), array( 'd' )
        #sensitivity = []
        #bkg_eff_1jet = []
        i = 0
        for cut in knnBins:

            msk_pass = (data[kNN_vars[0]]>cut) & (data[kNN_vars[1]]>cut)
            msk_pass1 = data[var]>cut
            #msk_pass = (data[var]>cut)
            msk_bkg_pass = msk_bkg & msk_pass
            msk_sig_pass = msk_sig & msk_pass

            msk_bkg_pass1 = msk_bkg & msk_pass1
            msk_sig_pass1 = msk_sig & msk_pass1

            h1_pass = c_tmp.hist(data.loc[msk_bkg_pass, var].values, bins=knnBins, weights=scale*data.loc[msk_bkg_pass, weight].values, normalise=False)
            h2_pass = c_tmp.hist(data.loc[msk_sig_pass, var].values, bins=knnBins, weights=data.loc[msk_sig_pass, weight].values, normalise=False)

            h1_pass1 = c_tmp.hist(data.loc[msk_bkg_pass1, var].values, bins=knnBins, weights=data.loc[msk_bkg_pass1, weight].values, normalise=False)

            if ( h2_incl.GetEffectiveEntries()>0 ) : #and h1_pass.GetEffectiveEntries()>0) :
                sensitivity.append( ((h2_pass.GetEffectiveEntries()/h2_incl.GetEffectiveEntries()) / (3./2 + np.sqrt(h1_pass.GetEffectiveEntries()) )) / normFactor )

                #print "bkg. eff. @ " , cut, ": ", h1_pass.GetEffectiveEntries()/h1_incl.GetEffectiveEntries()  
                #print "signal eff. @ ", cut, ": ", h2_pass.GetEffectiveEntries()/h2_incl.GetEffectiveEntries()
                #print "Sensitivity gain@ ", cut, ": ", ((h2_pass.GetEffectiveEntries()/h2_incl.GetEffectiveEntries()) / (3./2 + np.sqrt(h1_pass.GetEffectiveEntries())) ) / normFactor

            else: 
                sensitivity.append(0)

            if (h1_incl.GetEffectiveEntries()>0 ) :
                bkg_eff_1jet.append(h1_pass1.GetEffectiveEntries()/h1_incl.GetEffectiveEntries())
            else:
                bkg_eff_1jet.append(0)
                
            i = i+1

        #c.pads()[0].ylim(0,0.25)
        c.pads()[0].logy()
        c.pads()[0].xlim(-100,200)
        c.pads()[1].ylim(0,30)
        c.pads()[1].xlim(-100,200)
        c.pads()[1].graph( sensitivity, bins=knnBins) #, oob=False )

        ### Decorations ###
        c.legend(width=0.4, xmin=0.3, ymax=0.9)
        #c.xlabel("n_{trk}^{#epsilon={}\%}".format(kNN_eff)) #latex(var, ROOT=True))
        c.xlabel("n_{trk}^{#epsilon}") #latex(var, ROOT=True))
        c.ylabel("Fraction of jets")
        c.pads()[1].ylabel("Sensitivity gain")#"#epsilon_{S}/(#frac{3}{2} + #sqrt{B})/")
        c.pads()[1].text(["Sensitivity = #varepsilon_{S}/(#frac{3}{2} + #sqrt{B})", 
                ], xmin=0.2, ymax=0.80, ATLAS=False)

        c.save('figures/distributions/sensitivity_{}_sig{}_eff{}.pdf'.format(var, sigNumber, kNN_eff))
        c.save('figures/distributions/sensitivity_{}_sig{}_eff{}.eps'.format(var, sigNumber, kNN_eff))

        del c

        gr_sen = ROOT.TGraph(len(sensitivity), knnBins, sensitivity)
        gr_eff = ROOT.TGraph(len(bkg_eff_1jet), knnBins, bkg_eff_1jet)

        gr_more = ROOT.TGraph(len(sensitivity), bkg_eff_1jet, sensitivity)

        gr_sen.GetXaxis().SetTitle("#it{n}_{trk}^{#epsilon}-cut")
        gr_sen.GetYaxis().SetTitle("Sensitivity gain")
        gr_eff.GetYaxis().SetTitle("Single jet #varepsilon_{B}")
        gr_sen.GetYaxis().SetAxisColor(ROOT.kOrange+2)
        gr_eff.GetYaxis().SetAxisColor(ROOT.kGreen+2)
        gr_sen.SetMarkerColor(ROOT.kOrange+2)
        gr_eff.SetMarkerColor(ROOT.kGreen+2)
        gr_eff.SetDrawOption("Y+")

        c2 = rp.canvas(batch=True)
        c2.pads()[0].logx()
        c2.pads()[0].cd()
        #c2.pads()[0].graph(sensitivity, bkg_eff_1jet)
        gr_more.GetXaxis().SetTitle("Single jet #varepsilon_{B}")
        gr_more.GetYaxis().SetTitle("Sensitivity gain")
        #gr_more.GetXaxis().SetRangeUser(0, 0.02)
        gr_more.Draw("AP")


        #c2 = ROOT.TCanvas("can2", "", 200,10,700,500) #(batch=True)
        #pad1 = ROOT.TPad("pad1", "", 0,0,1,1) #c2.pads()[0]._bare()
        #pad1.Draw()
        #pad1.cd()
        #gr_sen.Draw("AP")
        

        #c2.cd()
        #pad2 = ROOT.TPad("pad2", "", 0,0,1,1) #c2.pads()[0]._bare()
        #pad2.SetFillStyle(4000)
        #pad2.Draw()
        #pad2.cd()
        #gr_eff.Draw("PY+")

        #gr_eff.Draw("APY+")
        #gr_sen.Draw("SAME")

        #gr_sen = c2.graph(sensitivity, bins=knnBins, markercolor=ROOT.kOrange+2)
        #gr_eff = c2.graph(bkg_eff_1jet, bins=knnBins, markercolor=ROOT.kGreen+2, option='Y+' )
        #gr_eff.GetYaxis.SetRange(0,1)
        #gr_eff.Draw("SAME Y+")
        #c2.xlabel("Single jet #varepsilon_{B}")
        #c2.ylabel("Sensitivity gain")
        #c2.text(["#epsilon=0.5 %",], xmin=0.2, ymax=0.8, ATLAS=False)

        ### Save ###
        #mkdir('figures/distributions')

        c2.save('figures/distributions/sensitivity_{}_eff{}_1jet.pdf'.format(var,kNN_eff) )
        del c2

        outFile.cd()
        gr_more.SetName("sensitivity_eff{}".format(kNN_eff))
        gr_more.Write()
        outFile.Close()

        #print 'figures/distributions/sensitivity_{}_sig{}_eff{}.pdf'.format(var, sigNumber, kNN_eff)
        pass
    

    # Plot also the normal ntrk distribution for cross check with Roland's result
    """