Ejemplo n.º 1
0
def main(args):

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, features, _ = load_data(args.input + 'data.h5',
                                  train=True,
                                  background=True)

    #variable = "tau21"
    #bins = TAU21BINS
    variable = "N2_B1"
    bins = N2BINS
    #variable = "decDeepWvsQCD"
    #bins = DECDEEPBINS
    #variable = "DeepWvsQCD"
    #bins = DEEPBINS

    # Add CSS variable
    add_css(variable, data)

    # Plot CSS distributions for each mass bin
    plot_distributions(data, variable, bins)

    return 0
Ejemplo n.º 2
0
def main(args):

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, features, _ = load_data(args.input + 'data.h5',
                                  train=True,
                                  background=True)

    # Fill Tau21 profile
    profile = fill_profile(data, VAR_TAU21)

    # Fit profile
    fit = ROOT.TF1('fit', 'pol1', *FIT_RANGE)
    profile.Fit('fit', 'RQ0')
    intercept_val, coef_val = fit.GetParameter(0), fit.GetParameter(1)
    intercept_err, coef_err = fit.GetParError(0), fit.GetParError(1)

    # Create scikit-learn transform
    ddt = LinearRegression()
    ddt.coef_ = np.array([coef_val])
    ddt.intercept_ = np.array([-coef_val * FIT_RANGE[0]])
    ddt.offset_ = np.array([coef_val * FIT_RANGE[0] + intercept_val])

    print "Fitted function:"
    print "  intercept: {:7.4f} ± {:7.4f}".format(intercept_val, intercept_err)
    print "  coef:      {:7.4f} ± {:7.4f}".format(coef_val, coef_err)

    # Save DDT transform
    saveclf(ddt, 'models/ddt/ddt.pkl.gz')

    return 0
Ejemplo n.º 3
0
def main(args):

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, features, _ = load_data(args.input + 'data.h5', sample=0.01)  # @TEMP

    # Define classifier configuration(s)
    pattern = 'uboost_ur_{:4.2f}_te_92_rel21_fixed'
    urs = sorted([0.0, 0.01, 0.1, 0.3])
    classifiers = [
        ('AdaBoost' if ur == 0 else 'uBoost (#alpha={:4.2f})'.format(ur),
         pattern.format(ur).replace('.', 'p')) for ur in urs
    ]

    # Compute classifiers variables in parallel
    njobs = min(7, len(classifiers))
    with Profile("Run tests in parallel"):
        ret = Parallel(n_jobs=njobs)(delayed(compute)(data, name)
                                     for _, name in classifiers)
        pass

    # Add classifier variables to data
    for name, staged_series in ret:
        for stage, series in enumerate(staged_series):
            data['{:s}__{:d}'.format(name, stage)] = series
            pass
        pass

    # Plot learning curves
    plot(data, urs, classifiers)

    return 0
Ejemplo n.º 4
0
def main(args):

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, _, _ = load_data(
        'data/' + args.input)  #Train=True removed since we use the data file

    # -------------------------------------------------------------------------
    ####
    #### # Initialise Keras backend
    #### initialise_backend(args)
    ####
    #### # Neural network-specific initialisation of the configuration dict
    #### initialise_config(args, cfg)
    ####
    #### # Keras import(s)
    #### from keras.models import load_model
    ####
    #### # NN
    #### from run.adversarial.common import add_nn
    #### with Profile("NN"):
    ####     classifier = load_model('models/adversarial/classifier/full/classifier.h5')
    ####     add_nn(data, classifier, 'NN')
    ####     pass
    # -------------------------------------------------------------------------

    # Compute background efficiency at sig. eff. = 50%
    eff_sig = 0.10
    fpr, tpr, thresholds = roc_curve(data['signal'],
                                     data[VAR],
                                     sample_weight=data['weight'])
    idx = np.argmin(np.abs(tpr - eff_sig))
    print "Background acceptance @ {:.2f}% sig. eff.: {:.2f}% ({} > {:.2f})".format(
        eff_sig * 100., (fpr[idx]) * 100., VAR,
        thresholds[idx])  #changed from 1-fpr[idx]
    #print "Signal efficiency @ {:.2f}% bkg. acc.: {:.2f}% ({} > {:.2f})".format(eff_sig * 100., (fpr[idx]) * 100., VAR, thresholds[idx]) #changed from 1-fpr[idx]
    print "Chosen target efficiency: {:.2f}%".format(EFF)

    # Filling profile
    data = data[data['signal'] == 0]
    profile_meas, (x, y, z) = fill_profile(data)

    # Format arrays
    X = np.vstack((x.flatten(), y.flatten()))
    X = X.T
    Y = z.flatten()

    # Fit KNN regressor
    print "debugging more: x.shape = ", X.shape, ", y.ndim = ", Y.ndim

    knn = KNeighborsRegressor(weights='distance')
    knn.fit(X, Y)

    # Save KNN classifier
    saveclf(knn, 'models/knn/knn_{:s}_{}_{}.pkl.gz'.format(VAR, EFF, MODEL))

    return 0
Ejemplo n.º 5
0
def main(args):

    # Initialise
    args, cfg = initialise(args)

    # Load data
    #data, _, _ = load_data(args.input + 'data.h5', train=True)
    data, _, _ = load_data(args.input + 'data.h5', train_full_signal=True)

    variable = VAR_TAU21
    signal_above = False
    bg_eff = TAU21_EFF
    #variable = VAR_N2; signal_above=False
    #bg_eff = N2_EFF
    #variable = VAR_DECDEEP; signal_above=True
    #bg_eff = DECDEEP_EFF
    #variable = VAR_DEEP; signal_above=True
    #bg_eff = DEEP_EFF

    ## training on a list of working points:
    #for bg_eff in WORKING_POINTS:
    #    train(data, variable, bg_eff, signal_above=signal_above)
    #print "reached end of main()"
    #return 0

    # -------------------------------------------------------------------------
    ####
    #### # Initialise Keras backend
    #### initialise_backend(args)
    ####
    #### # Neural network-specific initialisation of the configuration dict
    #### initialise_config(args, cfg)
    ####
    #### # Keras import(s)
    #### from keras.models import load_model
    ####
    #### # NN
    #### from run.adversarial.common import add_nn
    #### with Profile("NN"):
    ####     classifier = load_model('models/adversarial/classifier/full/classifier.h5')
    ####     add_nn(data, classifier, 'NN')
    ####     pass
    # -------------------------------------------------------------------------

    # Compute background efficiency at sig. eff. = 50%
    eff_sig = 0.5
    fpr, tpr, thresholds = roc_curve(data['signal'],
                                     data[variable],
                                     sample_weight=data['weight_test'])
    idx = np.argmin(np.abs(tpr - eff_sig))
    print "Background acceptance @ {:.2f}% sig. eff.: {:.2f}% ({} < {:.2f})".format(
        eff_sig * 100., (1 - fpr[idx]) * 100., variable, thresholds[idx])
    print "Chosen target efficiency: {:.2f}%".format(bg_eff)
    ## I think if the signal is above the background, the bg efficiency should be taken as (100 - bg efficiency)

    train(data, variable, bg_eff, signal_above=signal_above)
    print "reached end of main()"
    return 0
Ejemplo n.º 6
0
def main (args):

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, features, _ = load_data(args.input + 'data.h5', background=True, train=True)

    # Fill substructure profile
    perform_optimisation("D2", D2BINS, data)
    return
Ejemplo n.º 7
0
def main(args):

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, features, _ = load_data(args.input + 'data.h5',
                                  train=True,
                                  background=True)

    #variable = VAR_TAU21
    variable = VAR_N2
    #variable = VAR_DECDEEP
    #variable = VAR_DEEP

    # Fill variable profile
    profile = fill_profile(data, variable)

    # Fit profile
    if variable == VAR_N2:
        fit_range = FIT_RANGE_N2
    elif variable == VAR_TAU21:
        fit_range = FIT_RANGE_TAU21
    elif variable == VAR_DECDEEP:
        fit_range = FIT_RANGE_DECDEEP
    elif variable == VAR_DEEP:
        fit_range = FIT_RANGE_DEEP
    else:
        print "variable invalid"
        return 0
    fit = ROOT.TF1('fit', 'pol1', *fit_range)
    profile.Fit('fit', 'RQ0')
    intercept_val, coef_val = fit.GetParameter(0), fit.GetParameter(1)
    intercept_err, coef_err = fit.GetParError(0), fit.GetParError(1)

    # Create scikit-learn transform
    ddt = LinearRegression()
    ddt.coef_ = np.array([coef_val])
    ddt.intercept_ = np.array([-coef_val * fit_range[0]])
    ddt.offset_ = np.array([coef_val * fit_range[0] + intercept_val])

    print "Fitted function:"
    print "  intercept: {:7.4f} ± {:7.4f}".format(intercept_val, intercept_err)
    print "  coef:      {:7.4f} ± {:7.4f}".format(coef_val, coef_err)

    # Save DDT transform
    saveclf(ddt, 'models/ddt/ddt_{}.pkl.gz'.format(variable))
    print "got to the end of main()"
    return 0
Ejemplo n.º 8
0
def main(args):

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, features, _ = load_data(args.input + 'data.h5',
                                  background=True,
                                  train=True)

    # Fill substructure profile
    perform_optimisation("tau21", TAU21BINS, data)
    perform_optimisation("N2_B1", N2BINS, data)
    #perform_optimisation("decDeepWvsQCD", DECDEEPBINS, data)
    #perform_optimisation("DeepWvsQCD", DEEPBINS, data)
    return
Ejemplo n.º 9
0
def main(args):

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, features, _ = load_data(args.input + 'data.h5',
                                  train=True,
                                  background=True)

    # Add CSS variable
    var = "D2"
    add_css(var, data)

    # Plot D2(CSS) distributions for each mass bin
    plot_distributions(data, var)

    return 0
Ejemplo n.º 10
0
def main(args):

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, _, _ = load_data(args.input + 'data.h5')

    msk = data['train'] == 1

    features = filter(lambda s: s.startswith('fjet_'), list(data))

    X = data[features]
    y = data['signal']
    w = data['mcEventWeight']

    dtrain = xgb.DMatrix(X[msk], label=y[msk], weight=w[msk])
    dtest = xgb.DMatrix(X[~msk], label=y[~msk], weight=w[~msk])

    param = {
        'max_depth': 4,
        'eta': 1,
        'silent': 1,
        'objective': 'binary:logistic'
    }
    num_round = 100
    bst = xgb.train(param, dtrain, num_round)

    # make prediction
    preds = bst.predict(dtest)
    importance = bst.get_fscore()

    for name, score in sorted(list(importance.iteritems()),
                              key=lambda t: t[1],
                              reverse=True):
        print "  {:15s}: {:4.1f}".format(name, score)
        pass

    return 0
Ejemplo n.º 11
0
Archivo: loss.py Proyecto: nethemis/ANN
def main(args):

    # Initialise
    args, cfg = initialise(args)

    # Common definitions
    num_folds = 3

    # Perform classifier loss study
    plot_classifier_training_loss(num_folds)

    # Compute entropy of decorrelation variable posterior
    data, _, _ = load_data(args.input + 'data.h5', train=True, background=True)
    decorrelation = get_decorrelation_variables(data)
    H_prior = entropy(decorrelation, weights=data['weight_adv'])
    print "Entropy of prior: {}".format(H_prior)

    # Perform adversarial loss study
    for lambda_reg in [10, 100]:
        plot_adversarial_training_loss(lambda_reg, num_folds, 10, H_prior)
        pass

    return 0
def main(args):

    # Initialise
    args, cfg = initialise(args)

    # Initialise Keras backend
    initialise_backend(args)

    # Neural network-specific initialisation of the configuration dict
    initialise_config(args, cfg)

    # Keras import(s)
    import keras.backend as K
    from keras.models import load_model

    # Project import(s)
    from adversarial.models import classifier_model, adversary_model, combined_model, decorrelation_model

    # Load data
    data, features, _ = load_data(args.input + 'data.h5', test=True)

    def meaningful_digits(number):
        digits = 0
        if number > 0:
            digits = int(np.ceil(max(-np.log10(number), 0)))
            pass
        return '{l:.{d:d}f}'.format(d=digits, l=number)

    # -- Adversarial neural network (ANN) scan
    lambda_reg = 100.
    lambda_regs = sorted([100.])
    ann_vars = list()
    lambda_strs = list()
    for lambda_reg_ in lambda_regs:
        lambda_str = meaningful_digits(lambda_reg_).replace('.', 'p')
        lambda_strs.append(lambda_str)

        ann_var_ = "ANN(#lambda={:s})".format(lambda_str.replace('p', '.'))
        ann_vars.append(ann_var_)
        pass

    ann_var = ann_vars[lambda_regs.index(lambda_reg)]

    print "ann_var"
    print ann_var

    # Tagger feature collection
    # tagger_features = ['NN', ann_var]
    tagger_features = ['NN', ann_var, 'MV2c10', 'XbbScoreHiggs']
    # tagger_features = ['MV2c10']

    # Add variables
    # --------------------------------------------------------------------------
    with Profile("Add variables"):

        # NN
        from run.adversarial.common import add_nn
        with Profile("NN"):
            classifier = load_model(
                'models/adversarial/classifier/full/classifier.h5')
            add_nn(data, classifier, 'NN')
            pass

        # ANN
        with Profile("ANN"):
            from adversarial.utils import DECORRELATION_VARIABLES
            adversary = adversary_model(
                gmm_dimensions=len(DECORRELATION_VARIABLES),
                **cfg['adversary']['model'])

            combined = combined_model(classifier, adversary,
                                      **cfg['combined']['model'])

            for ann_var_, lambda_str_ in zip(ann_vars, lambda_strs):
                print "== Loading model for {}".format(ann_var_)
                combined.load_weights(
                    'models/adversarial/combined/full/combined_lambda{}.h5'.
                    format(lambda_str_))
                add_nn(data, classifier, ann_var_)
                pass
            pass

        with Profile("MV2c10"):
            data["MV2c10"] = pd.concat(
                [data["MV2c10_discriminant_1"], data["MV2c10_discriminant_2"]],
                axis=1).min(axis=1)

        # Add MV2 and XbbScore here
        # e.g. min(MV2_sj1, MV2_sj2)

    # Remove unused variables
    used_variables = set(tagger_features + ann_vars +
                         ['mass', 'pt', 'npv', 'weight_test'])
    unused_variables = [var for var in list(data) if var not in used_variables]
    data.drop(columns=unused_variables)
    gc.collect()

    # Perform performance studies
    perform_studies(data, args, tagger_features, ann_vars)

    return 0
Ejemplo n.º 13
0
def main(args):

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, _, _ = load_data(args.input + 'data.h5', train=True, background=True)

    # -------------------------------------------------------------------------
    ####
    #### # Initialise Keras backend
    #### initialise_backend(args)
    ####
    #### # Neural network-specific initialisation of the configuration dict
    #### initialise_config(args, cfg)
    ####
    #### # Keras import(s)
    #### from keras.models import load_model
    ####
    #### # NN
    #### from run.adversarial.common import add_nn
    #### with Profile("NN"):
    ####     classifier = load_model('models/adversarial/classifier/full/classifier.h5')
    ####     add_nn(data, classifier, 'NN')
    ####     pass
    # -------------------------------------------------------------------------

    # Fill measured profile
    profile_meas, _ = fill_profile(data)

    # Loading KNN classifier
    knn = loadclf('models/knn/knn_{:s}_{:.0f}.pkl.gz'.format(VAR, EFF))

    # Filling fitted profile
    with Profile("Filling fitted profile"):
        rebin = 8
        edges, centres = dict(), dict()
        for ax, var in zip(['x', 'y'], [VARX, VARY]):

            # Short-hands
            vbins, vmin, vmax = AXIS[var]

            # Re-binned bin edges  @TODO: Make standardised right away?
            edges[ax] = np.interp(
                np.linspace(0, vbins, vbins * rebin + 1, endpoint=True),
                range(vbins + 1),
                np.linspace(vmin, vmax, vbins + 1, endpoint=True))

            # Re-binned bin centres
            centres[ax] = edges[ax][:-1] + 0.5 * np.diff(edges[ax])
            pass

        # Get predictions evaluated at re-binned bin centres
        g = dict()
        g['x'], g['y'] = np.meshgrid(centres['x'], centres['y'])
        g['x'], g['y'] = standardise(g['x'], g['y'])

        X = np.vstack((g['x'].flatten(), g['y'].flatten())).T
        fit = knn.predict(X).reshape(g['x'].shape).T

        # Fill ROOT "profile"
        profile_fit = ROOT.TH2F('profile_fit', "",
                                len(edges['x']) - 1, edges['x'].flatten('C'),
                                len(edges['y']) - 1, edges['y'].flatten('C'))
        root_numpy.array2hist(fit, profile_fit)
        pass

    # Plotting
    with Profile("Plotting"):
        for fit in [False, True]:

            # Select correct profile
            profile = profile_fit if fit else profile_meas

            # Plot
            plot(profile, fit)
            pass
        pass

    return
Ejemplo n.º 14
0
def main(args):

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, _, _ = load_data(
        'data/' +
        args.input)  #, Train=True) removed since we use the data file

    # -------------------------------------------------------------------------
    ####
    #### # Initialise Keras backend
    #### initialise_backend(args)
    ####
    #### # Neural network-specific initialisation of the configuration dict
    #### initialise_config(args, cfg)
    ####
    #### # Keras import(s)
    #### from keras.models import load_model
    ####
    #### # NN
    #### from run.adversarial.common import add_nn
    #### with Profile("NN"):
    ####     classifier = load_model('models/adversarial/classifier/full/classifier.h5')
    ####     add_nn(data, classifier, 'NN')
    ####     pass
    # -------------------------------------------------------------------------

    # Compute background efficiency at sig. eff. = 50%
    eff_sig = 0.10
    fpr, tpr, thresholds = roc_curve(data['signal'],
                                     data[VAR],
                                     sample_weight=data['TotalEventWeight'])
    idx = np.argmin(np.abs(tpr - eff_sig))
    print "Background acceptance @ {:.2f}% sig. eff.: {:.2f}% ({} > {:.2f})".format(
        eff_sig * 100., (fpr[idx]) * 100., VAR,
        thresholds[idx])  #changed from 1-fpr[idx]
    #print "Signal efficiency @ {:.2f}% bkg. acc.: {:.2f}% ({} > {:.2f})".format(eff_sig * 100., (fpr[idx]) * 100., VAR, thresholds[idx]) #changed from 1-fpr[idx]
    print "Chosen target efficiency: {:.2f}%".format(EFF)

    # Filling profile
    data = data[data['signal'] == 0]
    profile_meas, (x, y, err) = fill_profile_1D(data)

    # Format arrays
    X = x.reshape(-1, 1)
    weights = 1 / err

    print X
    # Fit KNN regressor
    if 'knn1D' == FIT:
        knn = KNeighborsRegressor(5, weights='distance')
        knn.fit(X, y)  #.predict(X)

    elif 'knn1D_v2' in FIT:
        knn = KNeighborsRegressor(5, weights='uniform')
        knn.fit(X, y)  #.predict(X)

    elif 'knn1D_v3' in FIT:
        knn = KNeighborsRegressor(2, weights='uniform')
        knn.fit(X, y)  #.predict(X)

    elif 'knn1D_v4' in FIT:
        knn = KNeighborsRegressor(3, weights='distance')
        knn.fit(X, y)  #.predict(X)

    elif 'poly2' in FIT:
        knn = make_pipeline(PolynomialFeatures(degree=2), Ridge())
        knn.fit(X, y)  #.predict(X)
        #knn1 = PolynomialFeatures(degree=2)
        #knn1.fit(X, y)
        #X_poly = knn1.fit_transform(X)
        #knn = LinearRegression() #fit_intercept=False)
        #knn.fit(X_poly, y, weights)
        #score = round(reg.score(X_poly, y), 4)
        #coef = reg.coef_
        #intercept = reg.intercept_

        #print score, coef, intercept
        #knn.fit(X, y)#.predict(X)
        #print "Fit parameters: ", knn.transform(X).shape #get_feature_names() #get_params() #knn.coef_

    elif 'poly3' in FIT:
        knn = make_pipeline(PolynomialFeatures(degree=3), Ridge())
        knn.fit(X, y)  #.predict(X)

    # Create scikit-learn transform
    elif 'lin' in FIT:
        knn = LinearRegression()
        knn.fit(X, y, weights)

    elif 'erf' in FIT:
        knn, pcov = curve_fit(func, x, y, p0=[73, 0.0004, 2000])
        print "ERF: ", knn

    else:
        print "Weird FIT type chosen"
        #coef_val = np.polyfit(x, y, deg=1, w=weights)

        #knn.coef_      = np.array([coef_val[0]])
        #knn.intercept_ = np.array([coef_val[1]]) #[-coef_val[0] * FIT_RANGE[0]])
        #knn.offset_    = np.array([coef_val[0] * FIT_RANGE[0] + coef_val[1]])

        print "Fitted function:"
        print "  coef: {}".format(knn.coef_)
        print "  intercept:      {}".format(knn.intercept_)

    # Save DDT transform
    saveclf(knn,
            'models/knn/{}_{:s}_{}_{}.pkl.gz'.format(FIT, VAR, EFF, MODEL))

    # Save fit parameters to a ROOT file

    #TCoef = ROOT.TVector3(coef[0], coef[1], coef[2])
    #outFile = ROOT.TFile.Open("models/{}_jet_ungrtrk500_eff{}_stat{}_data.root".format(FIT, EFF, MIN_STAT),"RECREATE")
    #outFile.cd()
    #TCoef.SetName("coefficients")
    #TCoef.Write()
    #outFile.Close()

    return 0
Ejemplo n.º 15
0
def main(args):

    # Initialise
    args, cfg = initialise(args)

    # Initialise Keras backend
    initialise_backend(args)

    # Neural network-specific initialisation of the configuration dict
    initialise_config(args, cfg)

    # Keras import(s)
    import keras.backend as K
    from keras.models import load_model

    # Project import(s)
    from adversarial.models import classifier_model, adversary_model, combined_model, decorrelation_model

    # Load data
    data, features, _ = load_data('data/' + args.input, test=True)

    # Common definitions
    # --------------------------------------------------------------------------
    # -- k-nearest neighbour
    #kNN_var = 'D2-k#minusNN'
    #kNN_var = 'C1_02-knn'
    #base_var = 'sub_jet_ntrk'
    #kNN_var = base_var.replace('sub_jet_', '') + '-knn'

    #base_var = ['lead_jet_ungrtrk500', 'sub_jet_ungrtrk500']
    #kNN_var = [var.replace('jet', 'knn') for var in base_var]

    base_var = 'jet_ungrtrk500'
    kNN_var = base_var.replace('jet', 'knn')

    #base_var = ['jet_ungrtrk500']
    #kNN_var = [var.replace('jet', 'knn') for var in base_var]

    #base_var = ['ntrk_sum']
    #kNN_var = [var + '-knn' for var in base_var]

    def meaningful_digits(number):
        digits = 0
        if number > 0:
            digits = int(np.ceil(max(-np.log10(number), 0)))
            pass
        return '{l:.{d:d}f}'.format(d=digits, l=number)

    """
    # -- Adversarial neural network (ANN) scan
    lambda_reg  = 10.
    lambda_regs = sorted([1., 3., 10.])
å ham har jeg talt med løbende. For mange dage siden har vi talt om, om man kunne bruge grundlovsdag, og hvordan det ville hænge sammen med de frister, der er. In    ann_vars    = list()
    lambda_strs = list()
    for lambda_reg_ in lambda_regs:
        lambda_str = meaningful_digits(lambda_reg_).replace('.', 'p')
        lambda_strs.append(lambda_str)

        ann_var_ = "ANN(#lambda={:s})".format(lambda_str.replace('p', '.'))
        ann_vars.append(ann_var_)
        pass

    ann_var = ann_vars[lambda_regs.index(lambda_reg)]

    # -- uBoost scan
    uboost_eff = 92
    uboost_ur  = 0.3
    uboost_urs = sorted([0., 0.01, 0.1, 0.3, 1.0])
    uboost_var  =  'uBoost(#alpha={:s})'.format(meaningful_digits(uboost_ur))
    uboost_vars = ['uBoost(#alpha={:s})'.format(meaningful_digits(ur)) for ur in uboost_urs]
    uboost_pattern = 'uboost_ur_{{:4.2f}}_te_{:.0f}_rel21_fixed'.format(uboost_eff)
    """
    # Tagger feature collection
    #tagger_features = ['Tau21','Tau21DDT', 'D2', kNN_var, 'D2', 'D2CSS', 'NN', ann_var, 'Adaboost', uboost_var]
    #tagger_features = ['lead_jet_C1_02', kNN_var]
    tagger_features = [
        'lead_' + base_var, 'lead_' + kNN_var, 'sub_' + base_var,
        'sub_' + kNN_var
    ]

    #tagger_features = base_var + kNN_var

    # Add variables
    # --------------------------------------------------------------------------

    with Profile("Add variables"):
        #for i in range(len(base_var)):
        from run.knn.common import add_knn, MODEL as sigModel, VAR as kNN_basevar, EFF as kNN_eff
        print "k-NN base variable: {} (cp. {})".format(base_var, kNN_var)
        add_knn(data,
                newfeat='lead_' + kNN_var,
                path='models/knn/knn_{}_{}_{}.pkl.gz'.format(
                    base_var, kNN_eff, sigModel))
        add_knn(data,
                newfeat='sub_' + kNN_var,
                path='models/knn/knn_{}_{}_{}.pkl.gz'.format(
                    base_var, kNN_eff, sigModel))

    # Remove unused variables
    used_variables = set(tagger_features +
                         ['lead_jet_m', 'lead_jet_pt', 'dijetmass', 'weight'])
    unused_variables = [var for var in list(data) if var not in used_variables]
    data.drop(columns=unused_variables)
    gc.collect()

    # Perform performance studies
    perform_studies(data, args, tagger_features)

    return 0
Ejemplo n.º 16
0
def main(args):

    # Definitions
    histstyle = dict(**HISTSTYLE)

    # Initialise
    args, cfg = initialise(args)

    # Load data
    mc, features, _ = load_data('data/djr_LCTopo_2.h5')  #, test=True) #
    data, features, _ = load_data('data/djr_LCTopo_data.h5')  #, test=True) #

    histstyle[True]['label'] = 'Multijets'
    histstyle[False]['label'] = 'Dark jets, Model A, m = 2 TeV'

    # Add knn variables

    #base_var = ['lead_jet_ungrtrk500', 'sub_jet_ungrtrk500']
    base_var = 'jet_ungrtrk500'
    kNN_var = base_var.replace('jet', 'knn')
    #base_vars = ['lead_'+base_var, 'sub_'+base_var]
    #kNN_vars = ['lead_'+kNN_var, 'sub_'+kNN_var]
    """
    with Profile("Add variables"):
        #for i in range(len(base_var)):                                               
        print "k-NN base variable: {} (cp. {})".format(base_var, kNN_var)
        add_knn(data, newfeat='lead_'+kNN_var, path='models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel))
        add_knn(data, newfeat='sub_'+kNN_var, path='models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel))
        add_knn(mc, newfeat='lead_'+kNN_var, path='models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel))
        add_knn(mc, newfeat='sub_'+kNN_var, path='models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel))
    """
    #add_knn(data, newfeat=kNN_var, path='models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel))

    bins_pt = np.linspace(450, 5000, 50)

    # Useful masks
    msk_bkg_data = data['signal'] == 0
    msk_bkg_mc = (mc['signal'] == 0)  #& (mc['weight']<0.0002)
    msk_sig_mc = (mc['signal'] == 1)  #& (mc['weight']<0.0002)

    msk_CR = (mc['lead_jet_ungrtrk500'] < 20) | (mc['sub_jet_ungrtrk500'] < 20)

    scale = 139 * 1000000  # (inverse nanobarn)

    # pT dist
    c = rp.canvas(batch=True)
    hist_incl_data = c.hist(data.loc[msk_bkg_data, 'jet_pt'].values,
                            bins=bins_pt,
                            weights=data.loc[msk_bkg_data, 'weight'].values,
                            label="Data, control region",
                            normalise=False,
                            linecolor=ROOT.kGreen + 2)

    hist_incl_mc = c.hist(mc.loc[msk_bkg_mc, 'sub_jet_pt'].values,
                          bins=bins_pt,
                          weights=scale * mc.loc[msk_bkg_mc, 'weight'].values,
                          label="MC, scaled with lumi",
                          normalise=False,
                          linecolor=ROOT.kViolet + 2)

    hist_incl_sig = c.hist(mc.loc[msk_sig_mc, 'sub_jet_pt'].values,
                           bins=bins_pt,
                           weights=mc.loc[msk_sig_mc, 'weight'].values,
                           label="Combined Signal",
                           normalise=False,
                           linecolor=ROOT.kOrange + 2)

    c.legend(width=0.4, xmin=0.5, ymax=0.9)
    c.ylabel("Number of events")
    c.xlabel("Sub-leading jet pT [GeV]")
    c.logy()
    #c.ylim(0.00005, 5)
    #c.save('figures/distributions/mjj_Bkg_CR20.pdf'.format(knnCut))
    #c.save('figures/distributions/mjj_Bkg_CR20.eps'.format(knnCut))
    c.save('figures/distributions/sub_pt_bkg_data_mc.pdf')
    c.save('figures/distributions/sub_pt_bkg_data_mc.eps')

    print "Data bkg effective entries: ", hist_incl_data.GetEffectiveEntries()
    print "MC bkg effective entries: ", hist_incl_mc.GetEffectiveEntries()

    print "Data bkg integral: ", hist_incl_data.Integral()
    print "MC bkg integral: ", hist_incl_mc.Integral()

    del c

    c = rp.canvas(batch=True)
    hist_bkg_CR = c.hist(mc.loc[(msk_bkg_mc & msk_CR), 'lead_jet_pt'].values,
                         bins=bins_pt,
                         weights=scale *
                         mc.loc[(msk_bkg_mc & msk_CR), 'weight'].values,
                         label="MC, control region",
                         normalise=False,
                         linecolor=ROOT.kGreen + 2)

    hist_sig_CR = c.hist(mc.loc[(msk_sig_mc & msk_CR), 'lead_jet_pt'].values,
                         bins=bins_pt,
                         weights=mc.loc[(msk_sig_mc & msk_CR),
                                        'weight'].values,
                         label="MC, control region",
                         normalise=False,
                         linecolor=ROOT.kGreen + 2)

    print "CR sig contamination (eff. entries): ", hist_sig_CR.GetEffectiveEntries(
    ) / (hist_bkg_CR.GetEffectiveEntries() + hist_sig_CR.GetEffectiveEntries())
    print "CR sig contamination (integral): ", hist_sig_CR.Integral() / (
        hist_bkg_CR.Integral() + hist_sig_CR.Integral())

    print "CR sig efficiency (eff. entries): ", hist_sig_CR.GetEffectiveEntries(
    ) / hist_incl_sig.GetEffectiveEntries()
    print "CR sig efficiency (integral): ", hist_sig_CR.Integral(
    ) / hist_incl_sig.Integral()
Ejemplo n.º 17
0
def main (args):

    # Definitions
    histstyle = dict(**HISTSTYLE)

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, features, _ = load_data(args.input + 'data.h5', background=True, train=True)

    pt_bins = np.linspace(200, 2000, 18 + 1, endpoint=True)
    pt_bins = zip(pt_bins[:-1], pt_bins[1:])
    bins = np.linspace(50, 300, (300 - 50) // 10 + 1, endpoint=True)

    for pt_bin in pt_bins:

        histstyle[True] ['label'] = 'Inclusive'
        histstyle[False]['label'] = 'p_{{T}} #in  [{:.0f}, {:.0f}] GeV'.format(*pt_bin)

        # Canvas
        c = rp.canvas(batch=True)

        # Plots
        msk = (data['pt'] > pt_bin[0]) & (data['pt'] < pt_bin[1])
        c.hist(data['m'].values,      bins=bins, weight=data['weight_adv'] .values,      normalise=True, **histstyle[True])
        c.hist(data['m'].values[msk], bins=bins, weight=data['weight_adv'] .values[msk], normalise=True, **histstyle[False])
        c.hist(data['m'].values[msk], bins=bins, weight=data['weight_test'].values[msk], normalise=True, label="Testing weight", linewidth=2, linecolor=ROOT.kGreen)

        # Decorations
        c.legend()
        c.xlabel("Large-#it{R} jet mass [GeV]")
        c.ylabel("Fraction of jets")

        # Save
        c.save('figures/temp_mass_pT{:.0f}_{:.0f}.pdf'.format(*pt_bin))
        pass

    return


    # Perform selection  @NOTE: For Rel. 20.7 only
    #data = data[(data['m']  >  50) & (data['m']  <  300)]
    #data = data[(data['pt'] > 200) & (data['pt'] < 2000)]

    # Add variables  @NOTE: For Rel. 20.7 only
    #data['rho']    = pd.Series(np.log(np.square(data['m']) / np.square(data['pt'])), index=data.index)
    #data['rhoDDT'] = pd.Series(np.log(np.square(data['m']) / data['pt'] / 1.), index=data.index)

    data['logm'] = pd.Series(np.log(data['m']), index=data.index)

    # Check variable distributions
    axes = {
        'pt':   (45, 200, 2000),
        'm':    (50,  50,  300),
        'rho':  (50,  -8,    0),
        'logm': (50,  np.log(50),  np.log(300)),
    }
    weight = 'weight_adv'  # 'weight_test' / 'weight'
    pt_range = (200., 2000.)
    msk_pt = (data['pt'] > pt_range[0]) & (data['pt'] < pt_range[1])
    for var in axes:

        # Canvas
        c = rp.canvas(num_pads=2, batch=True)

        # Plot
        bins = np.linspace(axes[var][1], axes[var][2], axes[var][0] + 1, endpoint=True)
        for adv in [0,1]:
            msk  = data['signal'] == 0   # @TEMP signal
            msk &= msk_pt
            opts = dict(normalise=True, **HISTSTYLE[adv])  # @TEMP signal
            opts['label'] = 'adv' if adv else 'test'
            if adv:
                h1 = c.hist(data.loc[msk, var].values, bins=bins, weights=data.loc[msk, weight].values, **opts)
            else:
                h2 = c.hist(data.loc[msk, var].values, bins=bins, weights=data.loc[msk, 'weight_test'].values, **opts)
                pass
            pass

        # Ratio
        c.pads()[1].ylim(0,2)
        c.ratio_plot((h1,h2), oob=True)

        # Decorations
        c.legend()
        c.xlabel(latex(var, ROOT=True))
        c.ylabel("Fraction of jets")
        c.pads()[1].ylabel("adv/test")
        #c.logy()
        c.text(TEXT + ['p_{{T}} #in  [{:.0f}, {:.0f}] GeV'.format(pt_range[0], pt_range[1])], qualifier=QUALIFIER)

        # Save
        mkdir('figures/distributions')
        c.save('figures/distributions/incl_{}.pdf'.format(var))
        pass


    # 2D histograms
    msk = data['signal'] == 0
    axisvars = sorted(list(axes))
    for i,varx in enumerate(axisvars):
        for vary in axisvars[i+1:]:
            # Canvas
            c = ROOT.TCanvas()
            c.SetRightMargin(0.20)

            # Create, fill histogram
            h2 = ROOT.TH2F('{}_{}'.format(varx, vary), "", *(axes[varx] + axes[vary]))
            root_numpy.fill_hist(h2, data.loc[msk, [varx, vary]].values, 100. * data.loc[msk, weight].values)

            # Draw
            h2.Draw("COLZ")

            # Decorations
            h2.GetXaxis().SetTitle(latex(varx, ROOT=True))
            h2.GetYaxis().SetTitle(latex(vary, ROOT=True))
            c.SetLogz()

            # Save
            c.SaveAs('figures/distributions/2d_{}_{}.pdf'.format(varx, vary))
            pass
        pass

    return
Ejemplo n.º 18
0
def main(args):

    # Initialising
    # --------------------------------------------------------------------------
    args, cfg = initialise(args)

    # Loading data
    # --------------------------------------------------------------------------
    data, features, _ = load_data(args.input + 'data_1M_10M.h5')
    #data = data.sample(frac=0.5, random_state=32)  # @TEMP
    data = data[data['train'] == 1]

    # Reduce size of data
    drop_features = [
        feat for feat in list(data)
        if feat not in features + ['m', 'signal', 'weight_adv']
    ]
    data.drop(drop_features, axis=1)

    cfg['uBoost']['train_features'] = features
    cfg['uBoost']['random_state'] = SEED
    cfg['DecisionTreeClassifier']['random_state'] = SEED

    # Arrays
    X = data

    #print(X.head())

    w = np.array(data['weight_adv']).flatten()
    y = np.array(data['signal']).flatten()

    # Fit uBoost classifier
    # --------------------------------------------------------------------------
    with Profile("Fitting uBoost classifier"):

        # @NOTE: There might be an issue with the sample weights, because the
        #        local efficiencies computed using kNN does not seem to take the
        #        sample weights into account.
        #
        #        See:
        #          https://github.com/arogozhnikov/hep_ml/blob/master/hep_ml/uboost.py#L247-L248
        #        and
        #          https://github.com/arogozhnikov/hep_ml/blob/master/hep_ml/metrics_utils.py#L159-L176
        #        with `divided_weights` not set.
        #
        #        `sample_weight` seem to be use only as a starting point for the
        #        boosted, and so not used for the efficiency calculation.
        #
        #        If this is indeed the case, it would be possible to simply
        #        sample MC events by their weight, and use `sample_weight = 1`
        #        for all samples passed to uBoost.
        #
        # @NOTE: I have gotten less sure of the above, so probably no panic.

        def train_uBoost(X, y, w, cfg, uniforming_rate):
            """
            ...
            """

            # Create base classifier
            base_tree = DecisionTreeClassifier(**cfg['DecisionTreeClassifier'])

            # Update training configuration
            these_cfg = dict(**cfg['uBoost'])
            these_cfg['uniforming_rate'] = uniforming_rate

            # Create uBoost classifier
            uboost = uBoostBDT(base_estimator=base_tree, **these_cfg)

            # Fit uBoost classifier
            uboost.fit(X, y, sample_weight=w)

            return uboost

        #uniforming_rates = [0.0, 0.01, 0.1, 0.3, 1.0, 3.0, 10.0, 30.0, 100.0]
        uniforming_rates = [0.0, 0.01, 0.1, 0.3, 0.5, 1.0]
        #uniforming_rates = [0.5, 1.0]
        n_jobs = min(7, len(uniforming_rates))  # ...(10, ...

        jobs = [
            delayed(train_uBoost, check_pickle=False)(X, y, w, cfg,
                                                      uniforming_rate)
            for uniforming_rate in uniforming_rates
        ]

        result = Parallel(n_jobs=n_jobs, backend="threading")(jobs)
        pass

    # Saving classifiers
    # --------------------------------------------------------------------------
    for uboost, uniforming_rate in zip(result, uniforming_rates):
        with Profile("Saving classifiers"):

            # Ensure model directory exists
            mkdir('models/uboost/')

            suffix_ur = "ur_{:s}".format(
                ("%.2f" % uniforming_rate).replace('.', 'p'))
            suffix_te = "te_{:d}".format(
                int(cfg['uBoost']['target_efficiency'] * 100))

            # Save uBoost classifier
            with gzip.open(
                    'models/uboost/uboost_{}_{}_rel21_fixed_def_cfg_1000boost.pkl.gz'
                    .format(suffix_ur, suffix_te), 'w') as f:
                pickle.dump(uboost, f)
                pass
            pass
        pass

    return 0
Ejemplo n.º 19
0
def main(args):

    # Definitions
    histstyle = dict(**HISTSTYLE)

    # Initialise
    args, cfg = initialise(args)

    # Load data
    #data = np.zeros(1, 95213009, 10)
    data, features, _ = load_data(
        'data/djr_LCTopo_2.h5')  # + args.input) #, test=True) #
    #data2, features, _ = load_data('data/djr_LCTopo_2.h5') # + args.input) #, test=True) #
    #data = np.concatenate((data1, data2))

    #f1 = h5py.File('data/djr_LCTopo_1.h5', 'r')
    #f2 = h5py.File('data/djr_LCTopo_2.h5', 'r')

    knnCut = 0
    ntrkCut = 50
    emfracCut = 0.65
    scale = 139 * 1000000  # (inverse nanobarn)
    signal_to_plot = 7

    sigDict = {
        0: 'All Models',
        1: 'Model A, m = 2 TeV',
        2: 'Model A, m = 1 TeV',
        3: 'Model A, m = 1.5 TeV',
        4: 'Model A, m = 2.5 TeV',
        5: 'Model B, m = 1 TeV',
        6: 'Model B, m = 1.5 TeV',
        7: 'Model B, m = 2 TeV',
        8: 'Model B, m = 2.5 TeV',
        9: 'Model C, m = 1 TeV',
        10: 'Model C, m = 1.5 TeV',
        11: 'Model C, m = 2 TeV',
        12: 'Model C, m = 2.5 TeV',
        13: 'Model D, m = 1 TeV',
        14: 'Model D, m = 1.5 TeV',
        15: 'Model D, m = 2 TeV',
        16: 'Model D, m = 2.5 TeV',
    }

    outHistFile = ROOT.TFile.Open(
        "figures/mjjHistograms_kNN{}_eff{}.root".format(knnCut, kNN_eff),
        "RECREATE")

    histstyle[True]['label'] = 'Multijets'
    histstyle[False]['label'] = 'Dark jets, {}'.format(sigDict[signal_to_plot])

    # Add knn variables

    #base_var = ['lead_jet_ungrtrk500', 'sub_jet_ungrtrk500']
    base_var = 'jet_ungrtrk500'
    kNN_var = base_var.replace('jet', 'knn')
    #base_vars = ['lead_'+base_var, 'sub_'+base_var]
    #kNN_vars = ['lead_'+kNN_var, 'sub_'+kNN_var]

    print data.shape

    with Profile("Add variables"):
        #for i in range(len(base_var)):
        print "k-NN base variable: {} (cp. {})".format(base_var, kNN_var)
        add_knn(data,
                newfeat='lead_' + kNN_var,
                path='models/knn/{}_{}_{}_{}.pkl.gz'.format(
                    FIT, base_var, kNN_eff, sigModel))
        add_knn(data,
                newfeat='sub_' + kNN_var,
                path='models/knn/{}_{}_{}_{}.pkl.gz'.format(
                    FIT, base_var, kNN_eff, sigModel))

        #add_knn(data, newfeat=kNN_var, path='models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel))

        print 'models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff,
                                                      sigModel)
        """
        base_var = ['lead_jet_ungrtrk500', 'sub_jet_ungrtrk500']
        kNN_var = [var.replace('jet', 'knn') for var in base_var]
        
        with Profile("Add variables"):
        from run.knn.common import add_knn, MODEL, VAR as kNN_basevar, EFF as kNN_eff
        print "k-NN base variable: {} (cp. {})".format(kNN_basevar, kNN_var)
        for i in range(len(base_var)):
        add_knn(data, newfeat=kNN_var[i], path='models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var[i], kNN_eff, MODEL))
        print 'models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var[i], kNN_eff, MODEL)
        """

    weight = 'weight'  # 'weight_test' / 'weight'
    bins_pt = np.linspace(450, 3500, 40)
    bins_mjj = np.linspace(0, 8000, 80)

    # Useful masks
    msk_bkg = data['signal'] == 0
    if signal_to_plot == 0:
        msk_sig = data['signal'] == 1
    else:
        msk_sig = data['sigType'] == signal_to_plot

    #msk_weight = data['weight']<0.2

    msk_knn = (data['lead_knn_ungrtrk500'] >
               knnCut) & (data['sub_knn_ungrtrk500'] > knnCut)
    msk_ungr = (data['lead_jet_ungrtrk500'] >
                ntrkCut) & (data['sub_jet_ungrtrk500'] > ntrkCut)
    msk_emfrac = (data['lead_jet_EMFrac'] <
                  emfracCut) & (data['sub_jet_EMFrac'] < emfracCut)

    msk_knn_1 = (data['lead_knn_ungrtrk500'] > knnCut)
    msk_ungr_1 = (data['lead_jet_ungrtrk500'] > ntrkCut)

    #msk_knn = (data['knn_ungrtrk500']>knnCut)
    #msk_ungr = (data['jet_ungrtrk500']>90.0)

    msk_ntrkBkg = msk_ungr & msk_emfrac & msk_bkg  #& msk_weight #& msk_pt & msk_m & msk_eta
    msk_ntrkSig = msk_ungr & msk_emfrac & msk_sig  #& msk_pt & msk_m & msk_eta

    msk_knnBkg = msk_knn & msk_bkg
    msk_knnSig = msk_knn & msk_sig

    msk_ntrkBkg1 = msk_ungr_1 & msk_bkg  #& msk_weight #& msk_pt & msk_m & msk_eta
    msk_ntrkSig1 = msk_ungr_1 & msk_sig  #& msk_pt & msk_m & msk_eta
    msk_knnBkg1 = msk_knn_1 & msk_bkg  #& msk_weight #& msk_pt & msk_m & msk_eta
    msk_knnSig1 = msk_knn_1 & msk_sig  #& msk_pt & msk_m & msk_eta

    msk_inclBkg = msk_bkg  #& msk_weight #& msk_pt & msk_m & msk_eta
    msk_inclSig = msk_sig  #& msk_pt & msk_m & msk_eta

    # Mjj dist with cut on ntrk, ungrtrk compared to inclusive selection
    c = rp.canvas(batch=True)
    hist_inclBkg = c.hist(data.loc[msk_inclBkg, 'dijetmass'].values,
                          bins=bins_mjj,
                          weights=scale * data.loc[msk_inclBkg, weight].values,
                          label="Multijets, Inclusive",
                          normalise=True,
                          linecolor=ROOT.kGreen + 2,
                          linewidth=3)
    hist_knnBkg = c.hist(
        data.loc[msk_knnBkg, 'dijetmass'].values,
        bins=bins_mjj,
        weights=scale * data.loc[msk_knnBkg, weight].values,
        label="Multijets, n_{{trk}}^{{#epsilon}}>{}".format(knnCut),
        normalise=True,
        linecolor=ROOT.kMagenta + 2,
        linestyle=2,
        linewidth=3)

    hist_ntrkBkg = c.hist(data.loc[msk_ntrkBkg, 'dijetmass'].values,
                          bins=bins_mjj,
                          weights=scale * data.loc[msk_ntrkBkg, weight].values,
                          label="Multijets, n_{{trk}}>{}".format(ntrkCut),
                          normalise=True,
                          linecolor=ROOT.kOrange + 2,
                          linestyle=2,
                          linewidth=3)
    #hist_CRBkg = c.hist(data.loc[msk_CR_bkg, 'dijetmass'].values, bins=bins_mjj, weights=scale*data.loc[msk_CR_bkg, weight].values, label="CR Bkg, C<20", normalise=True, linecolor=ROOT.kGray+2, linestyle=2)

    c.legend(width=0.4, xmin=0.5, ymax=0.9)
    c.ylabel("Fraction of jets")
    c.xlabel("m_{jj} [GeV]")
    c.logy()
    #c.ylim(0.00005, 5)
    #c.save('figures/distributions/mjj_Bkg_CR20.pdf'.format(knnCut))
    #c.save('figures/distributions/mjj_Bkg_CR20.eps'.format(knnCut))
    c.save('figures/distributions/mjj_BkgDist_ntrk{}_knn{}_{}.pdf'.format(
        ntrkCut, knnCut, FIT))
    c.save('figures/distributions/mjj_BkgDist_ntrk{}_knn{}_{}.eps'.format(
        ntrkCut, knnCut, FIT))

    del c

    c = rp.canvas(batch=True)
    hist_Sig = c.hist(data.loc[msk_sig, 'dijetmass'].values,
                      bins=bins_mjj,
                      weights=data.loc[msk_sig, weight].values,
                      label="Model A, m = 2 TeV, inclusive",
                      normalise=True,
                      linecolor=ROOT.kGreen + 2)

    hist_knnSig = c.hist(
        data.loc[msk_knnSig, 'dijetmass'].values,
        bins=bins_mjj,
        weights=data.loc[msk_knnSig, weight].values,
        label="Model A, m = 2 TeV, #it{{n}}_{{trk}}^{{#epsilon}}>{}".format(
            knnCut),
        normalise=True,
        linecolor=ROOT.kMagenta + 2,
        linestyle=2)

    hist_ntrkSig = c.hist(
        data.loc[msk_ntrkSig, 'dijetmass'].values,
        bins=bins_mjj,
        weights=data.loc[msk_ntrkSig, weight].values,
        label="Model A, m = 2 TeV, #it{{n}}_{{trk}}>{}".format(ntrkCut),
        normalise=True,
        linecolor=ROOT.kOrange + 2,
        linestyle=2)

    #hist_CRSig = c.hist(data.loc[msk_CR_sig, 'dijetmass'].values, bins=bins_mjj, weights=data.loc[msk_CR_sig, weight].values, label="Sig, CR", normalise=True, linecolor=ROOT.kGray+2, linestyle=2)

    c.legend(width=0.4, xmin=0.5, ymax=0.9)
    c.ylabel("Fraction of jets")
    c.xlabel("m_{jj} [GeV]")
    c.logy()
    #c.ylim(0.00005, 5)
    c.save('figures/distributions/mjj_SigDist_ntrk{}_knn{}_{}.pdf'.format(
        ntrkCut, knnCut, FIT))
    c.save('figures/distributions/mjj_SigDist_ntrk{}_knn{}_{}.eps'.format(
        ntrkCut, knnCut, FIT))

    del c

    c = rp.canvas(batch=True)

    hist_knnSig = c.hist(
        data.loc[msk_knnSig, 'dijetmass'].values,
        bins=bins_mjj,
        weights=data.loc[msk_knnSig, weight].values,
        label="Model A, m = 2 TeV, knn_ntrk>{}".format(knnCut),
        normalise=False,
        linecolor=ROOT.kBlue + 1,
        linestyle=1)

    hist_knnBkg = c.hist(data.loc[msk_knnBkg, 'dijetmass'].values,
                         bins=bins_mjj,
                         weights=scale * data.loc[msk_knnBkg, weight].values,
                         label="Multijets, knn_ntrk>{}".format(knnCut),
                         normalise=False,
                         linecolor=ROOT.kMagenta + 2,
                         linestyle=2)

    hist_ntrkBkg = c.hist(data.loc[msk_ntrkBkg, 'dijetmass'].values,
                          bins=bins_mjj,
                          weights=scale * data.loc[msk_ntrkBkg, weight].values,
                          label="Multijets, ntrk>{}".format(ntrkCut),
                          normalise=False,
                          linecolor=ROOT.kOrange + 2,
                          linestyle=2)

    c.legend(width=0.4, xmin=0.3, ymax=0.9)
    c.ylabel("Number of events")
    c.xlabel("m_{jj} [GeV]")
    c.logy()
    #c.ylim(0.00005, 5)
    c.save('figures/distributions/mjj_Dist_noNorm_knn{}_{}.pdf'.format(
        knnCut, FIT))
    c.save('figures/distributions/mjj_Dist_noNorm_knn{}_{}.eps'.format(
        knnCut, FIT))

    bins_mjj = np.linspace(0, 10000, 50)

    # Unscaled histograms for calculating efficiencies

    hist_inclBkg = c.hist(data.loc[msk_inclBkg, 'dijetmass'].values,
                          bins=bins_mjj,
                          weights=scale * data.loc[msk_inclBkg, weight].values,
                          normalise=False)

    hist_inclSig = c.hist(data.loc[msk_inclSig, 'dijetmass'].values,
                          bins=bins_mjj,
                          weights=data.loc[msk_inclSig, weight].values,
                          normalise=False)

    hist_ntrkSig = c.hist(data.loc[msk_ntrkSig, 'dijetmass'].values,
                          bins=bins_mjj,
                          weights=data.loc[msk_ntrkSig, weight].values,
                          normalise=False)

    hist_knnSig = c.hist(data.loc[msk_knnSig, 'dijetmass'].values,
                         bins=bins_mjj,
                         weights=data.loc[msk_knnSig, weight].values,
                         normalise=False)

    hist_ntrkSig1 = c.hist(data.loc[msk_ntrkSig1, 'dijetmass'].values,
                           bins=bins_mjj,
                           weights=data.loc[msk_ntrkSig1, weight].values,
                           normalise=False)

    hist_ntrkBkg1 = c.hist(data.loc[msk_ntrkBkg1, 'dijetmass'].values,
                           bins=bins_mjj,
                           weights=data.loc[msk_ntrkBkg1, weight].values,
                           normalise=False)

    hist_knnBkg1 = c.hist(data.loc[msk_knnBkg1, 'dijetmass'].values,
                          bins=bins_mjj,
                          weights=data.loc[msk_knnBkg1, weight].values,
                          normalise=False)

    hist_knnSig1 = c.hist(data.loc[msk_knnSig1, 'dijetmass'].values,
                          bins=bins_mjj,
                          weights=data.loc[msk_knnSig1, weight].values,
                          normalise=False)

    print "Bkg inclusive integral: ", hist_inclBkg.GetEffectiveEntries()
    print "Sig inclusive integral: ", hist_inclSig.GetEffectiveEntries()

    print "Bkg pass kNN eff entries / integral: ", hist_knnBkg.GetEffectiveEntries(
    ), hist_knnBkg.Integral()
    print "Sig pass kNN eff entries / integral: ", hist_knnSig.GetEffectiveEntries(
    ), hist_knnSig.Integral()

    print "Bkg pass ntrk eff entries / integral: ", hist_ntrkBkg.GetEffectiveEntries(
    ), hist_ntrkBkg.Integral()
    print "Sig pass ntrk eff entries / integral: ", hist_ntrkSig.GetEffectiveEntries(
    ), hist_ntrkSig.Integral()

    print "Bkg Eff. knn_ntrk> {}, eff. entries: ".format(
        knnCut), 100 * hist_knnBkg.GetEffectiveEntries(
        ) / hist_inclBkg.GetEffectiveEntries()
    print "Sig Eff. knn_ntrk> {}, eff. entries: ".format(
        knnCut), 100 * hist_knnSig.GetEffectiveEntries(
        ) / hist_inclSig.GetEffectiveEntries()

    print "Bkg Eff. knn_ntrk> {}, integral: ".format(
        knnCut), 100 * hist_knnBkg.Integral() / hist_inclBkg.Integral()
    print "Sig Eff. knn_ntrk> {}, integral: ".format(
        knnCut), 100 * hist_knnSig.Integral() / hist_inclSig.Integral()

    print "Bkg Eff. ntrk>{}, eff. entries: ".format(
        ntrkCut), 100 * hist_ntrkBkg.GetEffectiveEntries(
        ) / hist_inclBkg.GetEffectiveEntries()
    print "Sig Eff. ntrk>{}, eff. entries: ".format(
        ntrkCut), 100 * hist_ntrkSig.GetEffectiveEntries(
        ) / hist_inclSig.GetEffectiveEntries(
        )  #, hist_ntrkSig.GetEffectiveEntries()

    print "Bkg Eff. 1 jet knn_ntrk> {}, eff. entries: ".format(
        knnCut), 100 * hist_knnBkg1.GetEffectiveEntries(
        ) / hist_inclBkg.GetEffectiveEntries()
    print "Sig Eff. 1 jet knn_ntrk> {}, eff. entries: ".format(
        knnCut), 100 * hist_knnSig1.GetEffectiveEntries(
        ) / hist_inclSig.GetEffectiveEntries()

    print "Bkg Eff. 1 jet knn_ntrk> {}, integral: ".format(
        knnCut), 100 * hist_knnBkg1.GetEffectiveEntries(
        ) / hist_inclBkg.GetEffectiveEntries()
    print "Sig Eff. 1 jet knn_ntrk> {}, integral: ".format(
        knnCut), 100 * hist_knnSig1.GetEffectiveEntries(
        ) / hist_inclSig.GetEffectiveEntries()

    outHistFile.cd()
    hist_knnBkg.SetName("bkg_knn")
    hist_knnSig.SetName("sig_knn")
    hist_knnBkg.Write()
    hist_knnSig.Write()
    outHistFile.Close()
    # Mjj dist for CR compared to inclusive selection
    """
Ejemplo n.º 20
0
def main(args):

    # Definitions
    histstyle = dict(**HISTSTYLE)

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, features, _ = load_data(args.input + 'data.h5',
                                  background=True,
                                  train=True)

    pt_bins = np.linspace(200, 2000, 18 + 1, endpoint=True)
    pt_bins = [None] + zip(pt_bins[:-1], pt_bins[1:])

    vars = ['m', 'pt']
    for var, pt_bin, log in itertools.product(vars, pt_bins, [True, False]):

        if var == 'm':
            bins = np.linspace(50, 300, (300 - 50) // 10 + 1, endpoint=True)
        else:
            bins = np.linspace(200,
                               2000, (2000 - 200) // 50 + 1,
                               endpoint=True)
            pass

        histstyle[True]['label'] = 'Training weight'
        histstyle[False]['label'] = 'Testing weight'

        # Canvas
        c = rp.canvas(batch=True)

        # Plots
        if pt_bin is not None:
            msk = (data['pt'] > pt_bin[0]) & (data['pt'] < pt_bin[1])
        else:
            msk = np.ones(data.shape[0], dtype=bool)
            pass

        if pt_bin is not None:
            c.hist(data[var].values[msk],
                   bins=bins,
                   weights=data['weight_test'].values[msk],
                   normalise=True,
                   **histstyle[False])
            c.hist(data[var].values[msk],
                   bins=bins,
                   weights=data['weight_adv'].values[msk],
                   normalise=True,
                   **histstyle[True])
            #c.hist(data[var].values,      bins=bins, weights=data['weight_adv'] .values,      normalise=True, **histstyle[True])
            #c.hist(data[var].values[msk], bins=bins, weights=data['weight_adv'] .values[msk], normalise=True, **histstyle[False])
            #c.hist(data[var].values[msk], bins=bins, weights=data['weight_test'].values[msk], normalise=True, label="Testing weight", linewidth=2, linecolor=ROOT.kGreen)
        else:
            c.hist(data[var].values[msk],
                   bins=bins,
                   weights=data['weight_test'].values[msk],
                   normalise=True,
                   **histstyle[False])
            c.hist(data[var].values[msk],
                   bins=bins,
                   weights=data['weight_adv'].values[msk],
                   normalise=True,
                   **histstyle[True])
            pass

        # Decorations
        c.text(TEXT + ["Multijets", "Training dataset"] +
               (['p_{{T}} #in  [{:.0f}, {:.0f}] GeV'.format(
                   *pt_bin)] if pt_bin is not None else []),
               qualifier='Simulation Internal')
        c.legend()
        c.xlabel("Large-#it{{R}} jet {:s} [GeV]".format('mass' if var ==
                                                        'm' else 'p_{T}'))
        c.ylabel("Fraction of jets")
        if log:
            c.logy()
            pass

        # Save
        c.save('figures/weighting_{}{:s}{}.pdf'.format(
            'mass' if var == 'm' else var,
            '_pT{:.0f}_{:.0f}'.format(*pt_bin) if pt_bin is not None else '',
            '_log' if log else ''))
        pass

    return

    data['logm'] = pd.Series(np.log(data['m']), index=data.index)

    # Check variable distributions
    axes = {
        'pt': (45, 200, 2000),
        'm': (50, 50, 300),
        'rho': (50, -8, 0),
        'logm': (50, np.log(50), np.log(300)),
    }
    weight = 'weight_adv'  # 'weight_test' / 'weight'
    pt_range = (200., 2000.)
    msk_pt = (data['pt'] > pt_range[0]) & (data['pt'] < pt_range[1])
    for var in axes:

        # Canvas
        c = rp.canvas(num_pads=2, batch=True)

        # Plot
        bins = np.linspace(axes[var][1],
                           axes[var][2],
                           axes[var][0] + 1,
                           endpoint=True)
        for adv in [0, 1]:
            msk = data['signal'] == 0  # @TEMP signal
            msk &= msk_pt
            opts = dict(normalise=True, **HISTSTYLE[adv])  # @TEMP signal
            opts['label'] = 'adv' if adv else 'test'
            if adv:
                h1 = c.hist(data.loc[msk, var].values,
                            bins=bins,
                            weights=data.loc[msk, weight].values,
                            **opts)
            else:
                h2 = c.hist(data.loc[msk, var].values,
                            bins=bins,
                            weights=data.loc[msk, 'weight_test'].values,
                            **opts)
                pass
            pass

        # Ratio
        c.pads()[1].ylim(0, 2)
        c.ratio_plot((h1, h2), oob=True)

        # Decorations
        c.legend()
        c.xlabel(latex(var, ROOT=True))
        c.ylabel("Fraction of jets")
        c.pads()[1].ylabel("adv/test")
        #c.logy()
        c.text(TEXT + [
            'p_{{T}} #in  [{:.0f}, {:.0f}] GeV'.format(pt_range[0],
                                                       pt_range[1])
        ],
               qualifier=QUALIFIER)

        # Save
        mkdir('figures/distributions')
        c.save('figures/distributions/incl_{}.pdf'.format(var))
        pass

    # 2D histograms
    msk = data['signal'] == 0
    axisvars = sorted(list(axes))
    for i, varx in enumerate(axisvars):
        for vary in axisvars[i + 1:]:
            # Canvas
            c = ROOT.TCanvas()
            c.SetRightMargin(0.20)

            # Create, fill histogram
            h2 = ROOT.TH2F('{}_{}'.format(varx, vary), "",
                           *(axes[varx] + axes[vary]))
            root_numpy.fill_hist(h2, data.loc[msk, [varx, vary]].values,
                                 100. * data.loc[msk, weight].values)

            # Draw
            h2.Draw("COLZ")

            # Decorations
            h2.GetXaxis().SetTitle(latex(varx, ROOT=True))
            h2.GetYaxis().SetTitle(latex(vary, ROOT=True))
            c.SetLogz()

            # Save
            c.SaveAs('figures/distributions/2d_{}_{}.pdf'.format(varx, vary))
            pass
        pass

    return
Ejemplo n.º 21
0
def main (args):

    # Definitions
    histstyle = dict(**HISTSTYLE)

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, features, _ = load_data('data/' + args.input) #, test=True) # 

    outFile = ROOT.TFile.Open("figures/knn_jet_ungrtrk500_eff{}_data.root".format(knn_eff),"RECREATE")


    EFF = 0.5
    VAR = 'jet_ungrtrk500'
    VARX = 'dijetmass'
    FIT_RANGE = (0, 6000) # Necessary?

    #eff_sig = 0.50
    #fpr, tpr, thresholds = roc_curve(data['signal'], data[kNN_basevar], sample_weight=data['weight'])
    #idx = np.argmin(np.abs(tpr - eff_sig))
    #print "Background acceptance @ {:.2f}% sig. eff.: {:.2f}% ({} > {:.2f})".format(eff_sig * 100., (fpr[idx]) * 100., kNN_basevar, thresholds[idx]) #changed from 1-fpr[idx]
    #print "Chosen target efficiency: {:.2f}%".format(kNN_eff)


    weight = 'weight'  # 'weight_test' / 'weight'
    bins_mjj = np.linspace(100, 8000, 20)
    fineBins = np.linspace(100, 8000, 7900)
    fineBinsRe = fineBins.reshape(-1,1)

    percs = []
    for i in range(1, len(bins_mjj)):
        
        msk = (data[VARX] > bins_mjj[i-1]) & (data[VARX] <= bins_mjj[i]) & (data['signal']==0) 

        if np.sum(msk) > 20:  # Ensure sufficient statistics for meaningful percentile. Was 20
            percs.append( wpercentile(data=data.loc[msk, VAR].values, percents=100-EFF, weights=data.loc[msk, weight].values) )#wpercentile
            
        else:
            percs.append(0)

    print "Length of percs: ", len(percs), percs

    percs = percs[0:-1]
    bins_mjj = bins_mjj[0:-1]
    
    X = bins_mjj.reshape(-1,1)
    X = X[1:len(bins_mjj)]


    print len(X), len(percs)

    # Fit parameters
    knn_neighbors = 2
    knn_weights = 'uniform'
    fit_deg = 1

    knn = KNeighborsRegressor(n_neighbors=5, weights='distance') 
    y_knn = knn.fit(X, percs).predict(fineBinsRe)
    
    c = rp.canvas(batch=True)
    knnFit = c.plot(y_knn, bins=fineBins, linecolor=ROOT.kRed+2, linewidth=2, linestyle=1, label="knn fit, uniform", option='L')

    c.save('figures/distributions/percentile_test.pdf'.format(EFF, args.input))           

    outFile.cd()
    knnFit.SetName("kNNfit")
    knnFit.Write()
    outFile.Close()

    """
Ejemplo n.º 22
0
def main(args):

    # Initialise
    args, cfg = initialise(args)

    # Initialise Keras backend
    #initialise_backend(args)

    # Neural network-specific initialisation of the configuration dict
    #initialise_config(args, cfg)

    # Keras import(s)
    #import keras.backend as K
    #from keras.models import load_model

    # Project import(s)
    #from adversarial.models import classifier_model, adversary_model, combined_model, decorrelation_model

    # Load data
    #data, features, _ = load_data(args.input + 'data.h5', test=True)
    data, features, _ = load_data(args.input + 'data.h5',
                                  test_full_signal=True)

    # Common definitions
    # --------------------------------------------------------------------------
    # -- k-nearest neighbour
    kNN_var_N2 = 'N_{2}-k#minusNN'
    kNN_var_tau21 = 'tau_{21}-k#minusNN'

    def meaningful_digits(number):
        digits = 0
        if number > 0:
            digits = int(np.ceil(max(-np.log10(number), 0)))
            pass
        return '{l:.{d:d}f}'.format(d=digits, l=number)

    # -- Adversarial neural network (ANN) scan
    #lambda_reg  = 10.
    #lambda_regs = sorted([1., 3., 10.])
    #ann_vars    = list()
    #lambda_strs = list()
    #for lambda_reg_ in lambda_regs:
    #    lambda_str = meaningful_digits(lambda_reg_).replace('.', 'p')
    #    lambda_strs.append(lambda_str)

    #    ann_var_ = "ANN(#lambda={:s})".format(lambda_str.replace('p', '.'))
    #    ann_vars.append(ann_var_)
    #    pass

    #ann_var = ann_vars[lambda_regs.index(lambda_reg)]

    # -- uBoost scan
    #uboost_eff = 92
    #uboost_ur  = 0.3
    #uboost_urs = sorted([0., 0.01, 0.1, 0.3, 1.0])
    #uboost_var  =  'uBoost(#alpha={:s})'.format(meaningful_digits(uboost_ur))
    #uboost_vars = ['uBoost(#alpha={:s})'.format(meaningful_digits(ur)) for ur in uboost_urs]
    #uboost_pattern = 'uboost_ur_{{:4.2f}}_te_{:.0f}_rel21_fixed'.format(uboost_eff)

    # Tagger feature collection
    #tagger_features = ['Tau21','Tau21DDT', 'D2', kNN_var, 'D2', 'D2CSS', 'NN', ann_var, 'Adaboost', uboost_var]
    #tagger_features = ['tau21', 'tau21DDT', 'tau21', 'tau21kNN', 'tau21', 'tau21CSS', 'N2_B1', 'N2_B1DDT', 'N2_B1', 'N2_B1kNN', 'N2_B1', 'N2_B1CSS']; title="tau21_vs_N2_B1"
    #tagger_features = ['N2_B1', 'N2_B1DDT', 'N2_B1', 'N2_B1kNN', 'N2_B1', 'N2_B1CSS']; title="N2_B1"
    #tagger_features = ['tau21', 'tau21DDT', 'N2_B1', 'N2_B1kNN', 'N2_B1', 'N2_B1CSS']; title="ATLAS"
    tagger_features = [
        'decDeepWvsQCD', 'decDeepWvsQCDDDT', 'decDeepWvsQCD',
        'decDeepWvsQCDkNN', 'decDeepWvsQCD', 'decDeepWvsQCDCSS'
    ]
    title = "decDeep"
    tagger_features = [
        'DeepWvsQCD', 'DeepWvsQCDDDT', 'DeepWvsQCD', 'DeepWvsQCDkNN',
        'DeepWvsQCD', 'DeepWvsQCDCSS'
    ]
    title = "Deep"

    # Add variables
    # --------------------------------------------------------------------------
    with Profile("Add variables"):

        ## Tau21DDT
        #from run.ddt.common import add_ddt
        #add_ddt(data, feat='tau21', path='models/ddt/ddt_tau21.pkl.gz')

        ## N2DDT
        #from run.ddt.common import add_ddt
        #add_ddt(data, feat='N2_B1', path='models/ddt/ddt_N2_B1.pkl.gz')

        ## decDeepQvsQCDDDT
        #from run.ddt.common import add_ddt
        #add_ddt(data, feat='decDeepWvsQCD', path='models/ddt/ddt_decDeepWvsQCD.pkl.gz')

        # DeepQvsQCDDDT
        from run.ddt.common import add_ddt
        add_ddt(data,
                feat='DeepWvsQCD',
                path='models/ddt/ddt_DeepWvsQCD.pkl.gz')

        ## Tau21-kNN
        #from run.knn.common import add_knn, VAR_TAU21 as kNN_basevar, TAU21_EFF as kNN_eff
        #print "k-NN base variable: {} (cp. {})".format(kNN_basevar, kNN_var_tau21)
        #add_knn(data, feat=kNN_basevar, path='models/knn/knn_{}_{}.pkl.gz'.format(kNN_basevar, kNN_eff))

        ## N2-kNN
        #from run.knn.common import add_knn, VAR_N2 as kNN_basevar, N2_EFF as kNN_eff
        #print "k-NN base variable: {} (cp. {})".format(kNN_basevar, kNN_var_N2)
        #add_knn(data, feat=kNN_basevar, path='models/knn/knn_{}_{}.pkl.gz'.format(kNN_basevar, kNN_eff))

        ## decDeepWvsQCD-kNN
        #from run.knn.common import add_knn, VAR_DECDEEP as kNN_basevar, DECDEEP_EFF as kNN_eff
        #print "k-NN base variable: {} (cp. {})".format(kNN_basevar, kNN_var_N2)
        #add_knn(data, feat=kNN_basevar, path='models/knn/knn_{}_{}.pkl.gz'.format(kNN_basevar, kNN_eff))

        # DeepWvsQCD-kNN
        from run.knn.common import add_knn, VAR_DEEP as kNN_basevar, DEEP_EFF as kNN_eff
        print "k-NN base variable: {} (cp. {})".format(kNN_basevar, kNN_var_N2)
        add_knn(data,
                feat=kNN_basevar,
                path='models/knn/knn_{}_{}.pkl.gz'.format(
                    kNN_basevar, kNN_eff))

        ## Tau21-CSS
        #from run.css.common import add_css
        #add_css("tau21", data)

        ## N2-CSS
        #from run.css.common import add_css
        #add_css("N2_B1", data)

        ## decDeepWvsQCD-CSS
        #from run.css.common import add_css
        #add_css("decDeepWvsQCD", data)

        # DeepWvsQCD-CSS
        from run.css.common import add_css
        add_css("DeepWvsQCD", data)

        pass

    # Remove unused variables
    #used_variables = set(tagger_features + ann_vars + uboost_vars + ['m', 'pt', 'npv', 'weight_test'])
    used_variables = set(tagger_features +
                         ['m', 'pt', 'weight_test', 'npv'
                          ])  ## need to put 'npv' back in for robustness study
    unused_variables = [var for var in list(data) if var not in used_variables]
    data.drop(columns=unused_variables)
    gc.collect()

    # Perform performance studies
    #perform_studies (data, args, tagger_features, ann_vars, uboost_vars)
    perform_studies(data, args, tagger_features, title=title)

    return 0
Ejemplo n.º 23
0
def main(args):

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, _, _ = load_data(args.input + 'data.h5', train=True)
    msk_sig = data['signal'] == 1
    msk_bkg = ~msk_sig

    # -------------------------------------------------------------------------
    ####
    #### # Initialise Keras backend
    #### initialise_backend(args)
    ####
    #### # Neural network-specific initialisation of the configuration dict
    #### initialise_config(args, cfg)
    ####
    #### # Keras import(s)
    #### from keras.models import load_model
    ####
    #### # NN
    #### from run.adversarial.common import add_nn
    #### with Profile("NN"):
    ####     classifier = load_model('models/adversarial/classifier/full/classifier.h5')
    ####     add_nn(data, classifier, 'NN')
    ####     pass
    # -------------------------------------------------------------------------

    # Fill measured profile
    profile_meas, _ = fill_profile(data[msk_bkg])

    # Add k-NN variable
    knnfeat = 'knn'
    add_knn(data,
            newfeat=knnfeat,
            path='models/knn/knn_{}_{}.pkl.gz'.format(VAR, EFF))

    # Loading KNN classifier
    knn = loadclf('models/knn/knn_{:s}_{:.0f}.pkl.gz'.format(VAR, EFF))

    # Filling fitted profile
    with Profile("Filling fitted profile"):
        rebin = 8
        edges, centres = dict(), dict()
        for ax, var in zip(['x', 'y'], [VARX, VARY]):

            # Short-hands
            vbins, vmin, vmax = AXIS[var]

            # Re-binned bin edges  @TODO: Make standardised right away?
            edges[ax] = np.interp(
                np.linspace(0, vbins, vbins * rebin + 1, endpoint=True),
                range(vbins + 1),
                np.linspace(vmin, vmax, vbins + 1, endpoint=True))

            # Re-binned bin centres
            centres[ax] = edges[ax][:-1] + 0.5 * np.diff(edges[ax])
            pass

        # Get predictions evaluated at re-binned bin centres
        g = dict()
        g['x'], g['y'] = np.meshgrid(centres['x'], centres['y'])
        g['x'], g['y'] = standardise(g['x'], g['y'])

        X = np.vstack((g['x'].flatten(), g['y'].flatten())).T
        fit = knn.predict(X).reshape(g['x'].shape).T

        # Fill ROOT "profile"
        profile_fit = ROOT.TH2F('profile_fit', "",
                                len(edges['x']) - 1, edges['x'].flatten('C'),
                                len(edges['y']) - 1, edges['y'].flatten('C'))
        root_numpy.array2hist(fit, profile_fit)
        pass

    # Plotting
    with Profile("Plotting"):
        for fit in [False, True]:

            # Select correct profile
            profile = profile_fit if fit else profile_meas

            # Plot
            plot(profile, fit)
            pass
        pass

    # Plotting local selection efficiencies for D2-kNN < 0
    # -- Compute signal efficiency
    for sig, msk in zip([True, False], [msk_sig, msk_bkg]):

        if sig:
            rgbs = [(247 / 255., 251 / 255., 255 / 255.),
                    (222 / 255., 235 / 255., 247 / 255.),
                    (198 / 255., 219 / 255., 239 / 255.),
                    (158 / 255., 202 / 255., 225 / 255.),
                    (107 / 255., 174 / 255., 214 / 255.),
                    (66 / 255., 146 / 255., 198 / 255.),
                    (33 / 255., 113 / 255., 181 / 255.),
                    (8 / 255., 81 / 255., 156 / 255.),
                    (8 / 255., 48 / 255., 107 / 255.)]

            red, green, blue = map(np.array, zip(*rgbs))
            nb_cols = len(rgbs)
            stops = np.linspace(0, 1, nb_cols, endpoint=True)
        else:
            rgbs = [(255 / 255., 51 / 255., 4 / 255.),
                    (247 / 255., 251 / 255., 255 / 255.),
                    (222 / 255., 235 / 255., 247 / 255.),
                    (198 / 255., 219 / 255., 239 / 255.),
                    (158 / 255., 202 / 255., 225 / 255.),
                    (107 / 255., 174 / 255., 214 / 255.),
                    (66 / 255., 146 / 255., 198 / 255.),
                    (33 / 255., 113 / 255., 181 / 255.),
                    (8 / 255., 81 / 255., 156 / 255.),
                    (8 / 255., 48 / 255., 107 / 255.)]

            red, green, blue = map(np.array, zip(*rgbs))
            nb_cols = len(rgbs)
            stops = np.array([0] + list(
                np.linspace(0, 1, nb_cols - 1, endpoint=True) *
                (1. - EFF / 100.) + EFF / 100.))
            pass

        ROOT.TColor.CreateGradientColorTable(nb_cols, stops, red, green, blue,
                                             NB_CONTOUR)

        # Define arrays
        shape = (AXIS[VARX][0], AXIS[VARY][0])
        bins = [
            np.linspace(AXIS[var][1],
                        AXIS[var][2],
                        AXIS[var][0] + 1,
                        endpoint=True) for var in VARS
        ]
        x, y, z = (np.zeros(shape) for _ in range(3))

        # Create `profile` histogram
        profile = ROOT.TH2F('profile', "",
                            len(bins[0]) - 1, bins[0].flatten('C'),
                            len(bins[1]) - 1, bins[1].flatten('C'))

        # Compute inclusive efficiency in bins of `VARY`
        effs = list()
        for edges in zip(bins[1][:-1], bins[1][1:]):
            msk_bin = (data[VARY] > edges[0]) & (data[VARY] < edges[1])
            msk_pass = data[knnfeat] < 0
            num = data.loc[msk & msk_bin & msk_pass,
                           'weight_test'].values.sum()
            den = data.loc[msk & msk_bin, 'weight_test'].values.sum()
            effs.append(num / den)
            pass

        # Fill profile
        for i, j in itertools.product(*map(range, shape)):

            # Bin edges in x and y
            edges = [bin[idx:idx + 2] for idx, bin in zip([i, j], bins)]

            # Masks
            msks = [(data[var] > edges[dim][0]) & (data[var] <= edges[dim][1])
                    for dim, var in enumerate(VARS)]
            msk_bin = reduce(lambda x, y: x & y, msks)
            data_ = data[msk & msk_bin]

            # Set non-zero bin content
            if np.sum(msk & msk_bin):
                msk_pass = data_[knnfeat] < 0
                num = data.loc[msk & msk_bin & msk_pass,
                               'weight_test'].values.sum()
                den = data.loc[msk & msk_bin, 'weight_test'].values.sum()
                eff = num / den
                profile.SetBinContent(i + 1, j + 1, eff)
                pass
            pass

        c = rp.canvas(batch=True)
        pad = c.pads()[0]._bare()
        pad.cd()
        pad.SetRightMargin(0.20)
        pad.SetLeftMargin(0.15)
        pad.SetTopMargin(0.10)

        # Styling
        profile.GetXaxis().SetTitle("Large-#it{R} jet " +
                                    latex(VARX, ROOT=True) +
                                    " = log(m^{2}/p_{T}^{2})")
        profile.GetYaxis().SetTitle("Large-#it{R} jet " +
                                    latex(VARY, ROOT=True) + " [GeV]")
        profile.GetZaxis().SetTitle("Selection efficiency for %s^{(%s%%)}" %
                                    (latex(VAR, ROOT=True), EFF))

        profile.GetYaxis().SetNdivisions(505)
        profile.GetZaxis().SetNdivisions(505)
        profile.GetXaxis().SetTitleOffset(1.4)
        profile.GetYaxis().SetTitleOffset(1.8)
        profile.GetZaxis().SetTitleOffset(1.3)
        zrange = (0., 1.)
        if zrange:
            profile.GetZaxis().SetRangeUser(*zrange)
            pass
        profile.SetContour(NB_CONTOUR)

        # Draw
        profile.Draw('COLZ')

        # Decorations
        c.text(qualifier=QUALIFIER, ymax=0.92, xmin=0.15)
        c.text(["#sqrt{s} = 13 TeV", "#it{W} jets" if sig else "Multijets"],
               ATLAS=False)

        # -- Efficiencies
        xaxis = profile.GetXaxis()
        yaxis = profile.GetYaxis()
        tlatex = ROOT.TLatex()
        tlatex.SetTextColor(ROOT.kGray + 2)
        tlatex.SetTextSize(0.023)
        tlatex.SetTextFont(42)
        tlatex.SetTextAlign(32)
        xt = xaxis.GetBinLowEdge(xaxis.GetNbins())
        for eff, ibin in zip(effs, range(1, yaxis.GetNbins() + 1)):
            yt = yaxis.GetBinCenter(ibin)
            tlatex.DrawLatex(
                xt, yt, "%s%.1f%%" %
                ("#bar{#varepsilon}^{rel}_{%s} = " %
                 ('sig' if sig else 'bkg') if ibin == 1 else '', eff * 100.))
            pass

        # -- Bounds
        BOUNDS[0].DrawCopy("SAME")
        BOUNDS[1].DrawCopy("SAME")
        c.latex("m > 50 GeV",
                -4.5,
                BOUNDS[0].Eval(-4.5) + 30,
                align=21,
                angle=-37,
                textsize=13,
                textcolor=ROOT.kGray + 3)
        c.latex("m < 300 GeV",
                -2.5,
                BOUNDS[1].Eval(-2.5) - 30,
                align=23,
                angle=-57,
                textsize=13,
                textcolor=ROOT.kGray + 3)

        # Save
        mkdir('figures/knn/')
        c.save('figures/knn/knn_eff_{}_{:s}_{:.0f}.pdf'.format(
            'sig' if sig else 'bkg', VAR, EFF))
        pass

    return
Ejemplo n.º 24
0
def main(args):

    # Initialise
    args, cfg = initialise(args)

    # Initialise Keras backend
    initialise_backend(args)

    # Neural network-specific initialisation of the configuration dict
    initialise_config(args, cfg)

    # Keras import(s)
    import keras.backend as K
    from keras.models import load_model

    # Project import(s)
    from adversarial.models import classifier_model, adversary_model, combined_model, decorrelation_model

    # Load data
    data, features, _ = load_data(args.input + 'data.h5', test=True)

    # Common definitions
    # --------------------------------------------------------------------------
    # -- k-nearest neighbour
    kNN_var = 'D2-k#minusNN'

    def meaningful_digits(number):
        digits = 0
        if number > 0:
            digits = int(np.ceil(max(-np.log10(number), 0)))
            pass
        return '{l:.{d:d}f}'.format(d=digits, l=number)

    # -- Adversarial neural network (ANN) scan
    lambda_reg = 10.
    lambda_regs = sorted([1., 3., 10.])
    ann_vars = list()
    lambda_strs = list()
    for lambda_reg_ in lambda_regs:
        lambda_str = meaningful_digits(lambda_reg_).replace('.', 'p')
        lambda_strs.append(lambda_str)

        ann_var_ = "ANN(#lambda={:s})".format(lambda_str.replace('p', '.'))
        ann_vars.append(ann_var_)
        pass

    ann_var = ann_vars[lambda_regs.index(lambda_reg)]

    # -- uBoost scan
    uboost_eff = 92
    uboost_ur = 0.3
    uboost_urs = sorted([0., 0.01, 0.1, 0.3, 1.0])
    uboost_var = 'uBoost(#alpha={:s})'.format(meaningful_digits(uboost_ur))
    uboost_vars = [
        'uBoost(#alpha={:s})'.format(meaningful_digits(ur))
        for ur in uboost_urs
    ]
    uboost_pattern = 'uboost_ur_{{:4.2f}}_te_{:.0f}_rel21_fixed'.format(
        uboost_eff)

    # Tagger feature collection
    tagger_features = [
        'Tau21', 'Tau21DDT', 'D2', kNN_var, 'D2', 'D2CSS', 'NN', ann_var,
        'Adaboost', uboost_var
    ]

    # Add variables
    # --------------------------------------------------------------------------
    with Profile("Add variables"):

        # Tau21DDT
        from run.ddt.common import add_ddt
        add_ddt(data, path='models/ddt/ddt.pkl.gz')

        # D2-kNN
        from run.knn.common import add_knn, VAR as kNN_basevar, EFF as kNN_eff
        print "k-NN base variable: {} (cp. {})".format(kNN_basevar, kNN_var)
        add_knn(data,
                newfeat=kNN_var,
                path='models/knn/knn_{}_{}.pkl.gz'.format(
                    kNN_basevar, kNN_eff))

        # D2-CSS
        from run.css.common import add_css
        add_css("D2", data)

        # NN
        from run.adversarial.common import add_nn
        with Profile("NN"):
            classifier = load_model(
                'models/adversarial/classifier/full/classifier.h5')
            add_nn(data, classifier, 'NN')
            pass

        # ANN
        with Profile("ANN"):
            from adversarial.utils import DECORRELATION_VARIABLES
            adversary = adversary_model(
                gmm_dimensions=len(DECORRELATION_VARIABLES),
                **cfg['adversary']['model'])

            combined = combined_model(classifier, adversary,
                                      **cfg['combined']['model'])

            for ann_var_, lambda_str_ in zip(ann_vars, lambda_strs):
                print "== Loading model for {}".format(ann_var_)
                combined.load_weights(
                    'models/adversarial/combined/full/combined_lambda{}.h5'.
                    format(lambda_str_))
                add_nn(data, classifier, ann_var_)
                pass
            pass

        # Adaboost/uBoost
        with Profile("Adaboost/uBoost"):
            from run.uboost.common import add_bdt
            for var, ur in zip(uboost_vars, uboost_urs):
                var = ('Adaboost' if ur == 0 else var)
                path = 'models/uboost/' + uboost_pattern.format(ur).replace(
                    '.', 'p') + '.pkl.gz'
                print "== Loading model for {}".format(var)
                add_bdt(data, var, path)
                pass

            # Remove `Adaboost` from scan list
            uboost_vars.pop(0)
            pass

        pass

    # Remove unused variables
    used_variables = set(tagger_features + ann_vars + uboost_vars +
                         ['m', 'pt', 'npv', 'weight_test'])
    unused_variables = [var for var in list(data) if var not in used_variables]
    data.drop(columns=unused_variables)
    gc.collect()

    # Perform performance studies
    perform_studies(data, args, tagger_features, ann_vars, uboost_vars)

    return 0
Ejemplo n.º 25
0
def main (args):

    # Definitions
    histstyle = dict(**HISTSTYLE)

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, features, _ = load_data('data/djr_LCTopo_1.h5') #, test=True)
    #data2, features, _ = load_data('data/djr_LCTopo_2.h5') #, test=True)

    #data = np.concatenate((data1, data2))

    sigNumber = 0

    sigDict = {
        0: 'All Models',
        1: 'Model A, m = 1 TeV',
        2: 'Model A, m = 1.5 TeV',
        3: 'Model A, m = 2 TeV',
        4: 'Model A, m = 2.5 TeV',
        5: 'Model B, m = 1 TeV',
        6: 'Model B, m = 1.5 TeV',
        7: 'Model B, m = 2 TeV',
        8: 'Model B, m = 2.5 TeV',
        9: 'Model C, m = 1 TeV',
        10: 'Model C, m = 1.5 TeV',
        11: 'Model C, m = 2 TeV',
        12: 'Model C, m = 2.5 TeV',
        13: 'Model D, m = 1 TeV',
        14: 'Model D, m = 1.5 TeV',
        15: 'Model D, m = 2 TeV',
        16: 'Model D, m = 2.5 TeV',
        }

    histstyle[True] ['label'] = 'Multijets'
    histstyle[False]['label'] = 'Dark jets, {}'.format(sigDict[sigNumber])

    # Add knn variables

    #base_var = ['lead_jet_ungrtrk500', 'sub_jet_ungrtrk500']
    base_var = 'jet_ungrtrk500'
    kNN_var = base_var.replace('jet', 'knn')
    #base_vars = [base_var]
    #kNN_vars = [kNN_var]
    base_vars = ['lead_'+base_var, 'sub_'+base_var]
    kNN_vars = ['lead_'+kNN_var, 'sub_'+kNN_var]

    
    with Profile("Add variables"):
        from run.knn.common import add_knn, EFF as kNN_eff
        #for i in range(len(base_var)):
        print "k-NN base variable: {} (cp. {})".format(base_var, kNN_var)
        add_knn(data, newfeat='lead_'+kNN_var, path='models/knn/knn1D_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel))
        add_knn(data, newfeat='sub_'+kNN_var, path='models/knn/knn1D_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel))

        #add_knn(data, newfeat=kNN_var, path='models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel))
        print 'models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel)

    # Check variable distributions
        
    weight = 'weight'  # 'weight_test' / 'weight'
    scale = 139*1000000 # (inverse nanobarn)

    msk_bkg = data['signal'] == 0
    if sigNumber==0:
        msk_sig = data['signal'] == 1 
    else:
        msk_sig = data['sigType'] == sigNumber 


    knnBins = np.linspace(-100, 200, 75, endpoint=True)

    for var in kNN_vars:
        ### Canvas ###
        c = rp.canvas(num_pads=2, batch=True)
        c_tmp = rp.canvas(num_pads=1, batch=True)
        c2 = rp.canvas(batch=True)

        ### Plot ###
        h2 = c.pads()[0].hist(data.loc[msk_sig, var].values, bins=knnBins, weights=data.loc[msk_sig, weight].values, normalise=True, **histstyle[False])
        h1 = c.pads()[0].hist(data.loc[msk_bkg, var].values, bins=knnBins, weights=scale*data.loc[msk_bkg, weight].values, normalise=True, **histstyle[True])

        h1_incl = c_tmp.hist(data.loc[msk_bkg, var].values, bins=knnBins, weights=scale*data.loc[msk_bkg, weight].values, normalise=False)
        h2_incl = c_tmp.hist(data.loc[msk_sig, var].values, bins=knnBins, weights=data.loc[msk_sig, weight].values, normalise=False)

        #h1_CR = c_tmp.hist(data.loc[msk_CR_bkg, var].values, bins=knnBins, weights=scale*data.loc[msk_CR_bkg, weight].values, normalise=False)
        #h2_CR = c_tmp.hist(data.loc[msk_CR_sig, var].values, bins=knnBins, weights=data.loc[msk_CR_sig, weight].values, normalise=False)

        print "bkg. incl integral: ", h1_incl.GetEffectiveEntries()
        print "sig. incl integral: ", h2_incl.GetEffectiveEntries()
        #print "bkg. CR efficiency: ", h1_CR.GetEffectiveEntries()/h1_incl.GetEffectiveEntries()
        #print "sig. CR efficiency: ", h2_CR.GetEffectiveEntries()/h2_incl.GetEffectiveEntries()

        normFactor = 1.0 / (3./2 + np.sqrt(h1_incl.GetEffectiveEntries()) )
        print "Sensitivity with no cut: ", normFactor

        ### sensitivity ###
        sensitivity = []
        bkg_eff_1jet = []
        i = 0
        for cut in knnBins:

            msk_pass = (data[kNN_vars[0]]>cut) & (data[kNN_vars[1]]>cut)
            msk_pass1 = data[kNN_vars[0]>cut)
            #msk_pass = (data[var]>cut)
            msk_bkg_pass = msk_bkg & msk_pass
            msk_sig_pass = msk_sig & msk_pass

            msk_bkg_pass1 = msk_bkg & msk_pass_1jet
            msk_sig_pass1 = msk_sig & msk_pass_1jet

            h1_pass = c_tmp.hist(data.loc[msk_bkg_pass, var].values, bins=knnBins, weights=scale*data.loc[msk_bkg_pass, weight].values, normalise=False)
            h2_pass = c_tmp.hist(data.loc[msk_sig_pass, var].values, bins=knnBins, weights=data.loc[msk_sig_pass, weight].values, normalise=False)

            h1_pass1 = c_tmp.hist(data.loc[msk_bkg_pass1, var].values, bins=knnBins, weights=data.loc[msk_sig_pass, weight].values, normalise=False)

            if ( h2_incl.GetEffectiveEntries()>0 ) : #and h1_pass.GetEffectiveEntries()>0) :
                sensitivity.append( ((h2_pass.GetEffectiveEntries()/h2_incl.GetEffectiveEntries()) / (3./2 + np.sqrt(h1_pass.GetEffectiveEntries()) )) / normFactor )
                #print "bkg. eff. @ " , cut, ": ", h1_pass.GetEffectiveEntries()/h1_incl.GetEffectiveEntries()  
                #print "signal eff. @ ", cut, ": ", h2_pass.GetEffectiveEntries()/h2_incl.GetEffectiveEntries()
                #print "Sensitivity gain@ ", cut, ": ", ((h2_pass.GetEffectiveEntries()/h2_incl.GetEffectiveEntries()) / (3./2 + np.sqrt(h1_pass.GetEffectiveEntries())) ) / normFactor

            else: 
                sensitivity.append(0)

            if (h1_incl.GetEffectiveEntries()>0 ) :
                bkg_eff_1jet.append(h1_pass1.GetEffectiveEntries()/h1_incl.GetEffectiveEntries())
            else:
                bkg_eff_1jet.append(0)
                

            i = i+1

        #c.pads()[0].ylim(0,0.25)
        c.pads()[0].logy()
        c.pads()[0].xlim(-100,200)
        c.pads()[1].ylim(0,30)
        c.pads()[1].xlim(-100,200)
        c.pads()[1].graph( sensitivity, bins=knnBins) #, oob=False )

        ### Decorations ###
        c.legend(width=0.4, xmin=0.3, ymax=0.9)
        #c.xlabel("n_{trk}^{#epsilon={}\%}".format(kNN_eff)) #latex(var, ROOT=True))
        c.xlabel("n_{trk}^{#epsilon}") #latex(var, ROOT=True))
        c.ylabel("Fraction of jets")
        c.pads()[1].ylabel("Sensitivity gain")#"#epsilon_{S}/(#frac{3}{2} + #sqrt{B})/")
        c.pads()[1].text(["Sensitivity = #varepsilon_{S}/(#frac{3}{2} + #sqrt{B})", 
                ], xmin=0.2, ymax=0.80, ATLAS=False)


        c2.graph(sensitivity, bkg_eff_1jet)
        c2.xlabel("Single jet #varepsilon_B")
        c2.ylabel("Sensitivity gain")
        c2.text(["#epsilon=0.5 %",], xmin=0.2, ymax=0.8, ATLAS=False)

        ### Save ###
        #mkdir('figures/distributions')
        c.save('figures/distributions/sensitivity_{}_sig{}_eff{}.pdf'.format(var, sigNumber, kNN_eff))
        c.save('figures/distributions/sensitivity_{}_sig{}_eff{}.eps'.format(var, sigNumber, kNN_eff))

        c2.save('figure/distribution/sensitivity_1jEfficiency.pdf'.format(var,sigNumber,kNN_eff))
        print 'figures/distributions/sensitivity_{}_sig{}_eff{}.pdf'.format(var, sigNumber, kNN_eff)
        pass
    

    # Plot also the normal ntrk distribution for cross check with Roland's result

    msk_bkg = data['signal'] == 0
    if sigNumber==0:
        msk_sig = data['signal'] == 1 # data['sigType'] == sigNumber #                             
    else:
        msk_sig = data['sigType'] == sigNumber # data['sigType'] == sigNumber #                    
    #msk_weight = data['weight']<0.0002
    #msk_bkg = msk_bkg & msk_pt & msk_m & msk_eta 
    #msk_sig = msk_sig & msk_pt & msk_m & msk_eta 


    baseBins = np.linspace(0, 200, 75, endpoint=True) #axes[var][1], axes[var][2], axes[var][0] + 1, endpoint=True)

    for var in base_vars:
        ### Canvas ###
        c = rp.canvas(num_pads=2, batch=True)
        c.pads()[0].logy()

        c_tmp = rp.canvas(batch=True)

        ### Plot ###
        h2 = c.pads()[0].hist(data.loc[msk_sig, var].values, bins=baseBins, weights=data.loc[msk_sig, weight].values, normalise=True, **histstyle[False])
        h1 = c.pads()[0].hist(data.loc[msk_bkg, var].values, bins=baseBins, weights=scale*data.loc[msk_bkg, weight].values, normalise=True, **histstyle[True])

        h1_incl = c_tmp.hist(data.loc[msk_bkg, var].values, bins=baseBins, weights=scale*data.loc[msk_bkg, weight].values, normalise=False)
        h2_incl = c_tmp.hist(data.loc[msk_sig, var].values, bins=baseBins, weights=data.loc[msk_sig, weight].values, normalise=False)


        print "bkg. incl integral: ", h1_incl.GetEffectiveEntries()
        print "sig. incl integral: ", h2_incl.GetEffectiveEntries()

        normFactor = 1.0 / (3./2 + np.sqrt(h1_incl.Integral()) )

        #print "Sensitivity with no cut: ", normFactor


        ### sensitivity ###
        sensitivity = []
        i = 0
        for cut in baseBins:
            #print cut

            msk_pass = (data[base_vars[0]]>cut) & (data[base_vars[1]]>cut) #
            #msk_pass = data[var]>cut

            msk_bkg_pass = msk_bkg & msk_pass
            msk_sig_pass = msk_sig & msk_pass
            
            h1_pass = c_tmp.hist(data.loc[msk_bkg_pass, var].values, bins=baseBins, weights=scale*data.loc[msk_bkg_pass, weight].values, normalise=False)
            h2_pass = c_tmp.hist(data.loc[msk_sig_pass, var].values, bins=baseBins, weights=data.loc[msk_sig_pass, weight].values, normalise=False)


            if ( h2_incl.Integral()>0 ): #and h1_pass.Integral()>0 ):
                sensitivity.append( (h2_pass.Integral()/h2_incl.Integral()) /  (3./2. + np.sqrt(h1_pass.Integral())) / normFactor )

                #print "signal eff.  at ", cut, ": ", (h2_pass.Integral()/h2_incl.Integral()) 
                #print "bkg eff.  at ", cut, ": ", (h1_pass.Integral()/h1_incl.Integral()) 
                #print "sensitivity gain at ", cut, ": ", (h2_pass.Integral()/h2_incl.Integral()) /  (3./2. + np.sqrt(h1_pass.Integral())) / normFactor

            else:
                sensitivity.append(0)

            i = i+1

        c.pads()[1].ylim(0,80)
        c.pads()[1].xlim(0,200)
        c.pads()[1].graph( sensitivity, bins=baseBins) #, oob=False )

        ### Decorations ###
        c.legend(width=0.4, xmin=0.3, ymax=0.9)
        #c.xlabel(latex(var, ROOT=True))
        c.ylabel("Fraction of jets")
        c.xlabel("n_{trk}") #latex(var, ROOT=True))                                             
        c.pads()[1].ylabel("sensitivity gain") #"#epsilon_{S}/(#frac{3}{2} + #sqrt{B})")
        c.pads()[1].text(["sensitivity = #epsilon_{S}/(#frac{3}{2} + #sqrt{B})",
                ], xmin=0.2, ymax=0.80, ATLAS=False)

        ### Save ###
        c.save('figures/distributions/sensitivity_{}_sig{}_eff{}.pdf'.format(var, sigNumber, kNN_eff))
        c.save('figures/distributions/sensitivity_{}_sig{}_eff{}.eps'.format(var, sigNumber, kNN_eff))
        pass
Ejemplo n.º 26
0
def main(args):

    # Definitions
    histstyle = dict(**HISTSTYLE)

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, features, _ = load_data('data/' + args.input)

    histstyle[True]['label'] = 'Multijets'
    histstyle[False]['label'] = 'Dark jets, Model A, m = 2 TeV'

    # Add knn variables

    #base_var = ['lead_jet_ungrtrk500', 'sub_jet_ungrtrk500']
    #kNN_var = [var.replace('jet', 'knn') for var in base_var]

    #base_var = ['ntrk_sum']
    #kNN_var = [var + '-knn' for var in base_var]
    """
    with Profile("Add variables"):
        from run.knn.common import add_knn, MODEL, VAR as kNN_basevar, EFF as kNN_eff
        print "k-NN base variable: {} (cp. {})".format(kNN_basevar, kNN_var)
        for i in range(len(base_var)):
            add_knn(data, newfeat=kNN_var[i], path='models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var[i], kNN_eff, MODEL))
            print 'models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var[i], kNN_eff, MODEL)
    """

    # Check variable distributions
    axes = {
        'jet_ungrtrk500': (50, 0, 100),
        #'lead_knn_ungrtrk500': (50, -100, 50),
        'jet_pt': (50, 0, 3000),
        'dijetmass': (50, 0, 7000),
    }

    scale = 139 * 1000000

    weight = 'weight'  # 'weight_test' / 'weight'
    msk_bkg = data['signal'] == 0  # @TEMP signal
    msk_sig = data['sigType'] == 1  # @TEMP signal
    #msk_weight = data['weight']<0.002
    #msk_bkg = msk_bkg & msk_weight

    #msk_CR = (data['lead_jet_ungrtrk500']<20) | (data['sub_jet_ungrtrk500']<20)

    ###### 3D histograms #######

    vary = 'jet_pt'
    varx = 'dijetmass'
    varz = 'jet_ungrtrk500'

    #for i,varx in enumerate(axisvars):
    #   for vary in axisvars[i+1:]:
    # Canvas
    can4 = rp.canvas(batch=True)
    pad = can4.pads()[0]._bare()
    pad.cd()
    pad.SetRightMargin(0.20)

    #can4 = ROOT.TCanvas("canvas", "", 800, 600)
    #can4.SetRightMargin(0.20)
    # Create, fill histogram
    h2_bkg = ROOT.TH2F('{}_{}'.format(varx, vary), "",
                       *(axes[varx] + axes[vary]))

    root_numpy.fill_hist(
        h2_bkg, data.loc[msk_bkg, [varx, vary]].values
    )  #, scale*data.loc[msk_bkg, weight].values)#*data.loc[msk_bkg, varz].values)

    #h2_bkg.Scale(1./h2_bkg.Integral())

    print h2_bkg.Integral()

    # Draw
    h2_bkg.Draw("COLZ")

    # Decorations
    h2_bkg.GetXaxis().SetTitle(latex(varx, ROOT=True))
    h2_bkg.GetYaxis().SetTitle(latex(vary, ROOT=True))
    #h2_bkg.GetZaxis().SetTitle(latex(varz, ROOT=True))
    #pad.SetLogz()
    #can4.zlim(0.0, 0.04)
    h2_bkg.GetZaxis().SetRangeUser(0.0, 300000)

    # Save
    can4.save('figures/distributions/3d_{}_{}_{}_bkg.pdf'.format(
        varx, vary, varz))
    can4.save('figures/distributions/3d_{}_{}_{}_bkg.eps'.format(
        varx, vary, varz))

    # ntrk distribution
    """ 
    can1 = rp.canvas(batch=True)
    bins1 = np.linspace(0, 150, 75)

    h_ungrB = can1.hist(data.loc[msk_bkg, 'lead_jet_ungrtrk500'].values, bins=bins1, weights=data.loc[msk_bkg, weight].values, label='ungrtrk, bkg', normalise=True, linecolor=ROOT.kGreen+2)

    h_ungeS = can1.hist(data.loc[msk_sig, 'lead_jet_ungrtrk500'].values, bins=bins1, weights=data.loc[msk_sig, weight].values, label='ungrtrk, sig', normalise=True, linecolor=ROOT.kGreen+2, linestyle=2)
    
    can1.legend(width=0.3, xmin=0.6, ymax=0.9)
    can1.save('figures/distributions/ungrtrk_dist.pdf')
    can1.save('figures/distributions/ungrtrk_dist.eps')


    # 2D histograms

    axisvars = sorted(list(axes))
    
    varx = 'lead_jet_ungrtrk500'
    vary = 'sub_jet_ungrtrk500'


    #for i,varx in enumerate(axisvars):
    #   for vary in axisvars[i+1:]:
    # Canvas
    can3 = ROOT.TCanvas()
    can3.SetRightMargin(0.20)
    
    # Create, fill histogram
    h2_bkg = ROOT.TH2F('{}_{}'.format(varx, vary), "", *(axes[varx] + axes[vary]))
    h2_sig = ROOT.TH2F('{}_{}'.format(varx, vary), "", *(axes[varx] + axes[vary]))

    root_numpy.fill_hist(h2_bkg, data.loc[msk_bkg, [varx, vary]].values, data.loc[msk_bkg, weight].values)
    root_numpy.fill_hist(h2_sig, data.loc[msk_sig, [varx, vary]].values, data.loc[msk_sig, weight].values)
    
    # Draw
    h2_bkg.Draw("COLZ")

    # Decorations
    h2_bkg.GetXaxis().SetTitle(latex(varx, ROOT=True))
    h2_bkg.GetYaxis().SetTitle(latex(vary, ROOT=True))
    can3.SetLogz()
    
    # Save
    can3.SaveAs('figures/distributions/2d_{}_{}_bkg.pdf'.format(varx, vary))
    can3.SaveAs('figures/distributions/2d_{}_{}_bkg.eps'.format(varx, vary))

    can6 = ROOT.TCanvas()
    can6.SetRightMargin(0.20)

    h2_sig.Draw("COLZ")

    # Decorations
    h2_sig.GetXaxis().SetTitle(latex(varx, ROOT=True))
    h2_sig.GetYaxis().SetTitle(latex(vary, ROOT=True))
    can6.SetLogz()
    
    # Save
    can6.SaveAs('figures/distributions/2d_{}_{}_sig.pdf'.format(varx, vary))
    can6.SaveAs('figures/distributions/2d_{}_{}_sig.eps'.format(varx, vary))

    ### Subleading vs. leading knn_ntrk

    varx = 'lead_knn_ungrtrk500'
    vary = 'sub_knn_ungrtrk500'


    # Canvas
    can4 = ROOT.TCanvas()
    can4.SetRightMargin(0.20)

    h2_C1_bkg = ROOT.TH2F('{}_{}'.format(varx, vary), "", *(axes[varx] + axes[vary]))
    root_numpy.fill_hist(h2_C1_bkg, data.loc[msk_bkg, [varx, vary]].values, 100. * data.loc[msk_bkg, weight].values)
    h2_C1_sig = ROOT.TH2F('{}_{}'.format(varx, vary), "", *(axes[varx] + axes[vary]))
    root_numpy.fill_hist(h2_C1_sig, data.loc[msk_sig, [varx, vary]].values, 100. * data.loc[msk_sig, weight].values)

    # Draw
    h2_C1_bkg.Draw("COLZ")

    # Decorations
    h2_C1_bkg.GetXaxis().SetTitle(latex(varx, ROOT=True))
    h2_C1_bkg.GetYaxis().SetTitle(latex(vary, ROOT=True))
    can4.SetLogz()

    can4.SaveAs('figures/distributions/2d_{}_{}_bkg.pdf'.format(varx, vary))
    can4.SaveAs('figures/distributions/2d_{}_{}_bkg.eps'.format(varx, vary))


    # Canvas
    can5 = ROOT.TCanvas()
    can5.SetRightMargin(0.20)

    # Draw
    h2_C1_sig.Draw("COLZ")

    # Decorations
    h2_C1_sig.GetXaxis().SetTitle(latex(varx, ROOT=True))
    h2_C1_sig.GetYaxis().SetTitle(latex(vary, ROOT=True))
    can5.SetLogz()

    can5.SaveAs('figures/distributions/2d_{}_{}_sig.pdf'.format(varx, vary))
    can5.SaveAs('figures/distributions/2d_{}_{}_sig.eps'.format(varx, vary))

    """

    return
Ejemplo n.º 27
0
def main (args):

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, _, _ = load_data('data/' + args.input) #, test=True)
    msk_sig = data['signal'] == 1
    msk_bkg = ~msk_sig

    # -------------------------------------------------------------------------
    ####
    #### # Initialise Keras backend
    #### initialise_backend(args)
    ####
    #### # Neural network-specific initialisation of the configuration dict
    #### initialise_config(args, cfg)
    ####
    #### # Keras import(s)
    #### from keras.models import load_model
    ####
    #### # NN
    #### from run.adversarial.common import add_nn
    #### with Profile("NN"):
    ####     classifier = load_model('models/adversarial/classifier/full/classifier.h5')
    ####     add_nn(data, classifier, 'NN')
    ####     pass
    # -------------------------------------------------------------------------

    # Fill measured profile
    profile_meas, (x,percs, err) = fill_profile_1D(data[msk_bkg])
    weights = 1/err

    # Add k-NN variable
    knnfeat = 'knn'
    orgfeat = VAR
    add_knn(data, newfeat=knnfeat, path='models/knn/{}_{}_{}_{}.pkl.gz'.format(FIT, VAR, EFF, MODEL)) 

    # Loading KNN classifier
    knn = loadclf('models/knn/{}_{:s}_{}_{}.pkl.gz'.format(FIT, VAR, EFF, MODEL))
    #knn = loadclf('models/knn/{}_{:s}_{}_{}.pkl.gz'.format(FIT, VAR, EFF, MODEL))

    X = x.reshape(-1,1)

    # Filling fitted profile
    with Profile("Filling fitted profile"):
        rebin = 8

        # Short-hands
        vbins, vmin, vmax = AXIS[VARX]

        # Re-binned bin edges  @TODO: Make standardised right away?
        # edges = np.interp(np.linspace(0, vbins, vbins * rebin + 1, endpoint=True), 
        #                  range(vbins + 1),
        #                  np.linspace(vmin, vmax,  vbins + 1,         endpoint=True))

        fineBins = np.linspace(vmin, vmax,  vbins*rebin + 1,         endpoint=True)
        orgBins = np.linspace(vmin, vmax,  vbins + 1,         endpoint=True)

        # Re-binned bin centres
        fineCentres = fineBins[:-1] + 0.5 * np.diff(fineBins)
        orgCentres = orgBins[:-1] + 0.5 * np.diff(orgBins)
        
        pass

        # Get predictions evaluated at re-binned bin centres
        if 'erf' in FIT:
            fit = func(fineCentres, knn[0], knn[1], knn[2])
            print "Check: ", func([1500, 2000], knn[0], knn[1], knn[2]) 
        else:
            fit = knn.predict(fineCentres.reshape(-1,1)) #centres.reshape(-1,1))

        # Fill ROOT "profile"
        profile_fit = ROOT.TH1F('profile_fit', "", len(fineBins) - 1, fineBins.flatten('C'))
        root_numpy.array2hist(fit, profile_fit)
        
        knn1 = PolynomialFeatures(degree=2)                                           
        X_poly = knn1.fit_transform(X)
        reg = LinearRegression(fit_intercept=False) #fit_intercept=False)
        reg.fit(X_poly, percs, weights)
        score = round(reg.score(X_poly, percs), 4)
        coef = reg.coef_
        intercept = reg.intercept_
        print "COEFFICIENTS: ", coef, intercept
        
        TCoef = ROOT.TVector3(coef[0], coef[1], coef[2]) 
        outFile = ROOT.TFile.Open("models/{}_jet_ungrtrk500_eff{}_stat{}_{}.root".format(FIT, EFF, MIN_STAT, MODEL),"RECREATE")
        outFile.cd()
        TCoef.Write()
        profile_fit.SetName("kNNfit")
        profile_fit.Write()
        outFile.Close()

        # profile_meas2 = ROOT.TH1F('profile_meas', "", len(x) - 1, x.flatten('C'))
        # root_numpy.array2hist(percs, profile_meas2)
        profile_meas2 = ROOT.TGraph(len(x), x, percs) 
        pass


    # Plotting
    with Profile("Plotting"):
        # Plot
        plot(profile_meas2, profile_fit)
        pass

    # Plotting local selection efficiencies for D2-kNN < 0
    # -- Compute signal efficiency

    # MC weights are scaled with lumi. This is just for better comparison
    #if INPUT =="mc": 
    #    data.loc[:,'TotalEventWeight'] /=  139000000. 

    for sig, msk in zip([True, False], [msk_sig, msk_bkg]):

        # Define arrays
        shape   = AXIS[VARX][0]
        bins    = np.linspace(AXIS[VARX][1], AXIS[VARX][2], AXIS[VARX][0]+ 1, endpoint=True)
        #bins = np.linspace(AXIS[VARX][1], 4000, 40, endpoint=True)
        #bins = np.append(bins, [4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000])

        print "HERE: ", bins 
        
        #x, y = (np.zeros(shape) for _ in range(2))

        # Create `profile` histogram
        profile_knn = ROOT.TH1F('profile', "", len(bins) - 1, bins ) #.flatten('C') )
        profile_org = ROOT.TH1F('profile', "", len(bins) - 1, bins ) #.flatten('C') )

        # Compute inclusive efficiency in bins of `VARX`
        effs = list()
        
        for i in range(shape):
            msk_bin  = (data[VARX] > bins[i]) & (data[VARX] <= bins[i+1])
            msk_pass =  data[knnfeat] > 0 # <?
            msk_pass_org =  data[orgfeat] > 70 # <?
            num = data.loc[msk & msk_bin & msk_pass, 'TotalEventWeight'].values.sum()
            num_org = data.loc[msk & msk_bin & msk_pass_org, 'TotalEventWeight'].values.sum()
            den = data.loc[msk & msk_bin,'TotalEventWeight'].values.sum()
            if den > 0:
                eff = num/den *100.
                eff_org = num_org/den *100.
                profile_knn.SetBinContent(i + 1, eff)
                profile_org.SetBinContent(i + 1, eff_org)
                effs.append(eff)
            #else:
            #print i, "Density = 0"
            pass

        c = rp.canvas(batch=True)
        leg = ROOT.TLegend(0.2, 0.75, 0.5, 0.85)
        leg.AddEntry(profile_knn, "#it{n}_{trk}^{#varepsilon=%s%%} > 0" % ( EFF), "l")
        leg.AddEntry(profile_org, "#it{n}_{trk} > 70", "l")
        leg.Draw()

        pad = c.pads()[0]._bare()
        pad.cd()
        pad.SetRightMargin(0.10)
        pad.SetLeftMargin(0.15)
        pad.SetTopMargin(0.10)

        # Styling
        profile_knn.SetLineColor(rp.colours[1])
        profile_org.SetLineColor(rp.colours[2])
        profile_knn.SetMarkerStyle(24)
        profile_knn.GetXaxis().SetTitle( "#it{m}_{jj} [GeV]" ) #latex(VARX, ROOT=True) + "[GeV]") #+ " = log(m^{2}/p_{T}^{2})")
        #profile.GetXaxis().SetTitle("Large-#it{R} jet " + latex(VARX, ROOT=True))# + " = log(m^{2}/p_{T}^{2})")
        profile_org.GetYaxis().SetTitle("Selection efficiency (%)") # for #it{n}_{trk}^{#varepsilon=%s%%}>0" % ( EFF))

        profile_knn.GetYaxis().SetNdivisions(505)
        #profile_knn.GetXaxis().SetNdivisions(505)
        profile_knn.GetXaxis().SetTitleOffset(1.4)
        profile_knn.GetYaxis().SetTitleOffset(1.8)
        profile_knn.GetXaxis().SetRangeUser(*XRANGE)
        profile_org.GetXaxis().SetRangeUser(*XRANGE)

        yrange = (0., EFF*3) #2.0 percent
        if yrange:
            profile_knn.GetYaxis().SetRangeUser(*yrange)
            profile_org.GetYaxis().SetRangeUser(*yrange)
            pass

        # Draw
        profile_org.Draw()
        profile_knn.Draw("same")

        # Save
        mkdir('figures/knn/')
        c.save('figures/knn/{}_eff_{}_{:s}_{}_{}_stat{}.pdf'.format(FIT, 'sig' if sig else 'bkg', VAR, EFF, MODEL+INPUT, MIN_STAT))
        #c.save('figures/knn/{}_eff_{}_{:s}_{}_{}_stat{}.png'.format(FIT, 'sig' if sig else 'bkg', VAR, EFF, MODEL, MIN_STAT))
        c.save('figures/knn/{}_eff_{}_{:s}_{}_{}_stat{}.eps'.format(FIT, 'sig' if sig else 'bkg', VAR, EFF, MODEL+INPUT, MIN_STAT))
        del c
        
        pass

    return
Ejemplo n.º 28
0
def main(args):

    # Initialise
    args, cfg = initialise(args)

    # Common definitions
    experiment = 'classifier'
    paths = sorted(glob.glob(
        'optimisation/{}/output/*.out'.format(experiment)))

    num_steps = 100

    # Loop all run outputs
    means, stds, results = list(), list(), list()
    for path in paths[:num_steps]:

        # Run-log
        with open(path, 'r') as f:
            lines = [l for l in f]

            # Number of training epochs, to identify the last one
            num_epochs = max(
                map(
                    int,
                    map(
                        lambda l: l.split('/')[-1],
                        filter(lambda l: re.search('^Epoch [\d]+/[\d]+ *$', l),
                               lines))))

            # Indices of line holding the results for the last training epoch in each CV fold
            try:
                indices = np.array(
                    zip(*filter(
                        lambda t: 'Epoch {e}/{e}'.format(e=num_epochs) in t[1],
                        enumerate(lines)))[0]) + 1
            except IndexError:
                continue

            # Validation losses for each CV fold
            val_losses = list()
            for idx in indices:
                fields = lines[idx].split()
                jdx = fields.index('val_loss:') + 1
                val_loss = float(fields[jdx])
                val_losses.append(val_loss)
                pass

            # Append results for current evaluation
            means.append(np.mean(val_losses))
            stds.append(np.std(val_losses))
            pass
        pass

    # Check losses
    print "Optimisation metrics, sorted by mean + 1 sigma, for robustness:"
    for i, m, s in sorted(zip(range(len(means)), means, stds),
                          key=lambda t: t[1] + t[2]):
        print "  [{:3d}] {:7.4f} ± {:6.4f}".format(i + 1, m, s)
        pass

    # Compute running, best mean
    means = np.array(means)
    stds = np.array(stds)
    bins = np.arange(len(means), dtype=np.float) + 1
    best_mean = np.array([np.min(means[:i + 1]) for i in range(len(means))])
    idx_improvements = [
        0
    ] + list(np.where(np.abs(np.diff(best_mean)) > 0)[0] + 1)

    # Create graph
    graph = TGraphErrors(len(bins), bins, means, bins * 0, stds)

    # Plot
    plot(experiment, means, graph, idx_improvements, best_mean, bins)

    return 0
Ejemplo n.º 29
0
def main (args):

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, _, _ = load_data(args.input + 'data.h5', test_full_signal=True)

    #variable = VAR_TAU21
    variable = VAR_N2
    #variable = VAR_DECDEEP
    #variable = VAR_DEEP

    if variable == VAR_N2:
        fit_range = FIT_RANGE_N2
    elif variable == VAR_TAU21:
        fit_range = FIT_RANGE_TAU21
    elif variable == VAR_DECDEEP:
	fit_range = FIT_RANGE_DECDEEP
    elif variable == VAR_DEEP:
	fit_range = FIT_RANGE_DEEP
    else:
	print "invalid variable"
	return 0

    # Add DDT variable
    add_ddt(data, feat=variable, path='models/ddt/ddt_{}.pkl.gz'.format(variable))

    # Load transform
    ddt = loadclf('models/ddt/ddt_{}.pkl.gz'.format(variable))

    # --------------------------------------------------------------------------
    # 1D plot

    # Define variable(s)
    msk = data['signal'] == 0

    # Fill profiles
    profiles = dict()
    for var in [variable, variable + 'DDT']:
        profiles[var] = fill_profile(data[msk], var)
        pass

    # Convert to graphs
    graphs = dict()
    for key, profile in profiles.iteritems():
        # Create arrays from profile
        arr_x, arr_y, arr_ex, arr_ey = array('d'), array('d'), array('d'), array('d')
        for ibin in range(1, profile.GetXaxis().GetNbins() + 1):
            if profile.GetBinContent(ibin) != 0. or profile.GetBinError(ibin) != 0.:
                arr_x .append(profile.GetBinCenter (ibin))
                arr_y .append(profile.GetBinContent(ibin))
                arr_ex.append(profile.GetBinWidth  (ibin) / 2.)
                arr_ey.append(profile.GetBinError  (ibin))
                pass
            pass

        # Create graph
        graphs[key] = ROOT.TGraphErrors(len(arr_x), arr_x, arr_y, arr_ex, arr_ey)
        pass

    # Plot 1D transform
    plot1D(graphs, ddt, arr_x, variable, fit_range)


    # --------------------------------------------------------------------------
    # 2D plot

    # Create contours
    binsx = np.linspace(1.5, 5.0, 40 + 1, endpoint=True)
    if variable == VAR_N2:
    	binsy = np.linspace(0.0, 0.8, 40 + 1, endpoint=True)
    else:
	binsy = np.linspace(0.0, 1.4, 40 + 1, endpoint=True)

    contours = dict()
    for sig in [0,1]:

        # Get signal/background mask
        msk = data['signal'] == sig

        # Normalise jet weights
        w  = data.loc[msk, VAR_WEIGHT].values
        w /= math.fsum(w)

        # Prepare inputs
        X = data.loc[msk, [VAR_RHODDT, variable]].values

        # Fill, store contour
        contour = ROOT.TH2F('2d_{}'.format(sig), "", len(binsx) - 1, binsx, len(binsy) - 1, binsy)
        root_numpy.fill_hist(contour, X, weights=w)
        contours[sig] = contour
        pass

    # Linear discriminant analysis (LDA)
    lda = LinearDiscriminantAnalysis()
    X = data[[VAR_RHODDT, variable]].values
    y = data['signal'].values
    w = data[VAR_WEIGHT].values
    p = w / math.fsum(w)
    indices = np.random.choice(y.shape[0], size=int(1E+06), p=p, replace=True)
    lda.fit(X[indices], y[indices])  # Fit weighted sample

    # -- Linear fit to decision boundary
    xx, yy = np.meshgrid(binsx, binsy)
    Z = lda.predict_proba(np.c_[xx.ravel(), yy.ravel()])
    Z = Z[:, 1].reshape(xx.shape)
    yboundary = binsy[np.argmin(np.abs(Z - 0.5), axis=0)]
    xboundary = binsx
    lda = LinearRegression()
    lda.fit(xboundary.reshape(-1,1), yboundary)

    # Plot 2D scatter
    plot2D(data, ddt, lda, contours, binsx, binsy, variable)
    return
Ejemplo n.º 30
0
def main(args):

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, features, _ = load_data(args.input + 'data.h5',
                                  test_full_signal=True)

    #data, features, _ = load_data(args.input + 'data.h5', train_full_signal=True)  #for faster checking, don't use for actual comparison

    # Common definitions
    # --------------------------------------------------------------------------

    def meaningful_digits(number):
        digits = 0
        if number > 0:
            digits = int(np.ceil(max(-np.log10(number), 0)))
            pass
        return '{l:.{d:d}f}'.format(d=digits, l=number)

    # Tagger feature collection
    #tagger_features = ['Tau21','Tau21DDT', 'D2', kNN_var, 'D2', 'D2CSS', 'NN', ann_var, 'Adaboost', uboost_var]
    #tagger_features = ['tau21', 'tau21DDT', 'tau21', 'tau21kNN', 'tau21', 'tau21CSS', 'N2_B1', 'N2_B1DDT', 'N2_B1', 'N2_B1kNN', 'N2_B1', 'N2_B1CSS']; title="tau21_vs_N2_B1"
    #tagger_features = ['N2_B1', 'N2_B1DDT', 'N2_B1', 'N2_B1kNN', 'N2_B1', 'N2_B1CSS']; title="N2_B1"
    #tagger_features = ['tau21', 'tau21DDT', 'N2_B1', 'N2_B1kNN', 'N2_B1', 'N2_B1CSS']; title="ATLAS"
    #tagger_features = ['decDeepWvsQCD', 'decDeepWvsQCDDDT', 'decDeepWvsQCD', 'decDeepWvsQCDkNN', 'decDeepWvsQCD', 'decDeepWvsQCDCSS']; title="decDeep"

    #tagger_features = {'tau21':['','DDT'], 'N2_B1':['','kNN','CSS']}; title='ATLAS2'
    #tagger_features = {'tau21':['','DDT'], 'N2_B1':['','kNN',], 'decDeepWvsQCD':['','kNN'], 'DeepWvsQCD':['','kNN']}; title='Deep_vs_Analytic'
    #tagger_features = {'tau21':[''], 'N2_B1':[''], 'decDeepWvsQCD':[''], 'DeepWvsQCD':['']}; title='Deep_Check2'
    tagger_features = {
        'tau21': ['', 'DDT', 'kNN', 'CSS'],
        'N2_B1': ['', 'DDT', 'kNN', 'CSS']
    }
    title = 'Corrected_Full_Analytic'
    #tagger_features = {'tau21':['', 'DDT', 'kNN', 'CSS'], 'N2_B1':['', 'DDT', 'kNN','CSS']}; title='Full_Analytic_vs_Atlas'

    extracted_features = []
    for basevar in tagger_features.keys():
        for suffix in tagger_features[basevar]:
            extracted_features.append(basevar + suffix)

    # Add variables
    # --------------------------------------------------------------------------
    with Profile("Add variables"):

        # the selections of which variables to add could also be automated from the tagger_features list...

        # Tau21DDT
        from run.ddt.common import add_ddt
        add_ddt(data, feat='tau21', path='models/ddt/ddt_tau21.pkl.gz')

        # N2DDT
        from run.ddt.common import add_ddt
        add_ddt(data, feat='N2_B1', path='models/ddt/ddt_N2_B1.pkl.gz')

        ## decDeepQvsQCDDDT
        #from run.ddt.common import add_ddt
        #add_ddt(data, feat='decDeepWvsQCD', path='models/ddt/ddt_decDeepWvsQCD.pkl.gz')

        ## DeepQvsQCDDDT
        #from run.ddt.common import add_ddt
        #add_ddt(data, feat='DeepWvsQCD', path='models/ddt/ddt_DeepWvsQCD.pkl.gz')

        # Tau21-kNN
        from run.knn.common import add_knn, VAR_TAU21 as kNN_basevar, TAU21_EFF as kNN_eff
        print "k-NN base variable: {} (cp. {})".format(kNN_basevar,
                                                       'tau_{21}-k#minusNN')
        add_knn(data,
                feat=kNN_basevar,
                path='models/knn/knn_{}_{}.pkl.gz'.format(
                    kNN_basevar, kNN_eff))

        # N2-kNN
        from run.knn.common import add_knn, VAR_N2 as kNN_basevar, N2_EFF as kNN_eff
        print "k-NN base variable: {} (cp. {})".format(kNN_basevar,
                                                       'N_{2}-kNN')
        add_knn(data,
                feat=kNN_basevar,
                path='models/knn/knn_{}_{}.pkl.gz'.format(
                    kNN_basevar, kNN_eff))

        ## decDeepWvsQCD-kNN
        #from run.knn.common import add_knn, VAR_DECDEEP as kNN_basevar, DECDEEP_EFF as kNN_eff
        #print "k-NN base variable: {} (cp. {})".format(kNN_basevar, 'decDeepWvsQCD')
        #add_knn(data, feat=kNN_basevar, path='models/knn/knn_{}_{}.pkl.gz'.format(kNN_basevar, kNN_eff))

        ## DeepWvsQCD-kNN
        #from run.knn.common import add_knn, VAR_DEEP as kNN_basevar, DEEP_EFF as kNN_eff
        #print "k-NN base variable: {} (cp. {})".format(kNN_basevar, 'DeepWvsQCD')
        #add_knn(data, feat=kNN_basevar, path='models/knn/knn_{}_{}.pkl.gz'.format(kNN_basevar, kNN_eff))

        # Tau21-CSS
        from run.css.common import add_css
        add_css("tau21", data)

        # N2-CSS
        from run.css.common import add_css
        add_css("N2_B1", data)

        ## decDeepWvsQCD-CSS
        #from run.css.common import add_css
        #add_css("decDeepWvsQCD", data)

        ## DeepWvsQCD-CSS
        #from run.css.common import add_css
        #add_css("DeepWvsQCD", data)

        pass

    # Remove unused variables
    #used_variables = set(tagger_features + ['m', 'pt', 'weight_test', 'npv'])
    used_variables = set(extracted_features +
                         ['m', 'pt', 'weight_test', 'npv'])
    unused_variables = [var for var in list(data) if var not in used_variables]
    data.drop(columns=unused_variables)
    gc.collect()

    # Perform performance studies
    perform_studies(data,
                    args,
                    tagger_features,
                    extracted_features,
                    title=title)
    return 0