def main(args): # Initialise args, cfg = initialise(args) # Load data data, features, _ = load_data(args.input + 'data.h5', train=True, background=True) #variable = "tau21" #bins = TAU21BINS variable = "N2_B1" bins = N2BINS #variable = "decDeepWvsQCD" #bins = DECDEEPBINS #variable = "DeepWvsQCD" #bins = DEEPBINS # Add CSS variable add_css(variable, data) # Plot CSS distributions for each mass bin plot_distributions(data, variable, bins) return 0
def main(args): # Initialise args, cfg = initialise(args) # Load data data, features, _ = load_data(args.input + 'data.h5', train=True, background=True) # Fill Tau21 profile profile = fill_profile(data, VAR_TAU21) # Fit profile fit = ROOT.TF1('fit', 'pol1', *FIT_RANGE) profile.Fit('fit', 'RQ0') intercept_val, coef_val = fit.GetParameter(0), fit.GetParameter(1) intercept_err, coef_err = fit.GetParError(0), fit.GetParError(1) # Create scikit-learn transform ddt = LinearRegression() ddt.coef_ = np.array([coef_val]) ddt.intercept_ = np.array([-coef_val * FIT_RANGE[0]]) ddt.offset_ = np.array([coef_val * FIT_RANGE[0] + intercept_val]) print "Fitted function:" print " intercept: {:7.4f} ± {:7.4f}".format(intercept_val, intercept_err) print " coef: {:7.4f} ± {:7.4f}".format(coef_val, coef_err) # Save DDT transform saveclf(ddt, 'models/ddt/ddt.pkl.gz') return 0
def main(args): # Initialise args, cfg = initialise(args) # Load data data, features, _ = load_data(args.input + 'data.h5', sample=0.01) # @TEMP # Define classifier configuration(s) pattern = 'uboost_ur_{:4.2f}_te_92_rel21_fixed' urs = sorted([0.0, 0.01, 0.1, 0.3]) classifiers = [ ('AdaBoost' if ur == 0 else 'uBoost (#alpha={:4.2f})'.format(ur), pattern.format(ur).replace('.', 'p')) for ur in urs ] # Compute classifiers variables in parallel njobs = min(7, len(classifiers)) with Profile("Run tests in parallel"): ret = Parallel(n_jobs=njobs)(delayed(compute)(data, name) for _, name in classifiers) pass # Add classifier variables to data for name, staged_series in ret: for stage, series in enumerate(staged_series): data['{:s}__{:d}'.format(name, stage)] = series pass pass # Plot learning curves plot(data, urs, classifiers) return 0
def main(args): # Initialise args, cfg = initialise(args) # Load data data, _, _ = load_data( 'data/' + args.input) #Train=True removed since we use the data file # ------------------------------------------------------------------------- #### #### # Initialise Keras backend #### initialise_backend(args) #### #### # Neural network-specific initialisation of the configuration dict #### initialise_config(args, cfg) #### #### # Keras import(s) #### from keras.models import load_model #### #### # NN #### from run.adversarial.common import add_nn #### with Profile("NN"): #### classifier = load_model('models/adversarial/classifier/full/classifier.h5') #### add_nn(data, classifier, 'NN') #### pass # ------------------------------------------------------------------------- # Compute background efficiency at sig. eff. = 50% eff_sig = 0.10 fpr, tpr, thresholds = roc_curve(data['signal'], data[VAR], sample_weight=data['weight']) idx = np.argmin(np.abs(tpr - eff_sig)) print "Background acceptance @ {:.2f}% sig. eff.: {:.2f}% ({} > {:.2f})".format( eff_sig * 100., (fpr[idx]) * 100., VAR, thresholds[idx]) #changed from 1-fpr[idx] #print "Signal efficiency @ {:.2f}% bkg. acc.: {:.2f}% ({} > {:.2f})".format(eff_sig * 100., (fpr[idx]) * 100., VAR, thresholds[idx]) #changed from 1-fpr[idx] print "Chosen target efficiency: {:.2f}%".format(EFF) # Filling profile data = data[data['signal'] == 0] profile_meas, (x, y, z) = fill_profile(data) # Format arrays X = np.vstack((x.flatten(), y.flatten())) X = X.T Y = z.flatten() # Fit KNN regressor print "debugging more: x.shape = ", X.shape, ", y.ndim = ", Y.ndim knn = KNeighborsRegressor(weights='distance') knn.fit(X, Y) # Save KNN classifier saveclf(knn, 'models/knn/knn_{:s}_{}_{}.pkl.gz'.format(VAR, EFF, MODEL)) return 0
def main(args): # Initialise args, cfg = initialise(args) # Load data #data, _, _ = load_data(args.input + 'data.h5', train=True) data, _, _ = load_data(args.input + 'data.h5', train_full_signal=True) variable = VAR_TAU21 signal_above = False bg_eff = TAU21_EFF #variable = VAR_N2; signal_above=False #bg_eff = N2_EFF #variable = VAR_DECDEEP; signal_above=True #bg_eff = DECDEEP_EFF #variable = VAR_DEEP; signal_above=True #bg_eff = DEEP_EFF ## training on a list of working points: #for bg_eff in WORKING_POINTS: # train(data, variable, bg_eff, signal_above=signal_above) #print "reached end of main()" #return 0 # ------------------------------------------------------------------------- #### #### # Initialise Keras backend #### initialise_backend(args) #### #### # Neural network-specific initialisation of the configuration dict #### initialise_config(args, cfg) #### #### # Keras import(s) #### from keras.models import load_model #### #### # NN #### from run.adversarial.common import add_nn #### with Profile("NN"): #### classifier = load_model('models/adversarial/classifier/full/classifier.h5') #### add_nn(data, classifier, 'NN') #### pass # ------------------------------------------------------------------------- # Compute background efficiency at sig. eff. = 50% eff_sig = 0.5 fpr, tpr, thresholds = roc_curve(data['signal'], data[variable], sample_weight=data['weight_test']) idx = np.argmin(np.abs(tpr - eff_sig)) print "Background acceptance @ {:.2f}% sig. eff.: {:.2f}% ({} < {:.2f})".format( eff_sig * 100., (1 - fpr[idx]) * 100., variable, thresholds[idx]) print "Chosen target efficiency: {:.2f}%".format(bg_eff) ## I think if the signal is above the background, the bg efficiency should be taken as (100 - bg efficiency) train(data, variable, bg_eff, signal_above=signal_above) print "reached end of main()" return 0
def main (args): # Initialise args, cfg = initialise(args) # Load data data, features, _ = load_data(args.input + 'data.h5', background=True, train=True) # Fill substructure profile perform_optimisation("D2", D2BINS, data) return
def main(args): # Initialise args, cfg = initialise(args) # Load data data, features, _ = load_data(args.input + 'data.h5', train=True, background=True) #variable = VAR_TAU21 variable = VAR_N2 #variable = VAR_DECDEEP #variable = VAR_DEEP # Fill variable profile profile = fill_profile(data, variable) # Fit profile if variable == VAR_N2: fit_range = FIT_RANGE_N2 elif variable == VAR_TAU21: fit_range = FIT_RANGE_TAU21 elif variable == VAR_DECDEEP: fit_range = FIT_RANGE_DECDEEP elif variable == VAR_DEEP: fit_range = FIT_RANGE_DEEP else: print "variable invalid" return 0 fit = ROOT.TF1('fit', 'pol1', *fit_range) profile.Fit('fit', 'RQ0') intercept_val, coef_val = fit.GetParameter(0), fit.GetParameter(1) intercept_err, coef_err = fit.GetParError(0), fit.GetParError(1) # Create scikit-learn transform ddt = LinearRegression() ddt.coef_ = np.array([coef_val]) ddt.intercept_ = np.array([-coef_val * fit_range[0]]) ddt.offset_ = np.array([coef_val * fit_range[0] + intercept_val]) print "Fitted function:" print " intercept: {:7.4f} ± {:7.4f}".format(intercept_val, intercept_err) print " coef: {:7.4f} ± {:7.4f}".format(coef_val, coef_err) # Save DDT transform saveclf(ddt, 'models/ddt/ddt_{}.pkl.gz'.format(variable)) print "got to the end of main()" return 0
def main(args): # Initialise args, cfg = initialise(args) # Load data data, features, _ = load_data(args.input + 'data.h5', background=True, train=True) # Fill substructure profile perform_optimisation("tau21", TAU21BINS, data) perform_optimisation("N2_B1", N2BINS, data) #perform_optimisation("decDeepWvsQCD", DECDEEPBINS, data) #perform_optimisation("DeepWvsQCD", DEEPBINS, data) return
def main(args): # Initialise args, cfg = initialise(args) # Load data data, features, _ = load_data(args.input + 'data.h5', train=True, background=True) # Add CSS variable var = "D2" add_css(var, data) # Plot D2(CSS) distributions for each mass bin plot_distributions(data, var) return 0
def main(args): # Initialise args, cfg = initialise(args) # Load data data, _, _ = load_data(args.input + 'data.h5') msk = data['train'] == 1 features = filter(lambda s: s.startswith('fjet_'), list(data)) X = data[features] y = data['signal'] w = data['mcEventWeight'] dtrain = xgb.DMatrix(X[msk], label=y[msk], weight=w[msk]) dtest = xgb.DMatrix(X[~msk], label=y[~msk], weight=w[~msk]) param = { 'max_depth': 4, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic' } num_round = 100 bst = xgb.train(param, dtrain, num_round) # make prediction preds = bst.predict(dtest) importance = bst.get_fscore() for name, score in sorted(list(importance.iteritems()), key=lambda t: t[1], reverse=True): print " {:15s}: {:4.1f}".format(name, score) pass return 0
def main(args): # Initialise args, cfg = initialise(args) # Common definitions num_folds = 3 # Perform classifier loss study plot_classifier_training_loss(num_folds) # Compute entropy of decorrelation variable posterior data, _, _ = load_data(args.input + 'data.h5', train=True, background=True) decorrelation = get_decorrelation_variables(data) H_prior = entropy(decorrelation, weights=data['weight_adv']) print "Entropy of prior: {}".format(H_prior) # Perform adversarial loss study for lambda_reg in [10, 100]: plot_adversarial_training_loss(lambda_reg, num_folds, 10, H_prior) pass return 0
def main(args): # Initialise args, cfg = initialise(args) # Initialise Keras backend initialise_backend(args) # Neural network-specific initialisation of the configuration dict initialise_config(args, cfg) # Keras import(s) import keras.backend as K from keras.models import load_model # Project import(s) from adversarial.models import classifier_model, adversary_model, combined_model, decorrelation_model # Load data data, features, _ = load_data(args.input + 'data.h5', test=True) def meaningful_digits(number): digits = 0 if number > 0: digits = int(np.ceil(max(-np.log10(number), 0))) pass return '{l:.{d:d}f}'.format(d=digits, l=number) # -- Adversarial neural network (ANN) scan lambda_reg = 100. lambda_regs = sorted([100.]) ann_vars = list() lambda_strs = list() for lambda_reg_ in lambda_regs: lambda_str = meaningful_digits(lambda_reg_).replace('.', 'p') lambda_strs.append(lambda_str) ann_var_ = "ANN(#lambda={:s})".format(lambda_str.replace('p', '.')) ann_vars.append(ann_var_) pass ann_var = ann_vars[lambda_regs.index(lambda_reg)] print "ann_var" print ann_var # Tagger feature collection # tagger_features = ['NN', ann_var] tagger_features = ['NN', ann_var, 'MV2c10', 'XbbScoreHiggs'] # tagger_features = ['MV2c10'] # Add variables # -------------------------------------------------------------------------- with Profile("Add variables"): # NN from run.adversarial.common import add_nn with Profile("NN"): classifier = load_model( 'models/adversarial/classifier/full/classifier.h5') add_nn(data, classifier, 'NN') pass # ANN with Profile("ANN"): from adversarial.utils import DECORRELATION_VARIABLES adversary = adversary_model( gmm_dimensions=len(DECORRELATION_VARIABLES), **cfg['adversary']['model']) combined = combined_model(classifier, adversary, **cfg['combined']['model']) for ann_var_, lambda_str_ in zip(ann_vars, lambda_strs): print "== Loading model for {}".format(ann_var_) combined.load_weights( 'models/adversarial/combined/full/combined_lambda{}.h5'. format(lambda_str_)) add_nn(data, classifier, ann_var_) pass pass with Profile("MV2c10"): data["MV2c10"] = pd.concat( [data["MV2c10_discriminant_1"], data["MV2c10_discriminant_2"]], axis=1).min(axis=1) # Add MV2 and XbbScore here # e.g. min(MV2_sj1, MV2_sj2) # Remove unused variables used_variables = set(tagger_features + ann_vars + ['mass', 'pt', 'npv', 'weight_test']) unused_variables = [var for var in list(data) if var not in used_variables] data.drop(columns=unused_variables) gc.collect() # Perform performance studies perform_studies(data, args, tagger_features, ann_vars) return 0
def main(args): # Initialise args, cfg = initialise(args) # Load data data, _, _ = load_data(args.input + 'data.h5', train=True, background=True) # ------------------------------------------------------------------------- #### #### # Initialise Keras backend #### initialise_backend(args) #### #### # Neural network-specific initialisation of the configuration dict #### initialise_config(args, cfg) #### #### # Keras import(s) #### from keras.models import load_model #### #### # NN #### from run.adversarial.common import add_nn #### with Profile("NN"): #### classifier = load_model('models/adversarial/classifier/full/classifier.h5') #### add_nn(data, classifier, 'NN') #### pass # ------------------------------------------------------------------------- # Fill measured profile profile_meas, _ = fill_profile(data) # Loading KNN classifier knn = loadclf('models/knn/knn_{:s}_{:.0f}.pkl.gz'.format(VAR, EFF)) # Filling fitted profile with Profile("Filling fitted profile"): rebin = 8 edges, centres = dict(), dict() for ax, var in zip(['x', 'y'], [VARX, VARY]): # Short-hands vbins, vmin, vmax = AXIS[var] # Re-binned bin edges @TODO: Make standardised right away? edges[ax] = np.interp( np.linspace(0, vbins, vbins * rebin + 1, endpoint=True), range(vbins + 1), np.linspace(vmin, vmax, vbins + 1, endpoint=True)) # Re-binned bin centres centres[ax] = edges[ax][:-1] + 0.5 * np.diff(edges[ax]) pass # Get predictions evaluated at re-binned bin centres g = dict() g['x'], g['y'] = np.meshgrid(centres['x'], centres['y']) g['x'], g['y'] = standardise(g['x'], g['y']) X = np.vstack((g['x'].flatten(), g['y'].flatten())).T fit = knn.predict(X).reshape(g['x'].shape).T # Fill ROOT "profile" profile_fit = ROOT.TH2F('profile_fit', "", len(edges['x']) - 1, edges['x'].flatten('C'), len(edges['y']) - 1, edges['y'].flatten('C')) root_numpy.array2hist(fit, profile_fit) pass # Plotting with Profile("Plotting"): for fit in [False, True]: # Select correct profile profile = profile_fit if fit else profile_meas # Plot plot(profile, fit) pass pass return
def main(args): # Initialise args, cfg = initialise(args) # Load data data, _, _ = load_data( 'data/' + args.input) #, Train=True) removed since we use the data file # ------------------------------------------------------------------------- #### #### # Initialise Keras backend #### initialise_backend(args) #### #### # Neural network-specific initialisation of the configuration dict #### initialise_config(args, cfg) #### #### # Keras import(s) #### from keras.models import load_model #### #### # NN #### from run.adversarial.common import add_nn #### with Profile("NN"): #### classifier = load_model('models/adversarial/classifier/full/classifier.h5') #### add_nn(data, classifier, 'NN') #### pass # ------------------------------------------------------------------------- # Compute background efficiency at sig. eff. = 50% eff_sig = 0.10 fpr, tpr, thresholds = roc_curve(data['signal'], data[VAR], sample_weight=data['TotalEventWeight']) idx = np.argmin(np.abs(tpr - eff_sig)) print "Background acceptance @ {:.2f}% sig. eff.: {:.2f}% ({} > {:.2f})".format( eff_sig * 100., (fpr[idx]) * 100., VAR, thresholds[idx]) #changed from 1-fpr[idx] #print "Signal efficiency @ {:.2f}% bkg. acc.: {:.2f}% ({} > {:.2f})".format(eff_sig * 100., (fpr[idx]) * 100., VAR, thresholds[idx]) #changed from 1-fpr[idx] print "Chosen target efficiency: {:.2f}%".format(EFF) # Filling profile data = data[data['signal'] == 0] profile_meas, (x, y, err) = fill_profile_1D(data) # Format arrays X = x.reshape(-1, 1) weights = 1 / err print X # Fit KNN regressor if 'knn1D' == FIT: knn = KNeighborsRegressor(5, weights='distance') knn.fit(X, y) #.predict(X) elif 'knn1D_v2' in FIT: knn = KNeighborsRegressor(5, weights='uniform') knn.fit(X, y) #.predict(X) elif 'knn1D_v3' in FIT: knn = KNeighborsRegressor(2, weights='uniform') knn.fit(X, y) #.predict(X) elif 'knn1D_v4' in FIT: knn = KNeighborsRegressor(3, weights='distance') knn.fit(X, y) #.predict(X) elif 'poly2' in FIT: knn = make_pipeline(PolynomialFeatures(degree=2), Ridge()) knn.fit(X, y) #.predict(X) #knn1 = PolynomialFeatures(degree=2) #knn1.fit(X, y) #X_poly = knn1.fit_transform(X) #knn = LinearRegression() #fit_intercept=False) #knn.fit(X_poly, y, weights) #score = round(reg.score(X_poly, y), 4) #coef = reg.coef_ #intercept = reg.intercept_ #print score, coef, intercept #knn.fit(X, y)#.predict(X) #print "Fit parameters: ", knn.transform(X).shape #get_feature_names() #get_params() #knn.coef_ elif 'poly3' in FIT: knn = make_pipeline(PolynomialFeatures(degree=3), Ridge()) knn.fit(X, y) #.predict(X) # Create scikit-learn transform elif 'lin' in FIT: knn = LinearRegression() knn.fit(X, y, weights) elif 'erf' in FIT: knn, pcov = curve_fit(func, x, y, p0=[73, 0.0004, 2000]) print "ERF: ", knn else: print "Weird FIT type chosen" #coef_val = np.polyfit(x, y, deg=1, w=weights) #knn.coef_ = np.array([coef_val[0]]) #knn.intercept_ = np.array([coef_val[1]]) #[-coef_val[0] * FIT_RANGE[0]]) #knn.offset_ = np.array([coef_val[0] * FIT_RANGE[0] + coef_val[1]]) print "Fitted function:" print " coef: {}".format(knn.coef_) print " intercept: {}".format(knn.intercept_) # Save DDT transform saveclf(knn, 'models/knn/{}_{:s}_{}_{}.pkl.gz'.format(FIT, VAR, EFF, MODEL)) # Save fit parameters to a ROOT file #TCoef = ROOT.TVector3(coef[0], coef[1], coef[2]) #outFile = ROOT.TFile.Open("models/{}_jet_ungrtrk500_eff{}_stat{}_data.root".format(FIT, EFF, MIN_STAT),"RECREATE") #outFile.cd() #TCoef.SetName("coefficients") #TCoef.Write() #outFile.Close() return 0
def main(args): # Initialise args, cfg = initialise(args) # Initialise Keras backend initialise_backend(args) # Neural network-specific initialisation of the configuration dict initialise_config(args, cfg) # Keras import(s) import keras.backend as K from keras.models import load_model # Project import(s) from adversarial.models import classifier_model, adversary_model, combined_model, decorrelation_model # Load data data, features, _ = load_data('data/' + args.input, test=True) # Common definitions # -------------------------------------------------------------------------- # -- k-nearest neighbour #kNN_var = 'D2-k#minusNN' #kNN_var = 'C1_02-knn' #base_var = 'sub_jet_ntrk' #kNN_var = base_var.replace('sub_jet_', '') + '-knn' #base_var = ['lead_jet_ungrtrk500', 'sub_jet_ungrtrk500'] #kNN_var = [var.replace('jet', 'knn') for var in base_var] base_var = 'jet_ungrtrk500' kNN_var = base_var.replace('jet', 'knn') #base_var = ['jet_ungrtrk500'] #kNN_var = [var.replace('jet', 'knn') for var in base_var] #base_var = ['ntrk_sum'] #kNN_var = [var + '-knn' for var in base_var] def meaningful_digits(number): digits = 0 if number > 0: digits = int(np.ceil(max(-np.log10(number), 0))) pass return '{l:.{d:d}f}'.format(d=digits, l=number) """ # -- Adversarial neural network (ANN) scan lambda_reg = 10. lambda_regs = sorted([1., 3., 10.]) å ham har jeg talt med løbende. For mange dage siden har vi talt om, om man kunne bruge grundlovsdag, og hvordan det ville hænge sammen med de frister, der er. In ann_vars = list() lambda_strs = list() for lambda_reg_ in lambda_regs: lambda_str = meaningful_digits(lambda_reg_).replace('.', 'p') lambda_strs.append(lambda_str) ann_var_ = "ANN(#lambda={:s})".format(lambda_str.replace('p', '.')) ann_vars.append(ann_var_) pass ann_var = ann_vars[lambda_regs.index(lambda_reg)] # -- uBoost scan uboost_eff = 92 uboost_ur = 0.3 uboost_urs = sorted([0., 0.01, 0.1, 0.3, 1.0]) uboost_var = 'uBoost(#alpha={:s})'.format(meaningful_digits(uboost_ur)) uboost_vars = ['uBoost(#alpha={:s})'.format(meaningful_digits(ur)) for ur in uboost_urs] uboost_pattern = 'uboost_ur_{{:4.2f}}_te_{:.0f}_rel21_fixed'.format(uboost_eff) """ # Tagger feature collection #tagger_features = ['Tau21','Tau21DDT', 'D2', kNN_var, 'D2', 'D2CSS', 'NN', ann_var, 'Adaboost', uboost_var] #tagger_features = ['lead_jet_C1_02', kNN_var] tagger_features = [ 'lead_' + base_var, 'lead_' + kNN_var, 'sub_' + base_var, 'sub_' + kNN_var ] #tagger_features = base_var + kNN_var # Add variables # -------------------------------------------------------------------------- with Profile("Add variables"): #for i in range(len(base_var)): from run.knn.common import add_knn, MODEL as sigModel, VAR as kNN_basevar, EFF as kNN_eff print "k-NN base variable: {} (cp. {})".format(base_var, kNN_var) add_knn(data, newfeat='lead_' + kNN_var, path='models/knn/knn_{}_{}_{}.pkl.gz'.format( base_var, kNN_eff, sigModel)) add_knn(data, newfeat='sub_' + kNN_var, path='models/knn/knn_{}_{}_{}.pkl.gz'.format( base_var, kNN_eff, sigModel)) # Remove unused variables used_variables = set(tagger_features + ['lead_jet_m', 'lead_jet_pt', 'dijetmass', 'weight']) unused_variables = [var for var in list(data) if var not in used_variables] data.drop(columns=unused_variables) gc.collect() # Perform performance studies perform_studies(data, args, tagger_features) return 0
def main(args): # Definitions histstyle = dict(**HISTSTYLE) # Initialise args, cfg = initialise(args) # Load data mc, features, _ = load_data('data/djr_LCTopo_2.h5') #, test=True) # data, features, _ = load_data('data/djr_LCTopo_data.h5') #, test=True) # histstyle[True]['label'] = 'Multijets' histstyle[False]['label'] = 'Dark jets, Model A, m = 2 TeV' # Add knn variables #base_var = ['lead_jet_ungrtrk500', 'sub_jet_ungrtrk500'] base_var = 'jet_ungrtrk500' kNN_var = base_var.replace('jet', 'knn') #base_vars = ['lead_'+base_var, 'sub_'+base_var] #kNN_vars = ['lead_'+kNN_var, 'sub_'+kNN_var] """ with Profile("Add variables"): #for i in range(len(base_var)): print "k-NN base variable: {} (cp. {})".format(base_var, kNN_var) add_knn(data, newfeat='lead_'+kNN_var, path='models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel)) add_knn(data, newfeat='sub_'+kNN_var, path='models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel)) add_knn(mc, newfeat='lead_'+kNN_var, path='models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel)) add_knn(mc, newfeat='sub_'+kNN_var, path='models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel)) """ #add_knn(data, newfeat=kNN_var, path='models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel)) bins_pt = np.linspace(450, 5000, 50) # Useful masks msk_bkg_data = data['signal'] == 0 msk_bkg_mc = (mc['signal'] == 0) #& (mc['weight']<0.0002) msk_sig_mc = (mc['signal'] == 1) #& (mc['weight']<0.0002) msk_CR = (mc['lead_jet_ungrtrk500'] < 20) | (mc['sub_jet_ungrtrk500'] < 20) scale = 139 * 1000000 # (inverse nanobarn) # pT dist c = rp.canvas(batch=True) hist_incl_data = c.hist(data.loc[msk_bkg_data, 'jet_pt'].values, bins=bins_pt, weights=data.loc[msk_bkg_data, 'weight'].values, label="Data, control region", normalise=False, linecolor=ROOT.kGreen + 2) hist_incl_mc = c.hist(mc.loc[msk_bkg_mc, 'sub_jet_pt'].values, bins=bins_pt, weights=scale * mc.loc[msk_bkg_mc, 'weight'].values, label="MC, scaled with lumi", normalise=False, linecolor=ROOT.kViolet + 2) hist_incl_sig = c.hist(mc.loc[msk_sig_mc, 'sub_jet_pt'].values, bins=bins_pt, weights=mc.loc[msk_sig_mc, 'weight'].values, label="Combined Signal", normalise=False, linecolor=ROOT.kOrange + 2) c.legend(width=0.4, xmin=0.5, ymax=0.9) c.ylabel("Number of events") c.xlabel("Sub-leading jet pT [GeV]") c.logy() #c.ylim(0.00005, 5) #c.save('figures/distributions/mjj_Bkg_CR20.pdf'.format(knnCut)) #c.save('figures/distributions/mjj_Bkg_CR20.eps'.format(knnCut)) c.save('figures/distributions/sub_pt_bkg_data_mc.pdf') c.save('figures/distributions/sub_pt_bkg_data_mc.eps') print "Data bkg effective entries: ", hist_incl_data.GetEffectiveEntries() print "MC bkg effective entries: ", hist_incl_mc.GetEffectiveEntries() print "Data bkg integral: ", hist_incl_data.Integral() print "MC bkg integral: ", hist_incl_mc.Integral() del c c = rp.canvas(batch=True) hist_bkg_CR = c.hist(mc.loc[(msk_bkg_mc & msk_CR), 'lead_jet_pt'].values, bins=bins_pt, weights=scale * mc.loc[(msk_bkg_mc & msk_CR), 'weight'].values, label="MC, control region", normalise=False, linecolor=ROOT.kGreen + 2) hist_sig_CR = c.hist(mc.loc[(msk_sig_mc & msk_CR), 'lead_jet_pt'].values, bins=bins_pt, weights=mc.loc[(msk_sig_mc & msk_CR), 'weight'].values, label="MC, control region", normalise=False, linecolor=ROOT.kGreen + 2) print "CR sig contamination (eff. entries): ", hist_sig_CR.GetEffectiveEntries( ) / (hist_bkg_CR.GetEffectiveEntries() + hist_sig_CR.GetEffectiveEntries()) print "CR sig contamination (integral): ", hist_sig_CR.Integral() / ( hist_bkg_CR.Integral() + hist_sig_CR.Integral()) print "CR sig efficiency (eff. entries): ", hist_sig_CR.GetEffectiveEntries( ) / hist_incl_sig.GetEffectiveEntries() print "CR sig efficiency (integral): ", hist_sig_CR.Integral( ) / hist_incl_sig.Integral()
def main (args): # Definitions histstyle = dict(**HISTSTYLE) # Initialise args, cfg = initialise(args) # Load data data, features, _ = load_data(args.input + 'data.h5', background=True, train=True) pt_bins = np.linspace(200, 2000, 18 + 1, endpoint=True) pt_bins = zip(pt_bins[:-1], pt_bins[1:]) bins = np.linspace(50, 300, (300 - 50) // 10 + 1, endpoint=True) for pt_bin in pt_bins: histstyle[True] ['label'] = 'Inclusive' histstyle[False]['label'] = 'p_{{T}} #in [{:.0f}, {:.0f}] GeV'.format(*pt_bin) # Canvas c = rp.canvas(batch=True) # Plots msk = (data['pt'] > pt_bin[0]) & (data['pt'] < pt_bin[1]) c.hist(data['m'].values, bins=bins, weight=data['weight_adv'] .values, normalise=True, **histstyle[True]) c.hist(data['m'].values[msk], bins=bins, weight=data['weight_adv'] .values[msk], normalise=True, **histstyle[False]) c.hist(data['m'].values[msk], bins=bins, weight=data['weight_test'].values[msk], normalise=True, label="Testing weight", linewidth=2, linecolor=ROOT.kGreen) # Decorations c.legend() c.xlabel("Large-#it{R} jet mass [GeV]") c.ylabel("Fraction of jets") # Save c.save('figures/temp_mass_pT{:.0f}_{:.0f}.pdf'.format(*pt_bin)) pass return # Perform selection @NOTE: For Rel. 20.7 only #data = data[(data['m'] > 50) & (data['m'] < 300)] #data = data[(data['pt'] > 200) & (data['pt'] < 2000)] # Add variables @NOTE: For Rel. 20.7 only #data['rho'] = pd.Series(np.log(np.square(data['m']) / np.square(data['pt'])), index=data.index) #data['rhoDDT'] = pd.Series(np.log(np.square(data['m']) / data['pt'] / 1.), index=data.index) data['logm'] = pd.Series(np.log(data['m']), index=data.index) # Check variable distributions axes = { 'pt': (45, 200, 2000), 'm': (50, 50, 300), 'rho': (50, -8, 0), 'logm': (50, np.log(50), np.log(300)), } weight = 'weight_adv' # 'weight_test' / 'weight' pt_range = (200., 2000.) msk_pt = (data['pt'] > pt_range[0]) & (data['pt'] < pt_range[1]) for var in axes: # Canvas c = rp.canvas(num_pads=2, batch=True) # Plot bins = np.linspace(axes[var][1], axes[var][2], axes[var][0] + 1, endpoint=True) for adv in [0,1]: msk = data['signal'] == 0 # @TEMP signal msk &= msk_pt opts = dict(normalise=True, **HISTSTYLE[adv]) # @TEMP signal opts['label'] = 'adv' if adv else 'test' if adv: h1 = c.hist(data.loc[msk, var].values, bins=bins, weights=data.loc[msk, weight].values, **opts) else: h2 = c.hist(data.loc[msk, var].values, bins=bins, weights=data.loc[msk, 'weight_test'].values, **opts) pass pass # Ratio c.pads()[1].ylim(0,2) c.ratio_plot((h1,h2), oob=True) # Decorations c.legend() c.xlabel(latex(var, ROOT=True)) c.ylabel("Fraction of jets") c.pads()[1].ylabel("adv/test") #c.logy() c.text(TEXT + ['p_{{T}} #in [{:.0f}, {:.0f}] GeV'.format(pt_range[0], pt_range[1])], qualifier=QUALIFIER) # Save mkdir('figures/distributions') c.save('figures/distributions/incl_{}.pdf'.format(var)) pass # 2D histograms msk = data['signal'] == 0 axisvars = sorted(list(axes)) for i,varx in enumerate(axisvars): for vary in axisvars[i+1:]: # Canvas c = ROOT.TCanvas() c.SetRightMargin(0.20) # Create, fill histogram h2 = ROOT.TH2F('{}_{}'.format(varx, vary), "", *(axes[varx] + axes[vary])) root_numpy.fill_hist(h2, data.loc[msk, [varx, vary]].values, 100. * data.loc[msk, weight].values) # Draw h2.Draw("COLZ") # Decorations h2.GetXaxis().SetTitle(latex(varx, ROOT=True)) h2.GetYaxis().SetTitle(latex(vary, ROOT=True)) c.SetLogz() # Save c.SaveAs('figures/distributions/2d_{}_{}.pdf'.format(varx, vary)) pass pass return
def main(args): # Initialising # -------------------------------------------------------------------------- args, cfg = initialise(args) # Loading data # -------------------------------------------------------------------------- data, features, _ = load_data(args.input + 'data_1M_10M.h5') #data = data.sample(frac=0.5, random_state=32) # @TEMP data = data[data['train'] == 1] # Reduce size of data drop_features = [ feat for feat in list(data) if feat not in features + ['m', 'signal', 'weight_adv'] ] data.drop(drop_features, axis=1) cfg['uBoost']['train_features'] = features cfg['uBoost']['random_state'] = SEED cfg['DecisionTreeClassifier']['random_state'] = SEED # Arrays X = data #print(X.head()) w = np.array(data['weight_adv']).flatten() y = np.array(data['signal']).flatten() # Fit uBoost classifier # -------------------------------------------------------------------------- with Profile("Fitting uBoost classifier"): # @NOTE: There might be an issue with the sample weights, because the # local efficiencies computed using kNN does not seem to take the # sample weights into account. # # See: # https://github.com/arogozhnikov/hep_ml/blob/master/hep_ml/uboost.py#L247-L248 # and # https://github.com/arogozhnikov/hep_ml/blob/master/hep_ml/metrics_utils.py#L159-L176 # with `divided_weights` not set. # # `sample_weight` seem to be use only as a starting point for the # boosted, and so not used for the efficiency calculation. # # If this is indeed the case, it would be possible to simply # sample MC events by their weight, and use `sample_weight = 1` # for all samples passed to uBoost. # # @NOTE: I have gotten less sure of the above, so probably no panic. def train_uBoost(X, y, w, cfg, uniforming_rate): """ ... """ # Create base classifier base_tree = DecisionTreeClassifier(**cfg['DecisionTreeClassifier']) # Update training configuration these_cfg = dict(**cfg['uBoost']) these_cfg['uniforming_rate'] = uniforming_rate # Create uBoost classifier uboost = uBoostBDT(base_estimator=base_tree, **these_cfg) # Fit uBoost classifier uboost.fit(X, y, sample_weight=w) return uboost #uniforming_rates = [0.0, 0.01, 0.1, 0.3, 1.0, 3.0, 10.0, 30.0, 100.0] uniforming_rates = [0.0, 0.01, 0.1, 0.3, 0.5, 1.0] #uniforming_rates = [0.5, 1.0] n_jobs = min(7, len(uniforming_rates)) # ...(10, ... jobs = [ delayed(train_uBoost, check_pickle=False)(X, y, w, cfg, uniforming_rate) for uniforming_rate in uniforming_rates ] result = Parallel(n_jobs=n_jobs, backend="threading")(jobs) pass # Saving classifiers # -------------------------------------------------------------------------- for uboost, uniforming_rate in zip(result, uniforming_rates): with Profile("Saving classifiers"): # Ensure model directory exists mkdir('models/uboost/') suffix_ur = "ur_{:s}".format( ("%.2f" % uniforming_rate).replace('.', 'p')) suffix_te = "te_{:d}".format( int(cfg['uBoost']['target_efficiency'] * 100)) # Save uBoost classifier with gzip.open( 'models/uboost/uboost_{}_{}_rel21_fixed_def_cfg_1000boost.pkl.gz' .format(suffix_ur, suffix_te), 'w') as f: pickle.dump(uboost, f) pass pass pass return 0
def main(args): # Definitions histstyle = dict(**HISTSTYLE) # Initialise args, cfg = initialise(args) # Load data #data = np.zeros(1, 95213009, 10) data, features, _ = load_data( 'data/djr_LCTopo_2.h5') # + args.input) #, test=True) # #data2, features, _ = load_data('data/djr_LCTopo_2.h5') # + args.input) #, test=True) # #data = np.concatenate((data1, data2)) #f1 = h5py.File('data/djr_LCTopo_1.h5', 'r') #f2 = h5py.File('data/djr_LCTopo_2.h5', 'r') knnCut = 0 ntrkCut = 50 emfracCut = 0.65 scale = 139 * 1000000 # (inverse nanobarn) signal_to_plot = 7 sigDict = { 0: 'All Models', 1: 'Model A, m = 2 TeV', 2: 'Model A, m = 1 TeV', 3: 'Model A, m = 1.5 TeV', 4: 'Model A, m = 2.5 TeV', 5: 'Model B, m = 1 TeV', 6: 'Model B, m = 1.5 TeV', 7: 'Model B, m = 2 TeV', 8: 'Model B, m = 2.5 TeV', 9: 'Model C, m = 1 TeV', 10: 'Model C, m = 1.5 TeV', 11: 'Model C, m = 2 TeV', 12: 'Model C, m = 2.5 TeV', 13: 'Model D, m = 1 TeV', 14: 'Model D, m = 1.5 TeV', 15: 'Model D, m = 2 TeV', 16: 'Model D, m = 2.5 TeV', } outHistFile = ROOT.TFile.Open( "figures/mjjHistograms_kNN{}_eff{}.root".format(knnCut, kNN_eff), "RECREATE") histstyle[True]['label'] = 'Multijets' histstyle[False]['label'] = 'Dark jets, {}'.format(sigDict[signal_to_plot]) # Add knn variables #base_var = ['lead_jet_ungrtrk500', 'sub_jet_ungrtrk500'] base_var = 'jet_ungrtrk500' kNN_var = base_var.replace('jet', 'knn') #base_vars = ['lead_'+base_var, 'sub_'+base_var] #kNN_vars = ['lead_'+kNN_var, 'sub_'+kNN_var] print data.shape with Profile("Add variables"): #for i in range(len(base_var)): print "k-NN base variable: {} (cp. {})".format(base_var, kNN_var) add_knn(data, newfeat='lead_' + kNN_var, path='models/knn/{}_{}_{}_{}.pkl.gz'.format( FIT, base_var, kNN_eff, sigModel)) add_knn(data, newfeat='sub_' + kNN_var, path='models/knn/{}_{}_{}_{}.pkl.gz'.format( FIT, base_var, kNN_eff, sigModel)) #add_knn(data, newfeat=kNN_var, path='models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel)) print 'models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel) """ base_var = ['lead_jet_ungrtrk500', 'sub_jet_ungrtrk500'] kNN_var = [var.replace('jet', 'knn') for var in base_var] with Profile("Add variables"): from run.knn.common import add_knn, MODEL, VAR as kNN_basevar, EFF as kNN_eff print "k-NN base variable: {} (cp. {})".format(kNN_basevar, kNN_var) for i in range(len(base_var)): add_knn(data, newfeat=kNN_var[i], path='models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var[i], kNN_eff, MODEL)) print 'models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var[i], kNN_eff, MODEL) """ weight = 'weight' # 'weight_test' / 'weight' bins_pt = np.linspace(450, 3500, 40) bins_mjj = np.linspace(0, 8000, 80) # Useful masks msk_bkg = data['signal'] == 0 if signal_to_plot == 0: msk_sig = data['signal'] == 1 else: msk_sig = data['sigType'] == signal_to_plot #msk_weight = data['weight']<0.2 msk_knn = (data['lead_knn_ungrtrk500'] > knnCut) & (data['sub_knn_ungrtrk500'] > knnCut) msk_ungr = (data['lead_jet_ungrtrk500'] > ntrkCut) & (data['sub_jet_ungrtrk500'] > ntrkCut) msk_emfrac = (data['lead_jet_EMFrac'] < emfracCut) & (data['sub_jet_EMFrac'] < emfracCut) msk_knn_1 = (data['lead_knn_ungrtrk500'] > knnCut) msk_ungr_1 = (data['lead_jet_ungrtrk500'] > ntrkCut) #msk_knn = (data['knn_ungrtrk500']>knnCut) #msk_ungr = (data['jet_ungrtrk500']>90.0) msk_ntrkBkg = msk_ungr & msk_emfrac & msk_bkg #& msk_weight #& msk_pt & msk_m & msk_eta msk_ntrkSig = msk_ungr & msk_emfrac & msk_sig #& msk_pt & msk_m & msk_eta msk_knnBkg = msk_knn & msk_bkg msk_knnSig = msk_knn & msk_sig msk_ntrkBkg1 = msk_ungr_1 & msk_bkg #& msk_weight #& msk_pt & msk_m & msk_eta msk_ntrkSig1 = msk_ungr_1 & msk_sig #& msk_pt & msk_m & msk_eta msk_knnBkg1 = msk_knn_1 & msk_bkg #& msk_weight #& msk_pt & msk_m & msk_eta msk_knnSig1 = msk_knn_1 & msk_sig #& msk_pt & msk_m & msk_eta msk_inclBkg = msk_bkg #& msk_weight #& msk_pt & msk_m & msk_eta msk_inclSig = msk_sig #& msk_pt & msk_m & msk_eta # Mjj dist with cut on ntrk, ungrtrk compared to inclusive selection c = rp.canvas(batch=True) hist_inclBkg = c.hist(data.loc[msk_inclBkg, 'dijetmass'].values, bins=bins_mjj, weights=scale * data.loc[msk_inclBkg, weight].values, label="Multijets, Inclusive", normalise=True, linecolor=ROOT.kGreen + 2, linewidth=3) hist_knnBkg = c.hist( data.loc[msk_knnBkg, 'dijetmass'].values, bins=bins_mjj, weights=scale * data.loc[msk_knnBkg, weight].values, label="Multijets, n_{{trk}}^{{#epsilon}}>{}".format(knnCut), normalise=True, linecolor=ROOT.kMagenta + 2, linestyle=2, linewidth=3) hist_ntrkBkg = c.hist(data.loc[msk_ntrkBkg, 'dijetmass'].values, bins=bins_mjj, weights=scale * data.loc[msk_ntrkBkg, weight].values, label="Multijets, n_{{trk}}>{}".format(ntrkCut), normalise=True, linecolor=ROOT.kOrange + 2, linestyle=2, linewidth=3) #hist_CRBkg = c.hist(data.loc[msk_CR_bkg, 'dijetmass'].values, bins=bins_mjj, weights=scale*data.loc[msk_CR_bkg, weight].values, label="CR Bkg, C<20", normalise=True, linecolor=ROOT.kGray+2, linestyle=2) c.legend(width=0.4, xmin=0.5, ymax=0.9) c.ylabel("Fraction of jets") c.xlabel("m_{jj} [GeV]") c.logy() #c.ylim(0.00005, 5) #c.save('figures/distributions/mjj_Bkg_CR20.pdf'.format(knnCut)) #c.save('figures/distributions/mjj_Bkg_CR20.eps'.format(knnCut)) c.save('figures/distributions/mjj_BkgDist_ntrk{}_knn{}_{}.pdf'.format( ntrkCut, knnCut, FIT)) c.save('figures/distributions/mjj_BkgDist_ntrk{}_knn{}_{}.eps'.format( ntrkCut, knnCut, FIT)) del c c = rp.canvas(batch=True) hist_Sig = c.hist(data.loc[msk_sig, 'dijetmass'].values, bins=bins_mjj, weights=data.loc[msk_sig, weight].values, label="Model A, m = 2 TeV, inclusive", normalise=True, linecolor=ROOT.kGreen + 2) hist_knnSig = c.hist( data.loc[msk_knnSig, 'dijetmass'].values, bins=bins_mjj, weights=data.loc[msk_knnSig, weight].values, label="Model A, m = 2 TeV, #it{{n}}_{{trk}}^{{#epsilon}}>{}".format( knnCut), normalise=True, linecolor=ROOT.kMagenta + 2, linestyle=2) hist_ntrkSig = c.hist( data.loc[msk_ntrkSig, 'dijetmass'].values, bins=bins_mjj, weights=data.loc[msk_ntrkSig, weight].values, label="Model A, m = 2 TeV, #it{{n}}_{{trk}}>{}".format(ntrkCut), normalise=True, linecolor=ROOT.kOrange + 2, linestyle=2) #hist_CRSig = c.hist(data.loc[msk_CR_sig, 'dijetmass'].values, bins=bins_mjj, weights=data.loc[msk_CR_sig, weight].values, label="Sig, CR", normalise=True, linecolor=ROOT.kGray+2, linestyle=2) c.legend(width=0.4, xmin=0.5, ymax=0.9) c.ylabel("Fraction of jets") c.xlabel("m_{jj} [GeV]") c.logy() #c.ylim(0.00005, 5) c.save('figures/distributions/mjj_SigDist_ntrk{}_knn{}_{}.pdf'.format( ntrkCut, knnCut, FIT)) c.save('figures/distributions/mjj_SigDist_ntrk{}_knn{}_{}.eps'.format( ntrkCut, knnCut, FIT)) del c c = rp.canvas(batch=True) hist_knnSig = c.hist( data.loc[msk_knnSig, 'dijetmass'].values, bins=bins_mjj, weights=data.loc[msk_knnSig, weight].values, label="Model A, m = 2 TeV, knn_ntrk>{}".format(knnCut), normalise=False, linecolor=ROOT.kBlue + 1, linestyle=1) hist_knnBkg = c.hist(data.loc[msk_knnBkg, 'dijetmass'].values, bins=bins_mjj, weights=scale * data.loc[msk_knnBkg, weight].values, label="Multijets, knn_ntrk>{}".format(knnCut), normalise=False, linecolor=ROOT.kMagenta + 2, linestyle=2) hist_ntrkBkg = c.hist(data.loc[msk_ntrkBkg, 'dijetmass'].values, bins=bins_mjj, weights=scale * data.loc[msk_ntrkBkg, weight].values, label="Multijets, ntrk>{}".format(ntrkCut), normalise=False, linecolor=ROOT.kOrange + 2, linestyle=2) c.legend(width=0.4, xmin=0.3, ymax=0.9) c.ylabel("Number of events") c.xlabel("m_{jj} [GeV]") c.logy() #c.ylim(0.00005, 5) c.save('figures/distributions/mjj_Dist_noNorm_knn{}_{}.pdf'.format( knnCut, FIT)) c.save('figures/distributions/mjj_Dist_noNorm_knn{}_{}.eps'.format( knnCut, FIT)) bins_mjj = np.linspace(0, 10000, 50) # Unscaled histograms for calculating efficiencies hist_inclBkg = c.hist(data.loc[msk_inclBkg, 'dijetmass'].values, bins=bins_mjj, weights=scale * data.loc[msk_inclBkg, weight].values, normalise=False) hist_inclSig = c.hist(data.loc[msk_inclSig, 'dijetmass'].values, bins=bins_mjj, weights=data.loc[msk_inclSig, weight].values, normalise=False) hist_ntrkSig = c.hist(data.loc[msk_ntrkSig, 'dijetmass'].values, bins=bins_mjj, weights=data.loc[msk_ntrkSig, weight].values, normalise=False) hist_knnSig = c.hist(data.loc[msk_knnSig, 'dijetmass'].values, bins=bins_mjj, weights=data.loc[msk_knnSig, weight].values, normalise=False) hist_ntrkSig1 = c.hist(data.loc[msk_ntrkSig1, 'dijetmass'].values, bins=bins_mjj, weights=data.loc[msk_ntrkSig1, weight].values, normalise=False) hist_ntrkBkg1 = c.hist(data.loc[msk_ntrkBkg1, 'dijetmass'].values, bins=bins_mjj, weights=data.loc[msk_ntrkBkg1, weight].values, normalise=False) hist_knnBkg1 = c.hist(data.loc[msk_knnBkg1, 'dijetmass'].values, bins=bins_mjj, weights=data.loc[msk_knnBkg1, weight].values, normalise=False) hist_knnSig1 = c.hist(data.loc[msk_knnSig1, 'dijetmass'].values, bins=bins_mjj, weights=data.loc[msk_knnSig1, weight].values, normalise=False) print "Bkg inclusive integral: ", hist_inclBkg.GetEffectiveEntries() print "Sig inclusive integral: ", hist_inclSig.GetEffectiveEntries() print "Bkg pass kNN eff entries / integral: ", hist_knnBkg.GetEffectiveEntries( ), hist_knnBkg.Integral() print "Sig pass kNN eff entries / integral: ", hist_knnSig.GetEffectiveEntries( ), hist_knnSig.Integral() print "Bkg pass ntrk eff entries / integral: ", hist_ntrkBkg.GetEffectiveEntries( ), hist_ntrkBkg.Integral() print "Sig pass ntrk eff entries / integral: ", hist_ntrkSig.GetEffectiveEntries( ), hist_ntrkSig.Integral() print "Bkg Eff. knn_ntrk> {}, eff. entries: ".format( knnCut), 100 * hist_knnBkg.GetEffectiveEntries( ) / hist_inclBkg.GetEffectiveEntries() print "Sig Eff. knn_ntrk> {}, eff. entries: ".format( knnCut), 100 * hist_knnSig.GetEffectiveEntries( ) / hist_inclSig.GetEffectiveEntries() print "Bkg Eff. knn_ntrk> {}, integral: ".format( knnCut), 100 * hist_knnBkg.Integral() / hist_inclBkg.Integral() print "Sig Eff. knn_ntrk> {}, integral: ".format( knnCut), 100 * hist_knnSig.Integral() / hist_inclSig.Integral() print "Bkg Eff. ntrk>{}, eff. entries: ".format( ntrkCut), 100 * hist_ntrkBkg.GetEffectiveEntries( ) / hist_inclBkg.GetEffectiveEntries() print "Sig Eff. ntrk>{}, eff. entries: ".format( ntrkCut), 100 * hist_ntrkSig.GetEffectiveEntries( ) / hist_inclSig.GetEffectiveEntries( ) #, hist_ntrkSig.GetEffectiveEntries() print "Bkg Eff. 1 jet knn_ntrk> {}, eff. entries: ".format( knnCut), 100 * hist_knnBkg1.GetEffectiveEntries( ) / hist_inclBkg.GetEffectiveEntries() print "Sig Eff. 1 jet knn_ntrk> {}, eff. entries: ".format( knnCut), 100 * hist_knnSig1.GetEffectiveEntries( ) / hist_inclSig.GetEffectiveEntries() print "Bkg Eff. 1 jet knn_ntrk> {}, integral: ".format( knnCut), 100 * hist_knnBkg1.GetEffectiveEntries( ) / hist_inclBkg.GetEffectiveEntries() print "Sig Eff. 1 jet knn_ntrk> {}, integral: ".format( knnCut), 100 * hist_knnSig1.GetEffectiveEntries( ) / hist_inclSig.GetEffectiveEntries() outHistFile.cd() hist_knnBkg.SetName("bkg_knn") hist_knnSig.SetName("sig_knn") hist_knnBkg.Write() hist_knnSig.Write() outHistFile.Close() # Mjj dist for CR compared to inclusive selection """
def main(args): # Definitions histstyle = dict(**HISTSTYLE) # Initialise args, cfg = initialise(args) # Load data data, features, _ = load_data(args.input + 'data.h5', background=True, train=True) pt_bins = np.linspace(200, 2000, 18 + 1, endpoint=True) pt_bins = [None] + zip(pt_bins[:-1], pt_bins[1:]) vars = ['m', 'pt'] for var, pt_bin, log in itertools.product(vars, pt_bins, [True, False]): if var == 'm': bins = np.linspace(50, 300, (300 - 50) // 10 + 1, endpoint=True) else: bins = np.linspace(200, 2000, (2000 - 200) // 50 + 1, endpoint=True) pass histstyle[True]['label'] = 'Training weight' histstyle[False]['label'] = 'Testing weight' # Canvas c = rp.canvas(batch=True) # Plots if pt_bin is not None: msk = (data['pt'] > pt_bin[0]) & (data['pt'] < pt_bin[1]) else: msk = np.ones(data.shape[0], dtype=bool) pass if pt_bin is not None: c.hist(data[var].values[msk], bins=bins, weights=data['weight_test'].values[msk], normalise=True, **histstyle[False]) c.hist(data[var].values[msk], bins=bins, weights=data['weight_adv'].values[msk], normalise=True, **histstyle[True]) #c.hist(data[var].values, bins=bins, weights=data['weight_adv'] .values, normalise=True, **histstyle[True]) #c.hist(data[var].values[msk], bins=bins, weights=data['weight_adv'] .values[msk], normalise=True, **histstyle[False]) #c.hist(data[var].values[msk], bins=bins, weights=data['weight_test'].values[msk], normalise=True, label="Testing weight", linewidth=2, linecolor=ROOT.kGreen) else: c.hist(data[var].values[msk], bins=bins, weights=data['weight_test'].values[msk], normalise=True, **histstyle[False]) c.hist(data[var].values[msk], bins=bins, weights=data['weight_adv'].values[msk], normalise=True, **histstyle[True]) pass # Decorations c.text(TEXT + ["Multijets", "Training dataset"] + (['p_{{T}} #in [{:.0f}, {:.0f}] GeV'.format( *pt_bin)] if pt_bin is not None else []), qualifier='Simulation Internal') c.legend() c.xlabel("Large-#it{{R}} jet {:s} [GeV]".format('mass' if var == 'm' else 'p_{T}')) c.ylabel("Fraction of jets") if log: c.logy() pass # Save c.save('figures/weighting_{}{:s}{}.pdf'.format( 'mass' if var == 'm' else var, '_pT{:.0f}_{:.0f}'.format(*pt_bin) if pt_bin is not None else '', '_log' if log else '')) pass return data['logm'] = pd.Series(np.log(data['m']), index=data.index) # Check variable distributions axes = { 'pt': (45, 200, 2000), 'm': (50, 50, 300), 'rho': (50, -8, 0), 'logm': (50, np.log(50), np.log(300)), } weight = 'weight_adv' # 'weight_test' / 'weight' pt_range = (200., 2000.) msk_pt = (data['pt'] > pt_range[0]) & (data['pt'] < pt_range[1]) for var in axes: # Canvas c = rp.canvas(num_pads=2, batch=True) # Plot bins = np.linspace(axes[var][1], axes[var][2], axes[var][0] + 1, endpoint=True) for adv in [0, 1]: msk = data['signal'] == 0 # @TEMP signal msk &= msk_pt opts = dict(normalise=True, **HISTSTYLE[adv]) # @TEMP signal opts['label'] = 'adv' if adv else 'test' if adv: h1 = c.hist(data.loc[msk, var].values, bins=bins, weights=data.loc[msk, weight].values, **opts) else: h2 = c.hist(data.loc[msk, var].values, bins=bins, weights=data.loc[msk, 'weight_test'].values, **opts) pass pass # Ratio c.pads()[1].ylim(0, 2) c.ratio_plot((h1, h2), oob=True) # Decorations c.legend() c.xlabel(latex(var, ROOT=True)) c.ylabel("Fraction of jets") c.pads()[1].ylabel("adv/test") #c.logy() c.text(TEXT + [ 'p_{{T}} #in [{:.0f}, {:.0f}] GeV'.format(pt_range[0], pt_range[1]) ], qualifier=QUALIFIER) # Save mkdir('figures/distributions') c.save('figures/distributions/incl_{}.pdf'.format(var)) pass # 2D histograms msk = data['signal'] == 0 axisvars = sorted(list(axes)) for i, varx in enumerate(axisvars): for vary in axisvars[i + 1:]: # Canvas c = ROOT.TCanvas() c.SetRightMargin(0.20) # Create, fill histogram h2 = ROOT.TH2F('{}_{}'.format(varx, vary), "", *(axes[varx] + axes[vary])) root_numpy.fill_hist(h2, data.loc[msk, [varx, vary]].values, 100. * data.loc[msk, weight].values) # Draw h2.Draw("COLZ") # Decorations h2.GetXaxis().SetTitle(latex(varx, ROOT=True)) h2.GetYaxis().SetTitle(latex(vary, ROOT=True)) c.SetLogz() # Save c.SaveAs('figures/distributions/2d_{}_{}.pdf'.format(varx, vary)) pass pass return
def main (args): # Definitions histstyle = dict(**HISTSTYLE) # Initialise args, cfg = initialise(args) # Load data data, features, _ = load_data('data/' + args.input) #, test=True) # outFile = ROOT.TFile.Open("figures/knn_jet_ungrtrk500_eff{}_data.root".format(knn_eff),"RECREATE") EFF = 0.5 VAR = 'jet_ungrtrk500' VARX = 'dijetmass' FIT_RANGE = (0, 6000) # Necessary? #eff_sig = 0.50 #fpr, tpr, thresholds = roc_curve(data['signal'], data[kNN_basevar], sample_weight=data['weight']) #idx = np.argmin(np.abs(tpr - eff_sig)) #print "Background acceptance @ {:.2f}% sig. eff.: {:.2f}% ({} > {:.2f})".format(eff_sig * 100., (fpr[idx]) * 100., kNN_basevar, thresholds[idx]) #changed from 1-fpr[idx] #print "Chosen target efficiency: {:.2f}%".format(kNN_eff) weight = 'weight' # 'weight_test' / 'weight' bins_mjj = np.linspace(100, 8000, 20) fineBins = np.linspace(100, 8000, 7900) fineBinsRe = fineBins.reshape(-1,1) percs = [] for i in range(1, len(bins_mjj)): msk = (data[VARX] > bins_mjj[i-1]) & (data[VARX] <= bins_mjj[i]) & (data['signal']==0) if np.sum(msk) > 20: # Ensure sufficient statistics for meaningful percentile. Was 20 percs.append( wpercentile(data=data.loc[msk, VAR].values, percents=100-EFF, weights=data.loc[msk, weight].values) )#wpercentile else: percs.append(0) print "Length of percs: ", len(percs), percs percs = percs[0:-1] bins_mjj = bins_mjj[0:-1] X = bins_mjj.reshape(-1,1) X = X[1:len(bins_mjj)] print len(X), len(percs) # Fit parameters knn_neighbors = 2 knn_weights = 'uniform' fit_deg = 1 knn = KNeighborsRegressor(n_neighbors=5, weights='distance') y_knn = knn.fit(X, percs).predict(fineBinsRe) c = rp.canvas(batch=True) knnFit = c.plot(y_knn, bins=fineBins, linecolor=ROOT.kRed+2, linewidth=2, linestyle=1, label="knn fit, uniform", option='L') c.save('figures/distributions/percentile_test.pdf'.format(EFF, args.input)) outFile.cd() knnFit.SetName("kNNfit") knnFit.Write() outFile.Close() """
def main(args): # Initialise args, cfg = initialise(args) # Initialise Keras backend #initialise_backend(args) # Neural network-specific initialisation of the configuration dict #initialise_config(args, cfg) # Keras import(s) #import keras.backend as K #from keras.models import load_model # Project import(s) #from adversarial.models import classifier_model, adversary_model, combined_model, decorrelation_model # Load data #data, features, _ = load_data(args.input + 'data.h5', test=True) data, features, _ = load_data(args.input + 'data.h5', test_full_signal=True) # Common definitions # -------------------------------------------------------------------------- # -- k-nearest neighbour kNN_var_N2 = 'N_{2}-k#minusNN' kNN_var_tau21 = 'tau_{21}-k#minusNN' def meaningful_digits(number): digits = 0 if number > 0: digits = int(np.ceil(max(-np.log10(number), 0))) pass return '{l:.{d:d}f}'.format(d=digits, l=number) # -- Adversarial neural network (ANN) scan #lambda_reg = 10. #lambda_regs = sorted([1., 3., 10.]) #ann_vars = list() #lambda_strs = list() #for lambda_reg_ in lambda_regs: # lambda_str = meaningful_digits(lambda_reg_).replace('.', 'p') # lambda_strs.append(lambda_str) # ann_var_ = "ANN(#lambda={:s})".format(lambda_str.replace('p', '.')) # ann_vars.append(ann_var_) # pass #ann_var = ann_vars[lambda_regs.index(lambda_reg)] # -- uBoost scan #uboost_eff = 92 #uboost_ur = 0.3 #uboost_urs = sorted([0., 0.01, 0.1, 0.3, 1.0]) #uboost_var = 'uBoost(#alpha={:s})'.format(meaningful_digits(uboost_ur)) #uboost_vars = ['uBoost(#alpha={:s})'.format(meaningful_digits(ur)) for ur in uboost_urs] #uboost_pattern = 'uboost_ur_{{:4.2f}}_te_{:.0f}_rel21_fixed'.format(uboost_eff) # Tagger feature collection #tagger_features = ['Tau21','Tau21DDT', 'D2', kNN_var, 'D2', 'D2CSS', 'NN', ann_var, 'Adaboost', uboost_var] #tagger_features = ['tau21', 'tau21DDT', 'tau21', 'tau21kNN', 'tau21', 'tau21CSS', 'N2_B1', 'N2_B1DDT', 'N2_B1', 'N2_B1kNN', 'N2_B1', 'N2_B1CSS']; title="tau21_vs_N2_B1" #tagger_features = ['N2_B1', 'N2_B1DDT', 'N2_B1', 'N2_B1kNN', 'N2_B1', 'N2_B1CSS']; title="N2_B1" #tagger_features = ['tau21', 'tau21DDT', 'N2_B1', 'N2_B1kNN', 'N2_B1', 'N2_B1CSS']; title="ATLAS" tagger_features = [ 'decDeepWvsQCD', 'decDeepWvsQCDDDT', 'decDeepWvsQCD', 'decDeepWvsQCDkNN', 'decDeepWvsQCD', 'decDeepWvsQCDCSS' ] title = "decDeep" tagger_features = [ 'DeepWvsQCD', 'DeepWvsQCDDDT', 'DeepWvsQCD', 'DeepWvsQCDkNN', 'DeepWvsQCD', 'DeepWvsQCDCSS' ] title = "Deep" # Add variables # -------------------------------------------------------------------------- with Profile("Add variables"): ## Tau21DDT #from run.ddt.common import add_ddt #add_ddt(data, feat='tau21', path='models/ddt/ddt_tau21.pkl.gz') ## N2DDT #from run.ddt.common import add_ddt #add_ddt(data, feat='N2_B1', path='models/ddt/ddt_N2_B1.pkl.gz') ## decDeepQvsQCDDDT #from run.ddt.common import add_ddt #add_ddt(data, feat='decDeepWvsQCD', path='models/ddt/ddt_decDeepWvsQCD.pkl.gz') # DeepQvsQCDDDT from run.ddt.common import add_ddt add_ddt(data, feat='DeepWvsQCD', path='models/ddt/ddt_DeepWvsQCD.pkl.gz') ## Tau21-kNN #from run.knn.common import add_knn, VAR_TAU21 as kNN_basevar, TAU21_EFF as kNN_eff #print "k-NN base variable: {} (cp. {})".format(kNN_basevar, kNN_var_tau21) #add_knn(data, feat=kNN_basevar, path='models/knn/knn_{}_{}.pkl.gz'.format(kNN_basevar, kNN_eff)) ## N2-kNN #from run.knn.common import add_knn, VAR_N2 as kNN_basevar, N2_EFF as kNN_eff #print "k-NN base variable: {} (cp. {})".format(kNN_basevar, kNN_var_N2) #add_knn(data, feat=kNN_basevar, path='models/knn/knn_{}_{}.pkl.gz'.format(kNN_basevar, kNN_eff)) ## decDeepWvsQCD-kNN #from run.knn.common import add_knn, VAR_DECDEEP as kNN_basevar, DECDEEP_EFF as kNN_eff #print "k-NN base variable: {} (cp. {})".format(kNN_basevar, kNN_var_N2) #add_knn(data, feat=kNN_basevar, path='models/knn/knn_{}_{}.pkl.gz'.format(kNN_basevar, kNN_eff)) # DeepWvsQCD-kNN from run.knn.common import add_knn, VAR_DEEP as kNN_basevar, DEEP_EFF as kNN_eff print "k-NN base variable: {} (cp. {})".format(kNN_basevar, kNN_var_N2) add_knn(data, feat=kNN_basevar, path='models/knn/knn_{}_{}.pkl.gz'.format( kNN_basevar, kNN_eff)) ## Tau21-CSS #from run.css.common import add_css #add_css("tau21", data) ## N2-CSS #from run.css.common import add_css #add_css("N2_B1", data) ## decDeepWvsQCD-CSS #from run.css.common import add_css #add_css("decDeepWvsQCD", data) # DeepWvsQCD-CSS from run.css.common import add_css add_css("DeepWvsQCD", data) pass # Remove unused variables #used_variables = set(tagger_features + ann_vars + uboost_vars + ['m', 'pt', 'npv', 'weight_test']) used_variables = set(tagger_features + ['m', 'pt', 'weight_test', 'npv' ]) ## need to put 'npv' back in for robustness study unused_variables = [var for var in list(data) if var not in used_variables] data.drop(columns=unused_variables) gc.collect() # Perform performance studies #perform_studies (data, args, tagger_features, ann_vars, uboost_vars) perform_studies(data, args, tagger_features, title=title) return 0
def main(args): # Initialise args, cfg = initialise(args) # Load data data, _, _ = load_data(args.input + 'data.h5', train=True) msk_sig = data['signal'] == 1 msk_bkg = ~msk_sig # ------------------------------------------------------------------------- #### #### # Initialise Keras backend #### initialise_backend(args) #### #### # Neural network-specific initialisation of the configuration dict #### initialise_config(args, cfg) #### #### # Keras import(s) #### from keras.models import load_model #### #### # NN #### from run.adversarial.common import add_nn #### with Profile("NN"): #### classifier = load_model('models/adversarial/classifier/full/classifier.h5') #### add_nn(data, classifier, 'NN') #### pass # ------------------------------------------------------------------------- # Fill measured profile profile_meas, _ = fill_profile(data[msk_bkg]) # Add k-NN variable knnfeat = 'knn' add_knn(data, newfeat=knnfeat, path='models/knn/knn_{}_{}.pkl.gz'.format(VAR, EFF)) # Loading KNN classifier knn = loadclf('models/knn/knn_{:s}_{:.0f}.pkl.gz'.format(VAR, EFF)) # Filling fitted profile with Profile("Filling fitted profile"): rebin = 8 edges, centres = dict(), dict() for ax, var in zip(['x', 'y'], [VARX, VARY]): # Short-hands vbins, vmin, vmax = AXIS[var] # Re-binned bin edges @TODO: Make standardised right away? edges[ax] = np.interp( np.linspace(0, vbins, vbins * rebin + 1, endpoint=True), range(vbins + 1), np.linspace(vmin, vmax, vbins + 1, endpoint=True)) # Re-binned bin centres centres[ax] = edges[ax][:-1] + 0.5 * np.diff(edges[ax]) pass # Get predictions evaluated at re-binned bin centres g = dict() g['x'], g['y'] = np.meshgrid(centres['x'], centres['y']) g['x'], g['y'] = standardise(g['x'], g['y']) X = np.vstack((g['x'].flatten(), g['y'].flatten())).T fit = knn.predict(X).reshape(g['x'].shape).T # Fill ROOT "profile" profile_fit = ROOT.TH2F('profile_fit', "", len(edges['x']) - 1, edges['x'].flatten('C'), len(edges['y']) - 1, edges['y'].flatten('C')) root_numpy.array2hist(fit, profile_fit) pass # Plotting with Profile("Plotting"): for fit in [False, True]: # Select correct profile profile = profile_fit if fit else profile_meas # Plot plot(profile, fit) pass pass # Plotting local selection efficiencies for D2-kNN < 0 # -- Compute signal efficiency for sig, msk in zip([True, False], [msk_sig, msk_bkg]): if sig: rgbs = [(247 / 255., 251 / 255., 255 / 255.), (222 / 255., 235 / 255., 247 / 255.), (198 / 255., 219 / 255., 239 / 255.), (158 / 255., 202 / 255., 225 / 255.), (107 / 255., 174 / 255., 214 / 255.), (66 / 255., 146 / 255., 198 / 255.), (33 / 255., 113 / 255., 181 / 255.), (8 / 255., 81 / 255., 156 / 255.), (8 / 255., 48 / 255., 107 / 255.)] red, green, blue = map(np.array, zip(*rgbs)) nb_cols = len(rgbs) stops = np.linspace(0, 1, nb_cols, endpoint=True) else: rgbs = [(255 / 255., 51 / 255., 4 / 255.), (247 / 255., 251 / 255., 255 / 255.), (222 / 255., 235 / 255., 247 / 255.), (198 / 255., 219 / 255., 239 / 255.), (158 / 255., 202 / 255., 225 / 255.), (107 / 255., 174 / 255., 214 / 255.), (66 / 255., 146 / 255., 198 / 255.), (33 / 255., 113 / 255., 181 / 255.), (8 / 255., 81 / 255., 156 / 255.), (8 / 255., 48 / 255., 107 / 255.)] red, green, blue = map(np.array, zip(*rgbs)) nb_cols = len(rgbs) stops = np.array([0] + list( np.linspace(0, 1, nb_cols - 1, endpoint=True) * (1. - EFF / 100.) + EFF / 100.)) pass ROOT.TColor.CreateGradientColorTable(nb_cols, stops, red, green, blue, NB_CONTOUR) # Define arrays shape = (AXIS[VARX][0], AXIS[VARY][0]) bins = [ np.linspace(AXIS[var][1], AXIS[var][2], AXIS[var][0] + 1, endpoint=True) for var in VARS ] x, y, z = (np.zeros(shape) for _ in range(3)) # Create `profile` histogram profile = ROOT.TH2F('profile', "", len(bins[0]) - 1, bins[0].flatten('C'), len(bins[1]) - 1, bins[1].flatten('C')) # Compute inclusive efficiency in bins of `VARY` effs = list() for edges in zip(bins[1][:-1], bins[1][1:]): msk_bin = (data[VARY] > edges[0]) & (data[VARY] < edges[1]) msk_pass = data[knnfeat] < 0 num = data.loc[msk & msk_bin & msk_pass, 'weight_test'].values.sum() den = data.loc[msk & msk_bin, 'weight_test'].values.sum() effs.append(num / den) pass # Fill profile for i, j in itertools.product(*map(range, shape)): # Bin edges in x and y edges = [bin[idx:idx + 2] for idx, bin in zip([i, j], bins)] # Masks msks = [(data[var] > edges[dim][0]) & (data[var] <= edges[dim][1]) for dim, var in enumerate(VARS)] msk_bin = reduce(lambda x, y: x & y, msks) data_ = data[msk & msk_bin] # Set non-zero bin content if np.sum(msk & msk_bin): msk_pass = data_[knnfeat] < 0 num = data.loc[msk & msk_bin & msk_pass, 'weight_test'].values.sum() den = data.loc[msk & msk_bin, 'weight_test'].values.sum() eff = num / den profile.SetBinContent(i + 1, j + 1, eff) pass pass c = rp.canvas(batch=True) pad = c.pads()[0]._bare() pad.cd() pad.SetRightMargin(0.20) pad.SetLeftMargin(0.15) pad.SetTopMargin(0.10) # Styling profile.GetXaxis().SetTitle("Large-#it{R} jet " + latex(VARX, ROOT=True) + " = log(m^{2}/p_{T}^{2})") profile.GetYaxis().SetTitle("Large-#it{R} jet " + latex(VARY, ROOT=True) + " [GeV]") profile.GetZaxis().SetTitle("Selection efficiency for %s^{(%s%%)}" % (latex(VAR, ROOT=True), EFF)) profile.GetYaxis().SetNdivisions(505) profile.GetZaxis().SetNdivisions(505) profile.GetXaxis().SetTitleOffset(1.4) profile.GetYaxis().SetTitleOffset(1.8) profile.GetZaxis().SetTitleOffset(1.3) zrange = (0., 1.) if zrange: profile.GetZaxis().SetRangeUser(*zrange) pass profile.SetContour(NB_CONTOUR) # Draw profile.Draw('COLZ') # Decorations c.text(qualifier=QUALIFIER, ymax=0.92, xmin=0.15) c.text(["#sqrt{s} = 13 TeV", "#it{W} jets" if sig else "Multijets"], ATLAS=False) # -- Efficiencies xaxis = profile.GetXaxis() yaxis = profile.GetYaxis() tlatex = ROOT.TLatex() tlatex.SetTextColor(ROOT.kGray + 2) tlatex.SetTextSize(0.023) tlatex.SetTextFont(42) tlatex.SetTextAlign(32) xt = xaxis.GetBinLowEdge(xaxis.GetNbins()) for eff, ibin in zip(effs, range(1, yaxis.GetNbins() + 1)): yt = yaxis.GetBinCenter(ibin) tlatex.DrawLatex( xt, yt, "%s%.1f%%" % ("#bar{#varepsilon}^{rel}_{%s} = " % ('sig' if sig else 'bkg') if ibin == 1 else '', eff * 100.)) pass # -- Bounds BOUNDS[0].DrawCopy("SAME") BOUNDS[1].DrawCopy("SAME") c.latex("m > 50 GeV", -4.5, BOUNDS[0].Eval(-4.5) + 30, align=21, angle=-37, textsize=13, textcolor=ROOT.kGray + 3) c.latex("m < 300 GeV", -2.5, BOUNDS[1].Eval(-2.5) - 30, align=23, angle=-57, textsize=13, textcolor=ROOT.kGray + 3) # Save mkdir('figures/knn/') c.save('figures/knn/knn_eff_{}_{:s}_{:.0f}.pdf'.format( 'sig' if sig else 'bkg', VAR, EFF)) pass return
def main(args): # Initialise args, cfg = initialise(args) # Initialise Keras backend initialise_backend(args) # Neural network-specific initialisation of the configuration dict initialise_config(args, cfg) # Keras import(s) import keras.backend as K from keras.models import load_model # Project import(s) from adversarial.models import classifier_model, adversary_model, combined_model, decorrelation_model # Load data data, features, _ = load_data(args.input + 'data.h5', test=True) # Common definitions # -------------------------------------------------------------------------- # -- k-nearest neighbour kNN_var = 'D2-k#minusNN' def meaningful_digits(number): digits = 0 if number > 0: digits = int(np.ceil(max(-np.log10(number), 0))) pass return '{l:.{d:d}f}'.format(d=digits, l=number) # -- Adversarial neural network (ANN) scan lambda_reg = 10. lambda_regs = sorted([1., 3., 10.]) ann_vars = list() lambda_strs = list() for lambda_reg_ in lambda_regs: lambda_str = meaningful_digits(lambda_reg_).replace('.', 'p') lambda_strs.append(lambda_str) ann_var_ = "ANN(#lambda={:s})".format(lambda_str.replace('p', '.')) ann_vars.append(ann_var_) pass ann_var = ann_vars[lambda_regs.index(lambda_reg)] # -- uBoost scan uboost_eff = 92 uboost_ur = 0.3 uboost_urs = sorted([0., 0.01, 0.1, 0.3, 1.0]) uboost_var = 'uBoost(#alpha={:s})'.format(meaningful_digits(uboost_ur)) uboost_vars = [ 'uBoost(#alpha={:s})'.format(meaningful_digits(ur)) for ur in uboost_urs ] uboost_pattern = 'uboost_ur_{{:4.2f}}_te_{:.0f}_rel21_fixed'.format( uboost_eff) # Tagger feature collection tagger_features = [ 'Tau21', 'Tau21DDT', 'D2', kNN_var, 'D2', 'D2CSS', 'NN', ann_var, 'Adaboost', uboost_var ] # Add variables # -------------------------------------------------------------------------- with Profile("Add variables"): # Tau21DDT from run.ddt.common import add_ddt add_ddt(data, path='models/ddt/ddt.pkl.gz') # D2-kNN from run.knn.common import add_knn, VAR as kNN_basevar, EFF as kNN_eff print "k-NN base variable: {} (cp. {})".format(kNN_basevar, kNN_var) add_knn(data, newfeat=kNN_var, path='models/knn/knn_{}_{}.pkl.gz'.format( kNN_basevar, kNN_eff)) # D2-CSS from run.css.common import add_css add_css("D2", data) # NN from run.adversarial.common import add_nn with Profile("NN"): classifier = load_model( 'models/adversarial/classifier/full/classifier.h5') add_nn(data, classifier, 'NN') pass # ANN with Profile("ANN"): from adversarial.utils import DECORRELATION_VARIABLES adversary = adversary_model( gmm_dimensions=len(DECORRELATION_VARIABLES), **cfg['adversary']['model']) combined = combined_model(classifier, adversary, **cfg['combined']['model']) for ann_var_, lambda_str_ in zip(ann_vars, lambda_strs): print "== Loading model for {}".format(ann_var_) combined.load_weights( 'models/adversarial/combined/full/combined_lambda{}.h5'. format(lambda_str_)) add_nn(data, classifier, ann_var_) pass pass # Adaboost/uBoost with Profile("Adaboost/uBoost"): from run.uboost.common import add_bdt for var, ur in zip(uboost_vars, uboost_urs): var = ('Adaboost' if ur == 0 else var) path = 'models/uboost/' + uboost_pattern.format(ur).replace( '.', 'p') + '.pkl.gz' print "== Loading model for {}".format(var) add_bdt(data, var, path) pass # Remove `Adaboost` from scan list uboost_vars.pop(0) pass pass # Remove unused variables used_variables = set(tagger_features + ann_vars + uboost_vars + ['m', 'pt', 'npv', 'weight_test']) unused_variables = [var for var in list(data) if var not in used_variables] data.drop(columns=unused_variables) gc.collect() # Perform performance studies perform_studies(data, args, tagger_features, ann_vars, uboost_vars) return 0
def main (args): # Definitions histstyle = dict(**HISTSTYLE) # Initialise args, cfg = initialise(args) # Load data data, features, _ = load_data('data/djr_LCTopo_1.h5') #, test=True) #data2, features, _ = load_data('data/djr_LCTopo_2.h5') #, test=True) #data = np.concatenate((data1, data2)) sigNumber = 0 sigDict = { 0: 'All Models', 1: 'Model A, m = 1 TeV', 2: 'Model A, m = 1.5 TeV', 3: 'Model A, m = 2 TeV', 4: 'Model A, m = 2.5 TeV', 5: 'Model B, m = 1 TeV', 6: 'Model B, m = 1.5 TeV', 7: 'Model B, m = 2 TeV', 8: 'Model B, m = 2.5 TeV', 9: 'Model C, m = 1 TeV', 10: 'Model C, m = 1.5 TeV', 11: 'Model C, m = 2 TeV', 12: 'Model C, m = 2.5 TeV', 13: 'Model D, m = 1 TeV', 14: 'Model D, m = 1.5 TeV', 15: 'Model D, m = 2 TeV', 16: 'Model D, m = 2.5 TeV', } histstyle[True] ['label'] = 'Multijets' histstyle[False]['label'] = 'Dark jets, {}'.format(sigDict[sigNumber]) # Add knn variables #base_var = ['lead_jet_ungrtrk500', 'sub_jet_ungrtrk500'] base_var = 'jet_ungrtrk500' kNN_var = base_var.replace('jet', 'knn') #base_vars = [base_var] #kNN_vars = [kNN_var] base_vars = ['lead_'+base_var, 'sub_'+base_var] kNN_vars = ['lead_'+kNN_var, 'sub_'+kNN_var] with Profile("Add variables"): from run.knn.common import add_knn, EFF as kNN_eff #for i in range(len(base_var)): print "k-NN base variable: {} (cp. {})".format(base_var, kNN_var) add_knn(data, newfeat='lead_'+kNN_var, path='models/knn/knn1D_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel)) add_knn(data, newfeat='sub_'+kNN_var, path='models/knn/knn1D_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel)) #add_knn(data, newfeat=kNN_var, path='models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel)) print 'models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel) # Check variable distributions weight = 'weight' # 'weight_test' / 'weight' scale = 139*1000000 # (inverse nanobarn) msk_bkg = data['signal'] == 0 if sigNumber==0: msk_sig = data['signal'] == 1 else: msk_sig = data['sigType'] == sigNumber knnBins = np.linspace(-100, 200, 75, endpoint=True) for var in kNN_vars: ### Canvas ### c = rp.canvas(num_pads=2, batch=True) c_tmp = rp.canvas(num_pads=1, batch=True) c2 = rp.canvas(batch=True) ### Plot ### h2 = c.pads()[0].hist(data.loc[msk_sig, var].values, bins=knnBins, weights=data.loc[msk_sig, weight].values, normalise=True, **histstyle[False]) h1 = c.pads()[0].hist(data.loc[msk_bkg, var].values, bins=knnBins, weights=scale*data.loc[msk_bkg, weight].values, normalise=True, **histstyle[True]) h1_incl = c_tmp.hist(data.loc[msk_bkg, var].values, bins=knnBins, weights=scale*data.loc[msk_bkg, weight].values, normalise=False) h2_incl = c_tmp.hist(data.loc[msk_sig, var].values, bins=knnBins, weights=data.loc[msk_sig, weight].values, normalise=False) #h1_CR = c_tmp.hist(data.loc[msk_CR_bkg, var].values, bins=knnBins, weights=scale*data.loc[msk_CR_bkg, weight].values, normalise=False) #h2_CR = c_tmp.hist(data.loc[msk_CR_sig, var].values, bins=knnBins, weights=data.loc[msk_CR_sig, weight].values, normalise=False) print "bkg. incl integral: ", h1_incl.GetEffectiveEntries() print "sig. incl integral: ", h2_incl.GetEffectiveEntries() #print "bkg. CR efficiency: ", h1_CR.GetEffectiveEntries()/h1_incl.GetEffectiveEntries() #print "sig. CR efficiency: ", h2_CR.GetEffectiveEntries()/h2_incl.GetEffectiveEntries() normFactor = 1.0 / (3./2 + np.sqrt(h1_incl.GetEffectiveEntries()) ) print "Sensitivity with no cut: ", normFactor ### sensitivity ### sensitivity = [] bkg_eff_1jet = [] i = 0 for cut in knnBins: msk_pass = (data[kNN_vars[0]]>cut) & (data[kNN_vars[1]]>cut) msk_pass1 = data[kNN_vars[0]>cut) #msk_pass = (data[var]>cut) msk_bkg_pass = msk_bkg & msk_pass msk_sig_pass = msk_sig & msk_pass msk_bkg_pass1 = msk_bkg & msk_pass_1jet msk_sig_pass1 = msk_sig & msk_pass_1jet h1_pass = c_tmp.hist(data.loc[msk_bkg_pass, var].values, bins=knnBins, weights=scale*data.loc[msk_bkg_pass, weight].values, normalise=False) h2_pass = c_tmp.hist(data.loc[msk_sig_pass, var].values, bins=knnBins, weights=data.loc[msk_sig_pass, weight].values, normalise=False) h1_pass1 = c_tmp.hist(data.loc[msk_bkg_pass1, var].values, bins=knnBins, weights=data.loc[msk_sig_pass, weight].values, normalise=False) if ( h2_incl.GetEffectiveEntries()>0 ) : #and h1_pass.GetEffectiveEntries()>0) : sensitivity.append( ((h2_pass.GetEffectiveEntries()/h2_incl.GetEffectiveEntries()) / (3./2 + np.sqrt(h1_pass.GetEffectiveEntries()) )) / normFactor ) #print "bkg. eff. @ " , cut, ": ", h1_pass.GetEffectiveEntries()/h1_incl.GetEffectiveEntries() #print "signal eff. @ ", cut, ": ", h2_pass.GetEffectiveEntries()/h2_incl.GetEffectiveEntries() #print "Sensitivity gain@ ", cut, ": ", ((h2_pass.GetEffectiveEntries()/h2_incl.GetEffectiveEntries()) / (3./2 + np.sqrt(h1_pass.GetEffectiveEntries())) ) / normFactor else: sensitivity.append(0) if (h1_incl.GetEffectiveEntries()>0 ) : bkg_eff_1jet.append(h1_pass1.GetEffectiveEntries()/h1_incl.GetEffectiveEntries()) else: bkg_eff_1jet.append(0) i = i+1 #c.pads()[0].ylim(0,0.25) c.pads()[0].logy() c.pads()[0].xlim(-100,200) c.pads()[1].ylim(0,30) c.pads()[1].xlim(-100,200) c.pads()[1].graph( sensitivity, bins=knnBins) #, oob=False ) ### Decorations ### c.legend(width=0.4, xmin=0.3, ymax=0.9) #c.xlabel("n_{trk}^{#epsilon={}\%}".format(kNN_eff)) #latex(var, ROOT=True)) c.xlabel("n_{trk}^{#epsilon}") #latex(var, ROOT=True)) c.ylabel("Fraction of jets") c.pads()[1].ylabel("Sensitivity gain")#"#epsilon_{S}/(#frac{3}{2} + #sqrt{B})/") c.pads()[1].text(["Sensitivity = #varepsilon_{S}/(#frac{3}{2} + #sqrt{B})", ], xmin=0.2, ymax=0.80, ATLAS=False) c2.graph(sensitivity, bkg_eff_1jet) c2.xlabel("Single jet #varepsilon_B") c2.ylabel("Sensitivity gain") c2.text(["#epsilon=0.5 %",], xmin=0.2, ymax=0.8, ATLAS=False) ### Save ### #mkdir('figures/distributions') c.save('figures/distributions/sensitivity_{}_sig{}_eff{}.pdf'.format(var, sigNumber, kNN_eff)) c.save('figures/distributions/sensitivity_{}_sig{}_eff{}.eps'.format(var, sigNumber, kNN_eff)) c2.save('figure/distribution/sensitivity_1jEfficiency.pdf'.format(var,sigNumber,kNN_eff)) print 'figures/distributions/sensitivity_{}_sig{}_eff{}.pdf'.format(var, sigNumber, kNN_eff) pass # Plot also the normal ntrk distribution for cross check with Roland's result msk_bkg = data['signal'] == 0 if sigNumber==0: msk_sig = data['signal'] == 1 # data['sigType'] == sigNumber # else: msk_sig = data['sigType'] == sigNumber # data['sigType'] == sigNumber # #msk_weight = data['weight']<0.0002 #msk_bkg = msk_bkg & msk_pt & msk_m & msk_eta #msk_sig = msk_sig & msk_pt & msk_m & msk_eta baseBins = np.linspace(0, 200, 75, endpoint=True) #axes[var][1], axes[var][2], axes[var][0] + 1, endpoint=True) for var in base_vars: ### Canvas ### c = rp.canvas(num_pads=2, batch=True) c.pads()[0].logy() c_tmp = rp.canvas(batch=True) ### Plot ### h2 = c.pads()[0].hist(data.loc[msk_sig, var].values, bins=baseBins, weights=data.loc[msk_sig, weight].values, normalise=True, **histstyle[False]) h1 = c.pads()[0].hist(data.loc[msk_bkg, var].values, bins=baseBins, weights=scale*data.loc[msk_bkg, weight].values, normalise=True, **histstyle[True]) h1_incl = c_tmp.hist(data.loc[msk_bkg, var].values, bins=baseBins, weights=scale*data.loc[msk_bkg, weight].values, normalise=False) h2_incl = c_tmp.hist(data.loc[msk_sig, var].values, bins=baseBins, weights=data.loc[msk_sig, weight].values, normalise=False) print "bkg. incl integral: ", h1_incl.GetEffectiveEntries() print "sig. incl integral: ", h2_incl.GetEffectiveEntries() normFactor = 1.0 / (3./2 + np.sqrt(h1_incl.Integral()) ) #print "Sensitivity with no cut: ", normFactor ### sensitivity ### sensitivity = [] i = 0 for cut in baseBins: #print cut msk_pass = (data[base_vars[0]]>cut) & (data[base_vars[1]]>cut) # #msk_pass = data[var]>cut msk_bkg_pass = msk_bkg & msk_pass msk_sig_pass = msk_sig & msk_pass h1_pass = c_tmp.hist(data.loc[msk_bkg_pass, var].values, bins=baseBins, weights=scale*data.loc[msk_bkg_pass, weight].values, normalise=False) h2_pass = c_tmp.hist(data.loc[msk_sig_pass, var].values, bins=baseBins, weights=data.loc[msk_sig_pass, weight].values, normalise=False) if ( h2_incl.Integral()>0 ): #and h1_pass.Integral()>0 ): sensitivity.append( (h2_pass.Integral()/h2_incl.Integral()) / (3./2. + np.sqrt(h1_pass.Integral())) / normFactor ) #print "signal eff. at ", cut, ": ", (h2_pass.Integral()/h2_incl.Integral()) #print "bkg eff. at ", cut, ": ", (h1_pass.Integral()/h1_incl.Integral()) #print "sensitivity gain at ", cut, ": ", (h2_pass.Integral()/h2_incl.Integral()) / (3./2. + np.sqrt(h1_pass.Integral())) / normFactor else: sensitivity.append(0) i = i+1 c.pads()[1].ylim(0,80) c.pads()[1].xlim(0,200) c.pads()[1].graph( sensitivity, bins=baseBins) #, oob=False ) ### Decorations ### c.legend(width=0.4, xmin=0.3, ymax=0.9) #c.xlabel(latex(var, ROOT=True)) c.ylabel("Fraction of jets") c.xlabel("n_{trk}") #latex(var, ROOT=True)) c.pads()[1].ylabel("sensitivity gain") #"#epsilon_{S}/(#frac{3}{2} + #sqrt{B})") c.pads()[1].text(["sensitivity = #epsilon_{S}/(#frac{3}{2} + #sqrt{B})", ], xmin=0.2, ymax=0.80, ATLAS=False) ### Save ### c.save('figures/distributions/sensitivity_{}_sig{}_eff{}.pdf'.format(var, sigNumber, kNN_eff)) c.save('figures/distributions/sensitivity_{}_sig{}_eff{}.eps'.format(var, sigNumber, kNN_eff)) pass
def main(args): # Definitions histstyle = dict(**HISTSTYLE) # Initialise args, cfg = initialise(args) # Load data data, features, _ = load_data('data/' + args.input) histstyle[True]['label'] = 'Multijets' histstyle[False]['label'] = 'Dark jets, Model A, m = 2 TeV' # Add knn variables #base_var = ['lead_jet_ungrtrk500', 'sub_jet_ungrtrk500'] #kNN_var = [var.replace('jet', 'knn') for var in base_var] #base_var = ['ntrk_sum'] #kNN_var = [var + '-knn' for var in base_var] """ with Profile("Add variables"): from run.knn.common import add_knn, MODEL, VAR as kNN_basevar, EFF as kNN_eff print "k-NN base variable: {} (cp. {})".format(kNN_basevar, kNN_var) for i in range(len(base_var)): add_knn(data, newfeat=kNN_var[i], path='models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var[i], kNN_eff, MODEL)) print 'models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var[i], kNN_eff, MODEL) """ # Check variable distributions axes = { 'jet_ungrtrk500': (50, 0, 100), #'lead_knn_ungrtrk500': (50, -100, 50), 'jet_pt': (50, 0, 3000), 'dijetmass': (50, 0, 7000), } scale = 139 * 1000000 weight = 'weight' # 'weight_test' / 'weight' msk_bkg = data['signal'] == 0 # @TEMP signal msk_sig = data['sigType'] == 1 # @TEMP signal #msk_weight = data['weight']<0.002 #msk_bkg = msk_bkg & msk_weight #msk_CR = (data['lead_jet_ungrtrk500']<20) | (data['sub_jet_ungrtrk500']<20) ###### 3D histograms ####### vary = 'jet_pt' varx = 'dijetmass' varz = 'jet_ungrtrk500' #for i,varx in enumerate(axisvars): # for vary in axisvars[i+1:]: # Canvas can4 = rp.canvas(batch=True) pad = can4.pads()[0]._bare() pad.cd() pad.SetRightMargin(0.20) #can4 = ROOT.TCanvas("canvas", "", 800, 600) #can4.SetRightMargin(0.20) # Create, fill histogram h2_bkg = ROOT.TH2F('{}_{}'.format(varx, vary), "", *(axes[varx] + axes[vary])) root_numpy.fill_hist( h2_bkg, data.loc[msk_bkg, [varx, vary]].values ) #, scale*data.loc[msk_bkg, weight].values)#*data.loc[msk_bkg, varz].values) #h2_bkg.Scale(1./h2_bkg.Integral()) print h2_bkg.Integral() # Draw h2_bkg.Draw("COLZ") # Decorations h2_bkg.GetXaxis().SetTitle(latex(varx, ROOT=True)) h2_bkg.GetYaxis().SetTitle(latex(vary, ROOT=True)) #h2_bkg.GetZaxis().SetTitle(latex(varz, ROOT=True)) #pad.SetLogz() #can4.zlim(0.0, 0.04) h2_bkg.GetZaxis().SetRangeUser(0.0, 300000) # Save can4.save('figures/distributions/3d_{}_{}_{}_bkg.pdf'.format( varx, vary, varz)) can4.save('figures/distributions/3d_{}_{}_{}_bkg.eps'.format( varx, vary, varz)) # ntrk distribution """ can1 = rp.canvas(batch=True) bins1 = np.linspace(0, 150, 75) h_ungrB = can1.hist(data.loc[msk_bkg, 'lead_jet_ungrtrk500'].values, bins=bins1, weights=data.loc[msk_bkg, weight].values, label='ungrtrk, bkg', normalise=True, linecolor=ROOT.kGreen+2) h_ungeS = can1.hist(data.loc[msk_sig, 'lead_jet_ungrtrk500'].values, bins=bins1, weights=data.loc[msk_sig, weight].values, label='ungrtrk, sig', normalise=True, linecolor=ROOT.kGreen+2, linestyle=2) can1.legend(width=0.3, xmin=0.6, ymax=0.9) can1.save('figures/distributions/ungrtrk_dist.pdf') can1.save('figures/distributions/ungrtrk_dist.eps') # 2D histograms axisvars = sorted(list(axes)) varx = 'lead_jet_ungrtrk500' vary = 'sub_jet_ungrtrk500' #for i,varx in enumerate(axisvars): # for vary in axisvars[i+1:]: # Canvas can3 = ROOT.TCanvas() can3.SetRightMargin(0.20) # Create, fill histogram h2_bkg = ROOT.TH2F('{}_{}'.format(varx, vary), "", *(axes[varx] + axes[vary])) h2_sig = ROOT.TH2F('{}_{}'.format(varx, vary), "", *(axes[varx] + axes[vary])) root_numpy.fill_hist(h2_bkg, data.loc[msk_bkg, [varx, vary]].values, data.loc[msk_bkg, weight].values) root_numpy.fill_hist(h2_sig, data.loc[msk_sig, [varx, vary]].values, data.loc[msk_sig, weight].values) # Draw h2_bkg.Draw("COLZ") # Decorations h2_bkg.GetXaxis().SetTitle(latex(varx, ROOT=True)) h2_bkg.GetYaxis().SetTitle(latex(vary, ROOT=True)) can3.SetLogz() # Save can3.SaveAs('figures/distributions/2d_{}_{}_bkg.pdf'.format(varx, vary)) can3.SaveAs('figures/distributions/2d_{}_{}_bkg.eps'.format(varx, vary)) can6 = ROOT.TCanvas() can6.SetRightMargin(0.20) h2_sig.Draw("COLZ") # Decorations h2_sig.GetXaxis().SetTitle(latex(varx, ROOT=True)) h2_sig.GetYaxis().SetTitle(latex(vary, ROOT=True)) can6.SetLogz() # Save can6.SaveAs('figures/distributions/2d_{}_{}_sig.pdf'.format(varx, vary)) can6.SaveAs('figures/distributions/2d_{}_{}_sig.eps'.format(varx, vary)) ### Subleading vs. leading knn_ntrk varx = 'lead_knn_ungrtrk500' vary = 'sub_knn_ungrtrk500' # Canvas can4 = ROOT.TCanvas() can4.SetRightMargin(0.20) h2_C1_bkg = ROOT.TH2F('{}_{}'.format(varx, vary), "", *(axes[varx] + axes[vary])) root_numpy.fill_hist(h2_C1_bkg, data.loc[msk_bkg, [varx, vary]].values, 100. * data.loc[msk_bkg, weight].values) h2_C1_sig = ROOT.TH2F('{}_{}'.format(varx, vary), "", *(axes[varx] + axes[vary])) root_numpy.fill_hist(h2_C1_sig, data.loc[msk_sig, [varx, vary]].values, 100. * data.loc[msk_sig, weight].values) # Draw h2_C1_bkg.Draw("COLZ") # Decorations h2_C1_bkg.GetXaxis().SetTitle(latex(varx, ROOT=True)) h2_C1_bkg.GetYaxis().SetTitle(latex(vary, ROOT=True)) can4.SetLogz() can4.SaveAs('figures/distributions/2d_{}_{}_bkg.pdf'.format(varx, vary)) can4.SaveAs('figures/distributions/2d_{}_{}_bkg.eps'.format(varx, vary)) # Canvas can5 = ROOT.TCanvas() can5.SetRightMargin(0.20) # Draw h2_C1_sig.Draw("COLZ") # Decorations h2_C1_sig.GetXaxis().SetTitle(latex(varx, ROOT=True)) h2_C1_sig.GetYaxis().SetTitle(latex(vary, ROOT=True)) can5.SetLogz() can5.SaveAs('figures/distributions/2d_{}_{}_sig.pdf'.format(varx, vary)) can5.SaveAs('figures/distributions/2d_{}_{}_sig.eps'.format(varx, vary)) """ return
def main (args): # Initialise args, cfg = initialise(args) # Load data data, _, _ = load_data('data/' + args.input) #, test=True) msk_sig = data['signal'] == 1 msk_bkg = ~msk_sig # ------------------------------------------------------------------------- #### #### # Initialise Keras backend #### initialise_backend(args) #### #### # Neural network-specific initialisation of the configuration dict #### initialise_config(args, cfg) #### #### # Keras import(s) #### from keras.models import load_model #### #### # NN #### from run.adversarial.common import add_nn #### with Profile("NN"): #### classifier = load_model('models/adversarial/classifier/full/classifier.h5') #### add_nn(data, classifier, 'NN') #### pass # ------------------------------------------------------------------------- # Fill measured profile profile_meas, (x,percs, err) = fill_profile_1D(data[msk_bkg]) weights = 1/err # Add k-NN variable knnfeat = 'knn' orgfeat = VAR add_knn(data, newfeat=knnfeat, path='models/knn/{}_{}_{}_{}.pkl.gz'.format(FIT, VAR, EFF, MODEL)) # Loading KNN classifier knn = loadclf('models/knn/{}_{:s}_{}_{}.pkl.gz'.format(FIT, VAR, EFF, MODEL)) #knn = loadclf('models/knn/{}_{:s}_{}_{}.pkl.gz'.format(FIT, VAR, EFF, MODEL)) X = x.reshape(-1,1) # Filling fitted profile with Profile("Filling fitted profile"): rebin = 8 # Short-hands vbins, vmin, vmax = AXIS[VARX] # Re-binned bin edges @TODO: Make standardised right away? # edges = np.interp(np.linspace(0, vbins, vbins * rebin + 1, endpoint=True), # range(vbins + 1), # np.linspace(vmin, vmax, vbins + 1, endpoint=True)) fineBins = np.linspace(vmin, vmax, vbins*rebin + 1, endpoint=True) orgBins = np.linspace(vmin, vmax, vbins + 1, endpoint=True) # Re-binned bin centres fineCentres = fineBins[:-1] + 0.5 * np.diff(fineBins) orgCentres = orgBins[:-1] + 0.5 * np.diff(orgBins) pass # Get predictions evaluated at re-binned bin centres if 'erf' in FIT: fit = func(fineCentres, knn[0], knn[1], knn[2]) print "Check: ", func([1500, 2000], knn[0], knn[1], knn[2]) else: fit = knn.predict(fineCentres.reshape(-1,1)) #centres.reshape(-1,1)) # Fill ROOT "profile" profile_fit = ROOT.TH1F('profile_fit', "", len(fineBins) - 1, fineBins.flatten('C')) root_numpy.array2hist(fit, profile_fit) knn1 = PolynomialFeatures(degree=2) X_poly = knn1.fit_transform(X) reg = LinearRegression(fit_intercept=False) #fit_intercept=False) reg.fit(X_poly, percs, weights) score = round(reg.score(X_poly, percs), 4) coef = reg.coef_ intercept = reg.intercept_ print "COEFFICIENTS: ", coef, intercept TCoef = ROOT.TVector3(coef[0], coef[1], coef[2]) outFile = ROOT.TFile.Open("models/{}_jet_ungrtrk500_eff{}_stat{}_{}.root".format(FIT, EFF, MIN_STAT, MODEL),"RECREATE") outFile.cd() TCoef.Write() profile_fit.SetName("kNNfit") profile_fit.Write() outFile.Close() # profile_meas2 = ROOT.TH1F('profile_meas', "", len(x) - 1, x.flatten('C')) # root_numpy.array2hist(percs, profile_meas2) profile_meas2 = ROOT.TGraph(len(x), x, percs) pass # Plotting with Profile("Plotting"): # Plot plot(profile_meas2, profile_fit) pass # Plotting local selection efficiencies for D2-kNN < 0 # -- Compute signal efficiency # MC weights are scaled with lumi. This is just for better comparison #if INPUT =="mc": # data.loc[:,'TotalEventWeight'] /= 139000000. for sig, msk in zip([True, False], [msk_sig, msk_bkg]): # Define arrays shape = AXIS[VARX][0] bins = np.linspace(AXIS[VARX][1], AXIS[VARX][2], AXIS[VARX][0]+ 1, endpoint=True) #bins = np.linspace(AXIS[VARX][1], 4000, 40, endpoint=True) #bins = np.append(bins, [4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000]) print "HERE: ", bins #x, y = (np.zeros(shape) for _ in range(2)) # Create `profile` histogram profile_knn = ROOT.TH1F('profile', "", len(bins) - 1, bins ) #.flatten('C') ) profile_org = ROOT.TH1F('profile', "", len(bins) - 1, bins ) #.flatten('C') ) # Compute inclusive efficiency in bins of `VARX` effs = list() for i in range(shape): msk_bin = (data[VARX] > bins[i]) & (data[VARX] <= bins[i+1]) msk_pass = data[knnfeat] > 0 # <? msk_pass_org = data[orgfeat] > 70 # <? num = data.loc[msk & msk_bin & msk_pass, 'TotalEventWeight'].values.sum() num_org = data.loc[msk & msk_bin & msk_pass_org, 'TotalEventWeight'].values.sum() den = data.loc[msk & msk_bin,'TotalEventWeight'].values.sum() if den > 0: eff = num/den *100. eff_org = num_org/den *100. profile_knn.SetBinContent(i + 1, eff) profile_org.SetBinContent(i + 1, eff_org) effs.append(eff) #else: #print i, "Density = 0" pass c = rp.canvas(batch=True) leg = ROOT.TLegend(0.2, 0.75, 0.5, 0.85) leg.AddEntry(profile_knn, "#it{n}_{trk}^{#varepsilon=%s%%} > 0" % ( EFF), "l") leg.AddEntry(profile_org, "#it{n}_{trk} > 70", "l") leg.Draw() pad = c.pads()[0]._bare() pad.cd() pad.SetRightMargin(0.10) pad.SetLeftMargin(0.15) pad.SetTopMargin(0.10) # Styling profile_knn.SetLineColor(rp.colours[1]) profile_org.SetLineColor(rp.colours[2]) profile_knn.SetMarkerStyle(24) profile_knn.GetXaxis().SetTitle( "#it{m}_{jj} [GeV]" ) #latex(VARX, ROOT=True) + "[GeV]") #+ " = log(m^{2}/p_{T}^{2})") #profile.GetXaxis().SetTitle("Large-#it{R} jet " + latex(VARX, ROOT=True))# + " = log(m^{2}/p_{T}^{2})") profile_org.GetYaxis().SetTitle("Selection efficiency (%)") # for #it{n}_{trk}^{#varepsilon=%s%%}>0" % ( EFF)) profile_knn.GetYaxis().SetNdivisions(505) #profile_knn.GetXaxis().SetNdivisions(505) profile_knn.GetXaxis().SetTitleOffset(1.4) profile_knn.GetYaxis().SetTitleOffset(1.8) profile_knn.GetXaxis().SetRangeUser(*XRANGE) profile_org.GetXaxis().SetRangeUser(*XRANGE) yrange = (0., EFF*3) #2.0 percent if yrange: profile_knn.GetYaxis().SetRangeUser(*yrange) profile_org.GetYaxis().SetRangeUser(*yrange) pass # Draw profile_org.Draw() profile_knn.Draw("same") # Save mkdir('figures/knn/') c.save('figures/knn/{}_eff_{}_{:s}_{}_{}_stat{}.pdf'.format(FIT, 'sig' if sig else 'bkg', VAR, EFF, MODEL+INPUT, MIN_STAT)) #c.save('figures/knn/{}_eff_{}_{:s}_{}_{}_stat{}.png'.format(FIT, 'sig' if sig else 'bkg', VAR, EFF, MODEL, MIN_STAT)) c.save('figures/knn/{}_eff_{}_{:s}_{}_{}_stat{}.eps'.format(FIT, 'sig' if sig else 'bkg', VAR, EFF, MODEL+INPUT, MIN_STAT)) del c pass return
def main(args): # Initialise args, cfg = initialise(args) # Common definitions experiment = 'classifier' paths = sorted(glob.glob( 'optimisation/{}/output/*.out'.format(experiment))) num_steps = 100 # Loop all run outputs means, stds, results = list(), list(), list() for path in paths[:num_steps]: # Run-log with open(path, 'r') as f: lines = [l for l in f] # Number of training epochs, to identify the last one num_epochs = max( map( int, map( lambda l: l.split('/')[-1], filter(lambda l: re.search('^Epoch [\d]+/[\d]+ *$', l), lines)))) # Indices of line holding the results for the last training epoch in each CV fold try: indices = np.array( zip(*filter( lambda t: 'Epoch {e}/{e}'.format(e=num_epochs) in t[1], enumerate(lines)))[0]) + 1 except IndexError: continue # Validation losses for each CV fold val_losses = list() for idx in indices: fields = lines[idx].split() jdx = fields.index('val_loss:') + 1 val_loss = float(fields[jdx]) val_losses.append(val_loss) pass # Append results for current evaluation means.append(np.mean(val_losses)) stds.append(np.std(val_losses)) pass pass # Check losses print "Optimisation metrics, sorted by mean + 1 sigma, for robustness:" for i, m, s in sorted(zip(range(len(means)), means, stds), key=lambda t: t[1] + t[2]): print " [{:3d}] {:7.4f} ± {:6.4f}".format(i + 1, m, s) pass # Compute running, best mean means = np.array(means) stds = np.array(stds) bins = np.arange(len(means), dtype=np.float) + 1 best_mean = np.array([np.min(means[:i + 1]) for i in range(len(means))]) idx_improvements = [ 0 ] + list(np.where(np.abs(np.diff(best_mean)) > 0)[0] + 1) # Create graph graph = TGraphErrors(len(bins), bins, means, bins * 0, stds) # Plot plot(experiment, means, graph, idx_improvements, best_mean, bins) return 0
def main (args): # Initialise args, cfg = initialise(args) # Load data data, _, _ = load_data(args.input + 'data.h5', test_full_signal=True) #variable = VAR_TAU21 variable = VAR_N2 #variable = VAR_DECDEEP #variable = VAR_DEEP if variable == VAR_N2: fit_range = FIT_RANGE_N2 elif variable == VAR_TAU21: fit_range = FIT_RANGE_TAU21 elif variable == VAR_DECDEEP: fit_range = FIT_RANGE_DECDEEP elif variable == VAR_DEEP: fit_range = FIT_RANGE_DEEP else: print "invalid variable" return 0 # Add DDT variable add_ddt(data, feat=variable, path='models/ddt/ddt_{}.pkl.gz'.format(variable)) # Load transform ddt = loadclf('models/ddt/ddt_{}.pkl.gz'.format(variable)) # -------------------------------------------------------------------------- # 1D plot # Define variable(s) msk = data['signal'] == 0 # Fill profiles profiles = dict() for var in [variable, variable + 'DDT']: profiles[var] = fill_profile(data[msk], var) pass # Convert to graphs graphs = dict() for key, profile in profiles.iteritems(): # Create arrays from profile arr_x, arr_y, arr_ex, arr_ey = array('d'), array('d'), array('d'), array('d') for ibin in range(1, profile.GetXaxis().GetNbins() + 1): if profile.GetBinContent(ibin) != 0. or profile.GetBinError(ibin) != 0.: arr_x .append(profile.GetBinCenter (ibin)) arr_y .append(profile.GetBinContent(ibin)) arr_ex.append(profile.GetBinWidth (ibin) / 2.) arr_ey.append(profile.GetBinError (ibin)) pass pass # Create graph graphs[key] = ROOT.TGraphErrors(len(arr_x), arr_x, arr_y, arr_ex, arr_ey) pass # Plot 1D transform plot1D(graphs, ddt, arr_x, variable, fit_range) # -------------------------------------------------------------------------- # 2D plot # Create contours binsx = np.linspace(1.5, 5.0, 40 + 1, endpoint=True) if variable == VAR_N2: binsy = np.linspace(0.0, 0.8, 40 + 1, endpoint=True) else: binsy = np.linspace(0.0, 1.4, 40 + 1, endpoint=True) contours = dict() for sig in [0,1]: # Get signal/background mask msk = data['signal'] == sig # Normalise jet weights w = data.loc[msk, VAR_WEIGHT].values w /= math.fsum(w) # Prepare inputs X = data.loc[msk, [VAR_RHODDT, variable]].values # Fill, store contour contour = ROOT.TH2F('2d_{}'.format(sig), "", len(binsx) - 1, binsx, len(binsy) - 1, binsy) root_numpy.fill_hist(contour, X, weights=w) contours[sig] = contour pass # Linear discriminant analysis (LDA) lda = LinearDiscriminantAnalysis() X = data[[VAR_RHODDT, variable]].values y = data['signal'].values w = data[VAR_WEIGHT].values p = w / math.fsum(w) indices = np.random.choice(y.shape[0], size=int(1E+06), p=p, replace=True) lda.fit(X[indices], y[indices]) # Fit weighted sample # -- Linear fit to decision boundary xx, yy = np.meshgrid(binsx, binsy) Z = lda.predict_proba(np.c_[xx.ravel(), yy.ravel()]) Z = Z[:, 1].reshape(xx.shape) yboundary = binsy[np.argmin(np.abs(Z - 0.5), axis=0)] xboundary = binsx lda = LinearRegression() lda.fit(xboundary.reshape(-1,1), yboundary) # Plot 2D scatter plot2D(data, ddt, lda, contours, binsx, binsy, variable) return
def main(args): # Initialise args, cfg = initialise(args) # Load data data, features, _ = load_data(args.input + 'data.h5', test_full_signal=True) #data, features, _ = load_data(args.input + 'data.h5', train_full_signal=True) #for faster checking, don't use for actual comparison # Common definitions # -------------------------------------------------------------------------- def meaningful_digits(number): digits = 0 if number > 0: digits = int(np.ceil(max(-np.log10(number), 0))) pass return '{l:.{d:d}f}'.format(d=digits, l=number) # Tagger feature collection #tagger_features = ['Tau21','Tau21DDT', 'D2', kNN_var, 'D2', 'D2CSS', 'NN', ann_var, 'Adaboost', uboost_var] #tagger_features = ['tau21', 'tau21DDT', 'tau21', 'tau21kNN', 'tau21', 'tau21CSS', 'N2_B1', 'N2_B1DDT', 'N2_B1', 'N2_B1kNN', 'N2_B1', 'N2_B1CSS']; title="tau21_vs_N2_B1" #tagger_features = ['N2_B1', 'N2_B1DDT', 'N2_B1', 'N2_B1kNN', 'N2_B1', 'N2_B1CSS']; title="N2_B1" #tagger_features = ['tau21', 'tau21DDT', 'N2_B1', 'N2_B1kNN', 'N2_B1', 'N2_B1CSS']; title="ATLAS" #tagger_features = ['decDeepWvsQCD', 'decDeepWvsQCDDDT', 'decDeepWvsQCD', 'decDeepWvsQCDkNN', 'decDeepWvsQCD', 'decDeepWvsQCDCSS']; title="decDeep" #tagger_features = {'tau21':['','DDT'], 'N2_B1':['','kNN','CSS']}; title='ATLAS2' #tagger_features = {'tau21':['','DDT'], 'N2_B1':['','kNN',], 'decDeepWvsQCD':['','kNN'], 'DeepWvsQCD':['','kNN']}; title='Deep_vs_Analytic' #tagger_features = {'tau21':[''], 'N2_B1':[''], 'decDeepWvsQCD':[''], 'DeepWvsQCD':['']}; title='Deep_Check2' tagger_features = { 'tau21': ['', 'DDT', 'kNN', 'CSS'], 'N2_B1': ['', 'DDT', 'kNN', 'CSS'] } title = 'Corrected_Full_Analytic' #tagger_features = {'tau21':['', 'DDT', 'kNN', 'CSS'], 'N2_B1':['', 'DDT', 'kNN','CSS']}; title='Full_Analytic_vs_Atlas' extracted_features = [] for basevar in tagger_features.keys(): for suffix in tagger_features[basevar]: extracted_features.append(basevar + suffix) # Add variables # -------------------------------------------------------------------------- with Profile("Add variables"): # the selections of which variables to add could also be automated from the tagger_features list... # Tau21DDT from run.ddt.common import add_ddt add_ddt(data, feat='tau21', path='models/ddt/ddt_tau21.pkl.gz') # N2DDT from run.ddt.common import add_ddt add_ddt(data, feat='N2_B1', path='models/ddt/ddt_N2_B1.pkl.gz') ## decDeepQvsQCDDDT #from run.ddt.common import add_ddt #add_ddt(data, feat='decDeepWvsQCD', path='models/ddt/ddt_decDeepWvsQCD.pkl.gz') ## DeepQvsQCDDDT #from run.ddt.common import add_ddt #add_ddt(data, feat='DeepWvsQCD', path='models/ddt/ddt_DeepWvsQCD.pkl.gz') # Tau21-kNN from run.knn.common import add_knn, VAR_TAU21 as kNN_basevar, TAU21_EFF as kNN_eff print "k-NN base variable: {} (cp. {})".format(kNN_basevar, 'tau_{21}-k#minusNN') add_knn(data, feat=kNN_basevar, path='models/knn/knn_{}_{}.pkl.gz'.format( kNN_basevar, kNN_eff)) # N2-kNN from run.knn.common import add_knn, VAR_N2 as kNN_basevar, N2_EFF as kNN_eff print "k-NN base variable: {} (cp. {})".format(kNN_basevar, 'N_{2}-kNN') add_knn(data, feat=kNN_basevar, path='models/knn/knn_{}_{}.pkl.gz'.format( kNN_basevar, kNN_eff)) ## decDeepWvsQCD-kNN #from run.knn.common import add_knn, VAR_DECDEEP as kNN_basevar, DECDEEP_EFF as kNN_eff #print "k-NN base variable: {} (cp. {})".format(kNN_basevar, 'decDeepWvsQCD') #add_knn(data, feat=kNN_basevar, path='models/knn/knn_{}_{}.pkl.gz'.format(kNN_basevar, kNN_eff)) ## DeepWvsQCD-kNN #from run.knn.common import add_knn, VAR_DEEP as kNN_basevar, DEEP_EFF as kNN_eff #print "k-NN base variable: {} (cp. {})".format(kNN_basevar, 'DeepWvsQCD') #add_knn(data, feat=kNN_basevar, path='models/knn/knn_{}_{}.pkl.gz'.format(kNN_basevar, kNN_eff)) # Tau21-CSS from run.css.common import add_css add_css("tau21", data) # N2-CSS from run.css.common import add_css add_css("N2_B1", data) ## decDeepWvsQCD-CSS #from run.css.common import add_css #add_css("decDeepWvsQCD", data) ## DeepWvsQCD-CSS #from run.css.common import add_css #add_css("DeepWvsQCD", data) pass # Remove unused variables #used_variables = set(tagger_features + ['m', 'pt', 'weight_test', 'npv']) used_variables = set(extracted_features + ['m', 'pt', 'weight_test', 'npv']) unused_variables = [var for var in list(data) if var not in used_variables] data.drop(columns=unused_variables) gc.collect() # Perform performance studies perform_studies(data, args, tagger_features, extracted_features, title=title) return 0