def main(args): # Initialise args, cfg = initialise(args) # Load data data, features, _ = load_data(args.input + 'data.h5', sample=0.01) # @TEMP # Define classifier configuration(s) pattern = 'uboost_ur_{:4.2f}_te_92_rel21_fixed' urs = sorted([0.0, 0.01, 0.1, 0.3]) classifiers = [ ('AdaBoost' if ur == 0 else 'uBoost (#alpha={:4.2f})'.format(ur), pattern.format(ur).replace('.', 'p')) for ur in urs ] # Compute classifiers variables in parallel njobs = min(7, len(classifiers)) with Profile("Run tests in parallel"): ret = Parallel(n_jobs=njobs)(delayed(compute)(data, name) for _, name in classifiers) pass # Add classifier variables to data for name, staged_series in ret: for stage, series in enumerate(staged_series): data['{:s}__{:d}'.format(name, stage)] = series pass pass # Plot learning curves plot(data, urs, classifiers) return 0
def plot(data, urs, classifiers): """ Common method to perform tests on named uBoost/Adaboost classifier. """ # Plotting learning process # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - with Profile("Plotting learning process"): for alpha, (title, name) in zip(urs, classifiers): if title is 'AdaBoost': continue print "===", name, title # Get training/test split masks msk_train = data['train'] == 1 msk_test = data['train'] == 0 # Get target and weight arrays y_train = data.loc[msk_train, 'signal'].values.flatten() y_test = data.loc[msk_test, 'signal'].values.flatten() w_train = data.loc[msk_train, 'weight_adv'].values.flatten() w_test = data.loc[msk_test, 'weight_adv'].values.flatten() # Compute log-loss for each epoch ll_ab_train, ll_ab_test = list(), list() ll_ub_train, ll_ub_test = list(), list() nb_epochs = len( filter(lambda col: col.startswith(name), data.columns)) x = np.arange(nb_epochs) for epoch in range(nb_epochs): # -- Get column names for current epoch col_ab = '{:s}__{:d}'.format( classifiers[0][1], epoch) # Assuming `AdaBoost` is first classifier col_ub = '{:s}__{:d}'.format(name, epoch) # -- Get classifier variables for current epoch p_ab_train = data.loc[msk_train, col_ab] p_ab_test = data.loc[msk_test, col_ab] p_ub_train = data.loc[msk_train, col_ub] p_ub_test = data.loc[msk_test, col_ub] # -- Compute log-loss for current epoch ll_ab_train.append( log_loss(y_train, p_ab_train, sample_weight=w_train)) ll_ab_test.append( log_loss(y_test, p_ab_test, sample_weight=w_test)) ll_ub_train.append( log_loss(y_train, p_ub_train, sample_weight=w_train)) ll_ub_test.append( log_loss(y_test, p_ub_test, sample_weight=w_test)) pass # Plot log-loss curves c = rp.canvas(batch=True) # -- Common plotting options opts = dict(linewidth=2, legend_option='L') c.graph(ll_ab_train, bins=x, linecolor=rp.colours[5], linestyle=1, option='AL', label='AdaBoost', **opts) c.graph(ll_ab_test, bins=x, linecolor=rp.colours[5], linestyle=2, option='L', **opts) c.graph(ll_ub_train, bins=x, linecolor=rp.colours[1], linestyle=1, option='L', label='uBoost', **opts) c.graph(ll_ub_test, bins=x, linecolor=rp.colours[1], linestyle=2, option='L', **opts) # -- Decorations c.pad()._yaxis().SetNdivisions(505) c.xlabel("Training epoch") c.ylabel("BDT classifier loss") c.xlim(0, len(x)) c.ylim(0.3, 1.4) c.legend(width=0.28) c.legend(header='Dataset:', categories=[('Training', { 'linestyle': 1 }), ('Testing', { 'linestyle': 2 })], width=0.28, ymax=0.69) for leg in c.pad()._legends: leg.SetFillStyle(0) pass c.text([ "#sqrt{s} = 13 TeV", "#it{W} jet tagging", "Uniforming rate #alpha = {:3.1f}".format(alpha) ], qualifier="Simulation Internal") # -- Save c.save('figures/loss_uboost__alpha{:4.2f}'.format(alpha).replace( '.', 'p') + '.pdf') pass pass return
def main(args): # Initialise args, cfg = initialise(args) # Initialise Keras backend initialise_backend(args) # Neural network-specific initialisation of the configuration dict initialise_config(args, cfg) # Keras import(s) import keras.backend as K from keras.models import load_model # Project import(s) from adversarial.models import classifier_model, adversary_model, combined_model, decorrelation_model # Load data data, features, _ = load_data(args.input + 'data.h5', test=True) def meaningful_digits(number): digits = 0 if number > 0: digits = int(np.ceil(max(-np.log10(number), 0))) pass return '{l:.{d:d}f}'.format(d=digits, l=number) # -- Adversarial neural network (ANN) scan lambda_reg = 100. lambda_regs = sorted([100.]) ann_vars = list() lambda_strs = list() for lambda_reg_ in lambda_regs: lambda_str = meaningful_digits(lambda_reg_).replace('.', 'p') lambda_strs.append(lambda_str) ann_var_ = "ANN(#lambda={:s})".format(lambda_str.replace('p', '.')) ann_vars.append(ann_var_) pass ann_var = ann_vars[lambda_regs.index(lambda_reg)] print "ann_var" print ann_var # Tagger feature collection # tagger_features = ['NN', ann_var] tagger_features = ['NN', ann_var, 'MV2c10', 'XbbScoreHiggs'] # tagger_features = ['MV2c10'] # Add variables # -------------------------------------------------------------------------- with Profile("Add variables"): # NN from run.adversarial.common import add_nn with Profile("NN"): classifier = load_model( 'models/adversarial/classifier/full/classifier.h5') add_nn(data, classifier, 'NN') pass # ANN with Profile("ANN"): from adversarial.utils import DECORRELATION_VARIABLES adversary = adversary_model( gmm_dimensions=len(DECORRELATION_VARIABLES), **cfg['adversary']['model']) combined = combined_model(classifier, adversary, **cfg['combined']['model']) for ann_var_, lambda_str_ in zip(ann_vars, lambda_strs): print "== Loading model for {}".format(ann_var_) combined.load_weights( 'models/adversarial/combined/full/combined_lambda{}.h5'. format(lambda_str_)) add_nn(data, classifier, ann_var_) pass pass with Profile("MV2c10"): data["MV2c10"] = pd.concat( [data["MV2c10_discriminant_1"], data["MV2c10_discriminant_2"]], axis=1).min(axis=1) # Add MV2 and XbbScore here # e.g. min(MV2_sj1, MV2_sj2) # Remove unused variables used_variables = set(tagger_features + ann_vars + ['mass', 'pt', 'npv', 'weight_test']) unused_variables = [var for var in list(data) if var not in used_variables] data.drop(columns=unused_variables) gc.collect() # Perform performance studies perform_studies(data, args, tagger_features, ann_vars) return 0
def main(args): # Initialising # -------------------------------------------------------------------------- args, cfg = initialise(args) # Loading data # -------------------------------------------------------------------------- data, features, _ = load_data(args.input + 'data_1M_10M.h5') #data = data.sample(frac=0.5, random_state=32) # @TEMP data = data[data['train'] == 1] # Reduce size of data drop_features = [ feat for feat in list(data) if feat not in features + ['m', 'signal', 'weight_adv'] ] data.drop(drop_features, axis=1) cfg['uBoost']['train_features'] = features cfg['uBoost']['random_state'] = SEED cfg['DecisionTreeClassifier']['random_state'] = SEED # Arrays X = data #print(X.head()) w = np.array(data['weight_adv']).flatten() y = np.array(data['signal']).flatten() # Fit uBoost classifier # -------------------------------------------------------------------------- with Profile("Fitting uBoost classifier"): # @NOTE: There might be an issue with the sample weights, because the # local efficiencies computed using kNN does not seem to take the # sample weights into account. # # See: # https://github.com/arogozhnikov/hep_ml/blob/master/hep_ml/uboost.py#L247-L248 # and # https://github.com/arogozhnikov/hep_ml/blob/master/hep_ml/metrics_utils.py#L159-L176 # with `divided_weights` not set. # # `sample_weight` seem to be use only as a starting point for the # boosted, and so not used for the efficiency calculation. # # If this is indeed the case, it would be possible to simply # sample MC events by their weight, and use `sample_weight = 1` # for all samples passed to uBoost. # # @NOTE: I have gotten less sure of the above, so probably no panic. def train_uBoost(X, y, w, cfg, uniforming_rate): """ ... """ # Create base classifier base_tree = DecisionTreeClassifier(**cfg['DecisionTreeClassifier']) # Update training configuration these_cfg = dict(**cfg['uBoost']) these_cfg['uniforming_rate'] = uniforming_rate # Create uBoost classifier uboost = uBoostBDT(base_estimator=base_tree, **these_cfg) # Fit uBoost classifier uboost.fit(X, y, sample_weight=w) return uboost #uniforming_rates = [0.0, 0.01, 0.1, 0.3, 1.0, 3.0, 10.0, 30.0, 100.0] uniforming_rates = [0.0, 0.01, 0.1, 0.3, 0.5, 1.0] #uniforming_rates = [0.5, 1.0] n_jobs = min(7, len(uniforming_rates)) # ...(10, ... jobs = [ delayed(train_uBoost, check_pickle=False)(X, y, w, cfg, uniforming_rate) for uniforming_rate in uniforming_rates ] result = Parallel(n_jobs=n_jobs, backend="threading")(jobs) pass # Saving classifiers # -------------------------------------------------------------------------- for uboost, uniforming_rate in zip(result, uniforming_rates): with Profile("Saving classifiers"): # Ensure model directory exists mkdir('models/uboost/') suffix_ur = "ur_{:s}".format( ("%.2f" % uniforming_rate).replace('.', 'p')) suffix_te = "te_{:d}".format( int(cfg['uBoost']['target_efficiency'] * 100)) # Save uBoost classifier with gzip.open( 'models/uboost/uboost_{}_{}_rel21_fixed_def_cfg_1000boost.pkl.gz' .format(suffix_ur, suffix_te), 'w') as f: pickle.dump(uboost, f) pass pass pass return 0
def main(args): # Definitions histstyle = dict(**HISTSTYLE) # Initialise args, cfg = initialise(args) # Load data #data = np.zeros(1, 95213009, 10) data, features, _ = load_data( 'data/djr_LCTopo_2.h5') # + args.input) #, test=True) # #data2, features, _ = load_data('data/djr_LCTopo_2.h5') # + args.input) #, test=True) # #data = np.concatenate((data1, data2)) #f1 = h5py.File('data/djr_LCTopo_1.h5', 'r') #f2 = h5py.File('data/djr_LCTopo_2.h5', 'r') knnCut = 0 ntrkCut = 50 emfracCut = 0.65 scale = 139 * 1000000 # (inverse nanobarn) signal_to_plot = 7 sigDict = { 0: 'All Models', 1: 'Model A, m = 2 TeV', 2: 'Model A, m = 1 TeV', 3: 'Model A, m = 1.5 TeV', 4: 'Model A, m = 2.5 TeV', 5: 'Model B, m = 1 TeV', 6: 'Model B, m = 1.5 TeV', 7: 'Model B, m = 2 TeV', 8: 'Model B, m = 2.5 TeV', 9: 'Model C, m = 1 TeV', 10: 'Model C, m = 1.5 TeV', 11: 'Model C, m = 2 TeV', 12: 'Model C, m = 2.5 TeV', 13: 'Model D, m = 1 TeV', 14: 'Model D, m = 1.5 TeV', 15: 'Model D, m = 2 TeV', 16: 'Model D, m = 2.5 TeV', } outHistFile = ROOT.TFile.Open( "figures/mjjHistograms_kNN{}_eff{}.root".format(knnCut, kNN_eff), "RECREATE") histstyle[True]['label'] = 'Multijets' histstyle[False]['label'] = 'Dark jets, {}'.format(sigDict[signal_to_plot]) # Add knn variables #base_var = ['lead_jet_ungrtrk500', 'sub_jet_ungrtrk500'] base_var = 'jet_ungrtrk500' kNN_var = base_var.replace('jet', 'knn') #base_vars = ['lead_'+base_var, 'sub_'+base_var] #kNN_vars = ['lead_'+kNN_var, 'sub_'+kNN_var] print data.shape with Profile("Add variables"): #for i in range(len(base_var)): print "k-NN base variable: {} (cp. {})".format(base_var, kNN_var) add_knn(data, newfeat='lead_' + kNN_var, path='models/knn/{}_{}_{}_{}.pkl.gz'.format( FIT, base_var, kNN_eff, sigModel)) add_knn(data, newfeat='sub_' + kNN_var, path='models/knn/{}_{}_{}_{}.pkl.gz'.format( FIT, base_var, kNN_eff, sigModel)) #add_knn(data, newfeat=kNN_var, path='models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel)) print 'models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel) """ base_var = ['lead_jet_ungrtrk500', 'sub_jet_ungrtrk500'] kNN_var = [var.replace('jet', 'knn') for var in base_var] with Profile("Add variables"): from run.knn.common import add_knn, MODEL, VAR as kNN_basevar, EFF as kNN_eff print "k-NN base variable: {} (cp. {})".format(kNN_basevar, kNN_var) for i in range(len(base_var)): add_knn(data, newfeat=kNN_var[i], path='models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var[i], kNN_eff, MODEL)) print 'models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var[i], kNN_eff, MODEL) """ weight = 'weight' # 'weight_test' / 'weight' bins_pt = np.linspace(450, 3500, 40) bins_mjj = np.linspace(0, 8000, 80) # Useful masks msk_bkg = data['signal'] == 0 if signal_to_plot == 0: msk_sig = data['signal'] == 1 else: msk_sig = data['sigType'] == signal_to_plot #msk_weight = data['weight']<0.2 msk_knn = (data['lead_knn_ungrtrk500'] > knnCut) & (data['sub_knn_ungrtrk500'] > knnCut) msk_ungr = (data['lead_jet_ungrtrk500'] > ntrkCut) & (data['sub_jet_ungrtrk500'] > ntrkCut) msk_emfrac = (data['lead_jet_EMFrac'] < emfracCut) & (data['sub_jet_EMFrac'] < emfracCut) msk_knn_1 = (data['lead_knn_ungrtrk500'] > knnCut) msk_ungr_1 = (data['lead_jet_ungrtrk500'] > ntrkCut) #msk_knn = (data['knn_ungrtrk500']>knnCut) #msk_ungr = (data['jet_ungrtrk500']>90.0) msk_ntrkBkg = msk_ungr & msk_emfrac & msk_bkg #& msk_weight #& msk_pt & msk_m & msk_eta msk_ntrkSig = msk_ungr & msk_emfrac & msk_sig #& msk_pt & msk_m & msk_eta msk_knnBkg = msk_knn & msk_bkg msk_knnSig = msk_knn & msk_sig msk_ntrkBkg1 = msk_ungr_1 & msk_bkg #& msk_weight #& msk_pt & msk_m & msk_eta msk_ntrkSig1 = msk_ungr_1 & msk_sig #& msk_pt & msk_m & msk_eta msk_knnBkg1 = msk_knn_1 & msk_bkg #& msk_weight #& msk_pt & msk_m & msk_eta msk_knnSig1 = msk_knn_1 & msk_sig #& msk_pt & msk_m & msk_eta msk_inclBkg = msk_bkg #& msk_weight #& msk_pt & msk_m & msk_eta msk_inclSig = msk_sig #& msk_pt & msk_m & msk_eta # Mjj dist with cut on ntrk, ungrtrk compared to inclusive selection c = rp.canvas(batch=True) hist_inclBkg = c.hist(data.loc[msk_inclBkg, 'dijetmass'].values, bins=bins_mjj, weights=scale * data.loc[msk_inclBkg, weight].values, label="Multijets, Inclusive", normalise=True, linecolor=ROOT.kGreen + 2, linewidth=3) hist_knnBkg = c.hist( data.loc[msk_knnBkg, 'dijetmass'].values, bins=bins_mjj, weights=scale * data.loc[msk_knnBkg, weight].values, label="Multijets, n_{{trk}}^{{#epsilon}}>{}".format(knnCut), normalise=True, linecolor=ROOT.kMagenta + 2, linestyle=2, linewidth=3) hist_ntrkBkg = c.hist(data.loc[msk_ntrkBkg, 'dijetmass'].values, bins=bins_mjj, weights=scale * data.loc[msk_ntrkBkg, weight].values, label="Multijets, n_{{trk}}>{}".format(ntrkCut), normalise=True, linecolor=ROOT.kOrange + 2, linestyle=2, linewidth=3) #hist_CRBkg = c.hist(data.loc[msk_CR_bkg, 'dijetmass'].values, bins=bins_mjj, weights=scale*data.loc[msk_CR_bkg, weight].values, label="CR Bkg, C<20", normalise=True, linecolor=ROOT.kGray+2, linestyle=2) c.legend(width=0.4, xmin=0.5, ymax=0.9) c.ylabel("Fraction of jets") c.xlabel("m_{jj} [GeV]") c.logy() #c.ylim(0.00005, 5) #c.save('figures/distributions/mjj_Bkg_CR20.pdf'.format(knnCut)) #c.save('figures/distributions/mjj_Bkg_CR20.eps'.format(knnCut)) c.save('figures/distributions/mjj_BkgDist_ntrk{}_knn{}_{}.pdf'.format( ntrkCut, knnCut, FIT)) c.save('figures/distributions/mjj_BkgDist_ntrk{}_knn{}_{}.eps'.format( ntrkCut, knnCut, FIT)) del c c = rp.canvas(batch=True) hist_Sig = c.hist(data.loc[msk_sig, 'dijetmass'].values, bins=bins_mjj, weights=data.loc[msk_sig, weight].values, label="Model A, m = 2 TeV, inclusive", normalise=True, linecolor=ROOT.kGreen + 2) hist_knnSig = c.hist( data.loc[msk_knnSig, 'dijetmass'].values, bins=bins_mjj, weights=data.loc[msk_knnSig, weight].values, label="Model A, m = 2 TeV, #it{{n}}_{{trk}}^{{#epsilon}}>{}".format( knnCut), normalise=True, linecolor=ROOT.kMagenta + 2, linestyle=2) hist_ntrkSig = c.hist( data.loc[msk_ntrkSig, 'dijetmass'].values, bins=bins_mjj, weights=data.loc[msk_ntrkSig, weight].values, label="Model A, m = 2 TeV, #it{{n}}_{{trk}}>{}".format(ntrkCut), normalise=True, linecolor=ROOT.kOrange + 2, linestyle=2) #hist_CRSig = c.hist(data.loc[msk_CR_sig, 'dijetmass'].values, bins=bins_mjj, weights=data.loc[msk_CR_sig, weight].values, label="Sig, CR", normalise=True, linecolor=ROOT.kGray+2, linestyle=2) c.legend(width=0.4, xmin=0.5, ymax=0.9) c.ylabel("Fraction of jets") c.xlabel("m_{jj} [GeV]") c.logy() #c.ylim(0.00005, 5) c.save('figures/distributions/mjj_SigDist_ntrk{}_knn{}_{}.pdf'.format( ntrkCut, knnCut, FIT)) c.save('figures/distributions/mjj_SigDist_ntrk{}_knn{}_{}.eps'.format( ntrkCut, knnCut, FIT)) del c c = rp.canvas(batch=True) hist_knnSig = c.hist( data.loc[msk_knnSig, 'dijetmass'].values, bins=bins_mjj, weights=data.loc[msk_knnSig, weight].values, label="Model A, m = 2 TeV, knn_ntrk>{}".format(knnCut), normalise=False, linecolor=ROOT.kBlue + 1, linestyle=1) hist_knnBkg = c.hist(data.loc[msk_knnBkg, 'dijetmass'].values, bins=bins_mjj, weights=scale * data.loc[msk_knnBkg, weight].values, label="Multijets, knn_ntrk>{}".format(knnCut), normalise=False, linecolor=ROOT.kMagenta + 2, linestyle=2) hist_ntrkBkg = c.hist(data.loc[msk_ntrkBkg, 'dijetmass'].values, bins=bins_mjj, weights=scale * data.loc[msk_ntrkBkg, weight].values, label="Multijets, ntrk>{}".format(ntrkCut), normalise=False, linecolor=ROOT.kOrange + 2, linestyle=2) c.legend(width=0.4, xmin=0.3, ymax=0.9) c.ylabel("Number of events") c.xlabel("m_{jj} [GeV]") c.logy() #c.ylim(0.00005, 5) c.save('figures/distributions/mjj_Dist_noNorm_knn{}_{}.pdf'.format( knnCut, FIT)) c.save('figures/distributions/mjj_Dist_noNorm_knn{}_{}.eps'.format( knnCut, FIT)) bins_mjj = np.linspace(0, 10000, 50) # Unscaled histograms for calculating efficiencies hist_inclBkg = c.hist(data.loc[msk_inclBkg, 'dijetmass'].values, bins=bins_mjj, weights=scale * data.loc[msk_inclBkg, weight].values, normalise=False) hist_inclSig = c.hist(data.loc[msk_inclSig, 'dijetmass'].values, bins=bins_mjj, weights=data.loc[msk_inclSig, weight].values, normalise=False) hist_ntrkSig = c.hist(data.loc[msk_ntrkSig, 'dijetmass'].values, bins=bins_mjj, weights=data.loc[msk_ntrkSig, weight].values, normalise=False) hist_knnSig = c.hist(data.loc[msk_knnSig, 'dijetmass'].values, bins=bins_mjj, weights=data.loc[msk_knnSig, weight].values, normalise=False) hist_ntrkSig1 = c.hist(data.loc[msk_ntrkSig1, 'dijetmass'].values, bins=bins_mjj, weights=data.loc[msk_ntrkSig1, weight].values, normalise=False) hist_ntrkBkg1 = c.hist(data.loc[msk_ntrkBkg1, 'dijetmass'].values, bins=bins_mjj, weights=data.loc[msk_ntrkBkg1, weight].values, normalise=False) hist_knnBkg1 = c.hist(data.loc[msk_knnBkg1, 'dijetmass'].values, bins=bins_mjj, weights=data.loc[msk_knnBkg1, weight].values, normalise=False) hist_knnSig1 = c.hist(data.loc[msk_knnSig1, 'dijetmass'].values, bins=bins_mjj, weights=data.loc[msk_knnSig1, weight].values, normalise=False) print "Bkg inclusive integral: ", hist_inclBkg.GetEffectiveEntries() print "Sig inclusive integral: ", hist_inclSig.GetEffectiveEntries() print "Bkg pass kNN eff entries / integral: ", hist_knnBkg.GetEffectiveEntries( ), hist_knnBkg.Integral() print "Sig pass kNN eff entries / integral: ", hist_knnSig.GetEffectiveEntries( ), hist_knnSig.Integral() print "Bkg pass ntrk eff entries / integral: ", hist_ntrkBkg.GetEffectiveEntries( ), hist_ntrkBkg.Integral() print "Sig pass ntrk eff entries / integral: ", hist_ntrkSig.GetEffectiveEntries( ), hist_ntrkSig.Integral() print "Bkg Eff. knn_ntrk> {}, eff. entries: ".format( knnCut), 100 * hist_knnBkg.GetEffectiveEntries( ) / hist_inclBkg.GetEffectiveEntries() print "Sig Eff. knn_ntrk> {}, eff. entries: ".format( knnCut), 100 * hist_knnSig.GetEffectiveEntries( ) / hist_inclSig.GetEffectiveEntries() print "Bkg Eff. knn_ntrk> {}, integral: ".format( knnCut), 100 * hist_knnBkg.Integral() / hist_inclBkg.Integral() print "Sig Eff. knn_ntrk> {}, integral: ".format( knnCut), 100 * hist_knnSig.Integral() / hist_inclSig.Integral() print "Bkg Eff. ntrk>{}, eff. entries: ".format( ntrkCut), 100 * hist_ntrkBkg.GetEffectiveEntries( ) / hist_inclBkg.GetEffectiveEntries() print "Sig Eff. ntrk>{}, eff. entries: ".format( ntrkCut), 100 * hist_ntrkSig.GetEffectiveEntries( ) / hist_inclSig.GetEffectiveEntries( ) #, hist_ntrkSig.GetEffectiveEntries() print "Bkg Eff. 1 jet knn_ntrk> {}, eff. entries: ".format( knnCut), 100 * hist_knnBkg1.GetEffectiveEntries( ) / hist_inclBkg.GetEffectiveEntries() print "Sig Eff. 1 jet knn_ntrk> {}, eff. entries: ".format( knnCut), 100 * hist_knnSig1.GetEffectiveEntries( ) / hist_inclSig.GetEffectiveEntries() print "Bkg Eff. 1 jet knn_ntrk> {}, integral: ".format( knnCut), 100 * hist_knnBkg1.GetEffectiveEntries( ) / hist_inclBkg.GetEffectiveEntries() print "Sig Eff. 1 jet knn_ntrk> {}, integral: ".format( knnCut), 100 * hist_knnSig1.GetEffectiveEntries( ) / hist_inclSig.GetEffectiveEntries() outHistFile.cd() hist_knnBkg.SetName("bkg_knn") hist_knnSig.SetName("sig_knn") hist_knnBkg.Write() hist_knnSig.Write() outHistFile.Close() # Mjj dist for CR compared to inclusive selection """
def main(args): # Initialise args, cfg = initialise(args) # Load data data, _, _ = load_data(args.input + 'data.h5', train=True) msk_sig = data['signal'] == 1 msk_bkg = ~msk_sig # ------------------------------------------------------------------------- #### #### # Initialise Keras backend #### initialise_backend(args) #### #### # Neural network-specific initialisation of the configuration dict #### initialise_config(args, cfg) #### #### # Keras import(s) #### from keras.models import load_model #### #### # NN #### from run.adversarial.common import add_nn #### with Profile("NN"): #### classifier = load_model('models/adversarial/classifier/full/classifier.h5') #### add_nn(data, classifier, 'NN') #### pass # ------------------------------------------------------------------------- # Fill measured profile profile_meas, _ = fill_profile(data[msk_bkg]) # Add k-NN variable knnfeat = 'knn' add_knn(data, newfeat=knnfeat, path='models/knn/knn_{}_{}.pkl.gz'.format(VAR, EFF)) # Loading KNN classifier knn = loadclf('models/knn/knn_{:s}_{:.0f}.pkl.gz'.format(VAR, EFF)) # Filling fitted profile with Profile("Filling fitted profile"): rebin = 8 edges, centres = dict(), dict() for ax, var in zip(['x', 'y'], [VARX, VARY]): # Short-hands vbins, vmin, vmax = AXIS[var] # Re-binned bin edges @TODO: Make standardised right away? edges[ax] = np.interp( np.linspace(0, vbins, vbins * rebin + 1, endpoint=True), range(vbins + 1), np.linspace(vmin, vmax, vbins + 1, endpoint=True)) # Re-binned bin centres centres[ax] = edges[ax][:-1] + 0.5 * np.diff(edges[ax]) pass # Get predictions evaluated at re-binned bin centres g = dict() g['x'], g['y'] = np.meshgrid(centres['x'], centres['y']) g['x'], g['y'] = standardise(g['x'], g['y']) X = np.vstack((g['x'].flatten(), g['y'].flatten())).T fit = knn.predict(X).reshape(g['x'].shape).T # Fill ROOT "profile" profile_fit = ROOT.TH2F('profile_fit', "", len(edges['x']) - 1, edges['x'].flatten('C'), len(edges['y']) - 1, edges['y'].flatten('C')) root_numpy.array2hist(fit, profile_fit) pass # Plotting with Profile("Plotting"): for fit in [False, True]: # Select correct profile profile = profile_fit if fit else profile_meas # Plot plot(profile, fit) pass pass # Plotting local selection efficiencies for D2-kNN < 0 # -- Compute signal efficiency for sig, msk in zip([True, False], [msk_sig, msk_bkg]): if sig: rgbs = [(247 / 255., 251 / 255., 255 / 255.), (222 / 255., 235 / 255., 247 / 255.), (198 / 255., 219 / 255., 239 / 255.), (158 / 255., 202 / 255., 225 / 255.), (107 / 255., 174 / 255., 214 / 255.), (66 / 255., 146 / 255., 198 / 255.), (33 / 255., 113 / 255., 181 / 255.), (8 / 255., 81 / 255., 156 / 255.), (8 / 255., 48 / 255., 107 / 255.)] red, green, blue = map(np.array, zip(*rgbs)) nb_cols = len(rgbs) stops = np.linspace(0, 1, nb_cols, endpoint=True) else: rgbs = [(255 / 255., 51 / 255., 4 / 255.), (247 / 255., 251 / 255., 255 / 255.), (222 / 255., 235 / 255., 247 / 255.), (198 / 255., 219 / 255., 239 / 255.), (158 / 255., 202 / 255., 225 / 255.), (107 / 255., 174 / 255., 214 / 255.), (66 / 255., 146 / 255., 198 / 255.), (33 / 255., 113 / 255., 181 / 255.), (8 / 255., 81 / 255., 156 / 255.), (8 / 255., 48 / 255., 107 / 255.)] red, green, blue = map(np.array, zip(*rgbs)) nb_cols = len(rgbs) stops = np.array([0] + list( np.linspace(0, 1, nb_cols - 1, endpoint=True) * (1. - EFF / 100.) + EFF / 100.)) pass ROOT.TColor.CreateGradientColorTable(nb_cols, stops, red, green, blue, NB_CONTOUR) # Define arrays shape = (AXIS[VARX][0], AXIS[VARY][0]) bins = [ np.linspace(AXIS[var][1], AXIS[var][2], AXIS[var][0] + 1, endpoint=True) for var in VARS ] x, y, z = (np.zeros(shape) for _ in range(3)) # Create `profile` histogram profile = ROOT.TH2F('profile', "", len(bins[0]) - 1, bins[0].flatten('C'), len(bins[1]) - 1, bins[1].flatten('C')) # Compute inclusive efficiency in bins of `VARY` effs = list() for edges in zip(bins[1][:-1], bins[1][1:]): msk_bin = (data[VARY] > edges[0]) & (data[VARY] < edges[1]) msk_pass = data[knnfeat] < 0 num = data.loc[msk & msk_bin & msk_pass, 'weight_test'].values.sum() den = data.loc[msk & msk_bin, 'weight_test'].values.sum() effs.append(num / den) pass # Fill profile for i, j in itertools.product(*map(range, shape)): # Bin edges in x and y edges = [bin[idx:idx + 2] for idx, bin in zip([i, j], bins)] # Masks msks = [(data[var] > edges[dim][0]) & (data[var] <= edges[dim][1]) for dim, var in enumerate(VARS)] msk_bin = reduce(lambda x, y: x & y, msks) data_ = data[msk & msk_bin] # Set non-zero bin content if np.sum(msk & msk_bin): msk_pass = data_[knnfeat] < 0 num = data.loc[msk & msk_bin & msk_pass, 'weight_test'].values.sum() den = data.loc[msk & msk_bin, 'weight_test'].values.sum() eff = num / den profile.SetBinContent(i + 1, j + 1, eff) pass pass c = rp.canvas(batch=True) pad = c.pads()[0]._bare() pad.cd() pad.SetRightMargin(0.20) pad.SetLeftMargin(0.15) pad.SetTopMargin(0.10) # Styling profile.GetXaxis().SetTitle("Large-#it{R} jet " + latex(VARX, ROOT=True) + " = log(m^{2}/p_{T}^{2})") profile.GetYaxis().SetTitle("Large-#it{R} jet " + latex(VARY, ROOT=True) + " [GeV]") profile.GetZaxis().SetTitle("Selection efficiency for %s^{(%s%%)}" % (latex(VAR, ROOT=True), EFF)) profile.GetYaxis().SetNdivisions(505) profile.GetZaxis().SetNdivisions(505) profile.GetXaxis().SetTitleOffset(1.4) profile.GetYaxis().SetTitleOffset(1.8) profile.GetZaxis().SetTitleOffset(1.3) zrange = (0., 1.) if zrange: profile.GetZaxis().SetRangeUser(*zrange) pass profile.SetContour(NB_CONTOUR) # Draw profile.Draw('COLZ') # Decorations c.text(qualifier=QUALIFIER, ymax=0.92, xmin=0.15) c.text(["#sqrt{s} = 13 TeV", "#it{W} jets" if sig else "Multijets"], ATLAS=False) # -- Efficiencies xaxis = profile.GetXaxis() yaxis = profile.GetYaxis() tlatex = ROOT.TLatex() tlatex.SetTextColor(ROOT.kGray + 2) tlatex.SetTextSize(0.023) tlatex.SetTextFont(42) tlatex.SetTextAlign(32) xt = xaxis.GetBinLowEdge(xaxis.GetNbins()) for eff, ibin in zip(effs, range(1, yaxis.GetNbins() + 1)): yt = yaxis.GetBinCenter(ibin) tlatex.DrawLatex( xt, yt, "%s%.1f%%" % ("#bar{#varepsilon}^{rel}_{%s} = " % ('sig' if sig else 'bkg') if ibin == 1 else '', eff * 100.)) pass # -- Bounds BOUNDS[0].DrawCopy("SAME") BOUNDS[1].DrawCopy("SAME") c.latex("m > 50 GeV", -4.5, BOUNDS[0].Eval(-4.5) + 30, align=21, angle=-37, textsize=13, textcolor=ROOT.kGray + 3) c.latex("m < 300 GeV", -2.5, BOUNDS[1].Eval(-2.5) - 30, align=23, angle=-57, textsize=13, textcolor=ROOT.kGray + 3) # Save mkdir('figures/knn/') c.save('figures/knn/knn_eff_{}_{:s}_{:.0f}.pdf'.format( 'sig' if sig else 'bkg', VAR, EFF)) pass return
def perform_studies(data, args, tagger_features, extracted_features, title=None): """ Method delegating performance studies. """ #masscuts = [True, False] masscuts = [False] pt_ranges = [None, (200, 500), (500, 1000), (1000, 2000)] #pt_ranges = [(1000, 2000)] #pt_ranges = [None] ## Perform combined robustness study #with Profile("Study: Robustness"): # for masscut in masscuts: # studies.robustness_full(data, args, tagger_features, masscut=masscut, title=title) # pass # pass ## Perform jet mass distribution comparison study #with Profile("Study: Jet mass comparison"): # for pt_range in pt_ranges: # print "pt_range =", pt_range # studies.jetmasscomparison(data, args, tagger_features, pt_range, title=title) # pass # Perform summary plot study with Profile("Study: Summary plot"): scan_features = dict() for masscut, pt_range in itertools.product(masscuts, pt_ranges): studies.summary(data, args, tagger_features, scan_features, masscut=masscut, pt_range=pt_range, title=title) pass pass ## Perform distributions study #with Profile("Study: Substructure tagger distributions"): # mass_ranges = np.linspace(50, 300, 5 + 1, endpoint=True) # mass_ranges = [None] + zip(mass_ranges[:-1], mass_ranges[1:]) # #for feat, pt_range, mass_range in itertools.product(tagger_features, pt_ranges, mass_ranges): # tagger_features # for feat, pt_range, mass_range in itertools.product(extracted_features, pt_ranges, mass_ranges): # tagger_features # studies.distribution(data, args, feat, pt_range, mass_range, title=title) # pass # pass ## Perform ROC study #with Profile("Study: ROC"): # #masscuts = [(65,105)] # #pt_ranges = [(None), (300,500), (1000,1500)] # for masscut, pt_range in itertools.product(masscuts, pt_ranges): # studies.roc(data, args, tagger_features, masscut=masscut, pt_range=pt_range, title=title) # pass # pass ## Perform JSD study #with Profile("Study: JSD"): # for pt_range in pt_ranges: # studies.jsd(data, args, tagger_features, pt_range, title=title) # pass # pass ## Perform efficiency study #with Profile("Study: Efficiency"): # #for feat in tagger_features: # for feat in extracted_features: # studies.efficiency(data, args, feat, title=title) # pass return
def perform_optimisation(var, bins, data): """ ... """ # Fill 2D substructure profile profile2d = fill_2d_profile(data, var, bins, "m", MASS_BINS) # Get 1D profile for lowest mass bin profile0 = profile2d.ProjectionY("%s_lowMass" % profile2d.GetName(), 1, 1) profile0 = kde(profile0) normalise(profile0, density=True) # Perform the optimisation bestShapeVal = 0 bestSumChi2 = 1e20 for shapeVal in SHAPEVAL_RANGE: print "Shape value: ", shapeVal sumChi2 = 0. # Each mass bin needs to be optimized over omega for mass in range(len(MASS_BINS) - 1): print " Mass bin: ", mass # Get 1D profile for current mass bin profile = profile2d.ProjectionY( "%s_bin_%i" % (profile2d.GetName(), mass), mass + 1, mass + 1) # Fit current profile to low-mass profile chi2, bestOmega, _, _ = fit(profile, shapeVal, profile0, "%.2f" % mass) # Accumulate chi2 sumChi2 += chi2 pass # Update chi2 for current `shapeVal` print "-- sumChi2: {} (cp. {})".format(sumChi2, bestSumChi2) if sumChi2 < bestSumChi2: bestSumChi2 = sumChi2 bestShapeVal = shapeVal pass pass # Saving CSS transforms with Profile("Saving CSS transform"): # Ensure model directory exists mkdir('models/css/') mkdir( 'figures/css/' ) ## put in by me because errors were eturned when saving the pdfs # Get the optimal, measured `omega`s for each mass-bin bestOmegas = list() for mass in range(len(MASS_BINS) - 1): profile = profile2d.ProjectionY( "%s_bin_%i_final" % (profile2d.GetName(), mass), mass + 1, mass + 1) sumChi2, bestOmega, profile_css, profile0rebin = fit( profile, bestShapeVal, profile0, "%.2f" % mass) # Test-plot distributions used for fitting! # -- Canvas c = rp.canvas(batch=True) # -- Plot profile = kde(profile) normalise(profile, density=True) lowmassbin = "#it{{m}} #in [{:.1f}, {:.1f}] GeV".format( MASS_BINS[0], MASS_BINS[1]).replace('.0', '') massbin = "#it{{m}} #in [{:.1f}, {:.1f}] GeV".format( MASS_BINS[mass], MASS_BINS[mass + 1]).replace('.0', '') c.hist(profile0rebin, label=latex(var, ROOT=True) + ", {}".format(lowmassbin), linecolor=rp.colours[1], fillcolor=rp.colours[1], alpha=0.5, option='HISTL', legend_option='FL') c.hist(profile, label=latex(var, ROOT=True) + ", {}".format(massbin), linecolor=rp.colours[4], linestyle=2, option='HISTL') c.hist(profile_css, label=latex(var + 'CSS', ROOT=True) + ", {}".format(massbin), linecolor=rp.colours[3], option='HISTL') # -- Decorations c.xlabel( latex(var, ROOT=True) + ", " + latex(var + 'CSS', ROOT=True)) c.ylabel("Number of jets p.d.f.") c.legend(xmin=0.45, ymax=0.76, width=0.25) c.text(["#sqrt{s} = 13 TeV, Multijets", "KDE smoothed"], qualifier=QUALIFIER, ATLAS=False) c.pad()._xaxis().SetTitleOffset(1.3) c.pad()._yaxis().SetNdivisions(105) c.pad()._primitives[-1].Draw('SAME AXIS') c.padding(0.50) # -- Save c.save('figures/css/css_test_{}_mass{}.pdf'.format(var, mass)) # Store best-fit omega in array print mass, bestOmega bestOmegas.append(bestOmega) pass # Fit best omega vs. mass x = MASS_BINS[:-1] + 0.5 * np.diff(MASS_BINS) y = np.array(bestOmegas) h = ROOT.TH1F('hfit', "", len(MASS_BINS) - 1, MASS_BINS) root_numpy.array2hist(y, h) for ibin in range(1, len(x) + 1): h.SetBinError( ibin, 0.02) # Just some value to ensure equal errors on all points pass m0 = 0.5 * (MASS_BINS[0] + MASS_BINS[1]) f = ROOT.TF1( "fit", "[0] * (1./{m0} - 1./x) + [1] * TMath::Log(x/{m0})".format(m0=m0), m0, 300) f.SetLineColor(rp.colours[4]) f.SetLineStyle(2) h.Fit(f) # Write out the optimal configuration for each mass bin for mass in range(len(MASS_BINS) - 1): profile = profile2d.ProjectionY( "%s_bin_%i_final" % (profile2d.GetName(), mass), mass + 1, mass + 1) profile = kde(profile) normalise(profile, density=True) bestOmegaFitted_ = f.Eval( h.GetBinCenter(mass + 1)) + np.finfo(float).eps bestOmegaFitted = max(bestOmegaFitted_, 1E-04) #bestOmegaFitted = h.GetBinContent(mass + 1) print "bestOmegaFitted[{}] = {} --> {}".format( mass, bestOmegaFitted_, bestOmegaFitted) F, Ginv = get_css_fns(bestShapeVal, bestOmegaFitted, profile, "") # Save classifier saveclf(F, 'models/css/css_%s_F_%i.pkl.gz' % (var, mass)) saveclf(Ginv, 'models/css/css_%s_Ginv_%i.pkl.gz' % (var, mass)) pass # Plot best omega vs. mass # -- Canvas c = rp.canvas(batch=True) # -- Plots #c.hist(bestOmegas, bins=MASS_BINS, linecolor=rp.colours[1]) c.hist(h, linecolor=rp.colours[1], option='HIST', label="Measured") f.Draw('SAME') # -- Decorations c.xlabel("Large-#it{R} jet mass [GeV]") c.ylabel("Best-fit #Omega_{D}") c.text([ "#sqrt{s} = 13 TeV, Multijets", "CSS applied to {}".format( latex(var, ROOT=True)), "Best-fit #alpha = {:.1f}".format(bestShapeVal) ], qualifier=QUALIFIER, ATLAS=False) c.legend(categories=[('Functional fit', { 'linewidth': 2, 'linestyle': 2, 'linecolor': rp.colours[4] })]) # Save c.save('figures/css/cssBestOmega_{}.pdf'.format(var)) pass return 0
def main(args): # Initialise args, cfg = initialise(args) # Load data data, _, _ = load_data(args.input + 'data.h5', train=True, background=True) # ------------------------------------------------------------------------- #### #### # Initialise Keras backend #### initialise_backend(args) #### #### # Neural network-specific initialisation of the configuration dict #### initialise_config(args, cfg) #### #### # Keras import(s) #### from keras.models import load_model #### #### # NN #### from run.adversarial.common import add_nn #### with Profile("NN"): #### classifier = load_model('models/adversarial/classifier/full/classifier.h5') #### add_nn(data, classifier, 'NN') #### pass # ------------------------------------------------------------------------- # Fill measured profile profile_meas, _ = fill_profile(data) # Loading KNN classifier knn = loadclf('models/knn/knn_{:s}_{:.0f}.pkl.gz'.format(VAR, EFF)) # Filling fitted profile with Profile("Filling fitted profile"): rebin = 8 edges, centres = dict(), dict() for ax, var in zip(['x', 'y'], [VARX, VARY]): # Short-hands vbins, vmin, vmax = AXIS[var] # Re-binned bin edges @TODO: Make standardised right away? edges[ax] = np.interp( np.linspace(0, vbins, vbins * rebin + 1, endpoint=True), range(vbins + 1), np.linspace(vmin, vmax, vbins + 1, endpoint=True)) # Re-binned bin centres centres[ax] = edges[ax][:-1] + 0.5 * np.diff(edges[ax]) pass # Get predictions evaluated at re-binned bin centres g = dict() g['x'], g['y'] = np.meshgrid(centres['x'], centres['y']) g['x'], g['y'] = standardise(g['x'], g['y']) X = np.vstack((g['x'].flatten(), g['y'].flatten())).T fit = knn.predict(X).reshape(g['x'].shape).T # Fill ROOT "profile" profile_fit = ROOT.TH2F('profile_fit', "", len(edges['x']) - 1, edges['x'].flatten('C'), len(edges['y']) - 1, edges['y'].flatten('C')) root_numpy.array2hist(fit, profile_fit) pass # Plotting with Profile("Plotting"): for fit in [False, True]: # Select correct profile profile = profile_fit if fit else profile_meas # Plot plot(profile, fit) pass pass return
def main(args): # Initialise args, cfg = initialise(args) # Initialise Keras backend initialise_backend(args) # Neural network-specific initialisation of the configuration dict initialise_config(args, cfg) # Keras import(s) import keras.backend as K from keras.models import load_model # Project import(s) from adversarial.models import classifier_model, adversary_model, combined_model, decorrelation_model # Load data data, features, _ = load_data('data/' + args.input, test=True) # Common definitions # -------------------------------------------------------------------------- # -- k-nearest neighbour #kNN_var = 'D2-k#minusNN' #kNN_var = 'C1_02-knn' #base_var = 'sub_jet_ntrk' #kNN_var = base_var.replace('sub_jet_', '') + '-knn' #base_var = ['lead_jet_ungrtrk500', 'sub_jet_ungrtrk500'] #kNN_var = [var.replace('jet', 'knn') for var in base_var] base_var = 'jet_ungrtrk500' kNN_var = base_var.replace('jet', 'knn') #base_var = ['jet_ungrtrk500'] #kNN_var = [var.replace('jet', 'knn') for var in base_var] #base_var = ['ntrk_sum'] #kNN_var = [var + '-knn' for var in base_var] def meaningful_digits(number): digits = 0 if number > 0: digits = int(np.ceil(max(-np.log10(number), 0))) pass return '{l:.{d:d}f}'.format(d=digits, l=number) """ # -- Adversarial neural network (ANN) scan lambda_reg = 10. lambda_regs = sorted([1., 3., 10.]) å ham har jeg talt med løbende. For mange dage siden har vi talt om, om man kunne bruge grundlovsdag, og hvordan det ville hænge sammen med de frister, der er. In ann_vars = list() lambda_strs = list() for lambda_reg_ in lambda_regs: lambda_str = meaningful_digits(lambda_reg_).replace('.', 'p') lambda_strs.append(lambda_str) ann_var_ = "ANN(#lambda={:s})".format(lambda_str.replace('p', '.')) ann_vars.append(ann_var_) pass ann_var = ann_vars[lambda_regs.index(lambda_reg)] # -- uBoost scan uboost_eff = 92 uboost_ur = 0.3 uboost_urs = sorted([0., 0.01, 0.1, 0.3, 1.0]) uboost_var = 'uBoost(#alpha={:s})'.format(meaningful_digits(uboost_ur)) uboost_vars = ['uBoost(#alpha={:s})'.format(meaningful_digits(ur)) for ur in uboost_urs] uboost_pattern = 'uboost_ur_{{:4.2f}}_te_{:.0f}_rel21_fixed'.format(uboost_eff) """ # Tagger feature collection #tagger_features = ['Tau21','Tau21DDT', 'D2', kNN_var, 'D2', 'D2CSS', 'NN', ann_var, 'Adaboost', uboost_var] #tagger_features = ['lead_jet_C1_02', kNN_var] tagger_features = [ 'lead_' + base_var, 'lead_' + kNN_var, 'sub_' + base_var, 'sub_' + kNN_var ] #tagger_features = base_var + kNN_var # Add variables # -------------------------------------------------------------------------- with Profile("Add variables"): #for i in range(len(base_var)): from run.knn.common import add_knn, MODEL as sigModel, VAR as kNN_basevar, EFF as kNN_eff print "k-NN base variable: {} (cp. {})".format(base_var, kNN_var) add_knn(data, newfeat='lead_' + kNN_var, path='models/knn/knn_{}_{}_{}.pkl.gz'.format( base_var, kNN_eff, sigModel)) add_knn(data, newfeat='sub_' + kNN_var, path='models/knn/knn_{}_{}_{}.pkl.gz'.format( base_var, kNN_eff, sigModel)) # Remove unused variables used_variables = set(tagger_features + ['lead_jet_m', 'lead_jet_pt', 'dijetmass', 'weight']) unused_variables = [var for var in list(data) if var not in used_variables] data.drop(columns=unused_variables) gc.collect() # Perform performance studies perform_studies(data, args, tagger_features) return 0
def main (args): # Definitions histstyle = dict(**HISTSTYLE) # Initialise args, cfg = initialise(args) # Load data data, features, _ = load_data('data/djr_LCTopo_1.h5') #, test=True) #data2, features, _ = load_data('data/djr_LCTopo_2.h5') #, test=True) #data = np.concatenate((data1, data2)) sigNumber = 0 sigDict = { 0: 'All Models', 1: 'Model A, m = 1 TeV', 2: 'Model A, m = 1.5 TeV', 3: 'Model A, m = 2 TeV', 4: 'Model A, m = 2.5 TeV', 5: 'Model B, m = 1 TeV', 6: 'Model B, m = 1.5 TeV', 7: 'Model B, m = 2 TeV', 8: 'Model B, m = 2.5 TeV', 9: 'Model C, m = 1 TeV', 10: 'Model C, m = 1.5 TeV', 11: 'Model C, m = 2 TeV', 12: 'Model C, m = 2.5 TeV', 13: 'Model D, m = 1 TeV', 14: 'Model D, m = 1.5 TeV', 15: 'Model D, m = 2 TeV', 16: 'Model D, m = 2.5 TeV', } histstyle[True] ['label'] = 'Multijets' histstyle[False]['label'] = 'Dark jets, {}'.format(sigDict[sigNumber]) # Add knn variables #base_var = ['lead_jet_ungrtrk500', 'sub_jet_ungrtrk500'] base_var = 'jet_ungrtrk500' kNN_var = base_var.replace('jet', 'knn') #base_vars = [base_var] #kNN_vars = [kNN_var] base_vars = ['lead_'+base_var, 'sub_'+base_var] kNN_vars = ['lead_'+kNN_var, 'sub_'+kNN_var] with Profile("Add variables"): from run.knn.common import add_knn, EFF as kNN_eff #for i in range(len(base_var)): print "k-NN base variable: {} (cp. {})".format(base_var, kNN_var) add_knn(data, newfeat='lead_'+kNN_var, path='models/knn/knn1D_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel)) add_knn(data, newfeat='sub_'+kNN_var, path='models/knn/knn1D_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel)) #add_knn(data, newfeat=kNN_var, path='models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel)) print 'models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel) # Check variable distributions weight = 'weight' # 'weight_test' / 'weight' scale = 139*1000000 # (inverse nanobarn) msk_bkg = data['signal'] == 0 if sigNumber==0: msk_sig = data['signal'] == 1 else: msk_sig = data['sigType'] == sigNumber knnBins = np.linspace(-100, 200, 75, endpoint=True) for var in kNN_vars: ### Canvas ### c = rp.canvas(num_pads=2, batch=True) c_tmp = rp.canvas(num_pads=1, batch=True) c2 = rp.canvas(batch=True) ### Plot ### h2 = c.pads()[0].hist(data.loc[msk_sig, var].values, bins=knnBins, weights=data.loc[msk_sig, weight].values, normalise=True, **histstyle[False]) h1 = c.pads()[0].hist(data.loc[msk_bkg, var].values, bins=knnBins, weights=scale*data.loc[msk_bkg, weight].values, normalise=True, **histstyle[True]) h1_incl = c_tmp.hist(data.loc[msk_bkg, var].values, bins=knnBins, weights=scale*data.loc[msk_bkg, weight].values, normalise=False) h2_incl = c_tmp.hist(data.loc[msk_sig, var].values, bins=knnBins, weights=data.loc[msk_sig, weight].values, normalise=False) #h1_CR = c_tmp.hist(data.loc[msk_CR_bkg, var].values, bins=knnBins, weights=scale*data.loc[msk_CR_bkg, weight].values, normalise=False) #h2_CR = c_tmp.hist(data.loc[msk_CR_sig, var].values, bins=knnBins, weights=data.loc[msk_CR_sig, weight].values, normalise=False) print "bkg. incl integral: ", h1_incl.GetEffectiveEntries() print "sig. incl integral: ", h2_incl.GetEffectiveEntries() #print "bkg. CR efficiency: ", h1_CR.GetEffectiveEntries()/h1_incl.GetEffectiveEntries() #print "sig. CR efficiency: ", h2_CR.GetEffectiveEntries()/h2_incl.GetEffectiveEntries() normFactor = 1.0 / (3./2 + np.sqrt(h1_incl.GetEffectiveEntries()) ) print "Sensitivity with no cut: ", normFactor ### sensitivity ### sensitivity = [] bkg_eff_1jet = [] i = 0 for cut in knnBins: msk_pass = (data[kNN_vars[0]]>cut) & (data[kNN_vars[1]]>cut) msk_pass1 = data[kNN_vars[0]>cut) #msk_pass = (data[var]>cut) msk_bkg_pass = msk_bkg & msk_pass msk_sig_pass = msk_sig & msk_pass msk_bkg_pass1 = msk_bkg & msk_pass_1jet msk_sig_pass1 = msk_sig & msk_pass_1jet h1_pass = c_tmp.hist(data.loc[msk_bkg_pass, var].values, bins=knnBins, weights=scale*data.loc[msk_bkg_pass, weight].values, normalise=False) h2_pass = c_tmp.hist(data.loc[msk_sig_pass, var].values, bins=knnBins, weights=data.loc[msk_sig_pass, weight].values, normalise=False) h1_pass1 = c_tmp.hist(data.loc[msk_bkg_pass1, var].values, bins=knnBins, weights=data.loc[msk_sig_pass, weight].values, normalise=False) if ( h2_incl.GetEffectiveEntries()>0 ) : #and h1_pass.GetEffectiveEntries()>0) : sensitivity.append( ((h2_pass.GetEffectiveEntries()/h2_incl.GetEffectiveEntries()) / (3./2 + np.sqrt(h1_pass.GetEffectiveEntries()) )) / normFactor ) #print "bkg. eff. @ " , cut, ": ", h1_pass.GetEffectiveEntries()/h1_incl.GetEffectiveEntries() #print "signal eff. @ ", cut, ": ", h2_pass.GetEffectiveEntries()/h2_incl.GetEffectiveEntries() #print "Sensitivity gain@ ", cut, ": ", ((h2_pass.GetEffectiveEntries()/h2_incl.GetEffectiveEntries()) / (3./2 + np.sqrt(h1_pass.GetEffectiveEntries())) ) / normFactor else: sensitivity.append(0) if (h1_incl.GetEffectiveEntries()>0 ) : bkg_eff_1jet.append(h1_pass1.GetEffectiveEntries()/h1_incl.GetEffectiveEntries()) else: bkg_eff_1jet.append(0) i = i+1 #c.pads()[0].ylim(0,0.25) c.pads()[0].logy() c.pads()[0].xlim(-100,200) c.pads()[1].ylim(0,30) c.pads()[1].xlim(-100,200) c.pads()[1].graph( sensitivity, bins=knnBins) #, oob=False ) ### Decorations ### c.legend(width=0.4, xmin=0.3, ymax=0.9) #c.xlabel("n_{trk}^{#epsilon={}\%}".format(kNN_eff)) #latex(var, ROOT=True)) c.xlabel("n_{trk}^{#epsilon}") #latex(var, ROOT=True)) c.ylabel("Fraction of jets") c.pads()[1].ylabel("Sensitivity gain")#"#epsilon_{S}/(#frac{3}{2} + #sqrt{B})/") c.pads()[1].text(["Sensitivity = #varepsilon_{S}/(#frac{3}{2} + #sqrt{B})", ], xmin=0.2, ymax=0.80, ATLAS=False) c2.graph(sensitivity, bkg_eff_1jet) c2.xlabel("Single jet #varepsilon_B") c2.ylabel("Sensitivity gain") c2.text(["#epsilon=0.5 %",], xmin=0.2, ymax=0.8, ATLAS=False) ### Save ### #mkdir('figures/distributions') c.save('figures/distributions/sensitivity_{}_sig{}_eff{}.pdf'.format(var, sigNumber, kNN_eff)) c.save('figures/distributions/sensitivity_{}_sig{}_eff{}.eps'.format(var, sigNumber, kNN_eff)) c2.save('figure/distribution/sensitivity_1jEfficiency.pdf'.format(var,sigNumber,kNN_eff)) print 'figures/distributions/sensitivity_{}_sig{}_eff{}.pdf'.format(var, sigNumber, kNN_eff) pass # Plot also the normal ntrk distribution for cross check with Roland's result msk_bkg = data['signal'] == 0 if sigNumber==0: msk_sig = data['signal'] == 1 # data['sigType'] == sigNumber # else: msk_sig = data['sigType'] == sigNumber # data['sigType'] == sigNumber # #msk_weight = data['weight']<0.0002 #msk_bkg = msk_bkg & msk_pt & msk_m & msk_eta #msk_sig = msk_sig & msk_pt & msk_m & msk_eta baseBins = np.linspace(0, 200, 75, endpoint=True) #axes[var][1], axes[var][2], axes[var][0] + 1, endpoint=True) for var in base_vars: ### Canvas ### c = rp.canvas(num_pads=2, batch=True) c.pads()[0].logy() c_tmp = rp.canvas(batch=True) ### Plot ### h2 = c.pads()[0].hist(data.loc[msk_sig, var].values, bins=baseBins, weights=data.loc[msk_sig, weight].values, normalise=True, **histstyle[False]) h1 = c.pads()[0].hist(data.loc[msk_bkg, var].values, bins=baseBins, weights=scale*data.loc[msk_bkg, weight].values, normalise=True, **histstyle[True]) h1_incl = c_tmp.hist(data.loc[msk_bkg, var].values, bins=baseBins, weights=scale*data.loc[msk_bkg, weight].values, normalise=False) h2_incl = c_tmp.hist(data.loc[msk_sig, var].values, bins=baseBins, weights=data.loc[msk_sig, weight].values, normalise=False) print "bkg. incl integral: ", h1_incl.GetEffectiveEntries() print "sig. incl integral: ", h2_incl.GetEffectiveEntries() normFactor = 1.0 / (3./2 + np.sqrt(h1_incl.Integral()) ) #print "Sensitivity with no cut: ", normFactor ### sensitivity ### sensitivity = [] i = 0 for cut in baseBins: #print cut msk_pass = (data[base_vars[0]]>cut) & (data[base_vars[1]]>cut) # #msk_pass = data[var]>cut msk_bkg_pass = msk_bkg & msk_pass msk_sig_pass = msk_sig & msk_pass h1_pass = c_tmp.hist(data.loc[msk_bkg_pass, var].values, bins=baseBins, weights=scale*data.loc[msk_bkg_pass, weight].values, normalise=False) h2_pass = c_tmp.hist(data.loc[msk_sig_pass, var].values, bins=baseBins, weights=data.loc[msk_sig_pass, weight].values, normalise=False) if ( h2_incl.Integral()>0 ): #and h1_pass.Integral()>0 ): sensitivity.append( (h2_pass.Integral()/h2_incl.Integral()) / (3./2. + np.sqrt(h1_pass.Integral())) / normFactor ) #print "signal eff. at ", cut, ": ", (h2_pass.Integral()/h2_incl.Integral()) #print "bkg eff. at ", cut, ": ", (h1_pass.Integral()/h1_incl.Integral()) #print "sensitivity gain at ", cut, ": ", (h2_pass.Integral()/h2_incl.Integral()) / (3./2. + np.sqrt(h1_pass.Integral())) / normFactor else: sensitivity.append(0) i = i+1 c.pads()[1].ylim(0,80) c.pads()[1].xlim(0,200) c.pads()[1].graph( sensitivity, bins=baseBins) #, oob=False ) ### Decorations ### c.legend(width=0.4, xmin=0.3, ymax=0.9) #c.xlabel(latex(var, ROOT=True)) c.ylabel("Fraction of jets") c.xlabel("n_{trk}") #latex(var, ROOT=True)) c.pads()[1].ylabel("sensitivity gain") #"#epsilon_{S}/(#frac{3}{2} + #sqrt{B})") c.pads()[1].text(["sensitivity = #epsilon_{S}/(#frac{3}{2} + #sqrt{B})", ], xmin=0.2, ymax=0.80, ATLAS=False) ### Save ### c.save('figures/distributions/sensitivity_{}_sig{}_eff{}.pdf'.format(var, sigNumber, kNN_eff)) c.save('figures/distributions/sensitivity_{}_sig{}_eff{}.eps'.format(var, sigNumber, kNN_eff)) pass
def main(args): # Initialise args, cfg = initialise(args) # Initialise Keras backend #initialise_backend(args) # Neural network-specific initialisation of the configuration dict #initialise_config(args, cfg) # Keras import(s) #import keras.backend as K #from keras.models import load_model # Project import(s) #from adversarial.models import classifier_model, adversary_model, combined_model, decorrelation_model # Load data #data, features, _ = load_data(args.input + 'data.h5', test=True) data, features, _ = load_data(args.input + 'data.h5', test_full_signal=True) # Common definitions # -------------------------------------------------------------------------- # -- k-nearest neighbour kNN_var_N2 = 'N_{2}-k#minusNN' kNN_var_tau21 = 'tau_{21}-k#minusNN' def meaningful_digits(number): digits = 0 if number > 0: digits = int(np.ceil(max(-np.log10(number), 0))) pass return '{l:.{d:d}f}'.format(d=digits, l=number) # -- Adversarial neural network (ANN) scan #lambda_reg = 10. #lambda_regs = sorted([1., 3., 10.]) #ann_vars = list() #lambda_strs = list() #for lambda_reg_ in lambda_regs: # lambda_str = meaningful_digits(lambda_reg_).replace('.', 'p') # lambda_strs.append(lambda_str) # ann_var_ = "ANN(#lambda={:s})".format(lambda_str.replace('p', '.')) # ann_vars.append(ann_var_) # pass #ann_var = ann_vars[lambda_regs.index(lambda_reg)] # -- uBoost scan #uboost_eff = 92 #uboost_ur = 0.3 #uboost_urs = sorted([0., 0.01, 0.1, 0.3, 1.0]) #uboost_var = 'uBoost(#alpha={:s})'.format(meaningful_digits(uboost_ur)) #uboost_vars = ['uBoost(#alpha={:s})'.format(meaningful_digits(ur)) for ur in uboost_urs] #uboost_pattern = 'uboost_ur_{{:4.2f}}_te_{:.0f}_rel21_fixed'.format(uboost_eff) # Tagger feature collection #tagger_features = ['Tau21','Tau21DDT', 'D2', kNN_var, 'D2', 'D2CSS', 'NN', ann_var, 'Adaboost', uboost_var] #tagger_features = ['tau21', 'tau21DDT', 'tau21', 'tau21kNN', 'tau21', 'tau21CSS', 'N2_B1', 'N2_B1DDT', 'N2_B1', 'N2_B1kNN', 'N2_B1', 'N2_B1CSS']; title="tau21_vs_N2_B1" #tagger_features = ['N2_B1', 'N2_B1DDT', 'N2_B1', 'N2_B1kNN', 'N2_B1', 'N2_B1CSS']; title="N2_B1" #tagger_features = ['tau21', 'tau21DDT', 'N2_B1', 'N2_B1kNN', 'N2_B1', 'N2_B1CSS']; title="ATLAS" tagger_features = [ 'decDeepWvsQCD', 'decDeepWvsQCDDDT', 'decDeepWvsQCD', 'decDeepWvsQCDkNN', 'decDeepWvsQCD', 'decDeepWvsQCDCSS' ] title = "decDeep" tagger_features = [ 'DeepWvsQCD', 'DeepWvsQCDDDT', 'DeepWvsQCD', 'DeepWvsQCDkNN', 'DeepWvsQCD', 'DeepWvsQCDCSS' ] title = "Deep" # Add variables # -------------------------------------------------------------------------- with Profile("Add variables"): ## Tau21DDT #from run.ddt.common import add_ddt #add_ddt(data, feat='tau21', path='models/ddt/ddt_tau21.pkl.gz') ## N2DDT #from run.ddt.common import add_ddt #add_ddt(data, feat='N2_B1', path='models/ddt/ddt_N2_B1.pkl.gz') ## decDeepQvsQCDDDT #from run.ddt.common import add_ddt #add_ddt(data, feat='decDeepWvsQCD', path='models/ddt/ddt_decDeepWvsQCD.pkl.gz') # DeepQvsQCDDDT from run.ddt.common import add_ddt add_ddt(data, feat='DeepWvsQCD', path='models/ddt/ddt_DeepWvsQCD.pkl.gz') ## Tau21-kNN #from run.knn.common import add_knn, VAR_TAU21 as kNN_basevar, TAU21_EFF as kNN_eff #print "k-NN base variable: {} (cp. {})".format(kNN_basevar, kNN_var_tau21) #add_knn(data, feat=kNN_basevar, path='models/knn/knn_{}_{}.pkl.gz'.format(kNN_basevar, kNN_eff)) ## N2-kNN #from run.knn.common import add_knn, VAR_N2 as kNN_basevar, N2_EFF as kNN_eff #print "k-NN base variable: {} (cp. {})".format(kNN_basevar, kNN_var_N2) #add_knn(data, feat=kNN_basevar, path='models/knn/knn_{}_{}.pkl.gz'.format(kNN_basevar, kNN_eff)) ## decDeepWvsQCD-kNN #from run.knn.common import add_knn, VAR_DECDEEP as kNN_basevar, DECDEEP_EFF as kNN_eff #print "k-NN base variable: {} (cp. {})".format(kNN_basevar, kNN_var_N2) #add_knn(data, feat=kNN_basevar, path='models/knn/knn_{}_{}.pkl.gz'.format(kNN_basevar, kNN_eff)) # DeepWvsQCD-kNN from run.knn.common import add_knn, VAR_DEEP as kNN_basevar, DEEP_EFF as kNN_eff print "k-NN base variable: {} (cp. {})".format(kNN_basevar, kNN_var_N2) add_knn(data, feat=kNN_basevar, path='models/knn/knn_{}_{}.pkl.gz'.format( kNN_basevar, kNN_eff)) ## Tau21-CSS #from run.css.common import add_css #add_css("tau21", data) ## N2-CSS #from run.css.common import add_css #add_css("N2_B1", data) ## decDeepWvsQCD-CSS #from run.css.common import add_css #add_css("decDeepWvsQCD", data) # DeepWvsQCD-CSS from run.css.common import add_css add_css("DeepWvsQCD", data) pass # Remove unused variables #used_variables = set(tagger_features + ann_vars + uboost_vars + ['m', 'pt', 'npv', 'weight_test']) used_variables = set(tagger_features + ['m', 'pt', 'weight_test', 'npv' ]) ## need to put 'npv' back in for robustness study unused_variables = [var for var in list(data) if var not in used_variables] data.drop(columns=unused_variables) gc.collect() # Perform performance studies #perform_studies (data, args, tagger_features, ann_vars, uboost_vars) perform_studies(data, args, tagger_features, title=title) return 0
def perform_studies(data, args, tagger_features, title=None): """ Method delegating performance studies. """ #masscuts = [True, False] masscuts = [False] pt_ranges = [None, (200, 500), (500, 1000), (1000, 2000)] ## Perform combined robustness study #with Profile("Study: Robustness"): # for masscut in masscuts: # studies.robustness_full(data, args, tagger_features, masscut=masscut, title=title) # pass # pass ## Perform jet mass distribution comparison study #with Profile("Study: Jet mass comparison"): # for pt_range in pt_ranges: # print "pt_range =", pt_range # studies.jetmasscomparison(data, args, tagger_features, pt_range, title=title) # pass # Perform summary plot study with Profile("Study: Summary plot"): #regex_nn = re.compile('\#lambda=[\d\.]+') #regex_ub = re.compile('\#alpha=[\d\.]+') #scan_features = {'NN': map(lambda feat: (feat, regex_nn.search(feat).group(0)), ann_vars), # 'Adaboost': map(lambda feat: (feat, regex_ub.search(feat).group(0)), uboost_vars) # } scan_features = dict() for masscut, pt_range in itertools.product(masscuts, pt_ranges): studies.summary(data, args, tagger_features, scan_features, masscut=masscut, pt_range=pt_range, title=title) pass pass ## Perform distributions study #with Profile("Study: Substructure tagger distributions"): # mass_ranges = np.linspace(50, 300, 5 + 1, endpoint=True) # mass_ranges = [None] + zip(mass_ranges[:-1], mass_ranges[1:]) # for feat, pt_range, mass_range in itertools.product(tagger_features, pt_ranges, mass_ranges): # tagger_features # studies.distribution(data, args, feat, pt_range, mass_range, title=title) # pass # pass # Perform ROC study with Profile("Study: ROC"): for masscut, pt_range in itertools.product(masscuts, pt_ranges): studies.roc(data, args, tagger_features, masscut=masscut, pt_range=pt_range, title=title) pass pass
def main(args): # Initialise args, cfg = initialise(args) # Load data data, features, _ = load_data(args.input + 'data.h5', test_full_signal=True) #data, features, _ = load_data(args.input + 'data.h5', train_full_signal=True) #for faster checking, don't use for actual comparison # Common definitions # -------------------------------------------------------------------------- def meaningful_digits(number): digits = 0 if number > 0: digits = int(np.ceil(max(-np.log10(number), 0))) pass return '{l:.{d:d}f}'.format(d=digits, l=number) # Tagger feature collection #tagger_features = ['Tau21','Tau21DDT', 'D2', kNN_var, 'D2', 'D2CSS', 'NN', ann_var, 'Adaboost', uboost_var] #tagger_features = ['tau21', 'tau21DDT', 'tau21', 'tau21kNN', 'tau21', 'tau21CSS', 'N2_B1', 'N2_B1DDT', 'N2_B1', 'N2_B1kNN', 'N2_B1', 'N2_B1CSS']; title="tau21_vs_N2_B1" #tagger_features = ['N2_B1', 'N2_B1DDT', 'N2_B1', 'N2_B1kNN', 'N2_B1', 'N2_B1CSS']; title="N2_B1" #tagger_features = ['tau21', 'tau21DDT', 'N2_B1', 'N2_B1kNN', 'N2_B1', 'N2_B1CSS']; title="ATLAS" #tagger_features = ['decDeepWvsQCD', 'decDeepWvsQCDDDT', 'decDeepWvsQCD', 'decDeepWvsQCDkNN', 'decDeepWvsQCD', 'decDeepWvsQCDCSS']; title="decDeep" #tagger_features = {'tau21':['','DDT'], 'N2_B1':['','kNN','CSS']}; title='ATLAS2' #tagger_features = {'tau21':['','DDT'], 'N2_B1':['','kNN',], 'decDeepWvsQCD':['','kNN'], 'DeepWvsQCD':['','kNN']}; title='Deep_vs_Analytic' #tagger_features = {'tau21':[''], 'N2_B1':[''], 'decDeepWvsQCD':[''], 'DeepWvsQCD':['']}; title='Deep_Check2' tagger_features = { 'tau21': ['', 'DDT', 'kNN', 'CSS'], 'N2_B1': ['', 'DDT', 'kNN', 'CSS'] } title = 'Corrected_Full_Analytic' #tagger_features = {'tau21':['', 'DDT', 'kNN', 'CSS'], 'N2_B1':['', 'DDT', 'kNN','CSS']}; title='Full_Analytic_vs_Atlas' extracted_features = [] for basevar in tagger_features.keys(): for suffix in tagger_features[basevar]: extracted_features.append(basevar + suffix) # Add variables # -------------------------------------------------------------------------- with Profile("Add variables"): # the selections of which variables to add could also be automated from the tagger_features list... # Tau21DDT from run.ddt.common import add_ddt add_ddt(data, feat='tau21', path='models/ddt/ddt_tau21.pkl.gz') # N2DDT from run.ddt.common import add_ddt add_ddt(data, feat='N2_B1', path='models/ddt/ddt_N2_B1.pkl.gz') ## decDeepQvsQCDDDT #from run.ddt.common import add_ddt #add_ddt(data, feat='decDeepWvsQCD', path='models/ddt/ddt_decDeepWvsQCD.pkl.gz') ## DeepQvsQCDDDT #from run.ddt.common import add_ddt #add_ddt(data, feat='DeepWvsQCD', path='models/ddt/ddt_DeepWvsQCD.pkl.gz') # Tau21-kNN from run.knn.common import add_knn, VAR_TAU21 as kNN_basevar, TAU21_EFF as kNN_eff print "k-NN base variable: {} (cp. {})".format(kNN_basevar, 'tau_{21}-k#minusNN') add_knn(data, feat=kNN_basevar, path='models/knn/knn_{}_{}.pkl.gz'.format( kNN_basevar, kNN_eff)) # N2-kNN from run.knn.common import add_knn, VAR_N2 as kNN_basevar, N2_EFF as kNN_eff print "k-NN base variable: {} (cp. {})".format(kNN_basevar, 'N_{2}-kNN') add_knn(data, feat=kNN_basevar, path='models/knn/knn_{}_{}.pkl.gz'.format( kNN_basevar, kNN_eff)) ## decDeepWvsQCD-kNN #from run.knn.common import add_knn, VAR_DECDEEP as kNN_basevar, DECDEEP_EFF as kNN_eff #print "k-NN base variable: {} (cp. {})".format(kNN_basevar, 'decDeepWvsQCD') #add_knn(data, feat=kNN_basevar, path='models/knn/knn_{}_{}.pkl.gz'.format(kNN_basevar, kNN_eff)) ## DeepWvsQCD-kNN #from run.knn.common import add_knn, VAR_DEEP as kNN_basevar, DEEP_EFF as kNN_eff #print "k-NN base variable: {} (cp. {})".format(kNN_basevar, 'DeepWvsQCD') #add_knn(data, feat=kNN_basevar, path='models/knn/knn_{}_{}.pkl.gz'.format(kNN_basevar, kNN_eff)) # Tau21-CSS from run.css.common import add_css add_css("tau21", data) # N2-CSS from run.css.common import add_css add_css("N2_B1", data) ## decDeepWvsQCD-CSS #from run.css.common import add_css #add_css("decDeepWvsQCD", data) ## DeepWvsQCD-CSS #from run.css.common import add_css #add_css("DeepWvsQCD", data) pass # Remove unused variables #used_variables = set(tagger_features + ['m', 'pt', 'weight_test', 'npv']) used_variables = set(extracted_features + ['m', 'pt', 'weight_test', 'npv']) unused_variables = [var for var in list(data) if var not in used_variables] data.drop(columns=unused_variables) gc.collect() # Perform performance studies perform_studies(data, args, tagger_features, extracted_features, title=title) return 0
def perform_studies(data, args, tagger_features, ann_vars, uboost_vars): """ Method delegating performance studies. """ masscuts = [True, False] pt_ranges = [None, (200, 500), (500, 1000)] # Perform combined robustness study with Profile("Study: Robustness"): for masscut in masscuts: studies.robustness_full(data, args, tagger_features, masscut=masscut) pass pass # Perform jet mass distribution comparison study with Profile("Study: Jet mass comparison"): studies.jetmasscomparison(data, args, tagger_features) pass # Perform summary plot study with Profile("Study: Summary plot"): regex_nn = re.compile('\#lambda=[\d\.]+') regex_ub = re.compile('\#alpha=[\d\.]+') scan_features = { 'NN': map(lambda feat: (feat, regex_nn.search(feat).group(0)), ann_vars), 'Adaboost': map(lambda feat: (feat, regex_ub.search(feat).group(0)), uboost_vars) } for masscut, pt_range in itertools.product(masscuts, pt_ranges): studies.summary(data, args, tagger_features, scan_features, masscut=masscut, pt_range=pt_range) pass pass # Perform distributions study with Profile("Study: Substructure tagger distributions"): mass_ranges = np.linspace(50, 300, 5 + 1, endpoint=True) mass_ranges = [None] + zip(mass_ranges[:-1], mass_ranges[1:]) for feat, pt_range, mass_range in itertools.product( tagger_features, pt_ranges, mass_ranges): # tagger_features studies.distribution(data, args, feat, pt_range, mass_range) pass pass # Perform ROC study with Profile("Study: ROC"): for masscut, pt_range in itertools.product(masscuts, pt_ranges): studies.roc(data, args, tagger_features, masscut=masscut, pt_range=pt_range) pass pass # Perform JSD study with Profile("Study: JSD"): studies.jsd(data, args, tagger_features) pass # Perform efficiency study with Profile("Study: Efficiency"): for feat in tagger_features: studies.efficiency(data, args, feat) pass pass return
def main (args): # Initialise args, cfg = initialise(args) # Load data data, _, _ = load_data('data/' + args.input) #, test=True) msk_sig = data['signal'] == 1 msk_bkg = ~msk_sig # ------------------------------------------------------------------------- #### #### # Initialise Keras backend #### initialise_backend(args) #### #### # Neural network-specific initialisation of the configuration dict #### initialise_config(args, cfg) #### #### # Keras import(s) #### from keras.models import load_model #### #### # NN #### from run.adversarial.common import add_nn #### with Profile("NN"): #### classifier = load_model('models/adversarial/classifier/full/classifier.h5') #### add_nn(data, classifier, 'NN') #### pass # ------------------------------------------------------------------------- # Fill measured profile profile_meas, (x,percs, err) = fill_profile_1D(data[msk_bkg]) weights = 1/err # Add k-NN variable knnfeat = 'knn' orgfeat = VAR add_knn(data, newfeat=knnfeat, path='models/knn/{}_{}_{}_{}.pkl.gz'.format(FIT, VAR, EFF, MODEL)) # Loading KNN classifier knn = loadclf('models/knn/{}_{:s}_{}_{}.pkl.gz'.format(FIT, VAR, EFF, MODEL)) #knn = loadclf('models/knn/{}_{:s}_{}_{}.pkl.gz'.format(FIT, VAR, EFF, MODEL)) X = x.reshape(-1,1) # Filling fitted profile with Profile("Filling fitted profile"): rebin = 8 # Short-hands vbins, vmin, vmax = AXIS[VARX] # Re-binned bin edges @TODO: Make standardised right away? # edges = np.interp(np.linspace(0, vbins, vbins * rebin + 1, endpoint=True), # range(vbins + 1), # np.linspace(vmin, vmax, vbins + 1, endpoint=True)) fineBins = np.linspace(vmin, vmax, vbins*rebin + 1, endpoint=True) orgBins = np.linspace(vmin, vmax, vbins + 1, endpoint=True) # Re-binned bin centres fineCentres = fineBins[:-1] + 0.5 * np.diff(fineBins) orgCentres = orgBins[:-1] + 0.5 * np.diff(orgBins) pass # Get predictions evaluated at re-binned bin centres if 'erf' in FIT: fit = func(fineCentres, knn[0], knn[1], knn[2]) print "Check: ", func([1500, 2000], knn[0], knn[1], knn[2]) else: fit = knn.predict(fineCentres.reshape(-1,1)) #centres.reshape(-1,1)) # Fill ROOT "profile" profile_fit = ROOT.TH1F('profile_fit', "", len(fineBins) - 1, fineBins.flatten('C')) root_numpy.array2hist(fit, profile_fit) knn1 = PolynomialFeatures(degree=2) X_poly = knn1.fit_transform(X) reg = LinearRegression(fit_intercept=False) #fit_intercept=False) reg.fit(X_poly, percs, weights) score = round(reg.score(X_poly, percs), 4) coef = reg.coef_ intercept = reg.intercept_ print "COEFFICIENTS: ", coef, intercept TCoef = ROOT.TVector3(coef[0], coef[1], coef[2]) outFile = ROOT.TFile.Open("models/{}_jet_ungrtrk500_eff{}_stat{}_{}.root".format(FIT, EFF, MIN_STAT, MODEL),"RECREATE") outFile.cd() TCoef.Write() profile_fit.SetName("kNNfit") profile_fit.Write() outFile.Close() # profile_meas2 = ROOT.TH1F('profile_meas', "", len(x) - 1, x.flatten('C')) # root_numpy.array2hist(percs, profile_meas2) profile_meas2 = ROOT.TGraph(len(x), x, percs) pass # Plotting with Profile("Plotting"): # Plot plot(profile_meas2, profile_fit) pass # Plotting local selection efficiencies for D2-kNN < 0 # -- Compute signal efficiency # MC weights are scaled with lumi. This is just for better comparison #if INPUT =="mc": # data.loc[:,'TotalEventWeight'] /= 139000000. for sig, msk in zip([True, False], [msk_sig, msk_bkg]): # Define arrays shape = AXIS[VARX][0] bins = np.linspace(AXIS[VARX][1], AXIS[VARX][2], AXIS[VARX][0]+ 1, endpoint=True) #bins = np.linspace(AXIS[VARX][1], 4000, 40, endpoint=True) #bins = np.append(bins, [4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000]) print "HERE: ", bins #x, y = (np.zeros(shape) for _ in range(2)) # Create `profile` histogram profile_knn = ROOT.TH1F('profile', "", len(bins) - 1, bins ) #.flatten('C') ) profile_org = ROOT.TH1F('profile', "", len(bins) - 1, bins ) #.flatten('C') ) # Compute inclusive efficiency in bins of `VARX` effs = list() for i in range(shape): msk_bin = (data[VARX] > bins[i]) & (data[VARX] <= bins[i+1]) msk_pass = data[knnfeat] > 0 # <? msk_pass_org = data[orgfeat] > 70 # <? num = data.loc[msk & msk_bin & msk_pass, 'TotalEventWeight'].values.sum() num_org = data.loc[msk & msk_bin & msk_pass_org, 'TotalEventWeight'].values.sum() den = data.loc[msk & msk_bin,'TotalEventWeight'].values.sum() if den > 0: eff = num/den *100. eff_org = num_org/den *100. profile_knn.SetBinContent(i + 1, eff) profile_org.SetBinContent(i + 1, eff_org) effs.append(eff) #else: #print i, "Density = 0" pass c = rp.canvas(batch=True) leg = ROOT.TLegend(0.2, 0.75, 0.5, 0.85) leg.AddEntry(profile_knn, "#it{n}_{trk}^{#varepsilon=%s%%} > 0" % ( EFF), "l") leg.AddEntry(profile_org, "#it{n}_{trk} > 70", "l") leg.Draw() pad = c.pads()[0]._bare() pad.cd() pad.SetRightMargin(0.10) pad.SetLeftMargin(0.15) pad.SetTopMargin(0.10) # Styling profile_knn.SetLineColor(rp.colours[1]) profile_org.SetLineColor(rp.colours[2]) profile_knn.SetMarkerStyle(24) profile_knn.GetXaxis().SetTitle( "#it{m}_{jj} [GeV]" ) #latex(VARX, ROOT=True) + "[GeV]") #+ " = log(m^{2}/p_{T}^{2})") #profile.GetXaxis().SetTitle("Large-#it{R} jet " + latex(VARX, ROOT=True))# + " = log(m^{2}/p_{T}^{2})") profile_org.GetYaxis().SetTitle("Selection efficiency (%)") # for #it{n}_{trk}^{#varepsilon=%s%%}>0" % ( EFF)) profile_knn.GetYaxis().SetNdivisions(505) #profile_knn.GetXaxis().SetNdivisions(505) profile_knn.GetXaxis().SetTitleOffset(1.4) profile_knn.GetYaxis().SetTitleOffset(1.8) profile_knn.GetXaxis().SetRangeUser(*XRANGE) profile_org.GetXaxis().SetRangeUser(*XRANGE) yrange = (0., EFF*3) #2.0 percent if yrange: profile_knn.GetYaxis().SetRangeUser(*yrange) profile_org.GetYaxis().SetRangeUser(*yrange) pass # Draw profile_org.Draw() profile_knn.Draw("same") # Save mkdir('figures/knn/') c.save('figures/knn/{}_eff_{}_{:s}_{}_{}_stat{}.pdf'.format(FIT, 'sig' if sig else 'bkg', VAR, EFF, MODEL+INPUT, MIN_STAT)) #c.save('figures/knn/{}_eff_{}_{:s}_{}_{}_stat{}.png'.format(FIT, 'sig' if sig else 'bkg', VAR, EFF, MODEL, MIN_STAT)) c.save('figures/knn/{}_eff_{}_{:s}_{}_{}_stat{}.eps'.format(FIT, 'sig' if sig else 'bkg', VAR, EFF, MODEL+INPUT, MIN_STAT)) del c pass return
def main(args): # Initialise args, cfg = initialise(args) # Initialise Keras backend initialise_backend(args) # Neural network-specific initialisation of the configuration dict initialise_config(args, cfg) # Keras import(s) import keras.backend as K from keras.models import load_model # Project import(s) from adversarial.models import classifier_model, adversary_model, combined_model, decorrelation_model # Load data data, features, _ = load_data(args.input + 'data.h5', test=True) # Common definitions # -------------------------------------------------------------------------- # -- k-nearest neighbour kNN_var = 'D2-k#minusNN' def meaningful_digits(number): digits = 0 if number > 0: digits = int(np.ceil(max(-np.log10(number), 0))) pass return '{l:.{d:d}f}'.format(d=digits, l=number) # -- Adversarial neural network (ANN) scan lambda_reg = 10. lambda_regs = sorted([1., 3., 10.]) ann_vars = list() lambda_strs = list() for lambda_reg_ in lambda_regs: lambda_str = meaningful_digits(lambda_reg_).replace('.', 'p') lambda_strs.append(lambda_str) ann_var_ = "ANN(#lambda={:s})".format(lambda_str.replace('p', '.')) ann_vars.append(ann_var_) pass ann_var = ann_vars[lambda_regs.index(lambda_reg)] # -- uBoost scan uboost_eff = 92 uboost_ur = 0.3 uboost_urs = sorted([0., 0.01, 0.1, 0.3, 1.0]) uboost_var = 'uBoost(#alpha={:s})'.format(meaningful_digits(uboost_ur)) uboost_vars = [ 'uBoost(#alpha={:s})'.format(meaningful_digits(ur)) for ur in uboost_urs ] uboost_pattern = 'uboost_ur_{{:4.2f}}_te_{:.0f}_rel21_fixed'.format( uboost_eff) # Tagger feature collection tagger_features = [ 'Tau21', 'Tau21DDT', 'D2', kNN_var, 'D2', 'D2CSS', 'NN', ann_var, 'Adaboost', uboost_var ] # Add variables # -------------------------------------------------------------------------- with Profile("Add variables"): # Tau21DDT from run.ddt.common import add_ddt add_ddt(data, path='models/ddt/ddt.pkl.gz') # D2-kNN from run.knn.common import add_knn, VAR as kNN_basevar, EFF as kNN_eff print "k-NN base variable: {} (cp. {})".format(kNN_basevar, kNN_var) add_knn(data, newfeat=kNN_var, path='models/knn/knn_{}_{}.pkl.gz'.format( kNN_basevar, kNN_eff)) # D2-CSS from run.css.common import add_css add_css("D2", data) # NN from run.adversarial.common import add_nn with Profile("NN"): classifier = load_model( 'models/adversarial/classifier/full/classifier.h5') add_nn(data, classifier, 'NN') pass # ANN with Profile("ANN"): from adversarial.utils import DECORRELATION_VARIABLES adversary = adversary_model( gmm_dimensions=len(DECORRELATION_VARIABLES), **cfg['adversary']['model']) combined = combined_model(classifier, adversary, **cfg['combined']['model']) for ann_var_, lambda_str_ in zip(ann_vars, lambda_strs): print "== Loading model for {}".format(ann_var_) combined.load_weights( 'models/adversarial/combined/full/combined_lambda{}.h5'. format(lambda_str_)) add_nn(data, classifier, ann_var_) pass pass # Adaboost/uBoost with Profile("Adaboost/uBoost"): from run.uboost.common import add_bdt for var, ur in zip(uboost_vars, uboost_urs): var = ('Adaboost' if ur == 0 else var) path = 'models/uboost/' + uboost_pattern.format(ur).replace( '.', 'p') + '.pkl.gz' print "== Loading model for {}".format(var) add_bdt(data, var, path) pass # Remove `Adaboost` from scan list uboost_vars.pop(0) pass pass # Remove unused variables used_variables = set(tagger_features + ann_vars + uboost_vars + ['m', 'pt', 'npv', 'weight_test']) unused_variables = [var for var in list(data) if var not in used_variables] data.drop(columns=unused_variables) gc.collect() # Perform performance studies perform_studies(data, args, tagger_features, ann_vars, uboost_vars) return 0
def test(data, variable, bg_eff, signal_above=False): # Shout out to Cynthia Brewer and Mark Harrower # [http://colorbrewer2.org]. Palette is colorblind-safe. rgbs = [(247 / 255., 251 / 255., 255 / 255.), (222 / 255., 235 / 255., 247 / 255.), (198 / 255., 219 / 255., 239 / 255.), (158 / 255., 202 / 255., 225 / 255.), (107 / 255., 174 / 255., 214 / 255.), (66 / 255., 146 / 255., 198 / 255.), (33 / 255., 113 / 255., 181 / 255.), (8 / 255., 81 / 255., 156 / 255.), (8 / 255., 48 / 255., 107 / 255.)] red, green, blue = map(np.array, zip(*rgbs)) nb_cols = len(rgbs) stops = np.linspace(0, 1, nb_cols, endpoint=True) ROOT.TColor.CreateGradientColorTable(nb_cols, stops, red, green, blue, NB_CONTOUR) msk_sig = data['signal'] == 1 msk_bkg = ~msk_sig # Fill measured profile with Profile("filling profile"): profile_meas, _ = fill_profile(data[msk_bkg], variable, bg_eff, signal_above=signal_above) # Add k-NN variable with Profile("adding variable"): knnfeat = 'knn' #add_knn(data, feat=variable, newfeat=knnfeat, path='knn_fitter/models/knn_{}_{}.pkl.gz'.format(variable, bg_eff)) add_knn(data, feat=variable, newfeat=knnfeat, path=args.output + '/models/knn_{:s}_{:.0f}.pkl.gz'.format(variable, bg_eff)) # Loading KNN classifier with Profile("loading model"): #knn = loadclf('knn_fitter/models/knn_{:s}_{:.0f}.pkl.gz'.format(variable, bg_eff)) knn = loadclf( args.output + '/models/knn_{:s}_{:.0f}.pkl.gz'.format(variable, bg_eff)) # Filling fitted profile with Profile("Filling fitted profile"): rebin = 8 edges, centres = dict(), dict() for ax, var in zip(['x', 'y'], [VARX, VARY]): # Short-hands vbins, vmin, vmax = AXIS[var] # Re-binned bin edges edges[ax] = np.interp( np.linspace(0, vbins, vbins * rebin + 1, endpoint=True), range(vbins + 1), np.linspace(vmin, vmax, vbins + 1, endpoint=True)) # Re-binned bin centres centres[ax] = edges[ax][:-1] + 0.5 * np.diff(edges[ax]) pass # Get predictions evaluated at re-binned bin centres g = dict() g['x'], g['y'] = np.meshgrid(centres['x'], centres['y']) g['x'], g['y'] = standardise(g['x'], g['y']) X = np.vstack((g['x'].flatten(), g['y'].flatten())).T fit = knn.predict(X).reshape(g['x'].shape).T # Fill ROOT "profile" profile_fit = ROOT.TH2F('profile_fit', "", len(edges['x']) - 1, edges['x'].flatten('C'), len(edges['y']) - 1, edges['y'].flatten('C')) root_numpy.array2hist(fit, profile_fit) pass # Plotting for fit in [False, True]: # Select correct profile profile = profile_fit if fit else profile_meas # Plot plot(profile, fit, variable, bg_eff) pass pass # Plotting local selection efficiencies for D2-kNN < 0 # -- Compute signal efficiency for sig, msk in zip([True, False], [msk_sig, msk_bkg]): if sig: print "working on signal" else: print "working on bg" if sig: rgbs = [(247 / 255., 251 / 255., 255 / 255.), (222 / 255., 235 / 255., 247 / 255.), (198 / 255., 219 / 255., 239 / 255.), (158 / 255., 202 / 255., 225 / 255.), (107 / 255., 174 / 255., 214 / 255.), (66 / 255., 146 / 255., 198 / 255.), (33 / 255., 113 / 255., 181 / 255.), (8 / 255., 81 / 255., 156 / 255.), (8 / 255., 48 / 255., 107 / 255.)] red, green, blue = map(np.array, zip(*rgbs)) nb_cols = len(rgbs) stops = np.linspace(0, 1, nb_cols, endpoint=True) else: rgbs = [(255 / 255., 51 / 255., 4 / 255.), (247 / 255., 251 / 255., 255 / 255.), (222 / 255., 235 / 255., 247 / 255.), (198 / 255., 219 / 255., 239 / 255.), (158 / 255., 202 / 255., 225 / 255.), (107 / 255., 174 / 255., 214 / 255.), (66 / 255., 146 / 255., 198 / 255.), (33 / 255., 113 / 255., 181 / 255.), (8 / 255., 81 / 255., 156 / 255.), (8 / 255., 48 / 255., 107 / 255.)] red, green, blue = map(np.array, zip(*rgbs)) nb_cols = len(rgbs) stops = np.array([0] + list( np.linspace(0, 1, nb_cols - 1, endpoint=True) * (1. - bg_eff / 100.) + bg_eff / 100.)) pass ROOT.TColor.CreateGradientColorTable(nb_cols, stops, red, green, blue, NB_CONTOUR) # Define arrays shape = (AXIS[VARX][0], AXIS[VARY][0]) bins = [ np.linspace(AXIS[var][1], AXIS[var][2], AXIS[var][0] + 1, endpoint=True) for var in VARS ] x, y, z = (np.zeros(shape) for _ in range(3)) # Create `profile` histogram profile = ROOT.TH2F('profile', "", len(bins[0]) - 1, bins[0].flatten('C'), len(bins[1]) - 1, bins[1].flatten('C')) # Compute inclusive efficiency in bins of `VARY` effs = list() for edges in zip(bins[1][:-1], bins[1][1:]): msk_bin = (data[VARY] > edges[0]) & (data[VARY] < edges[1]) if signal_above: msk_pass = data[knnfeat] > 0 # ensure correct cut direction else: msk_pass = data[knnfeat] < 0 num_msk = msk * msk_bin * msk_pass num = data.loc[num_msk, 'weight_test'].values.sum() den = data.loc[msk & msk_bin, 'weight_test'].values.sum() effs.append(num / den) pass # Fill profile with Profile("Fill profile"): for i, j in itertools.product(*map(range, shape)): #print "Fill profile - (i, j) = ({}, {})".format(i,j) # Bin edges in x and y edges = [bin[idx:idx + 2] for idx, bin in zip([i, j], bins)] # Masks msks = [ (data[var] > edges[dim][0]) & (data[var] <= edges[dim][1]) for dim, var in enumerate(VARS) ] msk_bin = reduce(lambda x, y: x & y, msks) # Set non-zero bin content if np.sum(msk & msk_bin): if signal_above: msk_pass = data[ knnfeat] > 0 # ensure correct cut direction else: msk_pass = data[knnfeat] < 0 num_msk = msk * msk_bin * msk_pass num = data.loc[num_msk, 'weight_test'].values.sum() den = data.loc[msk & msk_bin, 'weight_test'].values.sum() eff = num / den profile.SetBinContent(i + 1, j + 1, eff) pass c = rp.canvas(batch=True) pad = c.pads()[0]._bare() pad.cd() pad.SetRightMargin(0.20) pad.SetLeftMargin(0.15) pad.SetTopMargin(0.10) # Styling profile.GetXaxis().SetTitle("Large-#it{R} jet " + latex(VARX, ROOT=True) + " = log(m^{2}/p_{T}^{2})") profile.GetYaxis().SetTitle("Large-#it{R} jet " + latex(VARY, ROOT=True) + " [GeV]") profile.GetZaxis().SetTitle("Selection efficiency for %s^{(%s%%)}" % (latex(variable, ROOT=True), bg_eff)) profile.GetYaxis().SetNdivisions(505) profile.GetZaxis().SetNdivisions(505) profile.GetXaxis().SetTitleOffset(1.4) profile.GetYaxis().SetTitleOffset(1.8) profile.GetZaxis().SetTitleOffset(1.3) zrange = (0., 1.) if zrange: profile.GetZaxis().SetRangeUser(*zrange) pass profile.SetContour(NB_CONTOUR) # Draw profile.Draw('COLZ') # Decorations c.text(qualifier=QUALIFIER, ymax=0.92, xmin=0.15, ATLAS=False) c.text(["#sqrt{s} = 13 TeV", "#it{W} jets" if sig else "Multijets"], ATLAS=False) # -- Efficiencies xaxis = profile.GetXaxis() yaxis = profile.GetYaxis() tlatex = ROOT.TLatex() tlatex.SetTextColor(ROOT.kGray + 2) tlatex.SetTextSize(0.023) tlatex.SetTextFont(42) tlatex.SetTextAlign(32) xt = xaxis.GetBinLowEdge(xaxis.GetNbins()) for eff, ibin in zip(effs, range(1, yaxis.GetNbins() + 1)): yt = yaxis.GetBinCenter(ibin) tlatex.DrawLatex( xt, yt, "%s%.1f%%" % ("#bar{#varepsilon}^{rel}_{%s} = " % ('sig' if sig else 'bkg') if ibin == 1 else '', eff * 100.)) pass # -- Bounds BOUNDS[0].DrawCopy("SAME") BOUNDS[1].DrawCopy("SAME") c.latex("m > 50 GeV", -4.5, BOUNDS[0].Eval(-4.5) + 30, align=21, angle=-37, textsize=13, textcolor=ROOT.kGray + 3) c.latex("m < 300 GeV", -2.5, BOUNDS[1].Eval(-2.5) - 30, align=23, angle=-57, textsize=13, textcolor=ROOT.kGray + 3) # Save mkdir('knn_fitter/figures/') c.save('knn_fitter/figures/knn_eff_{}_{:s}_{:.0f}.pdf'.format( 'sig' if sig else 'bkg', variable, bg_eff)) mkdir(args.output + '/figures/') c.save(args.output + '/figures/knn_eff_{}_{:s}_{:.0f}.pdf'.format( 'sig' if sig else 'bkg', variable, bg_eff)) pass return
def main (): # For reproducibility np.random.seed(21) # Parse command-line argument args = parser.parse_args() # Modify directory name to conform to convention if not args.dir.endswith('/'): args.dir += '/' print "Reading and reweighting, splitting files in:\n {}".format(args.dir) # paths = sorted(glob.glob(args.dir + '*/*_slim.h5')) paths = sorted(glob.glob("./extractedHbbTopDatasets/*.h5")) print "Found {} files.".format(len(paths)) # Reading input HDF5 file(s) data = None with Profile("Reading input HDF5 file(s)"): # Run batched conversion in parallel queue = multiprocessing.Queue() parts = run_batched(FileLoader, list(enumerate(paths)), queue=queue, max_processes=args.max_processes) data = np.lib.recfunctions.stack_arrays(zip(*sorted(parts, key=lambda t: t[0]))[1], autoconvert=True, usemask=False) # Concatenate data in sorted order, for reproducibility # data = np.concatenate(zip(*sorted(parts, key=lambda t: t[0]))[1]) pass print "Found {} samples.".format(data.shape[0]) # Subsample with Profile("Subsample"): for sig in [0,1]: # Select samples belonging to current category if sig == 0: msk = (data['signal'] == 0) & (data["dsid"] > 360000) else: msk = (data["signal"] == 1) # Store reference of samples belonging to other category other = np.array(~msk).astype(bool) # Subsample current category num_sample = int((args.train + args.test) * 1E+06) if num_sample <= msk.sum(): idx = np.random.choice(np.where(msk)[0], num_sample, replace=False) sample = np.zeros_like(msk).astype(bool) sample[idx] = True else: print "[WARNING] Requested {:.1e} samples, but only {:.1e} are availabe in current mask. Using all available samples.".format(num_sample, msk.sum()) sample = np.ones_like(msk).astype(bool) pass # Select subsample, and all samples from other categories data = data[sample | other] pass pass # Re-weighting with Profile("Re-weighting"): # Add new data columns data = append_fields(data, 'weight_train', np.ones_like(data['weight_test'])) data = append_fields(data, 'weight_adv', np.ones_like(data['weight_test'])) # Reweight signal and background separately for sig in [0,1]: # Prepare data arrays msk = data['signal'] == sig # Flat pT # ------------------------------------------------------------------ original = data['pt'][msk] xmin, xmax = original.min(), original.max() target = np.random.rand(original.size) * (xmax - xmin) + xmin # Fit bins-reweighter reweighter = BinsReweighter(n_bins=100, n_neighs=1) reweighter.fit(original, target=target) # Predict new, flat-pT weight data['weight_train'][msk] = reweighter.predict_weights(original) # (Flat-pT, physical-m) reweighted # ------------------------------------------------------------------ original = data['pt'][msk] original_weight = data['weight_test'][msk] ptmin, ptmax = data['pt'].min(), data['pt'].max() target = np.random.rand(msk.sum()) * (ptmax - ptmin) + ptmin # Fit bins-reweighter reweighter = BinsReweighter(n_bins=100, n_neighs=1) reweighter.fit(original, original_weight=original_weight, target=target) # Compute new weights data['weight_adv'][msk] = reweighter.predict_weights(original, original_weight=original_weight) # Standardise weight variables # ------------------------------------------------------------------ weight_variables = filter(lambda name: name.startswith('weight_'), data.dtype.names) for var in weight_variables: print " Ensuring unit mean for {}".format(var) data[var][msk] /= data[var][msk].mean() pass pass pass # Train/test split with Profile("Performing train/test split"): msk_sig = data['signal'] == 1 num_sig = msk_sig .sum() num_bkg = (~msk_sig).sum() num_train = int(args.train * 1E+06) print "Found {:.1e} signal and {:.1e} background samples.".format(num_sig, num_bkg) print "Using {:.1e} samples for training for each class, leaving {:.1e} signal and {:.1e} background samples for testing.".format(num_train, num_sig - num_train, num_bkg - num_train) idx_sig = np.where( msk_sig)[0] idx_bkg = np.where(~msk_sig)[0] idx_sig_train = np.random.choice(idx_sig, num_train, replace=False) idx_bkg_train = np.random.choice(idx_bkg, num_train, replace=False) data = append_fields(data, 'train', np.zeros_like(data['signal']).astype(int)) data['train'][idx_sig_train] = 1 data['train'][idx_bkg_train] = 1 pass # Shuffle with Profile("Shuffling samples"): idx = np.arange(data.shape[0]) np.random.shuffle(idx) data = data[idx] pass # Writing output HDF5 file with Profile("Writing output HDF5 file"): save_hdf5(data, './reweightDatasets/extractedData.h5') pass return
def main (args): # Definitions histstyle = dict(**HISTSTYLE) # Initialise args, cfg = initialise(args) # Load data data, features, _ = load_data('data/djr_LCTopo_1.h5') #, test=True) #data2, features, _ = load_data('data/djr_LCTopo_2.h5') #, test=True) #data = np.concatenate((data1, data2)) sigNumber = 0 sigDict = { 0: 'All Models', 1: 'Model A, m = 1 TeV', 2: 'Model A, m = 1.5 TeV', 3: 'Model A, m = 2 TeV', 4: 'Model A, m = 2.5 TeV', 5: 'Model B, m = 1 TeV', 6: 'Model B, m = 1.5 TeV', 7: 'Model B, m = 2 TeV', 8: 'Model B, m = 2.5 TeV', 9: 'Model C, m = 1 TeV', 10: 'Model C, m = 1.5 TeV', 11: 'Model C, m = 2 TeV', 12: 'Model C, m = 2.5 TeV', 13: 'Model D, m = 1 TeV', 14: 'Model D, m = 1.5 TeV', 15: 'Model D, m = 2 TeV', 16: 'Model D, m = 2.5 TeV', } outFile = ROOT.TFile.Open("figures/sensitivity_targetEff{}.root".format(kNN_eff),"RECREATE") histstyle[True] ['label'] = 'Multijets' histstyle[False]['label'] = 'Dark jets, {}'.format(sigDict[sigNumber]) # Add knn variables #base_var = ['lead_jet_ungrtrk500', 'sub_jet_ungrtrk500'] base_var = 'jet_ungrtrk500' kNN_var = base_var.replace('jet', 'knn') #base_vars = [base_var] #kNN_vars = [kNN_var] base_vars = ['lead_'+base_var, 'sub_'+base_var] kNN_vars = ['lead_'+kNN_var, 'sub_'+kNN_var] with Profile("Add variables"): #for i in range(len(base_var)): print "k-NN base variable: {} (cp. {})".format(base_var, kNN_var) add_knn(data, newfeat='lead_'+kNN_var, path='models/knn/knn1D_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel)) add_knn(data, newfeat='sub_'+kNN_var, path='models/knn/knn1D_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel)) #add_knn(data, newfeat=kNN_var, path='models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel)) print 'models/knn/knn_{}_{}_{}.pkl.gz'.format(base_var, kNN_eff, sigModel) # Check variable distributions weight = 'weight' # 'weight_test' / 'weight' scale = 139*1000000 # (inverse nanobarn) msk_bkg = data['signal'] == 0 if sigNumber==0: msk_sig = data['signal'] == 1 else: msk_sig = data['sigType'] == sigNumber knnBins = np.linspace(-100, 200, 75, endpoint=True) effBins = np.linspace(0,1,100, endpoint=True) for var in kNN_vars: ### Canvas ### c = rp.canvas(num_pads=2, batch=True) c_tmp = rp.canvas(num_pads=1, batch=True) ### Plot ### h2 = c.pads()[0].hist(data.loc[msk_sig, var].values, bins=knnBins, weights=data.loc[msk_sig, weight].values, normalise=True, **histstyle[False]) h1 = c.pads()[0].hist(data.loc[msk_bkg, var].values, bins=knnBins, weights=scale*data.loc[msk_bkg, weight].values, normalise=True, **histstyle[True]) h1_incl = c_tmp.hist(data.loc[msk_bkg, var].values, bins=knnBins, weights=scale*data.loc[msk_bkg, weight].values, normalise=False) h2_incl = c_tmp.hist(data.loc[msk_sig, var].values, bins=knnBins, weights=data.loc[msk_sig, weight].values, normalise=False) #h1_CR = c_tmp.hist(data.loc[msk_CR_bkg, var].values, bins=knnBins, weights=scale*data.loc[msk_CR_bkg, weight].values, normalise=False) #h2_CR = c_tmp.hist(data.loc[msk_CR_sig, var].values, bins=knnBins, weights=data.loc[msk_CR_sig, weight].values, normalise=False) print "bkg. incl integral: ", h1_incl.GetEffectiveEntries() print "sig. incl integral: ", h2_incl.GetEffectiveEntries() #print "bkg. CR efficiency: ", h1_CR.GetEffectiveEntries()/h1_incl.GetEffectiveEntries() #print "sig. CR efficiency: ", h2_CR.GetEffectiveEntries()/h2_incl.GetEffectiveEntries() normFactor = 1.0 / (3./2 + np.sqrt(h1_incl.GetEffectiveEntries()) ) print "Sensitivity with no cut: ", normFactor ### sensitivity ### sensitivity, bkg_eff_1jet = array( 'd' ), array( 'd' ) #sensitivity = [] #bkg_eff_1jet = [] i = 0 for cut in knnBins: msk_pass = (data[kNN_vars[0]]>cut) & (data[kNN_vars[1]]>cut) msk_pass1 = data[var]>cut #msk_pass = (data[var]>cut) msk_bkg_pass = msk_bkg & msk_pass msk_sig_pass = msk_sig & msk_pass msk_bkg_pass1 = msk_bkg & msk_pass1 msk_sig_pass1 = msk_sig & msk_pass1 h1_pass = c_tmp.hist(data.loc[msk_bkg_pass, var].values, bins=knnBins, weights=scale*data.loc[msk_bkg_pass, weight].values, normalise=False) h2_pass = c_tmp.hist(data.loc[msk_sig_pass, var].values, bins=knnBins, weights=data.loc[msk_sig_pass, weight].values, normalise=False) h1_pass1 = c_tmp.hist(data.loc[msk_bkg_pass1, var].values, bins=knnBins, weights=data.loc[msk_bkg_pass1, weight].values, normalise=False) if ( h2_incl.GetEffectiveEntries()>0 ) : #and h1_pass.GetEffectiveEntries()>0) : sensitivity.append( ((h2_pass.GetEffectiveEntries()/h2_incl.GetEffectiveEntries()) / (3./2 + np.sqrt(h1_pass.GetEffectiveEntries()) )) / normFactor ) #print "bkg. eff. @ " , cut, ": ", h1_pass.GetEffectiveEntries()/h1_incl.GetEffectiveEntries() #print "signal eff. @ ", cut, ": ", h2_pass.GetEffectiveEntries()/h2_incl.GetEffectiveEntries() #print "Sensitivity gain@ ", cut, ": ", ((h2_pass.GetEffectiveEntries()/h2_incl.GetEffectiveEntries()) / (3./2 + np.sqrt(h1_pass.GetEffectiveEntries())) ) / normFactor else: sensitivity.append(0) if (h1_incl.GetEffectiveEntries()>0 ) : bkg_eff_1jet.append(h1_pass1.GetEffectiveEntries()/h1_incl.GetEffectiveEntries()) else: bkg_eff_1jet.append(0) i = i+1 #c.pads()[0].ylim(0,0.25) c.pads()[0].logy() c.pads()[0].xlim(-100,200) c.pads()[1].ylim(0,30) c.pads()[1].xlim(-100,200) c.pads()[1].graph( sensitivity, bins=knnBins) #, oob=False ) ### Decorations ### c.legend(width=0.4, xmin=0.3, ymax=0.9) #c.xlabel("n_{trk}^{#epsilon={}\%}".format(kNN_eff)) #latex(var, ROOT=True)) c.xlabel("n_{trk}^{#epsilon}") #latex(var, ROOT=True)) c.ylabel("Fraction of jets") c.pads()[1].ylabel("Sensitivity gain")#"#epsilon_{S}/(#frac{3}{2} + #sqrt{B})/") c.pads()[1].text(["Sensitivity = #varepsilon_{S}/(#frac{3}{2} + #sqrt{B})", ], xmin=0.2, ymax=0.80, ATLAS=False) c.save('figures/distributions/sensitivity_{}_sig{}_eff{}.pdf'.format(var, sigNumber, kNN_eff)) c.save('figures/distributions/sensitivity_{}_sig{}_eff{}.eps'.format(var, sigNumber, kNN_eff)) del c gr_sen = ROOT.TGraph(len(sensitivity), knnBins, sensitivity) gr_eff = ROOT.TGraph(len(bkg_eff_1jet), knnBins, bkg_eff_1jet) gr_more = ROOT.TGraph(len(sensitivity), bkg_eff_1jet, sensitivity) gr_sen.GetXaxis().SetTitle("#it{n}_{trk}^{#epsilon}-cut") gr_sen.GetYaxis().SetTitle("Sensitivity gain") gr_eff.GetYaxis().SetTitle("Single jet #varepsilon_{B}") gr_sen.GetYaxis().SetAxisColor(ROOT.kOrange+2) gr_eff.GetYaxis().SetAxisColor(ROOT.kGreen+2) gr_sen.SetMarkerColor(ROOT.kOrange+2) gr_eff.SetMarkerColor(ROOT.kGreen+2) gr_eff.SetDrawOption("Y+") c2 = rp.canvas(batch=True) c2.pads()[0].logx() c2.pads()[0].cd() #c2.pads()[0].graph(sensitivity, bkg_eff_1jet) gr_more.GetXaxis().SetTitle("Single jet #varepsilon_{B}") gr_more.GetYaxis().SetTitle("Sensitivity gain") #gr_more.GetXaxis().SetRangeUser(0, 0.02) gr_more.Draw("AP") #c2 = ROOT.TCanvas("can2", "", 200,10,700,500) #(batch=True) #pad1 = ROOT.TPad("pad1", "", 0,0,1,1) #c2.pads()[0]._bare() #pad1.Draw() #pad1.cd() #gr_sen.Draw("AP") #c2.cd() #pad2 = ROOT.TPad("pad2", "", 0,0,1,1) #c2.pads()[0]._bare() #pad2.SetFillStyle(4000) #pad2.Draw() #pad2.cd() #gr_eff.Draw("PY+") #gr_eff.Draw("APY+") #gr_sen.Draw("SAME") #gr_sen = c2.graph(sensitivity, bins=knnBins, markercolor=ROOT.kOrange+2) #gr_eff = c2.graph(bkg_eff_1jet, bins=knnBins, markercolor=ROOT.kGreen+2, option='Y+' ) #gr_eff.GetYaxis.SetRange(0,1) #gr_eff.Draw("SAME Y+") #c2.xlabel("Single jet #varepsilon_{B}") #c2.ylabel("Sensitivity gain") #c2.text(["#epsilon=0.5 %",], xmin=0.2, ymax=0.8, ATLAS=False) ### Save ### #mkdir('figures/distributions') c2.save('figures/distributions/sensitivity_{}_eff{}_1jet.pdf'.format(var,kNN_eff) ) del c2 outFile.cd() gr_more.SetName("sensitivity_eff{}".format(kNN_eff)) gr_more.Write() outFile.Close() #print 'figures/distributions/sensitivity_{}_sig{}_eff{}.pdf'.format(var, sigNumber, kNN_eff) pass # Plot also the normal ntrk distribution for cross check with Roland's result """