from features import * features, additional = get_features(args.what) # additional features, used somewhere in logic below features += ['gsf_pt', 'gsf_eta', 'preid_bdtout1'] multi_dim_branches = [] data = None multi_dim = None if args.multi_dim: multi_dim_branches = [ 'gsf_ecal_cluster_ematrix', 'ktf_ecal_cluster_ematrix' ] data, multi_dim = pre_process_data(dataset, features + labeling + additional + multi_dim_branches, for_seeding=('seeding' in args.what)) else: data = pre_process_data(dataset, features + labeling + additional, for_seeding=('seeding' in args.what)) #@@data['eid_sc_Nclus'] = data['sc_Nclus'] #@@features+= ['eid_sc_Nclus'] print 'making plots in dir: ', plots for feat in multi_dim_branches: vals = {} for dataset in [ {
'gen_phi', 'trk_pt', 'trk_eta', 'trk_phi', 'gsf_pt', 'gsf_eta', 'gsf_phi', ] dsets.input_files['debug'] = [ '/afs/cern.ch/work/m/mverzett/RK102v3/src/LowPtElectrons/LowPtElectrons/run/george_synch_all.root' ] #fields = set(biased_features+additional+unbiased_features+to_dump+labeling) fields = set(biased_features + unbiased_features + to_dump + labeling) data = dsets.pre_process_data('debug', list(fields), for_seeding=False, keep_nonmatch=True) from sklearn.externals import joblib import xgboost as xgb biased_model = joblib.load( '/afs/cern.ch/work/m/mverzett/RecoEgamma-ElectronIdentification/LowPtElectrons/RunII_Fall17_LowPtElectrons_displaced_pt_eta_biased.pkl' ) biased_model.booster = lambda: biased_model._Booster #'models/2018Nov01/bdt_bo_displaced_improvedfullseeding_noweight/model_18.pkl') unbiased_model = joblib.load( '/afs/cern.ch/work/m/mverzett/RecoEgamma-ElectronIdentification/LowPtElectrons/RunII_Fall17_LowPtElectrons_unbiased.pkl' ) unbiased_model.booster = lambda: unbiased_model._Booster
mods = '%s/src/LowPtElectrons/LowPtElectrons/macros/models/%s/' % ( os.environ['CMSSW_BASE'], tag) if not os.path.isdir(mods): os.makedirs(mods) plots = '%s/src/LowPtElectrons/LowPtElectrons/macros/plots/%s/feature_selection/' % ( os.environ['CMSSW_BASE'], tag) if not os.path.isdir(plots): os.makedirs(plots) from features import * features, additional = get_features('combined_id') fields = features + labeling + additional data = pre_process_data(dataset, fields, False) electrons = data[data.is_e & (np.random.rand(data.shape[0]) < 0.3)] tracks = data[data.is_other & (np.random.rand(data.shape[0]) < 0.1)] features.pop(features.index('trk_high_purity')) plt.clf() plt.figure(figsize=[11, 11]) corrmat = data[features].corr(method='pearson', min_periods=1) heatmap = plt.pcolor(corrmat, cmap="RdBu", vmin=-1, vmax=+1) plt.colorbar() #heatmap1, ax=ax1) plt.xlim(0, len(features) + 1) plt.ylim(0, len(features) + 1) plt.gca().set_xticks(np.arange(len(features)) + 0.5, minor=False)
print 'Reading data' from datasets import pre_process_data if '.hdf' in dataset: test = pd.read_hdf(dataset, key='data') \ if ':' not in dataset else \ pd.read_hdf(dataset.split(':')[0], key=dataset.split(':')[1]) else: print 'Reading a FULL dataset, this may be dangerous as the train dataset might be included!' fields = features + labeling if args.SW94X and 'seeding' in args.what: fields += seed_94X_additional else: fields += additional print 'Fields:\n', fields test = pre_process_data(dataset, fields, for_seeding=('seeding' in args.what), keep_nonmatch=args.usenomatch) from pdb import set_trace if args.events > 0: test = test[:args.events] print "test.shape:", test.shape if args.query: test = test.query(args.query) print "test.shape:", test.shape if 'original_weight' not in test.columns: test['original_weight'] = 1. else:
if not os.path.isdir(jobdir): os.makedirs(jobdir) else: jobdir = mods plots = '%s/src/LowPtElectrons/LowPtElectrons/macros/plots/%s/' % ( os.environ['CMSSW_BASE'], tag) if not os.path.isdir(plots): os.makedirs(plots) from features import * features, additional = get_features(args.what) fields = features + labeling + additional if 'gsf_pt' not in fields: fields += ['gsf_pt'] data = pre_process_data(dataset, fields, args.what in ['seeding', 'fullseeding']) from sklearn.model_selection import train_test_split train, test = train_test_split(data, test_size=0.2, random_state=42) # # Train NN # from keras.models import Model from keras.layers import Input from keras.metrics import binary_accuracy from keras.initializers import RandomNormal from keras.layers import Dense, Dropout, Multiply, Add, \ Concatenate, Reshape, LocallyConnected1D, Flatten from keras.layers.normalization import BatchNormalization