def main(options): with open(options.config, 'r') as config_file: config = yaml.load(config_file) output_tag = config['output_tag'] mc_dir = config['mc_file_dir'] mc_fnames = config['mc_file_names'] #data not needed yet, but stil specify in the config for compatibility with constructor data_dir = config['data_file_dir'] data_fnames = config['data_file_names'] proc_to_tree_name = config['proc_to_tree_name'] #check if dnn (lstm) variables need to be read in proc_to_train_vars = config['train_vars'] all_train_vars = [] for proc, varrs in proc_to_train_vars.iteritems(): if isinstance(varrs, dict): object_vars = proc_to_train_vars[proc]['object_vars'] flat_obj_vars = [ var for i_object in object_vars for var in i_object ] event_vars = proc_to_train_vars[proc]['event_vars'] all_train_vars += (flat_obj_vars + event_vars) else: all_train_vars += varrs vars_to_add = config['vars_to_add'] if options.syst_name is not None: syst = options.syst_name read_syst = True else: read_syst = False if read_syst and options.dump_weight_systs: raise IOError( 'Cannot dump weight variations and tree systematics at the same time. Please run separately for each.' ) if options.data_only and (read_syst or options.dump_weight_systs): raise IOError('Cannot read Data and apply sysetmatic shifts') #Data handling stuff# #apply loosest selection (ggh) first, else memory requirements are ridiculous. Fine to do this since all cuts all looser than VBF (not removing events with higher priority) #also note we norm the MC before applying this cut. In data we apply it when reading in. #loosest_selection = 'dielectronMass > 110 and dielectronMass < 150 and leadElectronPtOvM > 0.333 and subleadElectronPtOvM > 0.25' cant do this since these vars change with systematics! loosest_selection = 'dielectronMass > 100' #load the mc dataframe for all years. Do not apply any specific preselection to sim samples root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, all_train_vars, vars_to_add, loosest_selection, read_systs=(read_syst or options.dump_weight_systs)) root_obj.no_lumi_scale() for sig_obj in root_obj.sig_objects: root_obj.load_mc(sig_obj, reload_samples=options.reload_samples) #if not read_syst: if not options.data_as_bkg: for bkg_obj in root_obj.bkg_objects: root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples) else: for data_obj in root_obj.data_objects: root_obj.load_data(data_obj, reload_samples=options.reload_samples) #overwrite background attribute, for compat with DNN class root_obj.mc_df_bkg = root_obj.data_df root_obj.concat() #get year of samples for roob obj and check we didn't accidentally read in more than 1 year if len(root_obj.years) != 1: raise IOError( 'Reading in more than one year at a time! Tagging should be split by year' ) else: year = list(root_obj.years)[0] if ("2016" in year) and (not options.data_only): root_obj.scale_sig_partial_2016( ) #FIXME: check this is actually called #if read_syst: combined_df = root_obj.mc_df_sig doesnt work with DNN set up since need bkg class in _init_ #else: combined_df = pd.concat([root_obj.mc_df_sig, root_obj.mc_df_bkg]) combined_df = pd.concat([root_obj.mc_df_sig, root_obj.mc_df_bkg]) #Tag sequence stuff# #specify sequence of tags and preselection targetting each tag_sequence = ['VBF', 'ggH'] #categories targetted true_procs = ['VBF', 'ggH', 'ttH'] #procs to run through cats #true_procs = ['ggH', 'ttH'] #procs to run through cats if (not read_syst) and (not options.dump_weight_systs): true_procs.append( 'Data' ) #is this line needed? guess so since could run mc and data together in a stat-only config if options.data_only: true_procs = ['Data'] #do data on its own (for memory really) #create tag object tag_obj = taggerBase(tag_sequence, true_procs, combined_df, syst_name=options.syst_name) if read_syst: tag_obj.relabel_syst_vars() #not run if reading weight systematics #get number models and tag boundaries from config with open(options.mva_config, 'r') as mva_config_file: config = yaml.load(mva_config_file) proc_to_model = config['models'] tag_boundaries = config['boundaries'] #evaluate MVA scores used in categorisation for proc, model in proc_to_model.iteritems(): #for BDT - proc:[var list]. For DNN - proc:{var_type1:[var_list_type1], var_type2: [...], ...} if isinstance(model, dict): object_vars = proc_to_train_vars[proc]['object_vars'] flat_obj_vars = [ var for i_object in object_vars for var in i_object ] event_vars = proc_to_train_vars[proc]['event_vars'] dnn_loaded = tag_obj.load_dnn(proc, model) train_tag = model['architecture'].split('_model')[0] tag_obj.eval_lstm(dnn_loaded, train_tag, root_obj, proc, object_vars, flat_obj_vars, event_vars) elif isinstance(model, str): tag_obj.eval_bdt(proc, model, proc_to_train_vars[proc]) else: raise IOError( 'Did not get a classifier models in correct format in config' ) del root_obj #need to do this after eval MVAs, since LSTM class used in eval_lstm needs some Data in df for constructor if (read_syst or options.dump_weight_systs): tag_obj.combined_df = tag_obj.combined_df[ tag_obj.combined_df.proc != 'Data'].copy( ) #avoid copy warnings later tag_preselection = tag_obj.get_tag_preselection() #set up tag boundaries for each process being targeted tag_obj.decide_tag(tag_preselection, tag_boundaries) tag_obj.decide_priority() branch_names = tag_obj.get_tree_names(tag_boundaries, year) tag_obj.set_tree_names(tag_boundaries, options.dump_weight_systs, year) tag_obj.fill_trees(branch_names, year, print_yields=not read_syst) if not read_syst: pass #tag_obj.plot_matrix(branch_names, output_tag) #struct error?
def main(options): #take options from the yaml config with open(options.config, 'r') as config_file: config = yaml.load(config_file) output_tag = config['output_tag'] mc_dir = config['mc_file_dir'] mc_fnames = config['mc_file_names'] data_dir = config['data_file_dir'] data_fnames = config['data_file_names'] proc_to_tree_name = config['proc_to_tree_name'] object_vars = config['object_vars'] flat_obj_vars = [var for i_object in object_vars for var in i_object] event_vars = config['event_vars'] vars_to_add = config['vars_to_add'] presel = config['preselection'] #Data handling stuff# #load the mc dataframe for all years root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, flat_obj_vars+event_vars, vars_to_add, presel) for sig_obj in root_obj.sig_objects: root_obj.load_mc(sig_obj, reload_samples=options.reload_samples) if not options.data_as_bkg: for bkg_obj in root_obj.bkg_objects: root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples) else: for data_obj in root_obj.data_objects: root_obj.load_data(data_obj, reload_samples=options.reload_samples) #overwrite background attribute, for compat with DNN class root_obj.mc_df_bkg = root_obj.data_df root_obj.concat() #apply cut-based selection if not optimising BDT score (pred probs still evaluated for compatability w exisiting catOpt constructor). if len(options.cut_based_str)>0: root_obj.apply_more_cuts(options.cut_based_str) # DNN evaluation stuff # #load architecture and model weights print 'loading DNN: {}'.format(options.model_architecture) with open('{}'.format(options.model_architecture), 'r') as model_json: model_architecture = model_json.read() model = keras.models.model_from_json(model_architecture) model.load_weights('{}'.format(options.model)) LSTM = LSTM_DNN(root_obj, object_vars, event_vars, 1.0, False, True) # set up X and y Matrices. Log variables that have GeV units LSTM.var_transform(do_data=False) #bkg=data here. This option is for plotting purposes X_tot, y_tot = LSTM.create_X_y() X_tot = X_tot[flat_obj_vars+event_vars] #filter unused vars #scale X_vars to mean=0 and std=1. Use scaler fit during previous dnn training LSTM.load_X_scaler(out_tag=output_tag) X_tot = LSTM.X_scaler.transform(X_tot) #make 2D vars for LSTM layers X_tot = pd.DataFrame(X_tot, columns=flat_obj_vars+event_vars) X_tot_high_level = X_tot[event_vars].values X_tot_low_level = LSTM.join_objects(X_tot[flat_obj_vars]) #predict probs pred_prob_tot = model.predict([X_tot_high_level, X_tot_low_level], batch_size=1024).flatten() sig_weights = root_obj.mc_df_sig['weight'].values sig_m_ee = root_obj.mc_df_sig['dielectronMass'].values pred_prob_sig = pred_prob_tot[y_tot==1] bkg_weights = root_obj.data_df['weight'].values bkg_m_ee = root_obj.data_df['dielectronMass'].values pred_prob_bkg = pred_prob_tot[y_tot==0] #category optimisation stuff# #set up optimiser ranges and no. categories to test if non-cut based ranges = [ [0.3,1.] ] names = ['{} score'.format(output_tag)] #arbitrary print_str = '' cats = [1,2,3,4] AMS = [] #just to use class methods here if len(options.cut_based_str)>0: optimiser = CatOptim(sig_weights, sig_m_ee, [pred_prob_sig], bkg_weights, bkg_m_ee, [pred_prob_bkg], 0, ranges, names) AMS = optimiser.cutBasedAMS() print 'String for cut based optimimastion: {}'.format(options.cut_based_str) print 'Cut-based optimimsation gives AMS = {:1.8f}'.format(AMS) else: for n_cats in cats: optimiser = CatOptim(sig_weights, sig_m_ee, [pred_prob_sig], bkg_weights, bkg_m_ee, [pred_prob_bkg], n_cats, ranges, names) optimiser.optimise(1, options.n_iters) #set lumi to 1 as already scaled when loading in print_str += 'Results for {} categories : \n'.format(n_cats) print_str += optimiser.getPrintableResult() AMS.append(optimiser.bests.totSignif) print '\n {}'.format(print_str) #make nCat vs AMS plots Plotter.cats_vs_ams(cats, AMS, output_tag)
def main(options): with open(options.config, 'r') as config_file: config = yaml.load(config_file) output_tag = config['output_tag'] mc_dir = config['mc_file_dir'] mc_fnames = config['mc_file_names'] data_dir = config['data_file_dir'] data_fnames = config['data_file_names'] proc_to_tree_name = config['proc_to_tree_name'] proc_to_train_vars = config['train_vars'] object_vars = proc_to_train_vars['VBF']['object_vars'] flat_obj_vars = [var for i_object in object_vars for var in i_object] event_vars = proc_to_train_vars['VBF']['event_vars'] #used to check all vars we need for categorisation are in our dfs all_train_vars = proc_to_train_vars['ggH'] + flat_obj_vars + event_vars vars_to_add = config['vars_to_add'] #Data handling stuff# #apply loosest selection (ggh) first, else memory requirements are ridiculous. Fine to do this since all cuts all looser than VBF (not removing events with higher priority) loosest_selection = 'dielectronMass > 110 and dielectronMass < 150' #load the mc dataframe for all years. Do not apply any specific preselection root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, all_train_vars, vars_to_add, loosest_selection) root_obj.no_lumi_scale() for sig_obj in root_obj.sig_objects: root_obj.load_mc(sig_obj, reload_samples=options.reload_samples) if not options.data_as_bkg: for bkg_obj in root_obj.bkg_objects: root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples) else: for data_obj in root_obj.data_objects: root_obj.load_data(data_obj, reload_samples=options.reload_samples) #overwrite background attribute, for compat with DNN class root_obj.mc_df_bkg = root_obj.data_df root_obj.concat() #Tag sequence stuff# #NOTE: these must be concatted in the same way they are concatted in LSTM.create_X_y(), else predicts are misaligned combined_df = pd.concat([root_obj.mc_df_sig, root_obj.mc_df_bkg]) #decide sequence of tags and specify preselection for use with numpy.select: tag_sequence = ['VBF', 'ggH'] proc_to_preselection = { 'VBF': [ combined_df['dielectronMass'].gt(110) & combined_df['dielectronMass'].lt(150) & combined_df['leadElectronPToM'].gt(0.333) & combined_df['subleadElectronPToM'].gt(0.25) & combined_df['dijetMass'].gt(350) & combined_df['leadJetPt'].gt(40) & combined_df['subleadJetPt'].gt(30) ], 'ggH': [ combined_df['dielectronMass'].gt(110) & combined_df['dielectronMass'].lt(150) & combined_df['leadElectronPToM'].gt(0.333) & combined_df['subleadElectronPToM'].gt(0.25) ] } # GET MVA SCORES # with open(options.mva_config, 'r') as mva_config_file: config = yaml.load(mva_config_file) proc_to_model = config['models'] proc_to_tags = config['boundaries'] #evaluate ggH BDT scores print 'evaluating ggH classifier: {}'.format(proc_to_model['ggH']) clf = pickle.load(open('models/{}'.format(proc_to_model['ggH']), "rb")) train_vars = proc_to_train_vars['ggH'] combined_df['ggH_mva'] = clf.predict_proba( combined_df[train_vars].values)[:, 1:].ravel() #Evaluate VBF LSTM print 'loading VBF DNN:' with open('models/{}'.format(proc_to_model['VBF']['architecture']), 'r') as model_json: model_architecture = model_json.read() model = keras.models.model_from_json(model_architecture) model.load_weights('models/{}'.format(proc_to_model['VBF']['model'])) LSTM = LSTM_DNN(root_obj, object_vars, event_vars, 1.0, False, True) # set up X and y Matrices. Log variables that have GeV units LSTM.var_transform(do_data=False) X_tot, y_tot = LSTM.create_X_y() X_tot = X_tot[flat_obj_vars + event_vars] #filter unused vars print np.isnan(X_tot).any() #scale X_vars to mean=0 and std=1. Use scaler fit during previous dnn training LSTM.load_X_scaler(out_tag='VBF_DNN') X_tot = LSTM.X_scaler.transform(X_tot) #make 2D vars for LSTM layers X_tot = pd.DataFrame(X_tot, columns=flat_obj_vars + event_vars) X_tot_high_level = X_tot[event_vars].values X_tot_low_level = LSTM.join_objects(X_tot[flat_obj_vars]) #predict probs. Corresponds to same events, since dfs are concattened internally in the same combined_df['VBF_mva'] = model.predict( [X_tot_high_level, X_tot_low_level], batch_size=1).flatten() # TAG NUMBER # #decide on tag for proc in tag_sequence: presel = proc_to_preselection[proc] tag_bounds = proc_to_tags[proc].values() tag_masks = [] for i_bound in range( len(tag_bounds)): #c++ type looping for index reasons if i_bound == 0: #first bound, tag 0 tag_masks.append(presel[0] & combined_df['{}_mva'.format( proc)].gt(tag_bounds[i_bound])) else: #intermed bound tag_masks.append(presel[0] & combined_df['{}_mva'.format( proc)].lt(tag_bounds[i_bound - 1]) & combined_df[ '{}_mva'.format(proc)].gt(tag_bounds[i_bound])) mask_key = [icat for icat in range(len(tag_bounds))] combined_df['{}_analysis_tag'.format(proc)] = np.select( tag_masks, mask_key, default=-999) # PROC PRIORITY # # deduce tag priority: if two or more tags satisfied then set final tag to highest priority tag. make this non hardcoded i.e. compare proc in position 1 to all lower prioty positions. then compare proc in pos 2 ... tag_priority_filter = [ combined_df['VBF_analysis_tag'].ne(-999) & combined_df['ggH_analysis_tag'].ne(-999), # 1) if both filled... combined_df['VBF_analysis_tag'].ne(-999) & combined_df['ggH_analysis_tag'].eq( -999), # 2) if VBF filled and ggH not, take VBF combined_df['VBF_analysis_tag'].eq(-999) & combined_df['ggH_analysis_tag'].ne( -999), # 3) if ggH filled and VBF not, take ggH ] tag_priority_key = [ 'VBF', #1) take VBF 'VBF', #2) take VBF 'ggH', #3) take ggH ] combined_df['priority_tag'.format(proc)] = np.select( tag_priority_filter, tag_priority_key, default='NOTAG') # else keep -999 i.e. NOTAG #some debug checks: #print combined_df[['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']] #print combined_df[combined_df.VBF_analysis_tag>-1][['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']] #print combined_df[combined_df.ggH_analysis_tag>-1][['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']] # FILL TREES BASED ON BOTH OF ABOVE tree_vars = ['dZ', 'CMS_hgg_mass', 'weight'] combined_df['dZ'] = float(0.) combined_df['CMS_hgg_mass'] = combined_df['dielectronMass'] # FIXME: dont loop through events eventually but for now I cba to use numpy to vectorise it again #for true_proc in tag_sequence+['Data']: # #isolate true proc # true_proc_df = combined_df[combined_df.proc==true_proc.lower()] # #how much true proc landed in each of our analysis cats? # for target_proc in tag_sequence: #for all events that got the proc tag, which tag did they fall into? # true_proc_target_proc_df = true_proc_df[true_proc_df.priority_tag==target_proc] # for i_tag in range(len(proc_to_tags[target_proc].values())):#for each tag corresponding to the category we target, which events go in which tag # true_procs_target_proc_tag_i = true_proc_target_proc_df[true_proc_target_proc_df['{}_analysis_tag'.format(target_proc)].eq(i_tag)] # # branch_name = '{}_125_13TeV_{}cat{} :'.format(true_proc.lower(), target_proc.lower(), i_tag ) # print true_procs_target_proc_tag_i[['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']].head(10) # print branch_name #get tree names branch_names = {} #print 'DEBUG: {}'.format(np.unique(combined_df['proc'])) for true_proc in tag_sequence + ['Data']: branch_names[true_proc] = [] for target_proc in tag_sequence: #for all events that got the proc tag, which tag did they fall into? for i_tag in range( len(proc_to_tags[target_proc].values()) ): #for each tag corresponding to the category we target, which events go in which tag if true_proc is not 'Data': branch_names[true_proc].append( '{}_125_13TeV_{}cat{}'.format( true_proc.lower(), target_proc.lower(), i_tag)) else: branch_names[true_proc].append( '{}_13TeV_{}cat{}'.format(true_proc, target_proc.lower(), i_tag)) #debug_procs = ['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag'] debug_vars = [ 'proc', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag' ] combined_df['tree_name'] = combined_df.apply(assign_tree, axis=1) print combined_df[debug_vars + ['tree_name']] if not path.isdir('output_trees/'): print 'making directory: {}'.format('output_trees/') system('mkdir -p %s' % 'output_trees/') #have to save individual trees then hadd procs together on the command line. for proc in tag_sequence + ['Data']: selected_df = combined_df[combined_df.proc == proc] for bn in branch_names[proc]: print bn branch_selected_df = selected_df[selected_df.tree_name == bn] print branch_selected_df[debug_vars + ['tree_name']].head(20) root_pandas.to_root(branch_selected_df[tree_vars], 'output_trees/{}.root'.format(bn), key=bn) print