def main(options): #take options from the yaml config with open(options.config, 'r') as config_file: config = yaml.load(config_file) output_tag = config['output_tag'] mc_dir = config['mc_file_dir'] mc_fnames = config['mc_file_names'] #data not needed yet, could use this for validation later. keep for compat with class data_dir = config['data_file_dir'] data_fnames = config['data_file_names'] train_vars = config['train_vars'] vars_to_add = config['vars_to_add'] presel = config['preselection'] proc_to_tree_name = config['proc_to_tree_name'] #sig_colour = 'forestgreen' sig_colour = 'red' #Data handling stuff# sys.exit(1) #load the mc dataframe for all years if options.pt_reweight: cr_selection = config['reweight_cr'] output_tag += '_pt_reweighted' root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, cr_selection) else: root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, presel) for sig_obj in root_obj.sig_objects: root_obj.load_mc(sig_obj, reload_samples=options.reload_samples) for bkg_obj in root_obj.bkg_objects: root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples) for data_obj in root_obj.data_objects: root_obj.load_data(data_obj, reload_samples=options.reload_samples) root_obj.concat() if options.pt_reweight and options.reload_samples: root_obj.apply_pt_rew('DYMC', presel) #Plotter stuff# #set up X, w and y, train-test plotter = Plotter(root_obj, train_vars, sig_col=sig_colour, norm_to_data=True) for var in train_vars: plotter.plot_input(var, options.n_bins, output_tag, options.ratio_plot, norm_to_data=True)
def main(options): #take options from the yaml config with open(options.config, 'r') as config_file: config = yaml.load(config_file) output_tag = config['output_tag'] mc_dir = config['mc_file_dir'] mc_fnames = config['mc_file_names'] #data not needed yet, but stil specify in the config for compatibility with constructor data_dir = config['data_file_dir'] data_fnames = config['data_file_names'] proc_to_tree_name = config['proc_to_tree_name'] train_vars = config['train_vars'] vars_to_add = config['vars_to_add'] presel = config['preselection'] #Data handling stuff# #load the mc dataframe for all years if options.pt_reweight: cr_selection = config['reweight_cr'] output_tag += '_pt_reweighted' root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, cr_selection) else: root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, presel) for sig_obj in root_obj.sig_objects: root_obj.load_mc(sig_obj, reload_samples=options.reload_samples) for bkg_obj in root_obj.bkg_objects: root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples) for data_obj in root_obj.data_objects: root_obj.load_data(data_obj, reload_samples=options.reload_samples) root_obj.concat() #reweight samples in bins of pT (and maybe Njets), for each year separely. Note targetted selection # is applied here and all df's are resaved for smaller mem if options.pt_reweight and options.reload_samples: #FIXME what about reading files in first time, wanting to pT rew, but not including options.reload samples? It wont reweight and save the reweighted df's root_obj.apply_pt_rew('DYMC', presel) #root_obj.pt_njet_reweight('DYMC', year, presel) #BDT stuff# #set up X, w and y, train-test bdt_hee = BDTHelpers(root_obj, train_vars, options.train_frac, eq_train=options.eq_train) bdt_hee.create_X_and_y(mass_res_reweight=True) #submit the HP search if option true if options.hp_perm is not None: if options.opt_hps and options.train_best: raise Exception( 'Cannot optimise HPs and train best model. Run optimal training after hyper paramter optimisation' ) elif options.opt_hps and options.hp_perm: raise Exception( 'opt_hps option submits scripts with the hp_perm option; Cannot submit a script with both!' ) else: print( 'About to train + validate on dataset with {} fold splitting' .format(options.k_folds)) bdt_hee.set_hyper_parameters(options.hp_perm) bdt_hee.set_k_folds(options.k_folds) for i_fold in range(options.k_folds): bdt_hee.set_i_fold(i_fold) bdt_hee.train_classifier(root_obj.mc_dir, save=False) bdt_hee.validation_rocs.append(bdt_hee.compute_roc()) with open('{}/bdt_hp_opt_{}.txt'.format(mc_dir, output_tag), 'a+') as val_roc_file: bdt_hee.compare_rocs(val_roc_file, options.hp_perm) val_roc_file.close() elif options.opt_hps: #FIXME: add warning that many jobs are about to be submiited if options.k_folds < 2: raise ValueError('K-folds option must be at least 2') if path.isfile('{}/bdt_hp_opt_{}.txt'.format(mc_dir, output_tag)): system('rm {}/bdt_hp_opt_{}.txt'.format(mc_dir, output_tag)) print('deleting: {}/bdt_hp_opt_{}.txt'.format( mc_dir, output_tag)) bdt_hee.batch_gs_cv(k_folds=options.k_folds, pt_rew=options.pt_reweight) elif options.train_best: output_tag += '_best' with open('{}/bdt_hp_opt_{}.txt'.format(mc_dir, output_tag), 'r') as val_roc_file: hp_roc = val_roc_file.readlines() best_params = hp_roc[-1].split(';')[0] print('Best classifier params are: {}'.format(best_params)) bdt_hee.set_hyper_parameters(best_params) bdt_hee.train_classifier(root_obj.mc_dir, save=True, model_name=output_tag) bdt_hee.compute_roc() bdt_hee.plot_roc(output_tag) bdt_hee.plot_output_score( output_tag, ratio_plot=True, norm_to_data=(not options.pt_reweight)) #else just train BDT with default HPs else: bdt_hee.train_classifier(root_obj.mc_dir, save=True, model_name=output_tag + '_clf') #bdt_hee.train_classifier(root_obj.mc_dir, save=False, model_name=output_tag+'_clf') bdt_hee.compute_roc() bdt_hee.plot_roc(output_tag) #bdt_hee.plot_output_score(output_tag, ratio_plot=True, norm_to_data=(not options.pt_reweight), log=False) bdt_hee.plot_output_score(output_tag, ratio_plot=True, norm_to_data=(not options.pt_reweight), log=True)
def main(options): #take options from the yaml config with open(options.config, 'r') as config_file: config = yaml.load(config_file) output_tag = config['output_tag'] mc_dir = config['mc_file_dir'] mc_fnames = config['mc_file_names'] data_dir = config['data_file_dir'] data_fnames = config['data_file_names'] proc_to_tree_name = config['proc_to_tree_name'] object_vars = config['object_vars'] flat_obj_vars = [var for i_object in object_vars for var in i_object] event_vars = config['event_vars'] vars_to_add = config['vars_to_add'] presel = config['preselection'] #Data handling stuff# #load the mc dataframe for all years if options.pt_reweight: cr_selection = config['reweight_cr'] output_tag += '_pt_reweighted' root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, flat_obj_vars + event_vars, vars_to_add, cr_selection) else: root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, flat_obj_vars + event_vars, vars_to_add, presel) for sig_obj in root_obj.sig_objects: root_obj.load_mc(sig_obj, reload_samples=options.reload_samples) if not options.data_as_bkg: for bkg_obj in root_obj.bkg_objects: root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples) else: for data_obj in root_obj.data_objects: root_obj.load_data(data_obj, reload_samples=options.reload_samples) #overwrite background attribute, for compat with DNN class root_obj.mc_df_bkg = root_obj.data_df root_obj.concat() if options.pt_reweight and options.reload_samples: root_obj.apply_pt_rew('DYMC', presel) #apply cut-based selection if not optimising BDT score (pred probs still evaluated for compatability w exisiting catOpt constructor). if len(options.cut_based_str) > 0: root_obj.apply_more_cuts(options.cut_based_str) # DNN evaluation stuff # #load architecture and model weights print 'loading DNN: {}'.format(options.model_architecture) with open('{}'.format(options.model_architecture), 'r') as model_json: model_architecture = model_json.read() model = keras.models.model_from_json(model_architecture) model.load_weights('{}'.format(options.model)) LSTM = LSTM_DNN(root_obj, object_vars, event_vars, 1.0, False, True) # set up X and y Matrices. Log variables that have GeV units LSTM.var_transform( do_data=False ) #bkg=data here. This option is for plotting purposes X_tot, y_tot = LSTM.create_X_y() X_tot = X_tot[flat_obj_vars + event_vars] #filter unused vars #scale X_vars to mean=0 and std=1. Use scaler fit during previous dnn training LSTM.load_X_scaler(out_tag=output_tag) X_tot = LSTM.X_scaler.transform(X_tot) #make 2D vars for LSTM layers X_tot = pd.DataFrame(X_tot, columns=flat_obj_vars + event_vars) X_tot_high_level = X_tot[event_vars].values X_tot_low_level = LSTM.join_objects(X_tot[flat_obj_vars]) #predict probs pred_prob_tot = model.predict([X_tot_high_level, X_tot_low_level], batch_size=1024).flatten() sig_weights = root_obj.mc_df_sig['weight'].values sig_m_ee = root_obj.mc_df_sig['dielectronMass'].values pred_prob_sig = pred_prob_tot[y_tot == 1] bkg_weights = root_obj.data_df['weight'].values bkg_m_ee = root_obj.data_df['dielectronMass'].values pred_prob_bkg = pred_prob_tot[y_tot == 0] #category optimisation stuff# #set up optimiser ranges and no. categories to test if non-cut based #ranges = [ [0.3,1.] ] ranges = [[0.15, 1.]] names = ['{} score'.format(output_tag)] #arbitrary print_str = '' cats = [1, 2, 3, 4] AMS = [] #just to use class methods here if len(options.cut_based_str) > 0: optimiser = CatOptim(sig_weights, sig_m_ee, [pred_prob_sig], bkg_weights, bkg_m_ee, [pred_prob_bkg], 0, ranges, names) AMS = optimiser.cutBasedAMS() print 'String for cut based optimimastion: {}'.format( options.cut_based_str) print 'Cut-based optimimsation gives AMS = {:1.8f}'.format(AMS) else: for n_cats in cats: optimiser = CatOptim(sig_weights, sig_m_ee, [pred_prob_sig], bkg_weights, bkg_m_ee, [pred_prob_bkg], n_cats, ranges, names) optimiser.optimise( 1, options.n_iters ) #set lumi to 1 as already scaled when loading in print_str += 'Results for {} categories : \n'.format(n_cats) print_str += optimiser.getPrintableResult() AMS.append(optimiser.bests.totSignif) print '\n {}'.format(print_str) #make nCat vs AMS plots Plotter.cats_vs_ams(cats, AMS, output_tag)
def main(options): #take options from the yaml config with open(options.config, 'r') as config_file: config = yaml.load(config_file) output_tag = config['output_tag'] mc_dir = config['mc_file_dir'] mc_fnames = config['mc_file_names'] data_dir = config['data_file_dir'] data_fnames = config['data_file_names'] proc_to_tree_name = config['proc_to_tree_name'] object_vars = config['object_vars'] flat_obj_vars = [var for i_object in object_vars for var in i_object] event_vars = config['event_vars'] vars_to_add = config['vars_to_add'] presel = config['preselection'] #Data handling stuff# if options.pt_reweight: cr_selection = config['reweight_cr'] output_tag += '_pt_reweighted' root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, flat_obj_vars + event_vars, vars_to_add, cr_selection) else: root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, flat_obj_vars + event_vars, vars_to_add, presel) #load the dataframes for all years for sig_obj in root_obj.sig_objects: root_obj.load_mc(sig_obj, reload_samples=options.reload_samples) for bkg_obj in root_obj.bkg_objects: root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples) for data_obj in root_obj.data_objects: # for plotting root_obj.load_data(data_obj, reload_samples=options.reload_samples) root_obj.concat() #reweight samples in bins of pT (and maybe Njets), for each year separely. if options.pt_reweight and options.reload_samples: root_obj.apply_pt_rew('DYMC', presel) #LSTM stuff# LSTM = LSTM_DNN(root_obj, object_vars, event_vars, options.train_frac, options.eq_weights, options.batch_boost) if not options.opt_hps: LSTM.var_transform(do_data=True) X_tot, y_tot = LSTM.create_X_y(mass_res_reweight=True) LSTM.split_X_y(X_tot, y_tot, do_data=True) if options.hp_perm is not None: LSTM.get_X_scaler(LSTM.all_vars_X_train, out_tag=output_tag, save=False) else: LSTM.get_X_scaler(LSTM.all_vars_X_train, out_tag=output_tag) LSTM.X_scale_train_test(do_data=True) LSTM.set_low_level_2D_test_train(do_data=True, ignore_train=options.batch_boost) #functions called in subbed job, if options.opt_hps was true if options.hp_perm is not None: if options.opt_hps and options.train_best: raise Exception( 'Cannot optimise HPs and train best model. Run optimal training after hyper paramter optimisation' ) elif options.opt_hps and options.hp_perm: raise Exception( 'opt_hps option submits scripts with the hp_perm option; Cannot submit a script with both!' ) else: LSTM.set_hyper_parameters(options.hp_perm) LSTM.model.summary() LSTM.train_w_batch_boost(out_tag=output_tag, save=False) with open('{}/lstm_hp_opt_{}.txt'.format(mc_dir, output_tag), 'a+') as val_roc_file: LSTM.compare_rocs(val_roc_file, options.hp_perm) val_roc_file.close() elif options.opt_hps: #FIXME: add warning that many jobs are about to be submiited if path.isfile('{}/lstm_hp_opt_{}.txt'.format(mc_dir, output_tag)): system('rm {}/lstm_hp_opt_{}.txt'.format(mc_dir, output_tag)) print('deleting: {}/lstm_hp_opt_{}.txt'.format( mc_dir, output_tag)) LSTM.batch_gs_cv(pt_rew=options.pt_reweight) elif options.train_best: output_tag += '_best' with open('{}/lstm_hp_opt_{}.txt'.format(mc_dir, output_tag), 'r') as val_roc_file: hp_roc = val_roc_file.readlines() best_params = hp_roc[-1].split(';')[0] print 'Best classifier params are: {}'.format(best_params) LSTM.set_hyper_parameters(best_params) LSTM.model.summary() LSTM.train_w_batch_boost(out_tag=output_tag) #compute final roc on test set LSTM.compute_roc(batch_size=1024) LSTM.plot_roc(output_tag) LSTM.plot_output_score(output_tag, batch_size=1024, ratio_plot=True, norm_to_data=(not options.pt_reweight)) #else train with basic parameters/architecture else: LSTM.model.summary() if options.batch_boost: #type of model selection so need validation set LSTM.train_w_batch_boost( out_tag=output_tag ) #handles creating validation set and 2D vars and sequential saving else: LSTM.train_network(epochs=5, batch_size=1024) #LSTM.train_network(epochs=7, batch_size=32) LSTM.save_model(out_tag=output_tag) LSTM.compute_roc(batch_size=1024) #compute final roc on test set LSTM.plot_roc(output_tag) LSTM.plot_output_score(output_tag, batch_size=1024, ratio_plot=True, norm_to_data=(not options.pt_reweight))
def main(options): #take options from the yaml config with open(options.config, 'r') as config_file: config = yaml.load(config_file) output_tag = config['output_tag'] mc_dir = config['mc_file_dir'] mc_fnames = config['mc_file_names'] #data not needed yet, could use this for validation later. keep for compat with class data_dir = config['data_file_dir'] data_fnames = config['data_file_names'] train_vars = config['train_vars'] vars_to_add = config['vars_to_add'] presel = config['preselection'] proc_to_tree_name = config['proc_to_tree_name'] #sig_colour = 'forestgreen' sig_colour = 'red' #Data handling stuff# #load the mc dataframe for all years if options.pt_reweight: cr_selection = config['reweight_cr'] output_tag += '_pt_reweighted' root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, cr_selection) else: root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, presel) for sig_obj in root_obj.sig_objects: root_obj.load_mc(sig_obj, reload_samples=options.reload_samples) for bkg_obj in root_obj.bkg_objects: root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples) for data_obj in root_obj.data_objects: root_obj.load_data(data_obj, reload_samples=options.reload_samples) root_obj.concat() if options.pt_reweight and options.reload_samples: root_obj.apply_pt_rew('DYMC', presel) #load MVA with open(options.mva_config, 'r') as mva_config_file: config = yaml.load(mva_config_file) model = config['models'][options.mva_proc] boundaries = config['boundaries'][options.mva_proc] #add DNN later if isinstance(model, str): print 'evaluating BDT: {}'.format(model) clf = pickle.load(open('models/{}'.format(model), "rb")) root_obj.mc_df_sig[ options.mva_proc + '_mva'] = clf.predict_proba( root_obj.mc_df_sig[train_vars].values)[:, 1:].ravel() root_obj.mc_df_bkg[ options.mva_proc + '_mva'] = clf.predict_proba( root_obj.mc_df_bkg[train_vars].values)[:, 1:].ravel() root_obj.data_df[ options.mva_proc + '_mva'] = clf.predict_proba( root_obj.data_df[train_vars].values)[:, 1:].ravel() else: raise IOError( 'Did not get a classifier models in correct format in config' ) #Plotter stuff# plotter = Plotter(root_obj, train_vars, sig_col=sig_colour, norm_to_data=True) cat_counter = 0 for b in boundaries: if cat_counter == 0: extra_cuts = options.mva_proc + '_mva >' + str( boundaries['tag_0']) else: extra_cuts = (options.mva_proc + '_mva <' + str( boundaries['tag_' + str(cat_counter - 1)])) + ' and ' + ( options.mva_proc + '_mva >' + str(boundaries['tag_' + str(cat_counter)])) plotter.plot_input(options.mass_var_name, options.n_bins, output_tag, options.ratio_plot, norm_to_data=True, extra_cuts=extra_cuts, extra_tag=cat_counter, blind=True) cat_counter += 1