def do_bayesian_opt(self): self.logger.info("Do Bayesian optimisation for all classifiers") _, names_scikit, _, bayes_opt_scikit = getclf_scikit(self.db_model) _, names_xgboost, _, bayes_opt_xgboost = getclf_xgboost(self.db_model) _, names_keras, _, bayes_opt_keras = getclf_keras( self.db_model, len(self.df_xtrain.columns)) clfs_all = bayes_opt_scikit + bayes_opt_xgboost + bayes_opt_keras clfs_names_all = names_scikit + names_xgboost + names_keras clfs_names_all = [ name for name, clf in zip(clfs_names_all, clfs_all) if clf ] clfs_all = [clf for clf in clfs_all if clf] out_dirs = [os.path.join(self.dirmlplot, "bayesian_opt", name, f"{name}{self.s_suffix}") \ for name in clfs_names_all] if checkdirlist(out_dirs): # Only draw results if any can be found self.logger.warning("Not overwriting anything, just plotting if possible " \ "Please remove corresponding directories if you are certain you want to do " \ "grid search again") return checkmakedirlist(out_dirs) # Now, do it for opt, out_dir in zip(clfs_all, out_dirs): opt.x_train = self.df_xtrain opt.y_train = self.df_ytrain opt.optimise(ncores=self.p_ncorescross) opt.save(out_dir) opt.plot(out_dir)
def do_bayesian_opt(self): if self.step_done("bayesian_opt"): return self.logger.info("Do Bayesian optimisation for all classifiers") _, names_scikit, _, bayes_opt_scikit = getclf_scikit(self.db_model) _, names_xgboost, _, bayes_opt_xgboost = getclf_xgboost(self.db_model) _, names_keras, _, bayes_opt_keras = getclf_keras( self.db_model, len(self.df_xtrain.columns)) clfs_all = bayes_opt_scikit + bayes_opt_xgboost + bayes_opt_keras clfs_names_all = names_scikit + names_xgboost + names_keras clfs_names_all = [ name for name, clf in zip(clfs_names_all, clfs_all) if clf ] clfs_all = [clf for clf in clfs_all if clf] out_dirs = [os.path.join(self.dirmlplot, "bayesian_opt", name, f"{name}{self.s_suffix}") \ for name in clfs_names_all] checkmakedirlist(out_dirs) # Now, do it for opt, out_dir in zip(clfs_all, out_dirs): opt.x_train = self.df_xtrain opt.y_train = self.df_ytrain opt.optimise(ncores=self.p_ncorescross) opt.save(out_dir) opt.plot(out_dir)
def do_grid(self): self.logger.info("Do grid search") clfs_scikit, names_scikit, grid_params_scikit = getclf_scikit( self.db_model) clfs_xgboost, names_xgboost, grid_params_xgboost = getclf_xgboost( self.db_model) clfs_keras, names_keras, grid_params_keras = getclf_keras( self.db_model, len(self.df_xtrain.columns)) clfs_grid_params_all = grid_params_scikit + grid_params_xgboost + grid_params_keras clfs_all = clfs_scikit + clfs_xgboost + clfs_keras clfs_names_all = names_scikit + names_xgboost + names_keras clfs_all = [ clf for clf, gps in zip(clfs_all, clfs_grid_params_all) if gps ] clfs_names_all = [ name for name, gps in zip(clfs_names_all, clfs_grid_params_all) if gps ] clfs_grid_params_all = [gps for gps in clfs_grid_params_all if gps] out_dirs = [os.path.join(self.dirmlplot, "grid_search", name, f"{name}{self.s_suffix}") \ for name in clfs_names_all] if checkdirlist(out_dirs): # Only draw results if any can be found self.logger.warning("Not overwriting anything, just plotting again what was done " \ "before and returning. Please remove corresponding directories " \ "if you are certain you want do do grid search again") perform_plot_gridsearch(clfs_names_all, out_dirs) return checkmakedirlist(out_dirs) do_gridsearch(clfs_names_all, clfs_all, clfs_grid_params_all, self.df_xtrain, self.df_ytrain, self.p_nkfolds, out_dirs, self.p_ncorescross) perform_plot_gridsearch(clfs_names_all, out_dirs)
counter = counter + checkdir(dirresultsdatatot) if dovalhistodata is True: counter = counter + checkdirlist(dirvaldata) counter = counter + checkdir(dirvaldatamerged) if dovalhistomc is True: counter = counter + checkdirlist(dirvalmc) counter = counter + checkdir(dirvalmcmerged) if counter < 0: exit() # check and create directories if doconversionmc is True: checkmakedirlist(dirpklmc) if doconversiondata is True: checkmakedirlist(dirpkldata) if doskimmingmc is True: checkmakedirlist(dirpklskmc) checkmakedir(dirpklevtcounter_allmc) if doskimmingdata is True: checkmakedirlist(dirpklskdata) checkmakedir(dirpklevtcounter_alldata) if domergingmc is True: checkmakedirlist(dirpklmlmc)
def do_entire_analysis(data_config: dict, data_param: dict, data_param_overwrite: dict, # pylint: disable=too-many-locals, too-many-statements, too-many-branches data_model: dict, run_param: dict, clean: bool): # Disable any graphical stuff. No TCanvases opened and shown by default gROOT.SetBatch(True) logger = get_logger() logger.info("Do analysis chain") # If we are here we are interested in the very first key in the parameters database case = list(data_param.keys())[0] # Update database accordingly if needed update_config(data_param, data_config, data_param_overwrite) dodownloadalice = data_config["download"]["alice"]["activate"] doconversionmc = data_config["conversion"]["mc"]["activate"] doconversiondata = data_config["conversion"]["data"]["activate"] domergingmc = data_config["merging"]["mc"]["activate"] domergingdata = data_config["merging"]["data"]["activate"] doskimmingmc = data_config["skimming"]["mc"]["activate"] doskimmingdata = data_config["skimming"]["data"]["activate"] domergingperiodsmc = data_config["mergingperiods"]["mc"]["activate"] domergingperiodsdata = data_config["mergingperiods"]["data"]["activate"] doml = data_config["ml_study"]["activate"] docorrelation = data_config["ml_study"]['docorrelation'] dotraining = data_config["ml_study"]['dotraining'] dotesting = data_config["ml_study"]['dotesting'] doapplytodatamc = data_config["ml_study"]['doapplytodatamc'] docrossvalidation = data_config["ml_study"]['docrossvalidation'] dolearningcurve = data_config["ml_study"]['dolearningcurve'] doroc = data_config["ml_study"]['doroc'] doroctraintest = data_config["ml_study"]['doroctraintest'] doboundary = data_config["ml_study"]['doboundary'] doimportance = data_config["ml_study"]['doimportance'] doimportanceshap = data_config["ml_study"]['doimportanceshap'] dogridsearch = data_config["ml_study"]['dogridsearch'] dobayesianopt = data_config["ml_study"]['dobayesianopt'] doefficiencyml = data_config["ml_study"]['doefficiency'] dosignifopt = data_config["ml_study"]['dosignifopt'] doscancuts = data_config["ml_study"]["doscancuts"] doplotdistr = data_config["ml_study"]["doplotdistr"] doapplydata = data_config["mlapplication"]["data"]["doapply"] doapplymc = data_config["mlapplication"]["mc"]["doapply"] domergeapplydata = data_config["mlapplication"]["data"]["domergeapply"] domergeapplymc = data_config["mlapplication"]["mc"]["domergeapply"] docontinueapplydata = data_config["mlapplication"]["data"]["docontinueafterstop"] docontinueapplymc = data_config["mlapplication"]["mc"]["docontinueafterstop"] dohistomassmc = data_config["analysis"]["mc"]["histomass"] dohistomassdata = data_config["analysis"]["data"]["histomass"] doefficiency = data_config["analysis"]["mc"]["efficiency"] doresponse = data_config["analysis"]["mc"]["response"] dofeeddown = data_config["analysis"]["mc"]["feeddown"] dounfolding = data_config["analysis"]["mc"]["dounfolding"] dojetsystematics = data_config["analysis"]["data"]["dojetsystematics"] dofit = data_config["analysis"]["dofit"] doeff = data_config["analysis"]["doeff"] docross = data_config["analysis"]["docross"] doplotsval = data_config["analysis"]["doplotsval"] doplots = data_config["analysis"]["doplots"] dosyst = data_config["analysis"]["dosyst"] dosystprob = data_config["systematics"]["cutvar"]["activate"] do_syst_prob_mass = data_config["systematics"]["cutvar"]["probvariationmass"] do_syst_prob_eff = data_config["systematics"]["cutvar"]["probvariationeff"] do_syst_prob_fit = data_config["systematics"]["cutvar"]["probvariationfit"] do_syst_prob_cross = data_config["systematics"]["cutvar"]["probvariationcross"] dosystptshape = data_config["systematics"]["mcptshape"]["activate"] doanaperperiod = data_config["analysis"]["doperperiod"] typean = data_config["analysis"]["type"] dojetstudies = data_config["analysis"]["dojetstudies"] dirpklmc = data_param[case]["multi"]["mc"]["pkl"] dirpklevtcounter_allmc = data_param[case]["multi"]["mc"]["pkl_evtcounter_all"] dirpklskmc = data_param[case]["multi"]["mc"]["pkl_skimmed"] dirpklmlmc = data_param[case]["multi"]["mc"]["pkl_skimmed_merge_for_ml"] dirpklmltotmc = data_param[case]["multi"]["mc"]["pkl_skimmed_merge_for_ml_all"] dirpkldata = data_param[case]["multi"]["data"]["pkl"] dirpklevtcounter_alldata = data_param[case]["multi"]["data"]["pkl_evtcounter_all"] dirpklskdata = data_param[case]["multi"]["data"]["pkl_skimmed"] dirpklmldata = data_param[case]["multi"]["data"]["pkl_skimmed_merge_for_ml"] dirpklmltotdata = data_param[case]["multi"]["data"]["pkl_skimmed_merge_for_ml_all"] dirpklskdecmc = data_param[case]["mlapplication"]["mc"]["pkl_skimmed_dec"] dirpklskdec_mergedmc = data_param[case]["mlapplication"]["mc"]["pkl_skimmed_decmerged"] dirpklskdecdata = data_param[case]["mlapplication"]["data"]["pkl_skimmed_dec"] dirpklskdec_mergeddata = data_param[case]["mlapplication"]["data"]["pkl_skimmed_decmerged"] dirresultsdata = data_param[case]["analysis"][typean]["data"]["results"] dirresultsmc = data_param[case]["analysis"][typean]["mc"]["results"] dirresultsdatatot = data_param[case]["analysis"][typean]["data"]["resultsallp"] dirresultsmctot = data_param[case]["analysis"][typean]["mc"]["resultsallp"] binminarray = data_param[case]["ml"]["binmin"] binmaxarray = data_param[case]["ml"]["binmax"] raahp = data_param[case]["ml"]["opt"]["raahp"] mltype = data_param[case]["ml"]["mltype"] training_vars = data_param[case]["variables"]["var_training"] mlout = data_param[case]["ml"]["mlout"] mlplot = data_param[case]["ml"]["mlplot"] proc_type = data_param[case]["analysis"][typean]["proc_type"] #creating folder if not present counter = 0 if doconversionmc is True: counter = counter + checkdirlist(dirpklmc) if doconversiondata is True: counter = counter + checkdirlist(dirpkldata) if doskimmingmc is True: checkdirlist(dirpklskmc) counter = counter + checkdir(dirpklevtcounter_allmc) if doskimmingdata is True: counter = counter + checkdirlist(dirpklskdata) counter = counter + checkdir(dirpklevtcounter_alldata) if domergingmc is True: counter = counter + checkdirlist(dirpklmlmc) if domergingdata is True: counter = counter + checkdirlist(dirpklmldata) if domergingperiodsmc is True: counter = counter + checkdir(dirpklmltotmc) if domergingperiodsdata is True: counter = counter + checkdir(dirpklmltotdata) if doml is True: counter = counter + checkdir(mlout) counter = counter + checkdir(mlplot) if docontinueapplymc is False: if doapplymc is True: counter = counter + checkdirlist(dirpklskdecmc) if domergeapplymc is True: counter = counter + checkdirlist(dirpklskdec_mergedmc) if docontinueapplydata is False: if doapplydata is True: counter = counter + checkdirlist(dirpklskdecdata) if domergeapplydata is True: counter = counter + checkdirlist(dirpklskdec_mergeddata) if dohistomassmc is True: counter = counter + checkdirlist(dirresultsmc) counter = counter + checkdir(dirresultsmctot) if dohistomassdata is True: counter = counter + checkdirlist(dirresultsdata) counter = counter + checkdir(dirresultsdatatot) if counter < 0: sys.exit() # check and create directories if doconversionmc is True: checkmakedirlist(dirpklmc) if doconversiondata is True: checkmakedirlist(dirpkldata) if doskimmingmc is True: checkmakedirlist(dirpklskmc) checkmakedir(dirpklevtcounter_allmc) if doskimmingdata is True: checkmakedirlist(dirpklskdata) checkmakedir(dirpklevtcounter_alldata) if domergingmc is True: checkmakedirlist(dirpklmlmc) if domergingdata is True: checkmakedirlist(dirpklmldata) if domergingperiodsmc is True: checkmakedir(dirpklmltotmc) if domergingperiodsdata is True: checkmakedir(dirpklmltotdata) if doml is True: checkmakedir(mlout) checkmakedir(mlplot) if docontinueapplymc is False: if doapplymc is True: checkmakedirlist(dirpklskdecmc) if domergeapplymc is True: checkmakedirlist(dirpklskdec_mergedmc) if docontinueapplydata is False: if doapplydata is True: checkmakedirlist(dirpklskdecdata) if domergeapplydata is True: checkmakedirlist(dirpklskdec_mergeddata) if dohistomassmc is True: checkmakedirlist(dirresultsmc) checkmakedir(dirresultsmctot) if dohistomassdata is True: checkmakedirlist(dirresultsdata) checkmakedir(dirresultsdatatot) proc_class = Processer ana_class = Analyzer syst_class = Systematics if proc_type == "Dhadrons": print("Using new feature for Dhadrons") proc_class = ProcesserDhadrons ana_class = AnalyzerDhadrons if proc_type == "Dhadrons_mult": print("Using new feature for Dhadrons_mult") proc_class = ProcesserDhadrons_mult ana_class = AnalyzerDhadrons_mult if proc_type == "Dhadrons_jet": print("Using new feature for Dhadrons_jet") proc_class = ProcesserDhadrons_jet ana_class = AnalyzerJet mymultiprocessmc = MultiProcesser(case, proc_class, data_param[case], typean, run_param, "mc") mymultiprocessdata = MultiProcesser(case, proc_class, data_param[case], typean, run_param,\ "data") ana_mgr = AnalyzerManager(ana_class, data_param[case], case, typean, doanaperperiod) # Has to be done always period-by-period syst_mgr = AnalyzerManager(syst_class, data_param[case], case, typean, True, run_param) #perform the analysis flow if dodownloadalice == 1: subprocess.call("../cplusutilities/Download.sh") if doconversionmc == 1: mymultiprocessmc.multi_unpack_allperiods() if doconversiondata == 1: mymultiprocessdata.multi_unpack_allperiods() if doskimmingmc == 1: mymultiprocessmc.multi_skim_allperiods() if doskimmingdata == 1: mymultiprocessdata.multi_skim_allperiods() if domergingmc == 1: mymultiprocessmc.multi_mergeml_allperiods() if domergingdata == 1: mymultiprocessdata.multi_mergeml_allperiods() if domergingperiodsmc == 1: mymultiprocessmc.multi_mergeml_allinone() if domergingperiodsdata == 1: mymultiprocessdata.multi_mergeml_allinone() if doml is True: index = 0 for binmin, binmax in zip(binminarray, binmaxarray): myopt = Optimiser(data_param[case], case, typean, data_model[mltype], binmin, binmax, raahp[index], training_vars[index]) if docorrelation is True: myopt.do_corr() if dotraining is True: myopt.do_train() if dotesting is True: myopt.do_test() if doapplytodatamc is True: myopt.do_apply() if docrossvalidation is True: myopt.do_crossval() if dolearningcurve is True: myopt.do_learningcurve() if doroc is True: myopt.do_roc() if doroctraintest is True: myopt.do_roc_train_test() if doplotdistr is True: myopt.do_plot_model_pred() if doimportance is True: myopt.do_importance() if doimportanceshap is True: myopt.do_importance_shap() if dogridsearch is True: myopt.do_grid() if dobayesianopt is True: myopt.do_bayesian_opt() if doboundary is True: myopt.do_boundary() if doefficiencyml is True: myopt.do_efficiency() if dosignifopt is True: myopt.do_significance() if doscancuts is True: myopt.do_scancuts() index = index + 1 if doapplydata is True: mymultiprocessdata.multi_apply_allperiods() if doapplymc is True: mymultiprocessmc.multi_apply_allperiods() if domergeapplydata is True: mymultiprocessdata.multi_mergeapply_allperiods() if domergeapplymc is True: mymultiprocessmc.multi_mergeapply_allperiods() if dohistomassmc is True: mymultiprocessmc.multi_histomass() if dohistomassdata is True: # After-burner in case of a mult analysis to obtain "correctionsweight.root" # for merged-period data # pylint: disable=fixme # FIXME Can only be run here because result directories are constructed when histomass # is run. If this step was independent, histomass would always complain that the # result directory already exists. mymultiprocessdata.multi_histomass() if doefficiency is True: mymultiprocessmc.multi_efficiency() if doresponse is True: mymultiprocessmc.multi_response() # Collect all desired analysis steps analyze_steps = [] if dofit is True: analyze_steps.append("fit") if dosyst is True: analyze_steps.append("yield_syst") if doeff is True: analyze_steps.append("efficiency") if dojetstudies is True: if dofit is False: analyze_steps.append("fit") if doeff is False: analyze_steps.append("efficiency") analyze_steps.append("sideband_sub") if dofeeddown is True: analyze_steps.append("feeddown") if dounfolding is True: analyze_steps.append("unfolding") analyze_steps.append("unfolding_closure") if dojetsystematics is True: analyze_steps.append("jetsystematics") if docross is True: analyze_steps.append("makenormyields") if doplots is True: analyze_steps.append("plotternormyields") if doplotsval is True: analyze_steps.append("plottervalidation") # Now do the analysis ana_mgr.analyze(*analyze_steps) ml_syst_steps = [] if dosystprob is True: if do_syst_prob_mass: ml_syst_steps.append("ml_cutvar_mass") if do_syst_prob_eff: ml_syst_steps.append("ml_cutvar_eff") if do_syst_prob_fit: ml_syst_steps.append("ml_cutvar_fit") if do_syst_prob_cross: ml_syst_steps.append("ml_cutvar_cross") if dosystptshape is True: ml_syst_steps.append("mcptshape") syst_mgr.analyze(*ml_syst_steps) # Delete per-period results. if clean: print("Cleaning") if doanaperperiod: print("Per-period analysis enabled. Skipping.") else: if not delete_dirlist(dirresultsmc + dirresultsdata): print("Error: Failed to complete cleaning.") print("Done")
if domergingperiodsdata is True: counter = counter + checkdir(dirpklmltotdata) if v_max_ncand_merge > 0: counter = counter + checkdir(dirpklmltotdatamax) if counter < 0: sys.exit() # check and create directories if checkiffileexist is False: if doconversionmc is True: if dirpklmc[0] == dirpklmc[-1]: checkmakedir(dirpklmc[0]) else: checkmakedirlist(dirpklmc) if doconversiondata is True: checkmakedirlist(dirpkldata) if doskimmingmc is True: if dirpklskmc[0] == dirpklskmc[-1]: checkmakedir(dirpklskmc[0]) else: checkmakedirlist(dirpklskmc) checkmakedir(dirpklevtcounter_allmc) if doskimmingdata is True: checkmakedirlist(dirpklskdata) checkmakedir(dirpklevtcounter_alldata)
counter = counter + checkdir(dirresultsmctot) if doanalysisdata is True: counter = counter + checkdirlist(dirresultsdata) counter = counter + checkdir(dirresultsdatatot) if counter < 0: if doprobscan is True: sys.exit() else: logger.warning( "Directories already exists (see above), but no new prob scan") else: # check and create directories if doanalysismc is True: checkmakedirlist(dirresultsmc) checkmakedir(dirresultsmctot) if doanalysisdata is True: checkmakedirlist(dirresultsdata) checkmakedir(dirresultsdatatot) ana_class = AnalyserITSUpgrade ana_mgr = AnalyserManager(ana_class, data_param[case], case, typean) # Collect all desired analysis steps analyse_steps = [] if dohistomass is True: analyse_steps.append("invmass") if doefficiency is True: analyse_steps.append("efficiency")
if domergeapplymc is True: counter = counter + checkdirlist(dirpklskdec_mergedmc) if doapplydata is True: counter = counter + checkdirlist(dirpklskdecdata) if domergeapplydata is True: counter = counter + checkdirlist(dirpklskdec_mergeddata) if counter < 0: sys.exit() # check and create directories if checkiffileexist is False: if doapplymc is True: checkmakedirlist(dirpklskdecmc) if domergeapplymc is True: checkmakedirlist(dirpklskdec_mergedmc) if doapplydata is True: checkmakedirlist(dirpklskdecdata) if domergeapplydata is True: checkmakedirlist(dirpklskdec_mergeddata) proc_class = Processer mymultiprocessmc = MultiProcesser(case, proc_class, data_param[case], run_param, "mc", checkiffileexist, True) mymultiprocessdata = MultiProcesser(case, proc_class, data_param[case], run_param, "data", checkiffileexist,