def do_bayesian_opt(self):
        self.logger.info("Do Bayesian optimisation for all classifiers")
        _, names_scikit, _, bayes_opt_scikit = getclf_scikit(self.db_model)
        _, names_xgboost, _, bayes_opt_xgboost = getclf_xgboost(self.db_model)
        _, names_keras, _, bayes_opt_keras = getclf_keras(
            self.db_model, len(self.df_xtrain.columns))
        clfs_all = bayes_opt_scikit + bayes_opt_xgboost + bayes_opt_keras
        clfs_names_all = names_scikit + names_xgboost + names_keras

        clfs_names_all = [
            name for name, clf in zip(clfs_names_all, clfs_all) if clf
        ]
        clfs_all = [clf for clf in clfs_all if clf]

        out_dirs = [os.path.join(self.dirmlplot, "bayesian_opt", name, f"{name}{self.s_suffix}") \
                for name in clfs_names_all]
        if checkdirlist(out_dirs):
            # Only draw results if any can be found
            self.logger.warning("Not overwriting anything, just plotting if possible " \
                    "Please remove corresponding directories if you are certain you want to do " \
                    "grid search again")
            return
        checkmakedirlist(out_dirs)

        # Now, do it
        for opt, out_dir in zip(clfs_all, out_dirs):
            opt.x_train = self.df_xtrain
            opt.y_train = self.df_ytrain

            opt.optimise(ncores=self.p_ncorescross)
            opt.save(out_dir)
            opt.plot(out_dir)
Beispiel #2
0
    def do_bayesian_opt(self):
        if self.step_done("bayesian_opt"):
            return
        self.logger.info("Do Bayesian optimisation for all classifiers")
        _, names_scikit, _, bayes_opt_scikit = getclf_scikit(self.db_model)
        _, names_xgboost, _, bayes_opt_xgboost = getclf_xgboost(self.db_model)
        _, names_keras, _, bayes_opt_keras = getclf_keras(
            self.db_model, len(self.df_xtrain.columns))
        clfs_all = bayes_opt_scikit + bayes_opt_xgboost + bayes_opt_keras
        clfs_names_all = names_scikit + names_xgboost + names_keras

        clfs_names_all = [
            name for name, clf in zip(clfs_names_all, clfs_all) if clf
        ]
        clfs_all = [clf for clf in clfs_all if clf]

        out_dirs = [os.path.join(self.dirmlplot, "bayesian_opt", name, f"{name}{self.s_suffix}") \
                for name in clfs_names_all]
        checkmakedirlist(out_dirs)

        # Now, do it
        for opt, out_dir in zip(clfs_all, out_dirs):
            opt.x_train = self.df_xtrain
            opt.y_train = self.df_ytrain

            opt.optimise(ncores=self.p_ncorescross)
            opt.save(out_dir)
            opt.plot(out_dir)
Beispiel #3
0
    def do_grid(self):
        self.logger.info("Do grid search")
        clfs_scikit, names_scikit, grid_params_scikit = getclf_scikit(
            self.db_model)
        clfs_xgboost, names_xgboost, grid_params_xgboost = getclf_xgboost(
            self.db_model)
        clfs_keras, names_keras, grid_params_keras = getclf_keras(
            self.db_model, len(self.df_xtrain.columns))
        clfs_grid_params_all = grid_params_scikit + grid_params_xgboost + grid_params_keras
        clfs_all = clfs_scikit + clfs_xgboost + clfs_keras
        clfs_names_all = names_scikit + names_xgboost + names_keras

        clfs_all = [
            clf for clf, gps in zip(clfs_all, clfs_grid_params_all) if gps
        ]
        clfs_names_all = [
            name for name, gps in zip(clfs_names_all, clfs_grid_params_all)
            if gps
        ]
        clfs_grid_params_all = [gps for gps in clfs_grid_params_all if gps]



        out_dirs = [os.path.join(self.dirmlplot, "grid_search", name, f"{name}{self.s_suffix}") \
                for name in clfs_names_all]
        if checkdirlist(out_dirs):
            # Only draw results if any can be found
            self.logger.warning("Not overwriting anything, just plotting again what was done " \
                    "before and returning. Please remove corresponding directories " \
                    "if you are certain you want do do grid search again")
            perform_plot_gridsearch(clfs_names_all, out_dirs)
            return
        checkmakedirlist(out_dirs)

        do_gridsearch(clfs_names_all, clfs_all, clfs_grid_params_all,
                      self.df_xtrain, self.df_ytrain, self.p_nkfolds, out_dirs,
                      self.p_ncorescross)
        perform_plot_gridsearch(clfs_names_all, out_dirs)
        counter = counter + checkdir(dirresultsdatatot)

    if dovalhistodata is True:
        counter = counter + checkdirlist(dirvaldata)
        counter = counter + checkdir(dirvaldatamerged)

    if dovalhistomc is True:
        counter = counter + checkdirlist(dirvalmc)
        counter = counter + checkdir(dirvalmcmerged)

    if counter < 0:
        exit()
    # check and create directories

    if doconversionmc is True:
        checkmakedirlist(dirpklmc)

    if doconversiondata is True:
        checkmakedirlist(dirpkldata)

    if doskimmingmc is True:
        checkmakedirlist(dirpklskmc)
        checkmakedir(dirpklevtcounter_allmc)

    if doskimmingdata is True:
        checkmakedirlist(dirpklskdata)
        checkmakedir(dirpklevtcounter_alldata)

    if domergingmc is True:
        checkmakedirlist(dirpklmlmc)
Beispiel #5
0
def do_entire_analysis(data_config: dict, data_param: dict, data_param_overwrite: dict, # pylint: disable=too-many-locals, too-many-statements, too-many-branches
                       data_model: dict, run_param: dict, clean: bool):

    # Disable any graphical stuff. No TCanvases opened and shown by default
    gROOT.SetBatch(True)

    logger = get_logger()
    logger.info("Do analysis chain")

    # If we are here we are interested in the very first key in the parameters database
    case = list(data_param.keys())[0]

    # Update database accordingly if needed
    update_config(data_param, data_config, data_param_overwrite)

    dodownloadalice = data_config["download"]["alice"]["activate"]
    doconversionmc = data_config["conversion"]["mc"]["activate"]
    doconversiondata = data_config["conversion"]["data"]["activate"]
    domergingmc = data_config["merging"]["mc"]["activate"]
    domergingdata = data_config["merging"]["data"]["activate"]
    doskimmingmc = data_config["skimming"]["mc"]["activate"]
    doskimmingdata = data_config["skimming"]["data"]["activate"]
    domergingperiodsmc = data_config["mergingperiods"]["mc"]["activate"]
    domergingperiodsdata = data_config["mergingperiods"]["data"]["activate"]
    doml = data_config["ml_study"]["activate"]
    docorrelation = data_config["ml_study"]['docorrelation']
    dotraining = data_config["ml_study"]['dotraining']
    dotesting = data_config["ml_study"]['dotesting']
    doapplytodatamc = data_config["ml_study"]['doapplytodatamc']
    docrossvalidation = data_config["ml_study"]['docrossvalidation']
    dolearningcurve = data_config["ml_study"]['dolearningcurve']
    doroc = data_config["ml_study"]['doroc']
    doroctraintest = data_config["ml_study"]['doroctraintest']
    doboundary = data_config["ml_study"]['doboundary']
    doimportance = data_config["ml_study"]['doimportance']
    doimportanceshap = data_config["ml_study"]['doimportanceshap']
    dogridsearch = data_config["ml_study"]['dogridsearch']
    dobayesianopt = data_config["ml_study"]['dobayesianopt']
    doefficiencyml = data_config["ml_study"]['doefficiency']
    dosignifopt = data_config["ml_study"]['dosignifopt']
    doscancuts = data_config["ml_study"]["doscancuts"]
    doplotdistr = data_config["ml_study"]["doplotdistr"]
    doapplydata = data_config["mlapplication"]["data"]["doapply"]
    doapplymc = data_config["mlapplication"]["mc"]["doapply"]
    domergeapplydata = data_config["mlapplication"]["data"]["domergeapply"]
    domergeapplymc = data_config["mlapplication"]["mc"]["domergeapply"]
    docontinueapplydata = data_config["mlapplication"]["data"]["docontinueafterstop"]
    docontinueapplymc = data_config["mlapplication"]["mc"]["docontinueafterstop"]
    dohistomassmc = data_config["analysis"]["mc"]["histomass"]
    dohistomassdata = data_config["analysis"]["data"]["histomass"]
    doefficiency = data_config["analysis"]["mc"]["efficiency"]
    doresponse = data_config["analysis"]["mc"]["response"]
    dofeeddown = data_config["analysis"]["mc"]["feeddown"]
    dounfolding = data_config["analysis"]["mc"]["dounfolding"]
    dojetsystematics = data_config["analysis"]["data"]["dojetsystematics"]
    dofit = data_config["analysis"]["dofit"]
    doeff = data_config["analysis"]["doeff"]
    docross = data_config["analysis"]["docross"]
    doplotsval = data_config["analysis"]["doplotsval"]
    doplots = data_config["analysis"]["doplots"]
    dosyst = data_config["analysis"]["dosyst"]
    dosystprob = data_config["systematics"]["cutvar"]["activate"]
    do_syst_prob_mass = data_config["systematics"]["cutvar"]["probvariationmass"]
    do_syst_prob_eff = data_config["systematics"]["cutvar"]["probvariationeff"]
    do_syst_prob_fit = data_config["systematics"]["cutvar"]["probvariationfit"]
    do_syst_prob_cross = data_config["systematics"]["cutvar"]["probvariationcross"]
    dosystptshape = data_config["systematics"]["mcptshape"]["activate"]
    doanaperperiod = data_config["analysis"]["doperperiod"]
    typean = data_config["analysis"]["type"]

    dojetstudies = data_config["analysis"]["dojetstudies"]

    dirpklmc = data_param[case]["multi"]["mc"]["pkl"]
    dirpklevtcounter_allmc = data_param[case]["multi"]["mc"]["pkl_evtcounter_all"]
    dirpklskmc = data_param[case]["multi"]["mc"]["pkl_skimmed"]
    dirpklmlmc = data_param[case]["multi"]["mc"]["pkl_skimmed_merge_for_ml"]
    dirpklmltotmc = data_param[case]["multi"]["mc"]["pkl_skimmed_merge_for_ml_all"]
    dirpkldata = data_param[case]["multi"]["data"]["pkl"]
    dirpklevtcounter_alldata = data_param[case]["multi"]["data"]["pkl_evtcounter_all"]
    dirpklskdata = data_param[case]["multi"]["data"]["pkl_skimmed"]
    dirpklmldata = data_param[case]["multi"]["data"]["pkl_skimmed_merge_for_ml"]
    dirpklmltotdata = data_param[case]["multi"]["data"]["pkl_skimmed_merge_for_ml_all"]
    dirpklskdecmc = data_param[case]["mlapplication"]["mc"]["pkl_skimmed_dec"]
    dirpklskdec_mergedmc = data_param[case]["mlapplication"]["mc"]["pkl_skimmed_decmerged"]
    dirpklskdecdata = data_param[case]["mlapplication"]["data"]["pkl_skimmed_dec"]
    dirpklskdec_mergeddata = data_param[case]["mlapplication"]["data"]["pkl_skimmed_decmerged"]

    dirresultsdata = data_param[case]["analysis"][typean]["data"]["results"]
    dirresultsmc = data_param[case]["analysis"][typean]["mc"]["results"]
    dirresultsdatatot = data_param[case]["analysis"][typean]["data"]["resultsallp"]
    dirresultsmctot = data_param[case]["analysis"][typean]["mc"]["resultsallp"]

    binminarray = data_param[case]["ml"]["binmin"]
    binmaxarray = data_param[case]["ml"]["binmax"]
    raahp = data_param[case]["ml"]["opt"]["raahp"]
    mltype = data_param[case]["ml"]["mltype"]
    training_vars = data_param[case]["variables"]["var_training"]

    mlout = data_param[case]["ml"]["mlout"]
    mlplot = data_param[case]["ml"]["mlplot"]

    proc_type = data_param[case]["analysis"][typean]["proc_type"]

    #creating folder if not present
    counter = 0
    if doconversionmc is True:
        counter = counter + checkdirlist(dirpklmc)

    if doconversiondata is True:
        counter = counter + checkdirlist(dirpkldata)

    if doskimmingmc is True:
        checkdirlist(dirpklskmc)
        counter = counter + checkdir(dirpklevtcounter_allmc)

    if doskimmingdata is True:
        counter = counter + checkdirlist(dirpklskdata)
        counter = counter + checkdir(dirpklevtcounter_alldata)

    if domergingmc is True:
        counter = counter + checkdirlist(dirpklmlmc)

    if domergingdata is True:
        counter = counter + checkdirlist(dirpklmldata)

    if domergingperiodsmc is True:
        counter = counter + checkdir(dirpklmltotmc)

    if domergingperiodsdata is True:
        counter = counter + checkdir(dirpklmltotdata)

    if doml is True:
        counter = counter + checkdir(mlout)
        counter = counter + checkdir(mlplot)

    if docontinueapplymc is False:
        if doapplymc is True:
            counter = counter + checkdirlist(dirpklskdecmc)

        if domergeapplymc is True:
            counter = counter + checkdirlist(dirpklskdec_mergedmc)

    if docontinueapplydata is False:
        if doapplydata is True:
            counter = counter + checkdirlist(dirpklskdecdata)

        if domergeapplydata is True:
            counter = counter + checkdirlist(dirpklskdec_mergeddata)

    if dohistomassmc is True:
        counter = counter + checkdirlist(dirresultsmc)
        counter = counter + checkdir(dirresultsmctot)

    if dohistomassdata is True:
        counter = counter + checkdirlist(dirresultsdata)
        counter = counter + checkdir(dirresultsdatatot)

    if counter < 0:
        sys.exit()
    # check and create directories

    if doconversionmc is True:
        checkmakedirlist(dirpklmc)

    if doconversiondata is True:
        checkmakedirlist(dirpkldata)

    if doskimmingmc is True:
        checkmakedirlist(dirpklskmc)
        checkmakedir(dirpklevtcounter_allmc)

    if doskimmingdata is True:
        checkmakedirlist(dirpklskdata)
        checkmakedir(dirpklevtcounter_alldata)

    if domergingmc is True:
        checkmakedirlist(dirpklmlmc)

    if domergingdata is True:
        checkmakedirlist(dirpklmldata)

    if domergingperiodsmc is True:
        checkmakedir(dirpklmltotmc)

    if domergingperiodsdata is True:
        checkmakedir(dirpklmltotdata)

    if doml is True:
        checkmakedir(mlout)
        checkmakedir(mlplot)

    if docontinueapplymc is False:
        if doapplymc is True:
            checkmakedirlist(dirpklskdecmc)

        if domergeapplymc is True:
            checkmakedirlist(dirpklskdec_mergedmc)

    if docontinueapplydata is False:
        if doapplydata is True:
            checkmakedirlist(dirpklskdecdata)

        if domergeapplydata is True:
            checkmakedirlist(dirpklskdec_mergeddata)

    if dohistomassmc is True:
        checkmakedirlist(dirresultsmc)
        checkmakedir(dirresultsmctot)

    if dohistomassdata is True:
        checkmakedirlist(dirresultsdata)
        checkmakedir(dirresultsdatatot)

    proc_class = Processer
    ana_class = Analyzer
    syst_class = Systematics
    if proc_type == "Dhadrons":
        print("Using new feature for Dhadrons")
        proc_class = ProcesserDhadrons
        ana_class = AnalyzerDhadrons
    if proc_type == "Dhadrons_mult":
        print("Using new feature for Dhadrons_mult")
        proc_class = ProcesserDhadrons_mult
        ana_class = AnalyzerDhadrons_mult
    if proc_type == "Dhadrons_jet":
        print("Using new feature for Dhadrons_jet")
        proc_class = ProcesserDhadrons_jet
        ana_class = AnalyzerJet

    mymultiprocessmc = MultiProcesser(case, proc_class, data_param[case], typean, run_param, "mc")
    mymultiprocessdata = MultiProcesser(case, proc_class, data_param[case], typean, run_param,\
                                        "data")
    ana_mgr = AnalyzerManager(ana_class, data_param[case], case, typean, doanaperperiod)
    # Has to be done always period-by-period
    syst_mgr = AnalyzerManager(syst_class, data_param[case], case, typean, True, run_param)

    #perform the analysis flow
    if dodownloadalice == 1:
        subprocess.call("../cplusutilities/Download.sh")

    if doconversionmc == 1:
        mymultiprocessmc.multi_unpack_allperiods()

    if doconversiondata == 1:
        mymultiprocessdata.multi_unpack_allperiods()

    if doskimmingmc == 1:
        mymultiprocessmc.multi_skim_allperiods()

    if doskimmingdata == 1:
        mymultiprocessdata.multi_skim_allperiods()

    if domergingmc == 1:
        mymultiprocessmc.multi_mergeml_allperiods()

    if domergingdata == 1:
        mymultiprocessdata.multi_mergeml_allperiods()

    if domergingperiodsmc == 1:
        mymultiprocessmc.multi_mergeml_allinone()

    if domergingperiodsdata == 1:
        mymultiprocessdata.multi_mergeml_allinone()

    if doml is True:
        index = 0
        for binmin, binmax in zip(binminarray, binmaxarray):
            myopt = Optimiser(data_param[case], case, typean,
                              data_model[mltype], binmin, binmax,
                              raahp[index], training_vars[index])
            if docorrelation is True:
                myopt.do_corr()
            if dotraining is True:
                myopt.do_train()
            if dotesting is True:
                myopt.do_test()
            if doapplytodatamc is True:
                myopt.do_apply()
            if docrossvalidation is True:
                myopt.do_crossval()
            if dolearningcurve is True:
                myopt.do_learningcurve()
            if doroc is True:
                myopt.do_roc()
            if doroctraintest is True:
                myopt.do_roc_train_test()
            if doplotdistr is True:
                myopt.do_plot_model_pred()
            if doimportance is True:
                myopt.do_importance()
            if doimportanceshap is True:
                myopt.do_importance_shap()
            if dogridsearch is True:
                myopt.do_grid()
            if dobayesianopt is True:
                myopt.do_bayesian_opt()
            if doboundary is True:
                myopt.do_boundary()
            if doefficiencyml is True:
                myopt.do_efficiency()
            if dosignifopt is True:
                myopt.do_significance()
            if doscancuts is True:
                myopt.do_scancuts()
            index = index + 1

    if doapplydata is True:
        mymultiprocessdata.multi_apply_allperiods()
    if doapplymc is True:
        mymultiprocessmc.multi_apply_allperiods()
    if domergeapplydata is True:
        mymultiprocessdata.multi_mergeapply_allperiods()
    if domergeapplymc is True:
        mymultiprocessmc.multi_mergeapply_allperiods()
    if dohistomassmc is True:
        mymultiprocessmc.multi_histomass()
    if dohistomassdata is True:
        # After-burner in case of a mult analysis to obtain "correctionsweight.root"
        # for merged-period data
        # pylint: disable=fixme
        # FIXME Can only be run here because result directories are constructed when histomass
        #       is run. If this step was independent, histomass would always complain that the
        #       result directory already exists.
        mymultiprocessdata.multi_histomass()
    if doefficiency is True:
        mymultiprocessmc.multi_efficiency()
    if doresponse is True:
        mymultiprocessmc.multi_response()

    # Collect all desired analysis steps
    analyze_steps = []
    if dofit is True:
        analyze_steps.append("fit")
    if dosyst is True:
        analyze_steps.append("yield_syst")
    if doeff is True:
        analyze_steps.append("efficiency")
    if dojetstudies is True:
        if dofit is False:
            analyze_steps.append("fit")
        if doeff is False:
            analyze_steps.append("efficiency")
        analyze_steps.append("sideband_sub")
    if dofeeddown is True:
        analyze_steps.append("feeddown")
    if dounfolding is True:
        analyze_steps.append("unfolding")
        analyze_steps.append("unfolding_closure")
    if dojetsystematics is True:
        analyze_steps.append("jetsystematics")
    if docross is True:
        analyze_steps.append("makenormyields")
    if doplots is True:
        analyze_steps.append("plotternormyields")
    if doplotsval is True:
        analyze_steps.append("plottervalidation")

    # Now do the analysis
    ana_mgr.analyze(*analyze_steps)

    ml_syst_steps = []
    if dosystprob is True:
        if do_syst_prob_mass:
            ml_syst_steps.append("ml_cutvar_mass")
        if do_syst_prob_eff:
            ml_syst_steps.append("ml_cutvar_eff")
        if do_syst_prob_fit:
            ml_syst_steps.append("ml_cutvar_fit")
        if do_syst_prob_cross:
            ml_syst_steps.append("ml_cutvar_cross")
    if dosystptshape is True:
        ml_syst_steps.append("mcptshape")
    syst_mgr.analyze(*ml_syst_steps)

    # Delete per-period results.
    if clean:
        print("Cleaning")
        if doanaperperiod:
            print("Per-period analysis enabled. Skipping.")
        else:
            if not delete_dirlist(dirresultsmc + dirresultsdata):
                print("Error: Failed to complete cleaning.")

    print("Done")
    if domergingperiodsdata is True:
        counter = counter + checkdir(dirpklmltotdata)
        if v_max_ncand_merge > 0:
            counter = counter + checkdir(dirpklmltotdatamax)

    if counter < 0:
        sys.exit()

    # check and create directories
    if checkiffileexist is False:
        if doconversionmc is True:
            if dirpklmc[0] == dirpklmc[-1]:
                checkmakedir(dirpklmc[0])
            else:
                checkmakedirlist(dirpklmc)

        if doconversiondata is True:
            checkmakedirlist(dirpkldata)

    if doskimmingmc is True:
        if dirpklskmc[0] == dirpklskmc[-1]:
            checkmakedir(dirpklskmc[0])
        else:
            checkmakedirlist(dirpklskmc)
        checkmakedir(dirpklevtcounter_allmc)

    if doskimmingdata is True:
        checkmakedirlist(dirpklskdata)
        checkmakedir(dirpklevtcounter_alldata)
Beispiel #7
0
        counter = counter + checkdir(dirresultsmctot)

    if doanalysisdata is True:
        counter = counter + checkdirlist(dirresultsdata)
        counter = counter + checkdir(dirresultsdatatot)

    if counter < 0:
        if doprobscan is True:
            sys.exit()
        else:
            logger.warning(
                "Directories already exists (see above), but no new prob scan")
    else:
        # check and create directories
        if doanalysismc is True:
            checkmakedirlist(dirresultsmc)
            checkmakedir(dirresultsmctot)

        if doanalysisdata is True:
            checkmakedirlist(dirresultsdata)
            checkmakedir(dirresultsdatatot)

    ana_class = AnalyserITSUpgrade
    ana_mgr = AnalyserManager(ana_class, data_param[case], case, typean)

    # Collect all desired analysis steps
    analyse_steps = []
    if dohistomass is True:
        analyse_steps.append("invmass")
    if doefficiency is True:
        analyse_steps.append("efficiency")
        if domergeapplymc is True:
            counter = counter + checkdirlist(dirpklskdec_mergedmc)

        if doapplydata is True:
            counter = counter + checkdirlist(dirpklskdecdata)

        if domergeapplydata is True:
            counter = counter + checkdirlist(dirpklskdec_mergeddata)

    if counter < 0:
        sys.exit()

    # check and create directories
    if checkiffileexist is False:
        if doapplymc is True:
            checkmakedirlist(dirpklskdecmc)

        if domergeapplymc is True:
            checkmakedirlist(dirpklskdec_mergedmc)

        if doapplydata is True:
            checkmakedirlist(dirpklskdecdata)

        if domergeapplydata is True:
            checkmakedirlist(dirpklskdec_mergeddata)

    proc_class = Processer
    mymultiprocessmc = MultiProcesser(case, proc_class, data_param[case],
                                      run_param, "mc", checkiffileexist, True)
    mymultiprocessdata = MultiProcesser(case, proc_class, data_param[case],
                                        run_param, "data", checkiffileexist,