Esempio n. 1
0
 def do_test(self):
     df_ml_test = test(self.p_mltype, self.p_classname, self.p_trainedmod,
                       self.df_mltest, self.v_train, self.v_sig)
     df_ml_test_to_df = self.dirmlout+"/testsample_%s_mldecision.pkl" % (self.s_suffix)
     df_ml_test_to_root = self.dirmlout+"/testsample_%s_mldecision.root" % (self.s_suffix)
     pickle.dump(df_ml_test, openfile(df_ml_test_to_df, "wb"), protocol=4)
     write_tree(df_ml_test_to_root, self.n_treetest, df_ml_test)
Esempio n. 2
0
 def do_test(self):
     df_ml_test = test(self.p_mltype, self.p_classname, self.p_trainedmod,
                       self.df_mltest, self.v_train, self.v_sig)
     df_ml_test_to_df = self.dirmlout + "/testsample_%s_mldecision.pkl" % (
         self.s_suffix)
     df_ml_test_to_root = self.dirmlout + "/testsample_%s_mldecision.root" % (
         self.s_suffix)
     df_ml_test.to_pickle(df_ml_test_to_df)
     write_tree(df_ml_test_to_root, self.n_treetest, df_ml_test)
Esempio n. 3
0
 def do_apply(self):
     df_data = apply(self.p_mltype, self.p_classname, self.p_trainedmod,
                     self.df_data, self.v_train)
     df_mc = apply(self.p_mltype, self.p_classname, self.p_trainedmod,
                   self.df_mc, self.v_train)
     df_data_to_root = self.dirmlout + "/data_%s_mldecision.root" % (
         self.s_suffix)
     df_mc_to_root = self.dirmlout + "/mc_%s_mldecision.root" % (
         self.s_suffix)
     write_tree(df_data_to_root, self.n_treetest, df_data)
     write_tree(df_mc_to_root, self.n_treetest, df_mc)
Esempio n. 4
0
    def do_test(self):

        self.do_train()
        if self.step_done("test"):
            self.df_mltest_applied = pickle.load(
                openfile(self.f_mltest_applied, "rb"))
            return

        self.logger.info("Testing")
        self.df_mltest_applied = test(self.p_mltype, self.p_classname,
                                      self.p_trainedmod, self.df_mltest,
                                      self.v_train, self.v_sig)
        df_ml_test_to_root = self.dirmlout + "/testsample_%s_mldecision.root" % (
            self.s_suffix)
        pickle.dump(self.df_mltest_applied,
                    openfile(self.f_mltest_applied, "wb"),
                    protocol=4)
        write_tree(df_ml_test_to_root, self.n_treetest, self.df_mltest_applied)
    classifiers = classifiers_scikit + classifiers_xgboost + classifiers_keras
    names = names_scikit + names_xgboost + names_keras
    print(names)
    if dotraining == 1:
        trainedmodels = fit(names, classifiers, x_train, y_train)
        savemodels(names, trainedmodels, mlout, suffix)

    if dotesting == 1:
        # The model predictions are added to the test dataframe
        df_ml_test = test(mltype, names, trainedmodels, df_ml_test,
                          var_training, var_signal)
        df_ml_test_to_df = mlout + "/testsample_%s_mldecision.pkl" % (suffix)
        df_ml_test_to_root = mlout + "/testsample_%s_mldecision.root" % (
            suffix)
        df_ml_test.to_pickle(df_ml_test_to_df)
        write_tree(df_ml_test_to_root, tree_name, df_ml_test)
        #plot_overtraining(names, classifiers, suffix, mlplot, x_train, y_train, x_test, y_test)

    if applytodatamc == 1:
        # The model predictions are added to the dataframes of data and MC
        df_data = apply(mltype, names, trainedmodels, df_data, var_training)
        df_mc = apply(mltype, names, trainedmodels, df_mc, var_training)
        df_data_to_root = mlout + "/data_%s_mldecision.root" % (suffix)
        df_mc_to_root = mlout + "/mc_%s_mldecision.root" % (suffix)
        write_tree(df_data_to_root, tree_name, df_data)
        write_tree(df_mc_to_root, tree_name, df_mc)

    if docrossvalidation == 1:
        df_scores = []
        if mltype == "Regression":
            df_scores = cross_validation_mse_continuous(
Esempio n. 6
0
def doclassification_regression(conf):  # pylint: disable=too-many-locals, too-many-statements, too-many-branches

    logger = get_logger()
    logger.info(f"Start classification_regression run")

    run_config = conf.get_run_config()
    model_config = conf.get_model_config()

    mltype = run_config['mltype']
    mlsubtype = run_config['mlsubtype']
    case = run_config['case']
    loadsampleoption = run_config['loadsampleoption']
    binmin = run_config['binmin']
    binmax = run_config['binmax']
    rnd_shuffle = run_config['rnd_shuffle']
    nevt_sig = run_config['nevt_sig']
    nevt_bkg = run_config['nevt_bkg']
    test_frac = run_config['test_frac']
    rnd_splt = run_config['rnd_splt']
    docorrelation = run_config['docorrelation']
    dostandard = run_config['dostandard']
    dopca = run_config['dopca']
    dotraining = run_config['dotraining']
    dotesting = run_config['dotesting']
    applytodatamc = run_config['applytodatamc']
    docrossvalidation = run_config['docrossvalidation']
    dolearningcurve = run_config['dolearningcurve']
    doROC = run_config['doROC']
    doboundary = run_config['doboundary']
    doimportance = run_config['doimportance']
    dopltregressionxy = run_config['dopltregressionxy']
    dogridsearch = run_config['dogridsearch']
    dosignifopt = run_config['dosignifopt']
    nkfolds = run_config['nkfolds']
    ncores = run_config['ncores']

    data = get_database_ml_parameters()
    filesig, filebkg = data[case]["sig_bkg_files"]
    filedata, filemc = data[case]["data_mc_files"]
    trename = data[case]["tree_name"]
    var_all = data[case]["var_all"]
    var_signal = data[case]["var_signal"]
    sel_signal = data[case]["sel_signal"]
    sel_bkg = data[case]["sel_bkg"]
    var_training = data[case]["var_training"]
    var_target = data[case]["var_target"]
    var_corr_x, var_corr_y = data[case]["var_correlation"]
    var_boundaries = data[case]["var_boundaries"]
    var_binning = data[case]['var_binning']
    presel_reco = data[case]["presel_reco"]

    summary_string = f"#sig events: {nevt_sig}\n#bkg events: {nevt_bkg}\nmltype: {mltype}\n" \
                     f"mlsubtype: {mlsubtype}\ncase: {case}"
    logger.debug(summary_string)

    string_selection = createstringselection(var_binning, binmin, binmax)
    suffix = f"nevt_sig{nevt_sig}_nevt_bkg{nevt_bkg}_" \
             f"{mltype}{case}_{string_selection}"
    dataframe = f"dataframes_{suffix}"
    plotdir = f"plots_{suffix}"
    output = f"output_{suffix}"
    checkdir(dataframe)
    checkdir(plotdir)
    checkdir(output)

    classifiers = []
    classifiers_scikit = []
    classifiers_xgboost = []
    classifiers_keras = []

    names = []
    names_scikit = []
    names_xgboost = []
    names_keras = []

    filesig = os.path.join(DATA_PREFIX, filesig)
    filebkg = os.path.join(DATA_PREFIX, filebkg)
    filedata = os.path.join(DATA_PREFIX, filedata)
    filemc = os.path.join(DATA_PREFIX, filemc)

    trainedmodels = []

    if loadsampleoption == 1:
        df_sig = getdataframe(filesig, trename, var_all)
        df_bkg = getdataframe(filebkg, trename, var_all)
        if presel_reco is not None:
            df_sig = df_sig.query(presel_reco)
            df_bkg = df_bkg.query(presel_reco)
        df_sig = filterdataframe_singlevar(df_sig, var_binning, binmin, binmax)
        df_bkg = filterdataframe_singlevar(df_bkg, var_binning, binmin, binmax)
        _, df_ml_test, df_sig_train, df_bkg_train, _, _, \
        x_train, y_train, x_test, y_test = \
            create_mlsamples(df_sig, df_bkg, sel_signal, sel_bkg, rnd_shuffle,
                             var_signal, var_training,
                             nevt_sig, nevt_bkg, test_frac, rnd_splt)

    if docorrelation == 1:
        do_correlation(df_sig_train, df_bkg_train, var_all, var_corr_x,
                       var_corr_y, plotdir)

    if dostandard == 1:
        x_train = getdataframe_standardised(x_train)

    if dopca == 1:
        n_pca = 9
        x_train, pca = get_pcadataframe_pca(x_train, n_pca)
        plotvariance_pca(pca, plotdir)

    classifiers_scikit, names_scikit = getclf_scikit(model_config)

    classifiers_xgboost, names_xgboost = getclf_xgboost(model_config)

    classifiers_keras, names_keras = getclf_keras(model_config,
                                                  len(x_train.columns))

    classifiers = classifiers_scikit + classifiers_xgboost + classifiers_keras
    names = names_scikit + names_xgboost + names_keras

    if dotraining == 1:
        trainedmodels = fit(names, classifiers, x_train, y_train)
        savemodels(names, trainedmodels, output, suffix)

    if dotesting == 1:
        # The model predictions are added to the test dataframe
        df_ml_test = test(mltype, names, trainedmodels, df_ml_test,
                          var_training, var_signal)
        df_ml_test_to_df = output + "/testsample_%s_mldecision.pkl" % (suffix)
        df_ml_test_to_root = output + "/testsample_%s_mldecision.root" % (
            suffix)
        df_ml_test.to_pickle(df_ml_test_to_df)
        write_tree(df_ml_test_to_root, trename, df_ml_test)

    if applytodatamc == 1:
        df_data = getdataframe(filedata, trename, var_all)
        df_mc = getdataframe(filemc, trename, var_all)
        if presel_reco is not None:
            df_mc = df_mc.query(presel_reco)
            df_data = df_data.query(presel_reco)
        df_data = filterdataframe_singlevar(df_data, var_binning, binmin,
                                            binmax)
        df_mc = filterdataframe_singlevar(df_mc, var_binning, binmin, binmax)
        # The model predictions are added to the dataframes of data and MC
        df_data = apply(mltype, names, trainedmodels, df_data, var_training)
        df_mc = apply(mltype, names, trainedmodels, df_mc, var_training)
        df_data_to_root = output + "/data_%s_mldecision.root" % (suffix)
        df_mc_to_root = output + "/mc_%s_mldecision.root" % (suffix)
        write_tree(df_data_to_root, trename, df_data)
        write_tree(df_mc_to_root, trename, df_mc)

    if docrossvalidation == 1:
        df_scores = []
        if mltype == "Regression":
            df_scores = cross_validation_mse_continuous(
                names, classifiers, x_train, y_train, nkfolds, ncores)
        if mltype == "BinaryClassification":
            df_scores = cross_validation_mse(names, classifiers, x_train,
                                             y_train, nkfolds, ncores)
        plot_cross_validation_mse(names, df_scores, suffix, plotdir)

    if dolearningcurve == 1:
        #         confusion(names, classifiers, suffix, x_train, y_train, nkfolds, plotdir)
        npoints = 10
        plot_learning_curves(names, classifiers, suffix, plotdir, x_train,
                             y_train, npoints)

    if doROC == 1:
        precision_recall(names, classifiers, suffix, x_train, y_train, nkfolds,
                         plotdir)

    if doboundary == 1:
        classifiers_scikit_2var, names_2var = getclf_scikit(mltype)
        classifiers_keras_2var, names_keras_2var = getclf_keras(
            model_config, 2)
        classifiers_2var = classifiers_scikit_2var + classifiers_keras_2var
        names_2var = names_2var + names_keras_2var
        x_test_boundary = x_test[var_boundaries]
        trainedmodels_2var = fit(names_2var, classifiers_2var, x_test_boundary,
                                 y_test)
        decisionboundaries(names_2var, trainedmodels_2var, suffix + "2var",
                           x_test_boundary, y_test, plotdir)

    if doimportance == 1:
        importanceplotall(var_training, names_scikit + names_xgboost,
                          classifiers_scikit + classifiers_xgboost, suffix,
                          plotdir)

    if dopltregressionxy == 1:
        plotdistributiontarget(names, df_ml_test, var_target, suffix, plotdir)
        plotscattertarget(names, df_ml_test, var_target, suffix, plotdir)

    if dogridsearch == 1:
        datasearch = get_database_ml_gridsearch()
        analysisdb = datasearch[mltype]
        names_cv, clf_cv, par_grid_cv, refit_cv, var_param, \
            par_grid_cv_keys = read_grid_dict(analysisdb)
        _, _, dfscore = do_gridsearch(names_cv, clf_cv, par_grid_cv, refit_cv,
                                      x_train, y_train, nkfolds, ncores)
        perform_plot_gridsearch(names_cv, dfscore, par_grid_cv,
                                par_grid_cv_keys, var_param, plotdir, suffix,
                                0.1)

    if dosignifopt == 1:
        logger.info("Doing significance optimization")
        if dotraining and dotesting and applytodatamc:
            if (mlsubtype == "HFmeson") and case in ("Dsnew", "Lcnew", "Dzero",
                                                     "Dplus", "Dstar"):
                df_data_opt = df_data.query(sel_bkg)
                df_data_opt = shuffle(df_data_opt, random_state=rnd_shuffle)
                study_signif(case, names, [binmin, binmax], filemc, filedata,
                             df_mc, df_ml_test, df_data_opt, suffix, plotdir)
            else:
                logger.error(
                    "Optimisation is not implemented for this classification problem."
                )
        else:
            logger.error(
                "Training, testing and applytodata flags must be set to 1")