Ejemplo n.º 1
0
    def do_importance(self):
        if self.step_done("importance"):
            return

        self.do_train()

        self.logger.info("Do simple importance")
        importanceplotall(self.v_train, self.p_classname, self.p_class,
                          self.s_suffix, self.dirmlplot)
Ejemplo n.º 2
0
 def do_importance(self):
     importanceplotall(self.v_train, self.p_classname, self.p_class,
                       self.s_suffix, self.dirmlplot)
Ejemplo n.º 3
0
def post_form(req):  # pylint: disable=too-many-locals, too-many-statements, too-many-branches

    # Collect configuration in a dictionary for further processing
    run_config = {}

    mltype = "BinaryClassification"
    run_config["mltype"] = mltype
    case = get_form(req, "case")
    run_config["case"] = case
    filesig = get_form(req, "filesig")
    filebkg = get_form(req, "filebkg")
    trename = get_form(req, "tree_name")
    var_all_str = get_form(req, "var_all")
    var_all = var_all_str.split(',')
    var_signal = get_form(req, "var_signal")
    sel_signal = get_form(req, "sel_signal")
    sel_bkg_str = get_form(req, "sel_bkg")
    sel_bkg = ''
    for i in sel_bkg_str:
        if i == ',':
            sel_bkg += ' '
        elif i == '<':
            sel_bkg += '<'
        elif i == '&gt;':
            sel_bkg += '>'
        else:
            sel_bkg += i

    var_training_str = get_form(req, "var_training")
    var_training = var_training_str.split(',')
    var_corr_x_str = get_form(req, "var_correlation_x")
    var_corr_y_str = get_form(req, "var_correlation_y")
    var_corr_x = var_corr_x_str.split(',')
    var_corr_y = var_corr_y_str.split(',')
    #    var_binning_str = get_form(req, "var_binning")
    #    var_binning = var_binning_str.split(',')
    #    var_binning_min_str = get_form(req, "var_binning_min_str")
    #    varmin = [int(i) for i in var_binning_min_str.split(',')]
    #    var_binning_max_str = get_form(req, "var_binning_max_str")
    #    varmax = [int(i) for i in var_binning_max_str.split(',')]
    var_binning = get_form(req, "var_binning")
    var_binning_min = float(get_form(req, 'var_binning_min', var_type=float))
    var_binning_max = float(get_form(req, 'var_binning_max', var_type=float))
    run_config["binmin"] = var_binning_min
    run_config["binmax"] = var_binning_max
    presel_reco_str = get_form(req, "presel_reco")

    presel_reco = ''
    if presel_reco_str == 'None':
        presel_reco = None
    else:
        for i in presel_reco_str:
            if i == ',':
                presel_reco += ' '
            elif i == '&lt;':
                presel_reco += '<'
            elif i == '&gt;':
                presel_reco += '>'
            else:
                presel_reco += i

    activate_scikit = get_form(req, 'activate_scikit', var_type=bool)
    activate_xgboost = get_form(req, 'activate_xgboost', var_type=bool)
    activate_keras = get_form(req, 'activate_keras', var_type=bool)

    docorrelation = get_form(req, 'docorrelation', var_type=bool)
    run_config["docorrelation"] = docorrelation
    dotraining = get_form(req, 'dotraining', var_type=bool)
    run_config["dotraining"] = dotraining
    doROC = get_form(req, 'doROC', var_type=bool)
    run_config["doROC"] = doROC
    dolearningcurve = get_form(req, 'dolearningcurve', var_type=bool)
    run_config["dolearningcurve"] = dolearningcurve
    docrossvalidation = get_form(req, 'docrossvalidation', var_type=bool)
    run_config["docrossvalidation"] = docrossvalidation
    doimportance = get_form(req, 'doimportance', var_type=bool)
    run_config["doimportance"] = doimportance
    dogridsearch = get_form(req, 'dogridsearch', var_type=bool)
    run_config["dogridsearch"] = dogridsearch

    rnd_shuffle = int(get_form(req, 'rnd_shuffle', var_type=int))
    run_config["rnd_shuffle"] = rnd_shuffle
    nevt_sig = int(get_form(req, 'nevt_sig', var_type=int))
    run_config["nevt_sig"] = nevt_sig
    nevt_bkg = int(get_form(req, 'nevt_bkg', var_type=int))
    run_config["nevt_bkg"] = nevt_bkg
    test_frac = float(get_form(req, 'test_frac', var_type=float))
    run_config["test_frac"] = test_frac
    rnd_splt = int(get_form(req, 'rnd_splt', var_type=int))
    run_config["rnd_splt"] = rnd_splt
    nkfolds = int(get_form(req, 'nkfolds', var_type=int))
    run_config["nkfolds"] = nkfolds
    ncores = int(get_form(req, 'ncores', var_type=int))
    run_config["ncores"] = ncores

    data = get_database_ml_parameters()

    # Construct Configuration object from run_config
    conf = Configuration(run_config_input=run_config)
    conf.configure()

    model_config = conf.get_model_config()

    string_selection = createstringselection(var_binning, var_binning_min,
                                             var_binning_max)
    suffix = f"nevt_sig{nevt_sig}_nevt_bkg{nevt_bkg}_" \
             f"{mltype}{case}_{string_selection}"

    dataframe = f"dataframes_{suffix}"
    plotdir = f"plots_{suffix}"
    output = f"output_{suffix}"
    checkdir(dataframe)
    checkdir(plotdir)
    checkdir(output)

    classifiers = []
    classifiers_scikit = []
    classifiers_xgboost = []
    classifiers_keras = []

    names = []
    names_scikit = []
    names_xgboost = []
    names_keras = []

    trainedmodels = []

    df_sig = getdataframe(filesig, trename, var_all)
    df_bkg = getdataframe(filebkg, trename, var_all)
    if presel_reco is not None:
        df_sig = df_sig.query(presel_reco)
        df_bkg = df_bkg.query(presel_reco)
    df_sig = filterdataframe_singlevar(df_sig, var_binning, var_binning_min,
                                       var_binning_max)
    df_bkg = filterdataframe_singlevar(df_bkg, var_binning, var_binning_min,
                                       var_binning_max)

    # Output images
    imageIO_vardist: BytesIO = None
    imageIO_scatterplot: BytesIO = None
    imageIO_corr_sig: BytesIO = None
    imageIO_corr_bkg: BytesIO = None
    imageIO_precision_recall: BytesIO = None
    imageIO_ROC: BytesIO = None
    imageIO_plot_learning_curves: BytesIO = None
    img_scoresRME: BytesIO = None
    img_import: BytesIO = None
    img_gridsearch: BytesIO = None

    # pylint: disable=unused-variable
    _, _, df_sig_train, df_bkg_train, _, _, x_train, y_train, x_test, y_test = \
        create_mlsamples(df_sig, df_bkg, sel_signal, data[case], sel_bkg, rnd_shuffle,
                         var_signal, var_training, nevt_sig, nevt_bkg, test_frac, rnd_splt)
    if docorrelation:
        imageIO_vardist, imageIO_scatterplot, imageIO_corr_sig, imageIO_corr_bkg = \
            do_correlation(df_sig_train, df_bkg_train, var_all, var_corr_x, var_corr_y, plotdir)

    # Using the activate_* flags is for now a work-around
    if activate_scikit:
        classifiers_scikit, names_scikit = getclf_scikit(model_config)
        classifiers = classifiers + classifiers_scikit
        names = names + names_scikit

    if activate_xgboost:
        classifiers_xgboost, names_xgboost = getclf_xgboost(model_config)
        classifiers = classifiers + classifiers_xgboost
        names = names + names_xgboost

    if activate_keras:
        classifiers_keras, names_keras = getclf_keras(model_config,
                                                      len(x_train.columns))
        classifiers = classifiers + classifiers_keras
        names = names + names_keras

    if dotraining:
        trainedmodels = fit(names, classifiers, x_train, y_train)
        savemodels(names, trainedmodels, output, suffix)

    if doROC:
        imageIO_precision_recall, imageIO_ROC = \
            precision_recall(names, classifiers, suffix, x_train, y_train, nkfolds, plotdir)

    if docrossvalidation:
        df_scores = []
        if mltype == "Regression":
            df_scores = cross_validation_mse_continuous(
                names, classifiers, x_train, y_train, nkfolds, ncores)
        if mltype == "BinaryClassification":
            df_scores = cross_validation_mse(names, classifiers, x_train,
                                             y_train, nkfolds, ncores)
        img_scoresRME = plot_cross_validation_mse(names, df_scores, suffix,
                                                  plotdir)

    if doimportance:
        img_import = importanceplotall(
            var_training, names_scikit + names_xgboost,
            classifiers_scikit + classifiers_xgboost, suffix, plotdir)

    if dolearningcurve:
        npoints = 10
        imageIO_plot_learning_curves = plot_learning_curves(
            names, classifiers, suffix, plotdir, x_train, y_train, npoints)

    if dogridsearch:
        datasearch = get_database_ml_gridsearch()
        analysisdb = datasearch[mltype]
        names_cv, clf_cv, par_grid_cv, refit_cv, var_param, \
            par_grid_cv_keys = read_grid_dict(analysisdb)
        _, _, dfscore = do_gridsearch(names_cv, clf_cv, par_grid_cv, refit_cv,
                                      x_train, y_train, nkfolds, ncores)
        img_gridsearch = perform_plot_gridsearch(names_cv, dfscore,
                                                 par_grid_cv, par_grid_cv_keys,
                                                 var_param, plotdir, suffix,
                                                 0.1)

    return JENV.get_template("display.html").render(
        imageIO_vardist=imageIO_vardist,
        imageIO_scatterplot=imageIO_scatterplot,
        imageIO_corr_sig=imageIO_corr_sig,
        imageIO_corr_bkg=imageIO_corr_bkg,
        imageIO_precision_recall=imageIO_precision_recall,
        imageIO_ROC=imageIO_ROC,
        imageIO_plot_learning_curves=imageIO_plot_learning_curves,
        img_scoresRME=img_scoresRME,
        img_import=img_import,
        img_gridsearch=img_gridsearch)
    if doboundary == 1:
        classifiers_scikit_2var, names_2var = getclf_scikit(mltype)
        classifiers_keras_2var, names_keras_2var = getclf_keras(
            data["ml_study"], 2)
        classifiers_2var = classifiers_scikit_2var + classifiers_keras_2var
        names_2var = names_2var + names_keras_2var
        x_test_boundary = x_test[var_boundaries]
        trainedmodels_2var = fit(names_2var, classifiers_2var, x_test_boundary,
                                 y_test)
        decisionboundaries(names_2var, trainedmodels_2var, suffix + "2var",
                           x_test_boundary, y_test, mlplot)

    if doimportance == 1:
        importanceplotall(var_training, names_scikit + names_xgboost,
                          classifiers_scikit + classifiers_xgboost, suffix,
                          mlplot)

    if dogridsearch == 1:
        datasearch = get_database_ml_gridsearch()
        analysisdb = datasearch[mltype]
        names_cv, clf_cv, par_grid_cv, refit_cv, var_param, \
            par_grid_cv_keys = read_grid_dict(analysisdb)
        _, _, dfscore = do_gridsearch(names_cv, clf_cv, par_grid_cv, refit_cv,
                                      x_train, y_train, nkfolds, ncores)
        perform_plot_gridsearch(names_cv, dfscore, par_grid_cv,
                                par_grid_cv_keys, var_param, mlplot, suffix,
                                0.1)

    if dosignifopt == 1:
        logger.info("Doing significance optimization")
Ejemplo n.º 5
0
def doclassification_regression(conf):  # pylint: disable=too-many-locals, too-many-statements, too-many-branches

    logger = get_logger()
    logger.info(f"Start classification_regression run")

    run_config = conf.get_run_config()
    model_config = conf.get_model_config()

    mltype = run_config['mltype']
    mlsubtype = run_config['mlsubtype']
    case = run_config['case']
    loadsampleoption = run_config['loadsampleoption']
    binmin = run_config['binmin']
    binmax = run_config['binmax']
    rnd_shuffle = run_config['rnd_shuffle']
    nevt_sig = run_config['nevt_sig']
    nevt_bkg = run_config['nevt_bkg']
    test_frac = run_config['test_frac']
    rnd_splt = run_config['rnd_splt']
    docorrelation = run_config['docorrelation']
    dostandard = run_config['dostandard']
    dopca = run_config['dopca']
    dotraining = run_config['dotraining']
    dotesting = run_config['dotesting']
    applytodatamc = run_config['applytodatamc']
    docrossvalidation = run_config['docrossvalidation']
    dolearningcurve = run_config['dolearningcurve']
    doROC = run_config['doROC']
    doboundary = run_config['doboundary']
    doimportance = run_config['doimportance']
    dopltregressionxy = run_config['dopltregressionxy']
    dogridsearch = run_config['dogridsearch']
    dosignifopt = run_config['dosignifopt']
    nkfolds = run_config['nkfolds']
    ncores = run_config['ncores']

    data = get_database_ml_parameters()
    filesig, filebkg = data[case]["sig_bkg_files"]
    filedata, filemc = data[case]["data_mc_files"]
    trename = data[case]["tree_name"]
    var_all = data[case]["var_all"]
    var_signal = data[case]["var_signal"]
    sel_signal = data[case]["sel_signal"]
    sel_bkg = data[case]["sel_bkg"]
    var_training = data[case]["var_training"]
    var_target = data[case]["var_target"]
    var_corr_x, var_corr_y = data[case]["var_correlation"]
    var_boundaries = data[case]["var_boundaries"]
    var_binning = data[case]['var_binning']
    presel_reco = data[case]["presel_reco"]

    summary_string = f"#sig events: {nevt_sig}\n#bkg events: {nevt_bkg}\nmltype: {mltype}\n" \
                     f"mlsubtype: {mlsubtype}\ncase: {case}"
    logger.debug(summary_string)

    string_selection = createstringselection(var_binning, binmin, binmax)
    suffix = f"nevt_sig{nevt_sig}_nevt_bkg{nevt_bkg}_" \
             f"{mltype}{case}_{string_selection}"
    dataframe = f"dataframes_{suffix}"
    plotdir = f"plots_{suffix}"
    output = f"output_{suffix}"
    checkdir(dataframe)
    checkdir(plotdir)
    checkdir(output)

    classifiers = []
    classifiers_scikit = []
    classifiers_xgboost = []
    classifiers_keras = []

    names = []
    names_scikit = []
    names_xgboost = []
    names_keras = []

    filesig = os.path.join(DATA_PREFIX, filesig)
    filebkg = os.path.join(DATA_PREFIX, filebkg)
    filedata = os.path.join(DATA_PREFIX, filedata)
    filemc = os.path.join(DATA_PREFIX, filemc)

    trainedmodels = []

    if loadsampleoption == 1:
        df_sig = getdataframe(filesig, trename, var_all)
        df_bkg = getdataframe(filebkg, trename, var_all)
        if presel_reco is not None:
            df_sig = df_sig.query(presel_reco)
            df_bkg = df_bkg.query(presel_reco)
        df_sig = filterdataframe_singlevar(df_sig, var_binning, binmin, binmax)
        df_bkg = filterdataframe_singlevar(df_bkg, var_binning, binmin, binmax)
        _, df_ml_test, df_sig_train, df_bkg_train, _, _, \
        x_train, y_train, x_test, y_test = \
            create_mlsamples(df_sig, df_bkg, sel_signal, sel_bkg, rnd_shuffle,
                             var_signal, var_training,
                             nevt_sig, nevt_bkg, test_frac, rnd_splt)

    if docorrelation == 1:
        do_correlation(df_sig_train, df_bkg_train, var_all, var_corr_x,
                       var_corr_y, plotdir)

    if dostandard == 1:
        x_train = getdataframe_standardised(x_train)

    if dopca == 1:
        n_pca = 9
        x_train, pca = get_pcadataframe_pca(x_train, n_pca)
        plotvariance_pca(pca, plotdir)

    classifiers_scikit, names_scikit = getclf_scikit(model_config)

    classifiers_xgboost, names_xgboost = getclf_xgboost(model_config)

    classifiers_keras, names_keras = getclf_keras(model_config,
                                                  len(x_train.columns))

    classifiers = classifiers_scikit + classifiers_xgboost + classifiers_keras
    names = names_scikit + names_xgboost + names_keras

    if dotraining == 1:
        trainedmodels = fit(names, classifiers, x_train, y_train)
        savemodels(names, trainedmodels, output, suffix)

    if dotesting == 1:
        # The model predictions are added to the test dataframe
        df_ml_test = test(mltype, names, trainedmodels, df_ml_test,
                          var_training, var_signal)
        df_ml_test_to_df = output + "/testsample_%s_mldecision.pkl" % (suffix)
        df_ml_test_to_root = output + "/testsample_%s_mldecision.root" % (
            suffix)
        df_ml_test.to_pickle(df_ml_test_to_df)
        write_tree(df_ml_test_to_root, trename, df_ml_test)

    if applytodatamc == 1:
        df_data = getdataframe(filedata, trename, var_all)
        df_mc = getdataframe(filemc, trename, var_all)
        if presel_reco is not None:
            df_mc = df_mc.query(presel_reco)
            df_data = df_data.query(presel_reco)
        df_data = filterdataframe_singlevar(df_data, var_binning, binmin,
                                            binmax)
        df_mc = filterdataframe_singlevar(df_mc, var_binning, binmin, binmax)
        # The model predictions are added to the dataframes of data and MC
        df_data = apply(mltype, names, trainedmodels, df_data, var_training)
        df_mc = apply(mltype, names, trainedmodels, df_mc, var_training)
        df_data_to_root = output + "/data_%s_mldecision.root" % (suffix)
        df_mc_to_root = output + "/mc_%s_mldecision.root" % (suffix)
        write_tree(df_data_to_root, trename, df_data)
        write_tree(df_mc_to_root, trename, df_mc)

    if docrossvalidation == 1:
        df_scores = []
        if mltype == "Regression":
            df_scores = cross_validation_mse_continuous(
                names, classifiers, x_train, y_train, nkfolds, ncores)
        if mltype == "BinaryClassification":
            df_scores = cross_validation_mse(names, classifiers, x_train,
                                             y_train, nkfolds, ncores)
        plot_cross_validation_mse(names, df_scores, suffix, plotdir)

    if dolearningcurve == 1:
        #         confusion(names, classifiers, suffix, x_train, y_train, nkfolds, plotdir)
        npoints = 10
        plot_learning_curves(names, classifiers, suffix, plotdir, x_train,
                             y_train, npoints)

    if doROC == 1:
        precision_recall(names, classifiers, suffix, x_train, y_train, nkfolds,
                         plotdir)

    if doboundary == 1:
        classifiers_scikit_2var, names_2var = getclf_scikit(mltype)
        classifiers_keras_2var, names_keras_2var = getclf_keras(
            model_config, 2)
        classifiers_2var = classifiers_scikit_2var + classifiers_keras_2var
        names_2var = names_2var + names_keras_2var
        x_test_boundary = x_test[var_boundaries]
        trainedmodels_2var = fit(names_2var, classifiers_2var, x_test_boundary,
                                 y_test)
        decisionboundaries(names_2var, trainedmodels_2var, suffix + "2var",
                           x_test_boundary, y_test, plotdir)

    if doimportance == 1:
        importanceplotall(var_training, names_scikit + names_xgboost,
                          classifiers_scikit + classifiers_xgboost, suffix,
                          plotdir)

    if dopltregressionxy == 1:
        plotdistributiontarget(names, df_ml_test, var_target, suffix, plotdir)
        plotscattertarget(names, df_ml_test, var_target, suffix, plotdir)

    if dogridsearch == 1:
        datasearch = get_database_ml_gridsearch()
        analysisdb = datasearch[mltype]
        names_cv, clf_cv, par_grid_cv, refit_cv, var_param, \
            par_grid_cv_keys = read_grid_dict(analysisdb)
        _, _, dfscore = do_gridsearch(names_cv, clf_cv, par_grid_cv, refit_cv,
                                      x_train, y_train, nkfolds, ncores)
        perform_plot_gridsearch(names_cv, dfscore, par_grid_cv,
                                par_grid_cv_keys, var_param, plotdir, suffix,
                                0.1)

    if dosignifopt == 1:
        logger.info("Doing significance optimization")
        if dotraining and dotesting and applytodatamc:
            if (mlsubtype == "HFmeson") and case in ("Dsnew", "Lcnew", "Dzero",
                                                     "Dplus", "Dstar"):
                df_data_opt = df_data.query(sel_bkg)
                df_data_opt = shuffle(df_data_opt, random_state=rnd_shuffle)
                study_signif(case, names, [binmin, binmax], filemc, filedata,
                             df_mc, df_ml_test, df_data_opt, suffix, plotdir)
            else:
                logger.error(
                    "Optimisation is not implemented for this classification problem."
                )
        else:
            logger.error(
                "Training, testing and applytodata flags must be set to 1")