Beispiel #1
0
    def load_ML_analysis(self, cent_class, pt_range, ct_range, split=''):

        info_string = f'_{cent_class[0]}{cent_class[1]}_{pt_range[0]}{pt_range[1]}_{ct_range[0]}{ct_range[1]}{split}'

        handlers_path = os.environ['HYPERML_MODELS_{}'.format(
            self.mode)] + '/handlers'
        efficiencies_path = os.environ['HYPERML_EFFICIENCIES_{}'.format(
            self.mode)]

        filename_handler = handlers_path + '/model_handler' + info_string + '.pkl'
        filename_efficiencies = efficiencies_path + '/Eff_Score' + info_string + '.npy'

        eff_score_array = np.load(filename_efficiencies)

        model_handler = ModelHandler()
        model_handler.load_model_handler(filename_handler)

        return eff_score_array, model_handler
Beispiel #2
0
                        f'Number of candidates ({split}) for training in {ct_bins[0]} <= ct < {ct_bins[1]} cm: {len(train_test_data[0])}'
                    )
                    print(
                        f'signal candidates: {np.count_nonzero(train_test_data[1] == 1)}; background candidates: {np.count_nonzero(train_test_data[1] == 0)}; n_cand_bkg / n_cand_signal = {np.count_nonzero(train_test_data[1] == 0) / np.count_nonzero(train_test_data[1] == 1)}'
                    )
                    model_hdl.train_test_model(train_test_data,
                                               return_prediction=True)
                    model_file_name = str(f'models/{bin_model}_trained')
                    if OPTIMIZE:
                        model_file_name = str(
                            f'models/{bin_model}_optimized_trained')
                    model_hdl.dump_model_handler(model_file_name)
                elif COMPUTE_SCORES_FROM_EFF and isModelTrained:
                    print('Model trained...')
                    if OPTIMIZED:
                        model_hdl.load_model_handler(
                            f'models/{bin_model}_trained')
                    else:
                        model_hdl.load_model_handler(
                            f'models/{bin_model}_trained')
                else:
                    continue

                ct_bins_df_index = int(ct_bins[0] / 5 - 1)
                for ct_bins_df in zip(
                        CT_BINS_APPLY[i_cent_bins][ct_bins_df_index][:-1],
                        CT_BINS_APPLY[i_cent_bins][ct_bins_df_index][1:]):
                    bin_df = f'{split}_{cent_bins[0]}_{cent_bins[1]}_{ct_bins_df[0]}_{ct_bins_df[1]}'

                    # get only centrality selected
                    train_test_data_cent = [
                        pd.DataFrame(), [],
Beispiel #3
0
def main():  #pylint: disable=too-many-statements, too-many-branches
    # read config file
    parser = argparse.ArgumentParser(description='Arguments to pass')
    parser.add_argument('cfgFileName',
                        metavar='text',
                        default='cfgFileNameML.yml',
                        help='config file name for ml')
    args = parser.parse_args()

    print('Loading analysis configuration: ...', end='\r')
    with open(args.cfgFileName, 'r') as ymlCfgFile:
        inputCfg = yaml.load(ymlCfgFile, yaml.FullLoader)
    print('Loading analysis configuration: Done!')

    PtBins = [[a, b] for a, b in zip(inputCfg['pt_ranges']['min'],
                                     inputCfg['pt_ranges']['max'])]
    OutputLabels = [
        inputCfg['output']['out_labels']['Bkg'],
        inputCfg['output']['out_labels']['Prompt']
    ]
    if inputCfg['output']['out_labels']['FD'] is not None:
        OutputLabels.append(inputCfg['output']['out_labels']['FD'])
    ColumnsToSave = inputCfg['appl']['column_to_save_list']
    ModelList = inputCfg['ml']['saved_models']
    ModelHandls = []
    for iBin in range(len(PtBins)):
        ModelPath = ModelList[iBin]
        if not isinstance(ModelPath, str):
            print('\033[91mERROR: path to model not correctly defined!\033[0m')
            sys.exit()
        ModelPath = os.path.expanduser(ModelPath)
        print(f'Loaded saved model: {ModelPath}')
        ModelHandl = ModelHandler()
        ModelHandl.load_model_handler(ModelPath)
        ModelHandls.append(ModelHandl)

    for inputFile, outName in zip(inputCfg['standalone_appl']['inputs'],
                                  inputCfg['standalone_appl']['output_names']):
        print(f'Loading and preparing data file {inputFile}: ...', end='\r')
        DataHandler = TreeHandler(inputFile)
        DataHandler.slice_data_frame('pt_cand', PtBins, True)
        print(f'Loading and preparing data files {inputFile}: Done!')

        print('Applying ML model to dataframes: ...', end='\r')
        for iBin, PtBin in enumerate(PtBins):
            OutPutDirPt = os.path.join(
                os.path.expanduser(inputCfg['standalone_appl']['output_dir']),
                f'pt{PtBin[0]}_{PtBin[1]}')
            if os.path.isdir(OutPutDirPt):
                print((
                    f'\033[93mWARNING: Output directory \'{OutPutDirPt}\' already exists,'
                    ' overwrites possibly ongoing!\033[0m'))
            else:
                os.makedirs(OutPutDirPt)
            DataDfPtSel = DataHandler.get_slice(iBin)
            yPred = ModelHandls[iBin].predict(DataDfPtSel,
                                              inputCfg['ml']['raw_output'])
            ColumnsToSaveFinal = ColumnsToSave
            if not isinstance(ColumnsToSaveFinal, list):
                print(
                    '\033[91mERROR: column_to_save_list must be defined!\033[0m'
                )
                sys.exit()
            if 'inv_mass' not in ColumnsToSaveFinal:
                print(
                    '\033[93mWARNING: inv_mass is not going to be saved in the output dataframe!\033[0m'
                )
            if 'pt_cand' not in ColumnsToSaveFinal:
                print(
                    '\033[93mWARNING: pt_cand is not going to be saved in the output dataframe!\033[0m'
                )
            if 'pt_B' in ColumnsToSaveFinal and 'pt_B' not in DataDfPtSel.columns:
                ColumnsToSaveFinal.remove('pt_B')  # only in MC
            DataDfPtSel = DataDfPtSel.loc[:, ColumnsToSaveFinal]
            if ModelHandls[iBin].get_n_classes() < 3:
                DataDfPtSel['ML_output'] = yPred
            else:
                for Pred, Lab in enumerate(OutputLabels):
                    DataDfPtSel[f'ML_output_{Lab}'] = yPred[:, Pred]
            DataDfPtSel.to_parquet(
                f'{OutPutDirPt}/{outName}_pT_{PtBin[0]}_{PtBin[1]}_ModelApplied.parquet.gzip'
            )
            del DataDfPtSel
        print('Applying ML model to dataframes: Done!')
def main():  #pylint: disable=too-many-statements
    # read config file
    parser = argparse.ArgumentParser(description='Arguments to pass')
    parser.add_argument('cfgFileName',
                        metavar='text',
                        default='cfgFileNameML.yml',
                        help='config file name for ml')
    parser.add_argument("--train",
                        help="perform only training and testing",
                        action="store_true")
    parser.add_argument("--apply",
                        help="perform only application",
                        action="store_true")
    args = parser.parse_args()

    print('Loading analysis configuration: ...', end='\r')
    with open(args.cfgFileName, 'r') as ymlCfgFile:
        inputCfg = yaml.load(ymlCfgFile, yaml.FullLoader)
    print('Loading analysis configuration: Done!')

    print('Loading and preparing data files: ...', end='\r')
    PromptHandler = TreeHandler(inputCfg['input']['prompt'],
                                inputCfg['input']['treename'])
    FDHandler = None if inputCfg['input']['FD'] is None else TreeHandler(
        inputCfg['input']['FD'], inputCfg['input']['treename'])
    DataHandler = TreeHandler(inputCfg['input']['data'],
                              inputCfg['input']['treename'])

    if inputCfg['data_prep']['filt_bkg_mass']:
        BkgHandler = DataHandler.get_subset(
            inputCfg['data_prep']['filt_bkg_mass'],
            frac=1.,
            rndm_state=inputCfg['data_prep']['seed_split'])
    else:
        BkgHandler = DataHandler

    PtBins = [[a, b] for a, b in zip(inputCfg['pt_ranges']['min'],
                                     inputCfg['pt_ranges']['max'])]
    PromptHandler.slice_data_frame('pt_cand', PtBins, True)
    if FDHandler is not None:
        FDHandler.slice_data_frame('pt_cand', PtBins, True)
    DataHandler.slice_data_frame('pt_cand', PtBins, True)
    BkgHandler.slice_data_frame('pt_cand', PtBins, True)
    print('Loading and preparing data files: Done!')

    for iBin, PtBin in enumerate(PtBins):
        print(
            f'\n\033[94mStarting ML analysis --- {PtBin[0]} < pT < {PtBin[1]} GeV/c\033[0m'
        )

        OutPutDirPt = os.path.join(
            os.path.expanduser(inputCfg['output']['dir']),
            f'pt{PtBin[0]}_{PtBin[1]}')
        if os.path.isdir(OutPutDirPt):
            print((
                f'\033[93mWARNING: Output directory \'{OutPutDirPt}\' already exists,'
                ' overwrites possibly ongoing!\033[0m'))
        else:
            os.makedirs(OutPutDirPt)

        # data preparation
        #_____________________________________________
        FDDfPt = pd.DataFrame() if FDHandler is None else FDHandler.get_slice(
            iBin)
        TrainTestData, PromptDfSelForEff, FDDfSelForEff = data_prep(
            inputCfg, iBin, PtBin, OutPutDirPt, PromptHandler.get_slice(iBin),
            FDDfPt, BkgHandler.get_slice(iBin))
        if args.apply and inputCfg['data_prep']['test_fraction'] < 1.:
            print(
                '\033[93mWARNING: Using only a fraction of the MC for the application! Are you sure?\033[0m'
            )

        # training, testing
        #_____________________________________________
        if not args.apply:
            ModelHandl = train_test(inputCfg, PtBin, OutPutDirPt,
                                    TrainTestData, iBin)
        else:
            ModelList = inputCfg['ml']['saved_models']
            ModelPath = ModelList[iBin]
            if not isinstance(ModelPath, str):
                print(
                    '\033[91mERROR: path to model not correctly defined!\033[0m'
                )
                sys.exit()
            ModelPath = os.path.expanduser(ModelPath)
            print(f'Loaded saved model: {ModelPath}')
            ModelHandl = ModelHandler()
            ModelHandl.load_model_handler(ModelPath)

        # model application
        #_____________________________________________
        if not args.train:
            appl(inputCfg, PtBin, OutPutDirPt, ModelHandl,
                 DataHandler.get_slice(iBin), PromptDfSelForEff, FDDfSelForEff)

        # delete dataframes to release memory
        for data in TrainTestData:
            del data
        del PromptDfSelForEff, FDDfSelForEff
Beispiel #5
0
plt.ylabel('ROC AUC')
plt.legend()

plt.savefig('../opt_comp.png', dpi = 100, facecolor = 'white')
plt.close()

##################################################################################

# BEST HYPERPARAMETERS FOR EACH METHOD

names = ['Opt_test_OPTUNA', 'Opt_test_BAYES', 'Opt_test_DEFAULT', 'Opt_test_PbPb']

if False:
    for name in names:
        model_hdl = ModelHandler()
        model_hdl.load_model_handler('../analysis_results/' + name + '/model/model_hdl')

        print(name)
        print(model_hdl.get_model_params())
        print('\n---------------\n')

##################################################################################

# PLOT SUPERIMPOSED ROC
'''
plt.close()
objects = []

for n in names:
    with (open('../analysis_results/' + n + '/images/training/ROC_AUC_train_test.pickle', "rb")) as openfile:
        while True:
Beispiel #6
0
def main():
    # read config file
    parser = argparse.ArgumentParser(description='Arguments to pass')
    parser.add_argument('cfgFileName',
                        metavar='text',
                        default='cfgFileNameML.yml',
                        help='config file name for ml')
    parser.add_argument("--train",
                        help="perform only training and testing",
                        action="store_true")
    parser.add_argument("--apply",
                        help="perform only application",
                        action="store_true")
    args = parser.parse_args()

    print('Loading analysis configuration: ...', end='\r')
    with open(args.cfgFileName, 'r') as ymlCfgFile:
        inputCfg = yaml.load(ymlCfgFile, yaml.FullLoader)
    print('Loading analysis configuration: Done!')

    print('Loading data files: ...', end='\r')
    PromptDf = LoadDfFromRootOrParquet(inputCfg['input']['prompt'])
    FDDf = LoadDfFromRootOrParquet(inputCfg['input']['FD'])
    DataDf = LoadDfFromRootOrParquet(inputCfg['input']['data'])
    print('Loading data files: Done!')

    for iBin, (PtMin, PtMax) in enumerate(
            zip(inputCfg['pt_ranges']['min'], inputCfg['pt_ranges']['max'])):

        print(
            f'\n\033[94mStarting ML analysis --- {PtMin} < pT < {PtMax} GeV/c\033[0m'
        )

        OutPutDirPt = os.path.join(inputCfg['output']['dir'],
                                   f'pt{PtMin}_{PtMax}')
        if os.path.isdir(OutPutDirPt):
            print(
                'Output directory already exists, overwrites possibly ongoing!'
            )
        else:
            os.mkdir(OutPutDirPt)

        # data preparation
        #_____________________________________________
        TrainTestData, DataDfPtSel, PromptDfPtSelForEff, FDDfPtSelForEff = data_prep( \
            inputCfg, iBin, PtMin, PtMax, OutPutDirPt, DataDf, PromptDf, FDDf)

        # training, testing
        #_____________________________________________
        if not args.apply:
            ModelHandl = train_test(inputCfg, PtMin, PtMax, OutPutDirPt,
                                    TrainTestData)
        else:
            ModelList = inputCfg['ml']['saved_models']
            ModelPath = ModelList[iBin]
            if not isinstance(ModelPath, str):
                print(f'ERROR: path to model not correctly defined!')
                sys.exit()
            print(f'Loaded saved model: {ModelPath}')
            ModelHandl = ModelHandler()
            ModelHandl.load_model_handler(ModelPath)

        # model application
        #_____________________________________________
        if not args.train:
            appl(inputCfg, PtMin, PtMax, OutPutDirPt, ModelHandl, DataDfPtSel,
                 PromptDfPtSelForEff, FDDfPtSelForEff)

        # delete dataframes to release memory
        for data in TrainTestData:
            del data
        del DataDfPtSel, PromptDfPtSelForEff, FDDfPtSelForEff
Beispiel #7
0
def get_skimmed_large_data(data_path,
                           cent_classes,
                           pt_bins,
                           ct_bins,
                           training_columns,
                           application_columns,
                           mode,
                           split=''):
    print('\n++++++++++++++++++++++++++++++++++++++++++++++++++')
    print('\nStarting BDT appplication on large data')

    if mode == 3:
        handlers_path = os.environ['HYPERML_MODELS_3'] + '/handlers'
        efficiencies_path = os.environ['HYPERML_EFFICIENCIES_3']

    if mode == 2:
        handlers_path = os.environ['HYPERML_MODELS_2'] + '/handlers'
        efficiencies_path = os.environ['HYPERML_EFFICIENCIES_2']

    executor = ThreadPoolExecutor()
    iterator = uproot.pandas.iterate(data_path,
                                     'DataTable',
                                     executor=executor,
                                     reportfile=True)

    df_applied = pd.DataFrame()

    for current_file, data in iterator:
        rename_df_columns(data)

        print('current file: {}'.format(current_file))
        print('start entry chunk: {}, stop entry chunk: {}'.format(
            data.index[0], data.index[-1]))

        for cclass in cent_classes:
            for ptbin in zip(pt_bins[:-1], pt_bins[1:]):
                for ctbin in zip(ct_bins[:-1], ct_bins[1:]):
                    info_string = '_{}{}_{}{}_{}{}'.format(
                        cclass[0], cclass[1], ptbin[0], ptbin[1], ctbin[0],
                        ctbin[1])

                    filename_handler = handlers_path + '/model_handler' + info_string + split + '.pkl'
                    filename_efficiencies = efficiencies_path + '/Eff_Score' + info_string + split + '.npy'

                    model_handler = ModelHandler()
                    model_handler.load_model_handler(filename_handler)

                    eff_score_array = np.load(filename_efficiencies)
                    tsd = eff_score_array[1][-1]

                    data_range = f'{ctbin[0]}<ct<{ctbin[1]} and {ptbin[0]}<pt<{ptbin[1]} and {cclass[0]}<=centrality<{cclass[1]}'

                    df_tmp = data.query(data_range)
                    df_tmp.insert(
                        0, 'score',
                        model_handler.predict(df_tmp[training_columns]))

                    df_tmp = df_tmp.query('score>@tsd')
                    df_tmp = df_tmp.loc[:, application_columns]

                    df_applied = df_applied.append(df_tmp,
                                                   ignore_index=True,
                                                   sort=False)

    print(df_applied.info(memory_usage='deep'))
    return df_applied
Beispiel #8
0
    ##########################################################################

    print('\nHypertriton 3-body - pp @ 13 TeV\n')

    if flag_dict['train_model']:
        print('Starting model training & application\n')
        train.train_model(filename_dict, presel_dict, flag_dict, eff_array,
                          train_vars, params, params_range)
        print('Model training & application complete\n')

    #print('BENCHMARKING')
    #utils.benchmark_hyperparam_optimizers(filename_dict, params, params_range, flag_dict, presel_dict, train_vars)

    model_hdl = ModelHandler()
    model_hdl.load_model_handler(filename_dict['analysis_path'] +
                                 '/model/model_hdl')

    print('Model loaded\n')

    eff_array, scores = train.load_eff_scores(filename_dict['analysis_path'] +
                                              'output_data/')

    data = train.load_data_with_scores(filename_dict['analysis_path'] +
                                       'output_data/data_scores.parquet.gzip'
                                       )  #pd dataframe already processed
    print('Data loaded\n')
    #data.query('model_output > -5', inplace = True)         ## PARAM!!!!!
    #print('Query on data applied\n')
    background_ls = train.load_data_with_scores(
        filename_dict['analysis_path'] +
        'output_data/bckg_ls_scores.parquet.gzip')
'''

import os
import sys
import argparse

from hipe4ml.model_handler import ModelHandler

parser = argparse.ArgumentParser(description='Arguments to pass')
parser.add_argument('inFilePkl',
                    metavar='text',
                    default='model.pkl',
                    help='input pickle file to be converted')
args = parser.parse_args()

ModelPath = os.path.expanduser(args.inFilePkl)
print(f'Loaded saved model: {ModelPath}')
ModelHandl = ModelHandler()
ModelHandl.load_model_handler(ModelPath)

if '.pickle' in ModelPath:
    outFileName = ModelPath.replace('.pickle', '.model')
elif '.pkl' in ModelPath:
    outFileName = ModelPath.replace('.pkl', '.model')
else:
    print(f'ERROR: invalid input file {ModelHandl}, please check it! Exit')
    sys.exit()

ModelHandl.dump_original_model(outFileName, True)
print(f'Saved model: {outFileName}')
Beispiel #10
0
    for cclass in CENT_CLASSES:
        for ptbin in zip(PT_BINS[:-1], PT_BINS[1:]):
            for ctbin in zip(CT_BINS[:-1], CT_BINS[1:]):
                # data[0]=train_set, data[1]=y_train, data[2]=test_set, data[3]=y_test
                data = ml_analysis.prepare_dataframe(COLUMNS,
                                                     cent_class=cclass,
                                                     ct_range=ctbin,
                                                     pt_range=ptbin)

                input_model = xgb.XGBClassifier()
                model_handler = ModelHandler(input_model)

                info_string = f'_{cclass[0]}{cclass[1]}_{ptbin[0]}{ptbin[1]}_{ctbin[0]}{ctbin[1]}{split}'
                filename_handler = handlers_path + '/model_handler' + info_string + '.pkl'
                model_handler.load_model_handler(filename_handler)

                y_pred = model_handler.predict(data[2])
                test_set = pd.concat([data[2], data[3]], axis=1, sort=False)
                test_set.insert(0, 'score', y_pred)
                test_set.query('y>0', inplace=True)

                mass_bins = 40 if ctbin[1] < 16 else 36

                eff_score_array, model_handler = ml_application.load_ML_analysis(
                    cclass, ptbin, ctbin, split)

                eff_index = 1
                for eff, tsd in zip(pd.unique(eff_score_array[0][::-1]),
                                    pd.unique(eff_score_array[1][::-1])):
                    #after selection
        pp_string = "_pp"
    else:
        simH = TreeHandler(path_to_data + signal_table_name, "GenTable")
        presel_eff = len(signalH) / len(simH)
    print("Presel Eff: ", presel_eff)

    bdt_eff_arr = np.load(efficiencies_path + "/efficiency_arr.npy")
    score_eff_arr = np.load(efficiencies_path + "/score_efficiency_arr.npy")

    syst_mask = np.logical_and(bdt_eff_arr >= working_point - variation_range,
                               bdt_eff_arr <= working_point + variation_range)
    bdt_eff_syst_arr = bdt_eff_arr[syst_mask]
    score_eff_syst_arr = score_eff_arr[syst_mask]

    model_hdl = ModelHandler()
    model_hdl.load_model_handler(ml_model_path + "/model_hndl.pkl")

    selected_dataH.get_handler_from_large_file(
        path_to_data + data_table_name, "DataTable", model_hdl,
        f"model_output>{score_eff_syst_arr[-1]}")
    selected_lsH.get_handler_from_large_file(
        path_to_data + bkg_table_name, "DataTable", model_hdl,
        f"model_output>{score_eff_syst_arr[-1]}")
    # if pp_mode:
    #         selected_emH = TreeHandler()
    #         selected_emH.get_handler_from_large_file(path_to_data + "DataTable_pp_mixDeu.root", "DataTable", model_hdl, f"model_output>{score_eff_syst_arr[-1]}")

    print("Selected data len: ", len(selected_dataH))
    print("Selected ls len: ", len(selected_lsH))

    if significance_scan: