def load_ML_analysis(self, cent_class, pt_range, ct_range, split=''): info_string = f'_{cent_class[0]}{cent_class[1]}_{pt_range[0]}{pt_range[1]}_{ct_range[0]}{ct_range[1]}{split}' handlers_path = os.environ['HYPERML_MODELS_{}'.format( self.mode)] + '/handlers' efficiencies_path = os.environ['HYPERML_EFFICIENCIES_{}'.format( self.mode)] filename_handler = handlers_path + '/model_handler' + info_string + '.pkl' filename_efficiencies = efficiencies_path + '/Eff_Score' + info_string + '.npy' eff_score_array = np.load(filename_efficiencies) model_handler = ModelHandler() model_handler.load_model_handler(filename_handler) return eff_score_array, model_handler
f'Number of candidates ({split}) for training in {ct_bins[0]} <= ct < {ct_bins[1]} cm: {len(train_test_data[0])}' ) print( f'signal candidates: {np.count_nonzero(train_test_data[1] == 1)}; background candidates: {np.count_nonzero(train_test_data[1] == 0)}; n_cand_bkg / n_cand_signal = {np.count_nonzero(train_test_data[1] == 0) / np.count_nonzero(train_test_data[1] == 1)}' ) model_hdl.train_test_model(train_test_data, return_prediction=True) model_file_name = str(f'models/{bin_model}_trained') if OPTIMIZE: model_file_name = str( f'models/{bin_model}_optimized_trained') model_hdl.dump_model_handler(model_file_name) elif COMPUTE_SCORES_FROM_EFF and isModelTrained: print('Model trained...') if OPTIMIZED: model_hdl.load_model_handler( f'models/{bin_model}_trained') else: model_hdl.load_model_handler( f'models/{bin_model}_trained') else: continue ct_bins_df_index = int(ct_bins[0] / 5 - 1) for ct_bins_df in zip( CT_BINS_APPLY[i_cent_bins][ct_bins_df_index][:-1], CT_BINS_APPLY[i_cent_bins][ct_bins_df_index][1:]): bin_df = f'{split}_{cent_bins[0]}_{cent_bins[1]}_{ct_bins_df[0]}_{ct_bins_df[1]}' # get only centrality selected train_test_data_cent = [ pd.DataFrame(), [],
def main(): #pylint: disable=too-many-statements, too-many-branches # read config file parser = argparse.ArgumentParser(description='Arguments to pass') parser.add_argument('cfgFileName', metavar='text', default='cfgFileNameML.yml', help='config file name for ml') args = parser.parse_args() print('Loading analysis configuration: ...', end='\r') with open(args.cfgFileName, 'r') as ymlCfgFile: inputCfg = yaml.load(ymlCfgFile, yaml.FullLoader) print('Loading analysis configuration: Done!') PtBins = [[a, b] for a, b in zip(inputCfg['pt_ranges']['min'], inputCfg['pt_ranges']['max'])] OutputLabels = [ inputCfg['output']['out_labels']['Bkg'], inputCfg['output']['out_labels']['Prompt'] ] if inputCfg['output']['out_labels']['FD'] is not None: OutputLabels.append(inputCfg['output']['out_labels']['FD']) ColumnsToSave = inputCfg['appl']['column_to_save_list'] ModelList = inputCfg['ml']['saved_models'] ModelHandls = [] for iBin in range(len(PtBins)): ModelPath = ModelList[iBin] if not isinstance(ModelPath, str): print('\033[91mERROR: path to model not correctly defined!\033[0m') sys.exit() ModelPath = os.path.expanduser(ModelPath) print(f'Loaded saved model: {ModelPath}') ModelHandl = ModelHandler() ModelHandl.load_model_handler(ModelPath) ModelHandls.append(ModelHandl) for inputFile, outName in zip(inputCfg['standalone_appl']['inputs'], inputCfg['standalone_appl']['output_names']): print(f'Loading and preparing data file {inputFile}: ...', end='\r') DataHandler = TreeHandler(inputFile) DataHandler.slice_data_frame('pt_cand', PtBins, True) print(f'Loading and preparing data files {inputFile}: Done!') print('Applying ML model to dataframes: ...', end='\r') for iBin, PtBin in enumerate(PtBins): OutPutDirPt = os.path.join( os.path.expanduser(inputCfg['standalone_appl']['output_dir']), f'pt{PtBin[0]}_{PtBin[1]}') if os.path.isdir(OutPutDirPt): print(( f'\033[93mWARNING: Output directory \'{OutPutDirPt}\' already exists,' ' overwrites possibly ongoing!\033[0m')) else: os.makedirs(OutPutDirPt) DataDfPtSel = DataHandler.get_slice(iBin) yPred = ModelHandls[iBin].predict(DataDfPtSel, inputCfg['ml']['raw_output']) ColumnsToSaveFinal = ColumnsToSave if not isinstance(ColumnsToSaveFinal, list): print( '\033[91mERROR: column_to_save_list must be defined!\033[0m' ) sys.exit() if 'inv_mass' not in ColumnsToSaveFinal: print( '\033[93mWARNING: inv_mass is not going to be saved in the output dataframe!\033[0m' ) if 'pt_cand' not in ColumnsToSaveFinal: print( '\033[93mWARNING: pt_cand is not going to be saved in the output dataframe!\033[0m' ) if 'pt_B' in ColumnsToSaveFinal and 'pt_B' not in DataDfPtSel.columns: ColumnsToSaveFinal.remove('pt_B') # only in MC DataDfPtSel = DataDfPtSel.loc[:, ColumnsToSaveFinal] if ModelHandls[iBin].get_n_classes() < 3: DataDfPtSel['ML_output'] = yPred else: for Pred, Lab in enumerate(OutputLabels): DataDfPtSel[f'ML_output_{Lab}'] = yPred[:, Pred] DataDfPtSel.to_parquet( f'{OutPutDirPt}/{outName}_pT_{PtBin[0]}_{PtBin[1]}_ModelApplied.parquet.gzip' ) del DataDfPtSel print('Applying ML model to dataframes: Done!')
def main(): #pylint: disable=too-many-statements # read config file parser = argparse.ArgumentParser(description='Arguments to pass') parser.add_argument('cfgFileName', metavar='text', default='cfgFileNameML.yml', help='config file name for ml') parser.add_argument("--train", help="perform only training and testing", action="store_true") parser.add_argument("--apply", help="perform only application", action="store_true") args = parser.parse_args() print('Loading analysis configuration: ...', end='\r') with open(args.cfgFileName, 'r') as ymlCfgFile: inputCfg = yaml.load(ymlCfgFile, yaml.FullLoader) print('Loading analysis configuration: Done!') print('Loading and preparing data files: ...', end='\r') PromptHandler = TreeHandler(inputCfg['input']['prompt'], inputCfg['input']['treename']) FDHandler = None if inputCfg['input']['FD'] is None else TreeHandler( inputCfg['input']['FD'], inputCfg['input']['treename']) DataHandler = TreeHandler(inputCfg['input']['data'], inputCfg['input']['treename']) if inputCfg['data_prep']['filt_bkg_mass']: BkgHandler = DataHandler.get_subset( inputCfg['data_prep']['filt_bkg_mass'], frac=1., rndm_state=inputCfg['data_prep']['seed_split']) else: BkgHandler = DataHandler PtBins = [[a, b] for a, b in zip(inputCfg['pt_ranges']['min'], inputCfg['pt_ranges']['max'])] PromptHandler.slice_data_frame('pt_cand', PtBins, True) if FDHandler is not None: FDHandler.slice_data_frame('pt_cand', PtBins, True) DataHandler.slice_data_frame('pt_cand', PtBins, True) BkgHandler.slice_data_frame('pt_cand', PtBins, True) print('Loading and preparing data files: Done!') for iBin, PtBin in enumerate(PtBins): print( f'\n\033[94mStarting ML analysis --- {PtBin[0]} < pT < {PtBin[1]} GeV/c\033[0m' ) OutPutDirPt = os.path.join( os.path.expanduser(inputCfg['output']['dir']), f'pt{PtBin[0]}_{PtBin[1]}') if os.path.isdir(OutPutDirPt): print(( f'\033[93mWARNING: Output directory \'{OutPutDirPt}\' already exists,' ' overwrites possibly ongoing!\033[0m')) else: os.makedirs(OutPutDirPt) # data preparation #_____________________________________________ FDDfPt = pd.DataFrame() if FDHandler is None else FDHandler.get_slice( iBin) TrainTestData, PromptDfSelForEff, FDDfSelForEff = data_prep( inputCfg, iBin, PtBin, OutPutDirPt, PromptHandler.get_slice(iBin), FDDfPt, BkgHandler.get_slice(iBin)) if args.apply and inputCfg['data_prep']['test_fraction'] < 1.: print( '\033[93mWARNING: Using only a fraction of the MC for the application! Are you sure?\033[0m' ) # training, testing #_____________________________________________ if not args.apply: ModelHandl = train_test(inputCfg, PtBin, OutPutDirPt, TrainTestData, iBin) else: ModelList = inputCfg['ml']['saved_models'] ModelPath = ModelList[iBin] if not isinstance(ModelPath, str): print( '\033[91mERROR: path to model not correctly defined!\033[0m' ) sys.exit() ModelPath = os.path.expanduser(ModelPath) print(f'Loaded saved model: {ModelPath}') ModelHandl = ModelHandler() ModelHandl.load_model_handler(ModelPath) # model application #_____________________________________________ if not args.train: appl(inputCfg, PtBin, OutPutDirPt, ModelHandl, DataHandler.get_slice(iBin), PromptDfSelForEff, FDDfSelForEff) # delete dataframes to release memory for data in TrainTestData: del data del PromptDfSelForEff, FDDfSelForEff
plt.ylabel('ROC AUC') plt.legend() plt.savefig('../opt_comp.png', dpi = 100, facecolor = 'white') plt.close() ################################################################################## # BEST HYPERPARAMETERS FOR EACH METHOD names = ['Opt_test_OPTUNA', 'Opt_test_BAYES', 'Opt_test_DEFAULT', 'Opt_test_PbPb'] if False: for name in names: model_hdl = ModelHandler() model_hdl.load_model_handler('../analysis_results/' + name + '/model/model_hdl') print(name) print(model_hdl.get_model_params()) print('\n---------------\n') ################################################################################## # PLOT SUPERIMPOSED ROC ''' plt.close() objects = [] for n in names: with (open('../analysis_results/' + n + '/images/training/ROC_AUC_train_test.pickle', "rb")) as openfile: while True:
def main(): # read config file parser = argparse.ArgumentParser(description='Arguments to pass') parser.add_argument('cfgFileName', metavar='text', default='cfgFileNameML.yml', help='config file name for ml') parser.add_argument("--train", help="perform only training and testing", action="store_true") parser.add_argument("--apply", help="perform only application", action="store_true") args = parser.parse_args() print('Loading analysis configuration: ...', end='\r') with open(args.cfgFileName, 'r') as ymlCfgFile: inputCfg = yaml.load(ymlCfgFile, yaml.FullLoader) print('Loading analysis configuration: Done!') print('Loading data files: ...', end='\r') PromptDf = LoadDfFromRootOrParquet(inputCfg['input']['prompt']) FDDf = LoadDfFromRootOrParquet(inputCfg['input']['FD']) DataDf = LoadDfFromRootOrParquet(inputCfg['input']['data']) print('Loading data files: Done!') for iBin, (PtMin, PtMax) in enumerate( zip(inputCfg['pt_ranges']['min'], inputCfg['pt_ranges']['max'])): print( f'\n\033[94mStarting ML analysis --- {PtMin} < pT < {PtMax} GeV/c\033[0m' ) OutPutDirPt = os.path.join(inputCfg['output']['dir'], f'pt{PtMin}_{PtMax}') if os.path.isdir(OutPutDirPt): print( 'Output directory already exists, overwrites possibly ongoing!' ) else: os.mkdir(OutPutDirPt) # data preparation #_____________________________________________ TrainTestData, DataDfPtSel, PromptDfPtSelForEff, FDDfPtSelForEff = data_prep( \ inputCfg, iBin, PtMin, PtMax, OutPutDirPt, DataDf, PromptDf, FDDf) # training, testing #_____________________________________________ if not args.apply: ModelHandl = train_test(inputCfg, PtMin, PtMax, OutPutDirPt, TrainTestData) else: ModelList = inputCfg['ml']['saved_models'] ModelPath = ModelList[iBin] if not isinstance(ModelPath, str): print(f'ERROR: path to model not correctly defined!') sys.exit() print(f'Loaded saved model: {ModelPath}') ModelHandl = ModelHandler() ModelHandl.load_model_handler(ModelPath) # model application #_____________________________________________ if not args.train: appl(inputCfg, PtMin, PtMax, OutPutDirPt, ModelHandl, DataDfPtSel, PromptDfPtSelForEff, FDDfPtSelForEff) # delete dataframes to release memory for data in TrainTestData: del data del DataDfPtSel, PromptDfPtSelForEff, FDDfPtSelForEff
def get_skimmed_large_data(data_path, cent_classes, pt_bins, ct_bins, training_columns, application_columns, mode, split=''): print('\n++++++++++++++++++++++++++++++++++++++++++++++++++') print('\nStarting BDT appplication on large data') if mode == 3: handlers_path = os.environ['HYPERML_MODELS_3'] + '/handlers' efficiencies_path = os.environ['HYPERML_EFFICIENCIES_3'] if mode == 2: handlers_path = os.environ['HYPERML_MODELS_2'] + '/handlers' efficiencies_path = os.environ['HYPERML_EFFICIENCIES_2'] executor = ThreadPoolExecutor() iterator = uproot.pandas.iterate(data_path, 'DataTable', executor=executor, reportfile=True) df_applied = pd.DataFrame() for current_file, data in iterator: rename_df_columns(data) print('current file: {}'.format(current_file)) print('start entry chunk: {}, stop entry chunk: {}'.format( data.index[0], data.index[-1])) for cclass in cent_classes: for ptbin in zip(pt_bins[:-1], pt_bins[1:]): for ctbin in zip(ct_bins[:-1], ct_bins[1:]): info_string = '_{}{}_{}{}_{}{}'.format( cclass[0], cclass[1], ptbin[0], ptbin[1], ctbin[0], ctbin[1]) filename_handler = handlers_path + '/model_handler' + info_string + split + '.pkl' filename_efficiencies = efficiencies_path + '/Eff_Score' + info_string + split + '.npy' model_handler = ModelHandler() model_handler.load_model_handler(filename_handler) eff_score_array = np.load(filename_efficiencies) tsd = eff_score_array[1][-1] data_range = f'{ctbin[0]}<ct<{ctbin[1]} and {ptbin[0]}<pt<{ptbin[1]} and {cclass[0]}<=centrality<{cclass[1]}' df_tmp = data.query(data_range) df_tmp.insert( 0, 'score', model_handler.predict(df_tmp[training_columns])) df_tmp = df_tmp.query('score>@tsd') df_tmp = df_tmp.loc[:, application_columns] df_applied = df_applied.append(df_tmp, ignore_index=True, sort=False) print(df_applied.info(memory_usage='deep')) return df_applied
########################################################################## print('\nHypertriton 3-body - pp @ 13 TeV\n') if flag_dict['train_model']: print('Starting model training & application\n') train.train_model(filename_dict, presel_dict, flag_dict, eff_array, train_vars, params, params_range) print('Model training & application complete\n') #print('BENCHMARKING') #utils.benchmark_hyperparam_optimizers(filename_dict, params, params_range, flag_dict, presel_dict, train_vars) model_hdl = ModelHandler() model_hdl.load_model_handler(filename_dict['analysis_path'] + '/model/model_hdl') print('Model loaded\n') eff_array, scores = train.load_eff_scores(filename_dict['analysis_path'] + 'output_data/') data = train.load_data_with_scores(filename_dict['analysis_path'] + 'output_data/data_scores.parquet.gzip' ) #pd dataframe already processed print('Data loaded\n') #data.query('model_output > -5', inplace = True) ## PARAM!!!!! #print('Query on data applied\n') background_ls = train.load_data_with_scores( filename_dict['analysis_path'] + 'output_data/bckg_ls_scores.parquet.gzip')
''' import os import sys import argparse from hipe4ml.model_handler import ModelHandler parser = argparse.ArgumentParser(description='Arguments to pass') parser.add_argument('inFilePkl', metavar='text', default='model.pkl', help='input pickle file to be converted') args = parser.parse_args() ModelPath = os.path.expanduser(args.inFilePkl) print(f'Loaded saved model: {ModelPath}') ModelHandl = ModelHandler() ModelHandl.load_model_handler(ModelPath) if '.pickle' in ModelPath: outFileName = ModelPath.replace('.pickle', '.model') elif '.pkl' in ModelPath: outFileName = ModelPath.replace('.pkl', '.model') else: print(f'ERROR: invalid input file {ModelHandl}, please check it! Exit') sys.exit() ModelHandl.dump_original_model(outFileName, True) print(f'Saved model: {outFileName}')
for cclass in CENT_CLASSES: for ptbin in zip(PT_BINS[:-1], PT_BINS[1:]): for ctbin in zip(CT_BINS[:-1], CT_BINS[1:]): # data[0]=train_set, data[1]=y_train, data[2]=test_set, data[3]=y_test data = ml_analysis.prepare_dataframe(COLUMNS, cent_class=cclass, ct_range=ctbin, pt_range=ptbin) input_model = xgb.XGBClassifier() model_handler = ModelHandler(input_model) info_string = f'_{cclass[0]}{cclass[1]}_{ptbin[0]}{ptbin[1]}_{ctbin[0]}{ctbin[1]}{split}' filename_handler = handlers_path + '/model_handler' + info_string + '.pkl' model_handler.load_model_handler(filename_handler) y_pred = model_handler.predict(data[2]) test_set = pd.concat([data[2], data[3]], axis=1, sort=False) test_set.insert(0, 'score', y_pred) test_set.query('y>0', inplace=True) mass_bins = 40 if ctbin[1] < 16 else 36 eff_score_array, model_handler = ml_application.load_ML_analysis( cclass, ptbin, ctbin, split) eff_index = 1 for eff, tsd in zip(pd.unique(eff_score_array[0][::-1]), pd.unique(eff_score_array[1][::-1])): #after selection
pp_string = "_pp" else: simH = TreeHandler(path_to_data + signal_table_name, "GenTable") presel_eff = len(signalH) / len(simH) print("Presel Eff: ", presel_eff) bdt_eff_arr = np.load(efficiencies_path + "/efficiency_arr.npy") score_eff_arr = np.load(efficiencies_path + "/score_efficiency_arr.npy") syst_mask = np.logical_and(bdt_eff_arr >= working_point - variation_range, bdt_eff_arr <= working_point + variation_range) bdt_eff_syst_arr = bdt_eff_arr[syst_mask] score_eff_syst_arr = score_eff_arr[syst_mask] model_hdl = ModelHandler() model_hdl.load_model_handler(ml_model_path + "/model_hndl.pkl") selected_dataH.get_handler_from_large_file( path_to_data + data_table_name, "DataTable", model_hdl, f"model_output>{score_eff_syst_arr[-1]}") selected_lsH.get_handler_from_large_file( path_to_data + bkg_table_name, "DataTable", model_hdl, f"model_output>{score_eff_syst_arr[-1]}") # if pp_mode: # selected_emH = TreeHandler() # selected_emH.get_handler_from_large_file(path_to_data + "DataTable_pp_mixDeu.root", "DataTable", model_hdl, f"model_output>{score_eff_syst_arr[-1]}") print("Selected data len: ", len(selected_dataH)) print("Selected ls len: ", len(selected_lsH)) if significance_scan: