def main(): # read config file parser = argparse.ArgumentParser(description='Arguments to pass') parser.add_argument('cfgFileName', metavar='text', default='cfgFileNameCheck.yml', help='config file name for check') args = parser.parse_args() print('Loading check configuration: ...', end='\r') with open(args.cfgFileName, 'r') as ymlCfgFile: inputCfg = yaml.load(ymlCfgFile, yaml.FullLoader) print('Loading check configuration: Done!') print('Loading data files: ...', end='\r') DfList = [] for filePath in inputCfg['input']['files']: DfList.append(pd.read_parquet(filePath)) print('Loading data files: Done!') for (PtMin, PtMax) in zip(inputCfg['pt_ranges']['min'], inputCfg['pt_ranges']['max']): print(f'Plot variable distributions --- {PtMin} < pT < {PtMax} GeV/c') DfListPt = [] for df in DfList: DfListPt.append(df.query(f'{PtMin} < pt_cand < {PtMax}')) VarsToDraw = inputCfg['plotting_columns'] LegLabels = inputCfg['output']['leg_labels'] OutPutDir = inputCfg['output']['dir'] plot_utils.plot_distr(DfListPt, VarsToDraw, (12, 7), 100, True, LegLabels) plt.subplots_adjust(left=0.06, bottom=0.06, right=0.99, top=0.96, hspace=0.55, wspace=0.55) plt.savefig(f'{OutPutDir}/DistrComp_pT_{PtMin}_{PtMax}.pdf') plt.close('all') del DfListPt del DfList
def plot_distr_comparison(hdl1, hdl2, name, filename_dict, label_1='df1', label_2='df2', col_names=None, nbins=100): print('Plotting comparison of variable distributions') df1 = hdl1.get_data_frame() df2 = hdl2.get_data_frame() if col_names == None: col_names = list(df1.columns) column = [] for col in col_names: if col in list(df2.columns): column.append(col) plt.close() plot_utils.plot_distr([hdl1, hdl2], alpha=0.5, bins=100, labels=[label_1, label_2], figsize=((20, 20)), density=True, column=column) plt.savefig(filename_dict['analysis_path'] + 'images/var_distribution/' + name[:-1] + '.png') plt.close() for col in col_names: if col in list(df2.columns): plt.figure() df1[col].hist(alpha=0.5, bins=nbins, label=label_1, density=True) df2[col].hist(alpha=0.5, bins=nbins, label=label_2, density=True) plt.legend() plt.savefig(filename_dict['analysis_path'] + 'images/var_distribution/' + name + str(col) + '.png', facecolor='white') plt.close() print('Done\n')
def plot_distributions(tree_hdl, filename_dict, name, vars=None): """Plot the distribution of the variables in the tree handler Args: tree_hdl (hipe4ml.tree_handler): the tree with the data filename_dict (dictionary): dictionary of the filenames name (string): name of the plot vars (list, optional): the variables to plot. None for all variables. Defaults to None. """ plt.close() plots = plot_utils.plot_distr(tree_hdl, column=vars, figsize=((20, 20))) plt.savefig(filename_dict['analysis_path'] + 'images/var_distribution/' + name + '.png', dpi=500, facecolor='white') plt.close()
def data_prep(inputCfg, iBin, PtBin, OutPutDirPt, PromptDf, FDDf, BkgDf): #pylint: disable=too-many-statements, too-many-branches ''' function for data preparation ''' nPrompt = len(PromptDf) nFD = len(FDDf) nBkg = len(BkgDf) if FDDf.empty: out = f'\n Signal: {nPrompt}\n Bkg: {nBkg}' else: out = f'\n Prompt: {nPrompt}\n FD: {nFD}\n Bkg: {nBkg}' print( f'Number of available candidates in {PtBin[0]} < pT < {PtBin[1]} GeV/c:{out}' ) dataset_opt = inputCfg['data_prep']['dataset_opt'] seed_split = inputCfg['data_prep']['seed_split'] test_f = inputCfg['data_prep']['test_fraction'] if dataset_opt == 'equal': if FDDf.empty: nCandToKeep = min([nPrompt, nBkg]) out = 'signal' out2 = 'signal' else: nCandToKeep = min([nPrompt, nFD, nBkg]) out = 'prompt, FD' out2 = 'prompt' print(( f'Keep same number of {out} and background (minimum) for training and ' f'testing ({1 - test_f}-{test_f}): {nCandToKeep}')) print( f'Fraction of real data candidates used for ML: {nCandToKeep/nBkg:.5f}' ) if nPrompt > nCandToKeep: print((f'Remaining {out2} candidates ({nPrompt - nCandToKeep})' 'will be used for the efficiency together with test set')) if nFD > nCandToKeep: print(( f'Remaining FD candidates ({nFD - nCandToKeep}) will be used for the ' 'efficiency together with test set')) TotDf = pd.concat([ BkgDf.iloc[:nCandToKeep], PromptDf.iloc[:nCandToKeep], FDDf.iloc[:nCandToKeep] ], sort=True) if FDDf.empty: LabelsArray = np.array([0] * nCandToKeep + [1] * nCandToKeep) else: LabelsArray = np.array([0] * nCandToKeep + [1] * nCandToKeep + [2] * nCandToKeep) if test_f < 1: TrainSet, TestSet, yTrain, yTest = train_test_split( TotDf, LabelsArray, test_size=test_f, random_state=seed_split) else: TrainSet = pd.DataFrame() TestSet = TotDf.copy() yTrain = pd.Series() yTest = LabelsArray.copy() TrainTestData = [TrainSet, yTrain, TestSet, yTest] PromptDfSelForEff = pd.concat([ PromptDf.iloc[nCandToKeep:], TestSet[pd.Series(yTest).array == 1] ], sort=False) if FDDf.empty: FDDfSelForEff = pd.DataFrame() else: FDDfSelForEff = pd.concat([ FDDf.iloc[nCandToKeep:], TestSet[pd.Series(yTest).array == 2] ], sort=False) del TotDf elif dataset_opt == 'max_signal': nCandBkg = round(inputCfg['data_prep']['bkg_mult'][iBin] * (nPrompt + nFD)) out = 'signal' if FDDf.empty else 'prompt and FD' print(( f'Keep all {out} and use {nCandBkg} bkg candidates for training and ' f'testing ({1 - test_f}-{test_f})')) if nCandBkg >= nBkg: nCandBkg = nBkg print('\033[93mWARNING: using all bkg available, not good!\033[0m') print( f'Fraction of real data candidates used for ML: {nCandBkg/nBkg:.5f}' ) TotDf = pd.concat([BkgDf.iloc[:nCandBkg], PromptDf, FDDf], sort=True) if FDDf.empty: LabelsArray = np.array([0] * nCandBkg + [1] * nPrompt) else: LabelsArray = np.array([0] * nCandBkg + [1] * nPrompt + [2] * nFD) if test_f < 1: TrainSet, TestSet, yTrain, yTest = train_test_split( TotDf, LabelsArray, test_size=test_f, random_state=seed_split) else: TrainSet = pd.DataFrame() TestSet = TotDf.copy() yTrain = pd.Series() yTest = LabelsArray.copy() TrainTestData = [TrainSet, yTrain, TestSet, yTest] PromptDfSelForEff = TestSet[pd.Series(yTest).array == 1] FDDfSelForEff = pd.DataFrame() if FDDf.empty else TestSet[pd.Series( yTest).array == 2] del TotDf else: print(f'\033[91mERROR: {dataset_opt} is not a valid option!\033[0m') sys.exit() # plots VarsToDraw = inputCfg['plots']['plotting_columns'] LegLabels = [ inputCfg['output']['leg_labels']['Bkg'], inputCfg['output']['leg_labels']['Prompt'] ] if inputCfg['output']['leg_labels']['FD'] is not None: LegLabels.append(inputCfg['output']['leg_labels']['FD']) OutputLabels = [ inputCfg['output']['out_labels']['Bkg'], inputCfg['output']['out_labels']['Prompt'] ] if inputCfg['output']['out_labels']['FD'] is not None: OutputLabels.append(inputCfg['output']['out_labels']['FD']) ListDf = [BkgDf, PromptDf] if FDDf.empty else [BkgDf, PromptDf, FDDf] #_____________________________________________ plot_utils.plot_distr(ListDf, VarsToDraw, 100, LegLabels, figsize=(12, 7), alpha=0.3, log=True, grid=False, density=True) plt.subplots_adjust(left=0.06, bottom=0.06, right=0.99, top=0.96, hspace=0.55, wspace=0.55) plt.savefig(f'{OutPutDirPt}/DistributionsAll_pT_{PtBin[0]}_{PtBin[1]}.pdf') plt.close('all') #_____________________________________________ CorrMatrixFig = plot_utils.plot_corr(ListDf, VarsToDraw, LegLabels) for Fig, Lab in zip(CorrMatrixFig, OutputLabels): plt.figure(Fig.number) plt.subplots_adjust(left=0.2, bottom=0.25, right=0.95, top=0.9) Fig.savefig( f'{OutPutDirPt}/CorrMatrix{Lab}_pT_{PtBin[0]}_{PtBin[1]}.pdf') return TrainTestData, PromptDfSelForEff, FDDfSelForEff
background_tree_handler = TreeHandler() prompt_tree_handler.set_data_frame(df_prompt_ct) non_prompt_tree_handler.set_data_frame(df_non_prompt_ct) background_tree_handler.set_data_frame(df_background_ct) del df_prompt_ct, df_non_prompt_ct, df_background_ct if not os.path.isdir(f'{PLOT_DIR}/features'): os.mkdir(f'{PLOT_DIR}/features') leg_labels = ['background', 'non-prompt', 'prompt'] plot_distr = plot_utils.plot_distr([ background_tree_handler, non_prompt_tree_handler, prompt_tree_handler ], TRAINING_COLUMNS_LIST, bins=40, labels=leg_labels, log=True, density=True, figsize=(12, 12), alpha=0.5, grid=False) plt.subplots_adjust(left=0.06, bottom=0.06, right=0.99, top=0.96, hspace=0.50, wspace=0.50) plt.tight_layout() plt.savefig(f'{PLOT_DIR}/features/FeaturePlots.pdf') bkg_corr = plot_utils.plot_corr([background_tree_handler], TRAINING_COLUMNS_LIST, ['Background'])
def test_plot_distr(): """ Test the feature distribution plot """ assert isinstance(plot_utils.plot_distr( [SIG_DF, BKG_DF], SIG_DF.columns), np.ndarray)
def data_prep(inputCfg, iBin, PtMin, PtMax, OutPutDirPt, DataDf, PromptDf, FDDf): #pylint: disable=too-many-statements ''' function for data preparation ''' DataDfPtSel = DataDf.query(f'{PtMin} < pt_cand < {PtMax}') BkgDfPtSel = DataDfPtSel.query(inputCfg['data_prep']['filt_bkg_mass']) PromptDfPtSel = PromptDf.query(f'{PtMin} < pt_cand < {PtMax}') FDDfPtSel = FDDf.query(f'{PtMin} < pt_cand < {PtMax}') nPrompt = len(PromptDfPtSel) nFD = len(FDDfPtSel) nBkg = len(BkgDfPtSel) print(( f'Number of available candidates in {PtMin} < pT < {PtMax} GeV/c:\n Prompt: {nPrompt}' f'\n FD: {nFD}\n Bkg: {nBkg}')) dataset_opt = inputCfg['data_prep']['dataset_opt'] seed_split = inputCfg['data_prep']['seed_split'] test_f = inputCfg['data_prep']['test_fraction'] if dataset_opt == 'equal': nCandToKeep = min([nPrompt, nFD, nBkg]) print(( 'Keep same number of prompt, FD, and background (minimum) for training and ' f'testing ({1 - test_f}-{test_f}): {nCandToKeep}')) print( f'Fraction of real data candidates used for ML: {nCandToKeep/nBkg:.5f}' ) if nPrompt > nCandToKeep: print((f'Remaining prompt candidates ({nPrompt - nCandToKeep})' 'will be used for the efficiency together with test set')) if nFD > nCandToKeep: print(( f'Remaining FD candidates ({nFD - nCandToKeep}) will be used for the ' 'efficiency together with test set')) TotDfPtSel = pd.concat([ BkgDfPtSel.iloc[:nCandToKeep], PromptDfPtSel.iloc[:nCandToKeep], FDDfPtSel.iloc[:nCandToKeep] ], sort=True) LabelsArray = [0] * nCandToKeep + [1] * nCandToKeep + [2] * nCandToKeep TrainSet, TestSet, yTrain, yTest = train_test_split( TotDfPtSel, LabelsArray, test_size=test_f, random_state=seed_split) TrainTestData = [TrainSet, yTrain, TestSet, yTest] CandTypeFlags = pd.Series(yTest) PromptDfPtSelForEff = pd.concat([ PromptDfPtSel.iloc[nCandToKeep:], TestSet[CandTypeFlags.values == 1] ], sort=False) FDDfPtSelForEff = pd.concat( [FDDfPtSel.iloc[nCandToKeep:], TestSet[CandTypeFlags.values == 2]], sort=False) del TotDfPtSel elif dataset_opt == 'max_signal': nCandBkg = round(inputCfg['ml']['bkg_mult'][iBin] * (nPrompt + nFD)) print(( f'Keep all prompt and FD and use {nCandBkg} bkg candidates for training and ' f'testing ({1 - test_f}-{test_f})')) if nCandBkg >= nBkg: nCandBkg = nBkg print('WARNING: using all bkg available, not good!') print( f'Fraction of real data candidates used for ML: {nCandBkg/nBkg:.5f}' ) TotDfPtSel = pd.concat( [BkgDfPtSel.iloc[:nCandBkg], PromptDfPtSel, FDDfPtSel], sort=True) LabelsArray = [0] * nCandBkg + [1] * nPrompt + [2] * nFD TrainSet, TestSet, yTrain, yTest = train_test_split( TotDfPtSel, LabelsArray, test_size=test_f, random_state=seed_split) TrainTestData = [TrainSet, yTrain, TestSet, yTest] CandTypeFlags = pd.Series(yTest) PromptDfPtSelForEff = TestSet[CandTypeFlags.values == 1] FDDfPtSelForEff = TestSet[CandTypeFlags.values == 2] del TotDfPtSel else: print(f'ERROR: {dataset_opt} is not a valid option!') sys.exit() # plots VarsToDraw = inputCfg['ml']['plotting_columns'] LegLabels = inputCfg['output']['leg_labels'] OutputLabels = inputCfg['output']['out_labels'] #_____________________________________________ plot_utils.plot_distr([BkgDfPtSel, PromptDfPtSel, FDDfPtSel], VarsToDraw, (12, 7), 100, True, LegLabels, 0.3) plt.subplots_adjust(left=0.06, bottom=0.06, right=0.99, top=0.96, hspace=0.55, wspace=0.55) plt.savefig(f'{OutPutDirPt}/DistributionsAll_pT_{PtMin}_{PtMax}.pdf') plt.close('all') #_____________________________________________ CorrMatrixFig = plot_utils.plot_corr( [BkgDfPtSel, PromptDfPtSel, FDDfPtSel], VarsToDraw, LegLabels) for Fig, Lab in zip(CorrMatrixFig, OutputLabels): plt.figure(Fig.number) plt.subplots_adjust(left=0.2, bottom=0.25, right=0.95, top=0.9) Fig.savefig(f'{OutPutDirPt}/CorrMatrix{Lab}_pT_{PtMin}_{PtMax}.pdf') del BkgDfPtSel, PromptDfPtSel, FDDfPtSel return TrainTestData, DataDfPtSel, PromptDfPtSelForEff, FDDfPtSelForEff
def main(): # read config file parser = argparse.ArgumentParser(description='Arguments to pass') parser.add_argument('cfgFileName', metavar='text', default='cfgFileNameCheck.yml', help='config file name for check') args = parser.parse_args() print('Loading check configuration: ...', end='\r') with open(args.cfgFileName, 'r') as ymlCfgFile: inputCfg = yaml.load(ymlCfgFile, yaml.FullLoader) print('Loading check configuration: Done!') print('Loading data files: ...', end='\r') DfList = [] inDirName = inputCfg['input']['dirname'] inTreeName = inputCfg['input']['treename'] for filePath in inputCfg['input']['files']: DfList.append(LoadDfFromRootOrParquet(filePath, inDirName, inTreeName)) print('Loading data files: Done!') print('Appling simple pre-filtering: ...', end='\r') DfListSel = [] for df, query in zip(DfList, inputCfg['queries']): DfListSel.append(df.query(query)) print('Pre-filtering: Done!') del DfList VarsToDraw = inputCfg['plotting_columns'] LegLabels = inputCfg['output']['leg_labels'] Colors = inputCfg['output']['colors'] OutPutDir = inputCfg['output']['dir'] for PtMin, PtMax, LimMin, LimMax in zip(inputCfg['pt_ranges']['min'], inputCfg['pt_ranges']['max'], inputCfg['plot_lim_min'], inputCfg['plot_lim_max']): print(f'Plot variable distributions --- {PtMin} < pT < {PtMax} GeV/c') DfListPt = [] for df in DfListSel: DfListPt.append(df.query(f'{PtMin} < pt_cand < {PtMax}')) #print(len(DfListPt), len(Colors)) DistrPlot = plot_utils.plot_distr(DfListPt, VarsToDraw, 1000, LegLabels, figsize=(6, 6), density=True, histtype='stepfilled', grid=False, log=True, colors=Colors, alpha=0.3) plt.subplots_adjust(left=0.1, bottom=0.05, right=0.95, top=0.95, hspace=0.4) if not isinstance(DistrPlot, np.ndarray): DistrPlot = np.array([DistrPlot]) print(len(DistrPlot), len(LimMin), len(LimMax), len(inputCfg['xaxes_label'])) for ax, minVar, maxVar, xLabel in zip(DistrPlot, LimMin, LimMax, inputCfg['xaxes_label']): ax.set_xlim(minVar, maxVar) ax.set_xlabel(xLabel, fontsize=10, ha='right', position=(1, 20)) ax.set_ylabel('Counts (arb. units)', fontsize=10, ha='right', position=(20, 1)) plt.legend(frameon=False, fontsize=10, loc='best') ax.set_title('') ''' textstr = r'pp, $\sqrt{s}$ = 5.02 TeV' textstr2 = r'$3 < p_{\mathrm{T}} < 4~\mathrm{GeV}/c$' ax.text(0.56, 0.75, textstr, transform=ax.transAxes, fontsize=15, verticalalignment='top') ax.text(0.56, 0.69, textstr2, transform=ax.transAxes, fontsize=15, verticalalignment='top') ''' plt.tight_layout() plt.savefig(f'{OutPutDir}/NsigzoomDistrComp_pT_{PtMin}_{PtMax}.pdf') plt.close('all') del DfListPt del DfListSel
LegLabels = ['before selection', 'after selection'] varsToRemove = ['pt_B'] # HARD CODED for (cuts, ptMin, ptMax) in zip(selToApply, cutVars['Pt']['min'], cutVars['Pt']['max']): print(f'Projecting distributions for {ptMin:.1f} < pT < {ptMax:.1f} GeV/c') if isMC: dfPromptList = [dfPrompt.query(f'{ptMin} < pt_cand < {ptMax}'), dfPrompt.astype(float).query(cuts)] dfFDList = [dfFD.query(f'{ptMin} < pt_cand < {ptMax}'), dfFD.astype(float).query(cuts)] varsToDraw = list(dfPromptList[0].columns) for varToRemove in varsToRemove: if varToRemove in varsToDraw: varsToDraw.remove(varToRemove) plot_utils.plot_distr(dfPromptList, varsToDraw, 100, LegLabels, figsize=(12, 7), density=True) plt.subplots_adjust(left=0.06, bottom=0.06, right=0.99, top=0.96, hspace=0.55, wspace=0.55) plt.savefig(f'{args.outputDir}/PromptDistrCompBeforeAfterSel_pT_{ptMin}_{ptMax}.pdf') plt.close('all') del dfPromptList plot_utils.plot_distr(dfFDList, varsToDraw, 100, LegLabels, figsize=(12, 7), density=True) plt.subplots_adjust(left=0.06, bottom=0.06, right=0.99, top=0.96, hspace=0.55, wspace=0.55) plt.savefig(f'{args.outputDir}/FDDistrCompBeforeAfterSel_pT_{ptMin}_{ptMax}.pdf') plt.close('all') del dfFDList else: dfAllList = [dfAll.query(f'{ptMin} < pt_cand < {ptMax}'), dfAll.astype(float).query(cuts)] varsToDraw = list(dfAllList[0].columns)
HYP_RANGES = { # # defines the maximum depth of a single tree (regularization) 'max_depth': (5, 15), # 'learning_rate': (0.01, 0.3), # learning rate 'n_estimators': (5, 10), # number of boosting trees } MODEL.optimize_params_bayes(DATA, HYP_RANGES, 'roc_auc') # train and test the model with the updated hyperparameters MODEL.train_test_model(DATA) Y_PRED = MODEL.predict(DATA[2]) # Calculate the BDT efficiency as a function of the BDT score EFFICIENCY, THRESHOLD = analysis_utils.bdt_efficiency_array( DATA[3], Y_PRED, n_points=10) # -------------------------------------------- # PLOTTING # -------------------------------------------- FEATURES_DISTRIBUTIONS_PLOT = plot_utils.plot_distr( [SIG_DF, BKG_DF], SIG_DF.columns) CORRELATION_MATRIX_PLOT = plot_utils.plot_corr([SIG_DF, BKG_DF], SIG_DF.columns) BDT_OUTPUT_PLOT = plot_utils.plot_output_train_test(MODEL, DATA) ROC_CURVE_PLOT = plot_utils.plot_roc(DATA[3], Y_PRED) PRECISION_RECALL_PLOT = plot_utils.plot_precision_recall(DATA[3], Y_PRED) BDT_EFFICIENCY_PLOT = plot_utils.plot_bdt_eff(THRESHOLD, EFFICIENCY) FEATURES_IMPORTANCE = plot_utils.plot_feature_imp(TEST_SET, Y_TEST, MODEL) plt.show() # ---------------------------------------------
size=int(0.8 * signal_tree_handler.get_n_cand()), rndm_state=RANDOM_STATE) del background_tree_handler_full # features plot leg_labels = ['background', 'signal'] # second condition needed because of issue with Qt libraries if MAKE_FEATURES_PLOTS and not MAKE_PRESELECTION_EFFICIENCY: if not os.path.isdir(f'{PLOT_DIR}/features'): os.mkdir(f'{PLOT_DIR}/features') plot_utils.plot_distr( [background_tree_handler, signal_tree_handler], TRAINING_COLUMNS_LIST, bins=50, labels=leg_labels, log=True, density=True, figsize=(12, 7), alpha=0.3, grid=False) plt.subplots_adjust(left=0.06, bottom=0.06, right=0.99, top=0.96, hspace=0.55, wspace=0.55) plt.savefig(f'{PLOT_DIR}/features/FeaturePlots_{bin}') plot_utils.plot_corr([background_tree_handler], TRAINING_COLUMNS_LIST, ['background']) plt.savefig( f'{PLOT_DIR}/features/BackgroundCorrelationMatrix_{bin}'
def do_hipe4mlplot(self): self.logger.info("Plotting hipe4ml model") leglabels = ["Background", "Prompt signal"] outputlabels = ["Bkg", "SigPrompt"] # _____________________________________________ plot_utils.plot_distr([self.bkghandler, self.signalhandler], self.v_train, 100, leglabels) plt.subplots_adjust(left=0.06, bottom=0.06, right=0.99, top=0.96, hspace=0.55, wspace=0.55) figname = f'{self.dirmlplot}/DistributionsAll_pT_{self.p_binmin}_{self.p_binmax}.pdf' plt.savefig(figname) plt.close('all') # _____________________________________________ corrmatrixfig = plot_utils.plot_corr( [self.bkghandler, self.signalhandler], self.v_train, leglabels) for figg, labb in zip(corrmatrixfig, outputlabels): plt.figure(figg.number) plt.subplots_adjust(left=0.2, bottom=0.25, right=0.95, top=0.9) figname = f'{self.dirmlplot}/CorrMatrix{labb}_pT_{self.p_binmin}_{self.p_binmax}.pdf' figg.savefig(figname) # _____________________________________________ plt.rcParams["figure.figsize"] = (10, 7) mloutputfig = plot_utils.plot_output_train_test( self.p_hipe4ml_model, self.traintestdata, 80, self.raw_output_hipe4ml, leglabels, self.train_test_log_hipe4ml, density=True) figname = f'{self.dirmlplot}/MLOutputDistr_pT_{self.p_binmin}_{self.p_binmax}.pdf' mloutputfig.savefig(figname) # _____________________________________________ plt.rcParams["figure.figsize"] = (10, 9) roccurvefig = plot_utils.plot_roc(self.traintestdata[3], self.ypredtest_hipe4ml, None, leglabels, self.average_method_hipe4ml, self.roc_method_hipe4ml) figname = f'{self.dirmlplot}/ROCCurveAll_pT_{self.p_binmin}_{self.p_binmax}.pdf' roccurvefig.savefig(figname) # _____________________________________________ plt.rcParams["figure.figsize"] = (10, 9) roccurvettfig = plot_utils.plot_roc_train_test( self.traintestdata[3], self.ypredtest_hipe4ml, self.traintestdata[1], self.ypredtrain_hipe4ml, None, leglabels, self.average_method_hipe4ml, self.roc_method_hipe4ml) figname = f'{self.dirmlplot}/ROCCurveTrainTest_pT_{self.p_binmin}_{self.p_binmax}.pdf' roccurvettfig.savefig(figname) # _____________________________________________ precisionrecallfig = plot_utils.plot_precision_recall( self.traintestdata[3], self.ypredtest_hipe4ml, leglabels) figname = f'{self.dirmlplot}/PrecisionRecallAll_pT_{self.p_binmin}_{self.p_binmax}.pdf' precisionrecallfig.savefig(figname) # _____________________________________________ plt.rcParams["figure.figsize"] = (12, 7) featuresimportancefig = plot_utils.plot_feature_imp( self.traintestdata[2][self.v_train], self.traintestdata[3], self.p_hipe4ml_model, leglabels) for i in range(0, len(featuresimportancefig)): figname = (f'{self.dirmlplot}/FeatureImportanceOpt{i}_' f'pT_{self.p_binmin}_{self.p_binmax}.pdf') featuresimportancefig[i].savefig(figname)
] else: training_columns = [ 'TPCnSigmaHe3', 'ct', 'V0CosPA', 'ProngsDCA', 'He3ProngPvDCA', 'PiProngPvDCA', 'He3ProngPvDCAXY', 'PiProngPvDCAXY', 'NpidClustersHe3', 'TPCnSigmaPi' ] if not os.path.exists(results_ml_path): os.makedirs(results_ml_path) distr = pu.plot_distr([bkgH, signalH], training_columns, bins=63, labels=['Signal', "Background"], colors=["blue", "red"], log=True, density=True, figsize=(18, 13), alpha=0.3, grid=False) plt.subplots_adjust(left=0.06, bottom=0.06, right=0.99, top=0.96, hspace=0.55, wspace=0.55) plt.savefig(results_ml_path + "/features_distributions.png", bbox_inches='tight') corr = pu.plot_corr([signalH, bkgH], training_columns + ["m"], ['Signal', "Background"]) corr[0].savefig(results_ml_path + "/correlations.png", bbox_inches='tight')