def CompareModelwithandwithoutratios(DataSet): #### Train model paramList = { 'subsample': 1, 'reg_gamma': 0.4, 'reg_alpha': 0.1, 'n_estimators': 200, 'min_split_loss': 2, 'min_child_weight': 5, 'max_depth': 5, 'learning_rate': 0.1 } DataSet = DataCuts(DataSet) XGBModel = TreeModel(DataSet, ApplyDataCut=False, paramList=paramList) XGBModel.XGBoostTrain() AMSScore = dict() AMSScore['All_features'] = XGBModel.AMSScore(DataSet) ### No HT DataSet2 = DataSet.drop(['HT', 'ST'], axis=1) XGBModel = TreeModel(DataSet2, ApplyDataCut=False, paramList=paramList) XGBModel.XGBoostTrain() AMSScore['NO_HT'] = XGBModel.AMSScore(DataSet2) ### Noratios DataSet2 = DataSet.drop([ 'DER_PT_leading_lepton_ratio_PT_leading_jet', 'DER_PT_leading_lept_ratio_HT', 'DER_ST_ratio_PT_Leading_jet', 'DER_ST_ratio_HT', 'DER_PT_subleading_lepton_ratio_PT_leading_jet', 'DER_PT_subleading_lepton_ratio_HT' ], axis=1) XGBModel = TreeModel(DataSet2, ApplyDataCut=False, paramList=paramList) XGBModel.XGBoostTrain() #XGBModel.XGBoostTrain(UseF1Score=True) AMSScore['NO_ratio'] = XGBModel.AMSScore(DataSet2) return AMSScore
def MultiThreadTest(self, SMuon_Neutralino): SMuon, Neutralino = SMuon_Neutralino SignalEvents = pd.read_csv( 'I:\CSV\Events_PPtoSmuonSmuon_Smuon_Mass_{}_Neatralino_{}\EventData.csv' .format(SMuon, Neutralino)) SignalEvents.drop(['EventID'], axis=1, inplace=True) DataSet = pd.concat([self.BackGroundDataTest, SignalEvents]) DataSet.sample(frac=1) DataSet = DataCuts(DataSet) RenameDataBaseColumns(DataSet) F1Score = self.XGBModel.ModelPredictions(DataSet, Metric='f1') AUCScores = self.XGBModel.ModelPredictions(DataSet, Metric='auc') SigWeight = DataSet.Events_weight[DataSet.Label == 1].sum() self.Results['Smuon_Mass_{}_Neatralino_{}'.format( SMuon, Neutralino)] = { 'AMS Score': self.XGBModel.AMSScore(DataSet), 'F1 Score': F1Score, 'auc Score': AUCScores, 'Signal Weight': SigWeight }
def __init__(self, SMuonForModel, NeutralinoForModel, UseF1Score=False): BackGroundData = pd.read_csv(r'I:\CSV\Background_Events\EventData.csv') BackGroundData.drop('EventID', axis=1, inplace=True) self.BackGroundDataTest = pd.read_csv( r'I:\CSV\Background_Events_test\EventData.csv') self.BackGroundDataTest.drop('EventID', axis=1, inplace=True) SignalEvents = pd.read_csv( 'I:\CSV\Events_PPtoSmuonSmuon_Smuon_Mass_{}_Neatralino_{}\EventData.csv' .format(SMuonForModel, NeutralinoForModel)) SignalEvents.drop(['EventID'], axis=1, inplace=True) DataSet = pd.concat([BackGroundData, SignalEvents]) DataSet.sample(frac=1) DataSet = DataCuts(DataSet) RenameDataBaseColumns(DataSet) JSONParameters = RetrieveDictionary( r'I:\CSV\HyperparameterDictionary.json') paramList = JSONParameters['Smuon_Mass_{}_Neatralino_{}'.format( SMuonForModel, NeutralinoForModel)] self.Results = dict() self.TrainModel(DataSet, paramList, UseF1Score)
def SHAPValuesTest(Feature = 'All'): TestDataSet1 = pd.read_csv(r'I:\Results For Particle Physics\00Gerhard-2020-10-14\DockerOutput_Gerhard\Changing signals\Smuon_400_Neutralino_96\EventData.csv') TestDataSet1 = DataCuts(TestDataSet1) TestDataSet1.drop(['EventID'],axis=1,inplace=True) if Feature == 'All': TestColumns = TestDataSet1.columns else: if type(Feature) == str: TestColumns = [Feature] elif type(Feature) == list: TestColumns = Feature else: print('Feature needs to be of type string or list.') for Column in TestColumns: Columns = TestDataSet1.columns Columns = Columns.drop(['PRI_nleps','PRI_jets','Events_weight', 'Label'] + [Column]) TestDataSet = TestDataSet1.drop(Columns,axis=1) paramList ={'subsample': 1, 'reg_gamma': 0.4, 'reg_alpha': 0.1, 'n_estimators': 200, 'min_split_loss': 2, 'min_child_weight': 5, 'max_depth': 5, 'learning_rate': 0.1, 'base_score': 0.9} XGBModel = TreeModel(TestDataSet,paramList,SubSampleDataSet=False,ApplyDataCut=False) XGBModel.XGBoostTrain() XGBModel.SHAPValuePlots() AddedColumns = [Column] while len(Columns) > 0: AddedColumns.append(Columns[0]) Columns = Columns.drop(Columns[0]) TestDataSet = TestDataSet1.drop(Columns,axis=1) XGBModel = TreeModel(TestDataSet,paramList,SubSampleDataSet=False,ApplyDataCut=False) XGBModel.XGBoostTrain() XGBModel.SHAPValuePlots()
def HyperParameters(Smuon_Mass, Neutralino_Mass, SignalEventCSV, BackgroundCSV, NoofTests, Noof_jobs): HyperParameterResults = dict() BackGroundData = pd.read_csv(os.path.join(BackgroundCSV, 'EventData.csv')) BackGroundData.drop('EventID', axis=1, inplace=True) SignalEvents = pd.read_csv( os.path.join( SignalEventCSV, 'Events_PPtoSmuonSmuon_Smuon_Mass_{}_Neatralino_{}/EventData.csv'. format(Smuon_Mass, Neutralino_Mass))) SignalEvents.drop(['EventID'], axis=1, inplace=True) DataSet = pd.concat([BackGroundData, SignalEvents]) DataSet.sample(frac=1) DataSet = DataCuts(DataSet) XGBModel = TreeModel(DataSet, ApplyDataCut=False) XGBModel.HyperParameterTuning(NoofTests, Noof_jobs) return XGBModel.HyperParameters
def TestOneFeature(): TestDataSet1 = pd.read_csv(r'I:\Results For Particle Physics\00Gerhard-2020-10-14\DockerOutput_Gerhard\Changing signals\Smuon_400_Neutralino_96\EventData.csv') TestDataSet1 = DataCuts(TestDataSet1) TestDataSet1.drop(['EventID'],axis=1,inplace=True) Columns = TestDataSet1.columns Columns = Columns.drop(['PRI_nleps','PRI_jets','Events_weight', 'Label', 'DER_ST_ratio_HT']) TestDataSet = TestDataSet1.drop(Columns,axis=1) PCAPlots = PCAPlotter(TestDataSet,'Label') PCAPlots.PCAAnalysis() print(PCAPlots.FeaturePCAValues['Leptons 2 Jets 2']['DER_ST_ratio_HT']) ST_HT_ratio_Percentage = [PCAPlots.FeaturePCAValues['Leptons 2 Jets 2']['DER_ST_ratio_HT']] AddedColumns = ['DER_ST_ratio_HT'] while len(Columns) > 0: AddedColumns.append(Columns[0]) Columns = Columns.drop(Columns[0]) TestDataSet = TestDataSet1.drop(Columns,axis=1) PCAPlots = PCAPlotter(TestDataSet,'Label') PCAPlots.PCAAnalysis() print(PCAPlots.FeaturePCAValues['Leptons 2 Jets 2']['DER_ST_ratio_HT']) ST_HT_ratio_Percentage.append(PCAPlots.FeaturePCAValues['Leptons 2 Jets 2']['DER_ST_ratio_HT']) ST_HT_ratio_Percentage ax2 = plt.gca() X = np.arange(len(ST_HT_ratio_Percentage)) width = 0.25 List = [ST_HT_ratio_Percentage[i][0] for i in range(len(ST_HT_ratio_Percentage))] ax2.bar(X - width/2,List, width, color = 'b',label='PCA1') List = [ST_HT_ratio_Percentage[i][1] for i in range(len(ST_HT_ratio_Percentage))] ax2.bar(X + width/2,List, width, color = 'r',label='PCA2') ax2.set_ylabel('Percentage of PCA score') ax2.set_title('Percentage that each feature makes up of the PCA value') ax2.set_xlabel('Feature added in iteration') ax2.set_xticks(X) ax2.set_xticklabels(AddedColumns, rotation = 'vertical') ax2.legend()
def Pipeline(DataSet, paramList=None, Plot_titles=None): DataSet = DataCuts(DataSet) Key = { 'PRI_nleps': r'$N_{\ell}$', 'PRI_jets': r'$N_{jets}$', 'PRI_leading_jet_pt': r'$jet_{PT}^{(1)}$', 'PRI_subleading_jet_pt': r'$jet_{PT}^{(2)}$', 'PRI_leading_jet_eta': r'$jet_{\eta}^{(1)}$', 'PRI_subleading_jet_eta': r'$jet_{\eta}^{(2)}$', 'PRI_lep_leading_pt': r'$\ell_{PT}^{(1)}$', 'PRI_lep_subleading_pt': r'$\ell_{PT}^{(2)}$', 'PRI_lep_leading_eta': r'$\ell_{\eta}^{(1)}$', 'PRI_lep_subleading_eta': r'$\ell_{\eta}^{(2)}$', 'PRI_lep_leading_phi': r'$\ell_{\phi}^{(1)}$', 'PRI_lep_subleading_phi': r'$\ell_{\phi}^{(2)}$', 'DER_P_T_ratio_lep_pair': r'$\frac{\ell_{PT}^{(1)}}{\ell_{PT}^{(2)}}$', 'DER_Diff_Eta_lep_pair': r'$abs(\ell_{\eta}^{(1)} - \ell_{\eta}^{(2)})$', 'DER_Diff_Phi_lep_pair': r'$abs(\ell_{\phi}^{(1)} - \ell_{\phi}^{(2)})$', 'DER_sum_P_T': r'$\sum(PT)$', 'PRI_Missing_pt': r'MissingPT', 'DER_PT_leading_lepton_ratio_PT_leading_jet': r'$\frac{\ell_{PT}^{(1)}}{jet_{PT}^{(1)}}$', 'DER_PT_leading_lept_ratio_HT': r'$\frac{\ell_{PT}^{(1)}}{HT}$', 'DER_ST_ratio_PT_Leading_jet': r'$\frac{ST}{jet_{PT}^{(1)}}$', 'DER_ST_ratio_HT': r'$\frac{ST}{HT}$', 'DER_PT_subleading_lepton_ratio_PT_leading_jet': r'$\frac{\ell_{PT}^{(2)}}{jet_{PT}^{(1)}}$', 'DER_PT_subleading_lepton_ratio_HT': r'$\frac{\ell_{PT}^{(2)}}{HT}$' } try: DataSet.drop(['EventID'], axis=1, inplace=True) except: pass PCAPlots = PCAPlotter(DataSet, 'Label', Key) PCAPlots.PCAAnalysis() DataSet.rename(columns=Key, inplace=True) if paramList == None: XGBModel = TreeModel(DataSet, ApplyDataCut=False) XGBModel.HyperParameterTuning() else: XGBModel = TreeModel(DataSet, ApplyDataCut=False, paramList=paramList) XGBModel.XGBoostTrain() MeanSHAPValues = XGBModel.SHAPValuePlots(Plot_titles) MeanPermValues = XGBModel.FeaturePermutation(usePredict_poba=False, Plot_Title=Plot_titles) #PCAMag = {} #for items in PCAPlots.FeaturePCAValues['Leptons 2 Jets 2']: # PCAMag[items] = np.sqrt(sum(abs(PCAPlots.FeaturePCAValues['Leptons 2 Jets 2'][items]))) #PCAMag.pop('PRI_nleps') #PCAMag.pop('PRI_jets') #PCAMag = dict(sorted(PCAMag.items(), key=lambda item: item[1])) # #DropColumns = list(PCAMag.keys())[:8] #print(DropColumns) # #DataSet.drop(DropColumns,axis=1,inplace=True) #DataSet.drop('DER_PT_subleading_lepton_ratio_PT_leading_jet',axis=1,inplace = True) #PCAPlots = PCAPlotter(DataSet,'Label') #PCAPlots.PCAAnalysis() #if paramList == None: # XGBModel = TreeModel(DataSet,SubSampleDataSet=False,ApplyDataCut=False) # XGBModel.HyperParameterTuning() #else: # XGBModel = TreeModel(DataSet,SubSampleDataSet=False,ApplyDataCut=False, paramList=paramList) # #XGBModel.XGBoostTrain() #XGBModel.SHAPValuePlots(Plot_titles) return MeanSHAPValues, MeanPermValues
def TestColumns(Feature = 'All', ShowPCAPlots = True): """ This function tests returns the percentage of the contribution the the selected features contribute to the PCA values. The features provided are the ones checked against all the other columns. Parameters ---------- Columns : String or list, optional DESCRIPTION. The default is 'All' which will sequentially tests all the features in the database. You can pass a list of features that you want to test or a single feature. Returns ------- None. """ TestDataSet1 = pd.read_csv(r'I:\Results For Particle Physics\00Gerhard-2020-10-14\DockerOutput_Gerhard\Changing signals\Smuon_400_Neutralino_96\EventData.csv') TestDataSet1 = DataCuts(TestDataSet1) TestDataSet1.drop(['EventID'],axis=1,inplace=True) if Feature == 'All': TestColumns = TestDataSet1.columns else: if type(Feature) == str: TestColumns = [Feature] elif type(Feature) == list: TestColumns = Feature else: print('Feature needs to be of type string or list.') for Column in TestColumns: Columns = TestDataSet1.columns Columns = Columns.drop(['PRI_nleps','PRI_jets','Events_weight', 'Label'] + [Column]) TestDataSet = TestDataSet1.drop(Columns,axis=1) PCAPlots = PCAPlotter(TestDataSet,'Label') PCAPlots.PCAAnalysis( ShowPlots = ShowPCAPlots) print(PCAPlots.FeaturePCAValues['Leptons 2 Jets 2'][Column]) Column_Percentage = [PCAPlots.FeaturePCAValues['Leptons 2 Jets 2'][Column]] AddedColumns = [Column] while len(Columns) > 0: AddedColumns.append(Columns[0]) Columns = Columns.drop(Columns[0]) TestDataSet = TestDataSet1.drop(Columns,axis=1) PCAPlots = PCAPlotter(TestDataSet,'Label') PCAPlots.PCAAnalysis(ShowPlots = ShowPCAPlots) print(PCAPlots.FeaturePCAPercentage['Leptons 2 Jets 2'][Column]) Column_Percentage.append(PCAPlots.FeaturePCAPercentage['Leptons 2 Jets 2'][Column]) Column_Percentage X = np.arange(len(Column_Percentage)) Barplot = plt.figure() ax = Barplot.add_axes([0,0,1,1]) width = 0.25 List = [Column_Percentage[i][0] for i in range(len(Column_Percentage))] ax.bar(X - width/2,List, width, color = 'b',label='PCA1') List = [Column_Percentage[i][1] for i in range(len(Column_Percentage))] ax.bar(X + width/2,List, width, color = 'r',label='PCA2') ax.set_ylabel('Percentage of PCA score') ax.set_title('Percentage that each feature makes up of the PCA value starting with the {} feature'.format(Column)) ax.set_xlabel('Number of feature included in iteration') ax.set_xticks(X) ax.set_xticklabels(AddedColumns, rotation = 'vertical') ax.legend() Barplot.savefig('Percentage Plot.png')
AddedColumns = [Column] while len(Columns) > 0: AddedColumns.append(Columns[0]) Columns = Columns.drop(Columns[0]) TestDataSet = TestDataSet1.drop(Columns,axis=1) XGBModel = TreeModel(TestDataSet,paramList,SubSampleDataSet=False,ApplyDataCut=False) XGBModel.XGBoostTrain() XGBModel.SHAPValuePlots() if "__main__": #TestColumns(Feature = 'DER_PT_subleading_ratio_HT',ShowPCAPlots = True) #TestColumns(Feature = 'DER_ST_ratio_HT', ShowPCAPlots = False) #TestColumns(Feature = 'DER_sum_P_T',ShowPCAPlots = False) #TestColumns(Feature = 'PRI_Missing_pt',ShowPCAPlots = False) #TestColumns('All') #TestTreeModelWeights(Feature = 'DER_PT_subleading_ratio_HT') #TestTreeModelWeights(Feature = 'DER_ST_ratio_HT') #TestTreeModelWeights(Feature = 'DER_sum_P_T') #TestTreeModelWeights(Feature = 'PRI_Missing_pt') #SHAPValuesTest(Feature = 'DER_PT_subleading_ratio_HT') TestDataSet1 = pd.read_csv(r'I:\Results For Particle Physics\00Gerhard-2020-10-14\DockerOutput_Gerhard\Changing signals\Smuon_400_Neutralino_96\EventData.csv') TestDataSet1 = DataCuts(TestDataSet1) TestDataSet1.drop(['EventID'],axis=1,inplace=True) PCAPlots = PCAPlotter(TestDataSet1,'Label') PCAPlots.PCAAnalysis( MinNoofJets= 1, MaxNoofJets=1, MinNoofLeptons = 1, MaxNoofLeptons = 1) PCAPlots.PCAAnalysis( MinNoofJets= 1, MaxNoofJets=2, MinNoofLeptons = 1, MaxNoofLeptons = 1) PCAPlots.PCAAnalysis( MinNoofJets= 1, MaxNoofJets=1, MinNoofLeptons = 1, MaxNoofLeptons = 2) PCAPlots.PCAAnalysis( MinNoofJets= 1, MaxNoofJets=2, MinNoofLeptons = 1, MaxNoofLeptons = 2)
def FeaturePlots(): TestDataSet1 = pd.read_csv( r'I:\Results For Particle Physics\00Gerhard-2020-10-14\DockerOutput_Gerhard\Changing signals\Smuon_200_Neutralino_96\EventData.csv' ) TestDataSet1 = DataCuts(TestDataSet1) TestDataSet1 = RemoveFeatures(TestDataSet1) TestDataSet1.drop(['EventID', 'Events_weight'], axis=1, inplace=True) Feature_Plots_PCA.FeaturePlots(TestDataSet1, 'Label') ############################################################################ TestDataSet1 = pd.read_csv( r'I:\Results For Particle Physics\00Gerhard-2020-10-14\DockerOutput_Gerhard\Changing signals\Smuon_200_Neutralino_195\EventData.csv' ) TestDataSet1 = DataCuts(TestDataSet1) TestDataSet1 = RemoveFeatures(TestDataSet1) TestDataSet1.drop(['EventID', 'Events_weight'], axis=1, inplace=True) Feature_Plots_PCA.FeaturePlots(TestDataSet1, 'Label') ############################################################################ TestDataSet1 = pd.read_csv( r'I:\Results For Particle Physics\00Gerhard-2020-10-14\DockerOutput_Gerhard\Changing signals\Smuon_400_Neutralino_96\EventData.csv' ) TestDataSet1 = DataCuts(TestDataSet1) TestDataSet1 = RemoveFeatures(TestDataSet1) TestDataSet1.drop(['EventID', 'Events_weight'], axis=1, inplace=True) Feature_Plots_PCA.FeaturePlots(TestDataSet1, 'Label') ############################################################################ TestDataSet1 = pd.read_csv( r'I:\Results For Particle Physics\00Gerhard-2020-10-14\DockerOutput_Gerhard\Changing signals\Smuon_400_Neutralino_195\EventData.csv' ) TestDataSet1 = DataCuts(TestDataSet1) TestDataSet1 = RemoveFeatures(TestDataSet1) TestDataSet1.drop(['EventID', 'Events_weight'], axis=1, inplace=True) Feature_Plots_PCA.FeaturePlots(TestDataSet1, 'Label')
return ax Signal = pd.read_csv( r'I:\Results For Particle Physics\PCA TestsFolder\Signal\Events_PPtoSmuonSmuon_Smuon_Mass_400_Neatralino_96\EventData.csv' ) BackGround = pd.read_csv( r'I:\Results For Particle Physics\PCA TestsFolder\Background\Events_PPtoTopTopBar\EventData.csv' ) BackGround.Label = 'TTBar' Signal.Label = 'Signal' DataSet = pd.concat([BackGround, Signal]) DataSet = DataCuts(DataSet) AllFeature = RemoveFeaturesNotinPaper(DataSet) #AllFeature = AllFeature.sample(n = 10000) FeaturePlots(AllFeature, 'Label') PairPlots = Displotter(AllFeature, 'Label') PairPlots.PairPlotAnalysis() sns.displot(AllFeature, x='HT', hue='Label', kind='kde') PCAPlots = PCAPlotter(AllFeature, 'Label') PCAPlots.PCAAnalysis()