Python DataCutsの例、XGBoosterModel.DataCuts Pythonの例

コード例 #1

0

ファイルを表示

def CompareModelwithandwithoutratios(DataSet):
    #### Train model

    paramList = {
        'subsample': 1,
        'reg_gamma': 0.4,
        'reg_alpha': 0.1,
        'n_estimators': 200,
        'min_split_loss': 2,
        'min_child_weight': 5,
        'max_depth': 5,
        'learning_rate': 0.1
    }

    DataSet = DataCuts(DataSet)

    XGBModel = TreeModel(DataSet, ApplyDataCut=False, paramList=paramList)

    XGBModel.XGBoostTrain()

    AMSScore = dict()

    AMSScore['All_features'] = XGBModel.AMSScore(DataSet)

    ### No HT

    DataSet2 = DataSet.drop(['HT', 'ST'], axis=1)

    XGBModel = TreeModel(DataSet2, ApplyDataCut=False, paramList=paramList)

    XGBModel.XGBoostTrain()

    AMSScore['NO_HT'] = XGBModel.AMSScore(DataSet2)

    ### Noratios

    DataSet2 = DataSet.drop([
        'DER_PT_leading_lepton_ratio_PT_leading_jet',
        'DER_PT_leading_lept_ratio_HT', 'DER_ST_ratio_PT_Leading_jet',
        'DER_ST_ratio_HT', 'DER_PT_subleading_lepton_ratio_PT_leading_jet',
        'DER_PT_subleading_lepton_ratio_HT'
    ],
                            axis=1)

    XGBModel = TreeModel(DataSet2, ApplyDataCut=False, paramList=paramList)

    XGBModel.XGBoostTrain()
    #XGBModel.XGBoostTrain(UseF1Score=True)

    AMSScore['NO_ratio'] = XGBModel.AMSScore(DataSet2)

    return AMSScore

コード例 #2

0

ファイルを表示

    def MultiThreadTest(self, SMuon_Neutralino):

        SMuon, Neutralino = SMuon_Neutralino

        SignalEvents = pd.read_csv(
            'I:\CSV\Events_PPtoSmuonSmuon_Smuon_Mass_{}_Neatralino_{}\EventData.csv'
            .format(SMuon, Neutralino))
        SignalEvents.drop(['EventID'], axis=1, inplace=True)

        DataSet = pd.concat([self.BackGroundDataTest, SignalEvents])
        DataSet.sample(frac=1)

        DataSet = DataCuts(DataSet)

        RenameDataBaseColumns(DataSet)

        F1Score = self.XGBModel.ModelPredictions(DataSet, Metric='f1')
        AUCScores = self.XGBModel.ModelPredictions(DataSet, Metric='auc')
        SigWeight = DataSet.Events_weight[DataSet.Label == 1].sum()
        self.Results['Smuon_Mass_{}_Neatralino_{}'.format(
            SMuon, Neutralino)] = {
                'AMS Score': self.XGBModel.AMSScore(DataSet),
                'F1 Score': F1Score,
                'auc Score': AUCScores,
                'Signal Weight': SigWeight
            }

コード例 #3

0

ファイルを表示

    def __init__(self, SMuonForModel, NeutralinoForModel, UseF1Score=False):
        BackGroundData = pd.read_csv(r'I:\CSV\Background_Events\EventData.csv')
        BackGroundData.drop('EventID', axis=1, inplace=True)
        self.BackGroundDataTest = pd.read_csv(
            r'I:\CSV\Background_Events_test\EventData.csv')
        self.BackGroundDataTest.drop('EventID', axis=1, inplace=True)

        SignalEvents = pd.read_csv(
            'I:\CSV\Events_PPtoSmuonSmuon_Smuon_Mass_{}_Neatralino_{}\EventData.csv'
            .format(SMuonForModel, NeutralinoForModel))
        SignalEvents.drop(['EventID'], axis=1, inplace=True)

        DataSet = pd.concat([BackGroundData, SignalEvents])
        DataSet.sample(frac=1)

        DataSet = DataCuts(DataSet)

        RenameDataBaseColumns(DataSet)

        JSONParameters = RetrieveDictionary(
            r'I:\CSV\HyperparameterDictionary.json')
        paramList = JSONParameters['Smuon_Mass_{}_Neatralino_{}'.format(
            SMuonForModel, NeutralinoForModel)]

        self.Results = dict()

        self.TrainModel(DataSet, paramList, UseF1Score)

コード例 #4

0

ファイルを表示

def SHAPValuesTest(Feature = 'All'): 
    TestDataSet1 = pd.read_csv(r'I:\Results For Particle Physics\00Gerhard-2020-10-14\DockerOutput_Gerhard\Changing signals\Smuon_400_Neutralino_96\EventData.csv')
    TestDataSet1 = DataCuts(TestDataSet1)
    TestDataSet1.drop(['EventID'],axis=1,inplace=True)
    if Feature == 'All':
        TestColumns = TestDataSet1.columns
    else: 
        if type(Feature) == str:
            TestColumns = [Feature]
        elif type(Feature) == list:  TestColumns = Feature
        else: print('Feature needs to be of type string or list.')  
    for Column in TestColumns:
        Columns = TestDataSet1.columns
        Columns = Columns.drop(['PRI_nleps','PRI_jets','Events_weight', 'Label'] + [Column])
        TestDataSet = TestDataSet1.drop(Columns,axis=1)
        paramList ={'subsample': 1, 'reg_gamma': 0.4, 'reg_alpha': 0.1, 'n_estimators': 200, 'min_split_loss': 2, 'min_child_weight': 5, 'max_depth': 5, 'learning_rate': 0.1, 'base_score': 0.9}
        XGBModel = TreeModel(TestDataSet,paramList,SubSampleDataSet=False,ApplyDataCut=False)
        XGBModel.XGBoostTrain()
        XGBModel.SHAPValuePlots()
        AddedColumns = [Column]
        while len(Columns) > 0:
            AddedColumns.append(Columns[0])
            Columns = Columns.drop(Columns[0])
            TestDataSet = TestDataSet1.drop(Columns,axis=1)
            XGBModel = TreeModel(TestDataSet,paramList,SubSampleDataSet=False,ApplyDataCut=False)
            XGBModel.XGBoostTrain()
            XGBModel.SHAPValuePlots()

コード例 #5

0

ファイルを表示

ファイル: HyperParameterTuning.py プロジェクト: GerhardHarmsen/Physics-Machine-Learning-project

def HyperParameters(Smuon_Mass, Neutralino_Mass, SignalEventCSV, BackgroundCSV,
                    NoofTests, Noof_jobs):
    HyperParameterResults = dict()
    BackGroundData = pd.read_csv(os.path.join(BackgroundCSV, 'EventData.csv'))
    BackGroundData.drop('EventID', axis=1, inplace=True)

    SignalEvents = pd.read_csv(
        os.path.join(
            SignalEventCSV,
            'Events_PPtoSmuonSmuon_Smuon_Mass_{}_Neatralino_{}/EventData.csv'.
            format(Smuon_Mass, Neutralino_Mass)))
    SignalEvents.drop(['EventID'], axis=1, inplace=True)

    DataSet = pd.concat([BackGroundData, SignalEvents])
    DataSet.sample(frac=1)

    DataSet = DataCuts(DataSet)

    XGBModel = TreeModel(DataSet, ApplyDataCut=False)

    XGBModel.HyperParameterTuning(NoofTests, Noof_jobs)

    return XGBModel.HyperParameters

コード例 #6

0

ファイルを表示

def TestOneFeature():
    TestDataSet1 = pd.read_csv(r'I:\Results For Particle Physics\00Gerhard-2020-10-14\DockerOutput_Gerhard\Changing signals\Smuon_400_Neutralino_96\EventData.csv')
    TestDataSet1 = DataCuts(TestDataSet1)
    TestDataSet1.drop(['EventID'],axis=1,inplace=True)
    Columns = TestDataSet1.columns
    Columns = Columns.drop(['PRI_nleps','PRI_jets','Events_weight', 'Label', 'DER_ST_ratio_HT'])
    TestDataSet = TestDataSet1.drop(Columns,axis=1)
    PCAPlots = PCAPlotter(TestDataSet,'Label')
    PCAPlots.PCAAnalysis()
    print(PCAPlots.FeaturePCAValues['Leptons 2 Jets 2']['DER_ST_ratio_HT'])
    ST_HT_ratio_Percentage = [PCAPlots.FeaturePCAValues['Leptons 2 Jets 2']['DER_ST_ratio_HT']]
    AddedColumns = ['DER_ST_ratio_HT']
    while len(Columns) > 0:
        AddedColumns.append(Columns[0])
        Columns = Columns.drop(Columns[0])
        TestDataSet = TestDataSet1.drop(Columns,axis=1)
        PCAPlots = PCAPlotter(TestDataSet,'Label')
        PCAPlots.PCAAnalysis()
        print(PCAPlots.FeaturePCAValues['Leptons 2 Jets 2']['DER_ST_ratio_HT'])
        ST_HT_ratio_Percentage.append(PCAPlots.FeaturePCAValues['Leptons 2 Jets 2']['DER_ST_ratio_HT'])
        
    ST_HT_ratio_Percentage

    ax2 = plt.gca()
            
    X = np.arange(len(ST_HT_ratio_Percentage))
    width = 0.25
    List = [ST_HT_ratio_Percentage[i][0] for i in range(len(ST_HT_ratio_Percentage))]
    ax2.bar(X - width/2,List, width, color = 'b',label='PCA1')
    List = [ST_HT_ratio_Percentage[i][1] for i in range(len(ST_HT_ratio_Percentage))]
    ax2.bar(X + width/2,List, width, color = 'r',label='PCA2')
           
    ax2.set_ylabel('Percentage of PCA score')
    ax2.set_title('Percentage that each feature makes up of the PCA value')
    ax2.set_xlabel('Feature added in iteration')
    ax2.set_xticks(X)
    ax2.set_xticklabels(AddedColumns, rotation = 'vertical')
    ax2.legend()

コード例 #7

0

ファイルを表示

def Pipeline(DataSet, paramList=None, Plot_titles=None):
    DataSet = DataCuts(DataSet)

    Key = {
        'PRI_nleps': r'$N_{\ell}$',
        'PRI_jets': r'$N_{jets}$',
        'PRI_leading_jet_pt': r'$jet_{PT}^{(1)}$',
        'PRI_subleading_jet_pt': r'$jet_{PT}^{(2)}$',
        'PRI_leading_jet_eta': r'$jet_{\eta}^{(1)}$',
        'PRI_subleading_jet_eta': r'$jet_{\eta}^{(2)}$',
        'PRI_lep_leading_pt': r'$\ell_{PT}^{(1)}$',
        'PRI_lep_subleading_pt': r'$\ell_{PT}^{(2)}$',
        'PRI_lep_leading_eta': r'$\ell_{\eta}^{(1)}$',
        'PRI_lep_subleading_eta': r'$\ell_{\eta}^{(2)}$',
        'PRI_lep_leading_phi': r'$\ell_{\phi}^{(1)}$',
        'PRI_lep_subleading_phi': r'$\ell_{\phi}^{(2)}$',
        'DER_P_T_ratio_lep_pair': r'$\frac{\ell_{PT}^{(1)}}{\ell_{PT}^{(2)}}$',
        'DER_Diff_Eta_lep_pair':
        r'$abs(\ell_{\eta}^{(1)} - \ell_{\eta}^{(2)})$',
        'DER_Diff_Phi_lep_pair':
        r'$abs(\ell_{\phi}^{(1)} - \ell_{\phi}^{(2)})$',
        'DER_sum_P_T': r'$\sum(PT)$',
        'PRI_Missing_pt': r'MissingPT',
        'DER_PT_leading_lepton_ratio_PT_leading_jet':
        r'$\frac{\ell_{PT}^{(1)}}{jet_{PT}^{(1)}}$',
        'DER_PT_leading_lept_ratio_HT': r'$\frac{\ell_{PT}^{(1)}}{HT}$',
        'DER_ST_ratio_PT_Leading_jet': r'$\frac{ST}{jet_{PT}^{(1)}}$',
        'DER_ST_ratio_HT': r'$\frac{ST}{HT}$',
        'DER_PT_subleading_lepton_ratio_PT_leading_jet':
        r'$\frac{\ell_{PT}^{(2)}}{jet_{PT}^{(1)}}$',
        'DER_PT_subleading_lepton_ratio_HT': r'$\frac{\ell_{PT}^{(2)}}{HT}$'
    }

    try:
        DataSet.drop(['EventID'], axis=1, inplace=True)
    except:
        pass

    PCAPlots = PCAPlotter(DataSet, 'Label', Key)
    PCAPlots.PCAAnalysis()

    DataSet.rename(columns=Key, inplace=True)

    if paramList == None:
        XGBModel = TreeModel(DataSet, ApplyDataCut=False)
        XGBModel.HyperParameterTuning()
    else:
        XGBModel = TreeModel(DataSet, ApplyDataCut=False, paramList=paramList)

    XGBModel.XGBoostTrain()
    MeanSHAPValues = XGBModel.SHAPValuePlots(Plot_titles)

    MeanPermValues = XGBModel.FeaturePermutation(usePredict_poba=False,
                                                 Plot_Title=Plot_titles)

    #PCAMag = {}
    #for items in PCAPlots.FeaturePCAValues['Leptons 2 Jets 2']:
    #    PCAMag[items] = np.sqrt(sum(abs(PCAPlots.FeaturePCAValues['Leptons 2 Jets 2'][items])))
    #PCAMag.pop('PRI_nleps')
    #PCAMag.pop('PRI_jets')
    #PCAMag = dict(sorted(PCAMag.items(), key=lambda item: item[1]))
    #
    #DropColumns = list(PCAMag.keys())[:8]
    #print(DropColumns)
    #
    #DataSet.drop(DropColumns,axis=1,inplace=True)
    #DataSet.drop('DER_PT_subleading_lepton_ratio_PT_leading_jet',axis=1,inplace = True)

    #PCAPlots = PCAPlotter(DataSet,'Label')
    #PCAPlots.PCAAnalysis()

    #if paramList == None:
    #    XGBModel = TreeModel(DataSet,SubSampleDataSet=False,ApplyDataCut=False)
    #    XGBModel.HyperParameterTuning()
    #else:
    #    XGBModel = TreeModel(DataSet,SubSampleDataSet=False,ApplyDataCut=False, paramList=paramList)
    #
    #XGBModel.XGBoostTrain()
    #XGBModel.SHAPValuePlots(Plot_titles)
    return MeanSHAPValues, MeanPermValues

コード例 #8

0

ファイルを表示

def TestColumns(Feature = 'All', ShowPCAPlots = True):
    """
    This function tests returns the percentage of the contribution the the selected features contribute to the PCA values. The features provided are the ones checked against all the other columns.

    Parameters
    ----------
    Columns : String or list, optional
        DESCRIPTION. The default is 'All' which will sequentially tests all the features in the database. You can pass a list of features that you want to test or a single feature.

    Returns
    -------
    None.

    """
   
    TestDataSet1 = pd.read_csv(r'I:\Results For Particle Physics\00Gerhard-2020-10-14\DockerOutput_Gerhard\Changing signals\Smuon_400_Neutralino_96\EventData.csv')
    TestDataSet1 = DataCuts(TestDataSet1)
    TestDataSet1.drop(['EventID'],axis=1,inplace=True)
    
    if Feature == 'All':
        TestColumns = TestDataSet1.columns
    else: 
        if type(Feature) == str:
            TestColumns = [Feature]
        elif type(Feature) == list:  TestColumns = Feature
        else: print('Feature needs to be of type string or list.')  
    for Column in TestColumns:
        Columns = TestDataSet1.columns
        Columns = Columns.drop(['PRI_nleps','PRI_jets','Events_weight', 'Label'] + [Column])
        TestDataSet = TestDataSet1.drop(Columns,axis=1)
        PCAPlots = PCAPlotter(TestDataSet,'Label')
        PCAPlots.PCAAnalysis( ShowPlots = ShowPCAPlots)
        print(PCAPlots.FeaturePCAValues['Leptons 2 Jets 2'][Column])
        Column_Percentage = [PCAPlots.FeaturePCAValues['Leptons 2 Jets 2'][Column]]
        AddedColumns = [Column]
        while len(Columns) > 0:
            AddedColumns.append(Columns[0])
            Columns = Columns.drop(Columns[0])
            TestDataSet = TestDataSet1.drop(Columns,axis=1)
            PCAPlots = PCAPlotter(TestDataSet,'Label')
            PCAPlots.PCAAnalysis(ShowPlots = ShowPCAPlots)
            print(PCAPlots.FeaturePCAPercentage['Leptons 2 Jets 2'][Column])
            Column_Percentage.append(PCAPlots.FeaturePCAPercentage['Leptons 2 Jets 2'][Column])
            
        Column_Percentage

        
                
        X = np.arange(len(Column_Percentage))
        Barplot = plt.figure()
        ax = Barplot.add_axes([0,0,1,1])
        width = 0.25
        List = [Column_Percentage[i][0] for i in range(len(Column_Percentage))]
        ax.bar(X - width/2,List, width, color = 'b',label='PCA1')
        List = [Column_Percentage[i][1] for i in range(len(Column_Percentage))]
        ax.bar(X + width/2,List, width, color = 'r',label='PCA2')
               
        ax.set_ylabel('Percentage of PCA score')
        ax.set_title('Percentage that each feature makes up of the PCA value starting with the {} feature'.format(Column))
        ax.set_xlabel('Number of feature included in iteration')
        ax.set_xticks(X)
        ax.set_xticklabels(AddedColumns,  rotation = 'vertical')
        ax.legend()
        Barplot.savefig('Percentage Plot.png')

コード例 #9

0

ファイルを表示

        AddedColumns = [Column]
        while len(Columns) > 0:
            AddedColumns.append(Columns[0])
            Columns = Columns.drop(Columns[0])
            TestDataSet = TestDataSet1.drop(Columns,axis=1)
            XGBModel = TreeModel(TestDataSet,paramList,SubSampleDataSet=False,ApplyDataCut=False)
            XGBModel.XGBoostTrain()
            XGBModel.SHAPValuePlots()

if "__main__":
   #TestColumns(Feature = 'DER_PT_subleading_ratio_HT',ShowPCAPlots = True)
   #TestColumns(Feature = 'DER_ST_ratio_HT', ShowPCAPlots = False)
   #TestColumns(Feature = 'DER_sum_P_T',ShowPCAPlots = False)
   #TestColumns(Feature = 'PRI_Missing_pt',ShowPCAPlots = False)
   #TestColumns('All')
   #TestTreeModelWeights(Feature = 'DER_PT_subleading_ratio_HT')
   #TestTreeModelWeights(Feature = 'DER_ST_ratio_HT')
   #TestTreeModelWeights(Feature = 'DER_sum_P_T')
   #TestTreeModelWeights(Feature = 'PRI_Missing_pt')
   #SHAPValuesTest(Feature = 'DER_PT_subleading_ratio_HT')
   TestDataSet1 = pd.read_csv(r'I:\Results For Particle Physics\00Gerhard-2020-10-14\DockerOutput_Gerhard\Changing signals\Smuon_400_Neutralino_96\EventData.csv')
   TestDataSet1 = DataCuts(TestDataSet1)
   TestDataSet1.drop(['EventID'],axis=1,inplace=True)
   PCAPlots = PCAPlotter(TestDataSet1,'Label')
   PCAPlots.PCAAnalysis( MinNoofJets= 1, MaxNoofJets=1, MinNoofLeptons = 1, MaxNoofLeptons = 1)
   PCAPlots.PCAAnalysis( MinNoofJets= 1, MaxNoofJets=2, MinNoofLeptons = 1, MaxNoofLeptons = 1)
   PCAPlots.PCAAnalysis( MinNoofJets= 1, MaxNoofJets=1, MinNoofLeptons = 1, MaxNoofLeptons = 2)
   PCAPlots.PCAAnalysis( MinNoofJets= 1, MaxNoofJets=2, MinNoofLeptons = 1, MaxNoofLeptons = 2)

コード例 #10

0

ファイルを表示

def FeaturePlots():
    TestDataSet1 = pd.read_csv(
        r'I:\Results For Particle Physics\00Gerhard-2020-10-14\DockerOutput_Gerhard\Changing signals\Smuon_200_Neutralino_96\EventData.csv'
    )
    TestDataSet1 = DataCuts(TestDataSet1)
    TestDataSet1 = RemoveFeatures(TestDataSet1)
    TestDataSet1.drop(['EventID', 'Events_weight'], axis=1, inplace=True)
    Feature_Plots_PCA.FeaturePlots(TestDataSet1, 'Label')
    ############################################################################
    TestDataSet1 = pd.read_csv(
        r'I:\Results For Particle Physics\00Gerhard-2020-10-14\DockerOutput_Gerhard\Changing signals\Smuon_200_Neutralino_195\EventData.csv'
    )
    TestDataSet1 = DataCuts(TestDataSet1)
    TestDataSet1 = RemoveFeatures(TestDataSet1)
    TestDataSet1.drop(['EventID', 'Events_weight'], axis=1, inplace=True)
    Feature_Plots_PCA.FeaturePlots(TestDataSet1, 'Label')
    ############################################################################
    TestDataSet1 = pd.read_csv(
        r'I:\Results For Particle Physics\00Gerhard-2020-10-14\DockerOutput_Gerhard\Changing signals\Smuon_400_Neutralino_96\EventData.csv'
    )
    TestDataSet1 = DataCuts(TestDataSet1)
    TestDataSet1 = RemoveFeatures(TestDataSet1)
    TestDataSet1.drop(['EventID', 'Events_weight'], axis=1, inplace=True)
    Feature_Plots_PCA.FeaturePlots(TestDataSet1, 'Label')
    ############################################################################
    TestDataSet1 = pd.read_csv(
        r'I:\Results For Particle Physics\00Gerhard-2020-10-14\DockerOutput_Gerhard\Changing signals\Smuon_400_Neutralino_195\EventData.csv'
    )
    TestDataSet1 = DataCuts(TestDataSet1)
    TestDataSet1 = RemoveFeatures(TestDataSet1)
    TestDataSet1.drop(['EventID', 'Events_weight'], axis=1, inplace=True)
    Feature_Plots_PCA.FeaturePlots(TestDataSet1, 'Label')

コード例 #11

0

ファイルを表示

ファイル: PCAFeaturePlotter.py プロジェクト: GerhardHarmsen/Physics-Machine-Learning-project

        return ax


Signal = pd.read_csv(
    r'I:\Results For Particle Physics\PCA TestsFolder\Signal\Events_PPtoSmuonSmuon_Smuon_Mass_400_Neatralino_96\EventData.csv'
)
BackGround = pd.read_csv(
    r'I:\Results For Particle Physics\PCA TestsFolder\Background\Events_PPtoTopTopBar\EventData.csv'
)

BackGround.Label = 'TTBar'
Signal.Label = 'Signal'

DataSet = pd.concat([BackGround, Signal])

DataSet = DataCuts(DataSet)

AllFeature = RemoveFeaturesNotinPaper(DataSet)

#AllFeature = AllFeature.sample(n = 10000)

FeaturePlots(AllFeature, 'Label')

PairPlots = Displotter(AllFeature, 'Label')
PairPlots.PairPlotAnalysis()

sns.displot(AllFeature, x='HT', hue='Label', kind='kde')

PCAPlots = PCAPlotter(AllFeature, 'Label')
PCAPlots.PCAAnalysis()