Ejemplo n.º 1
0
                                    bins=40,
                                    labels=leg_labels,
                                    log=True,
                                    density=True,
                                    figsize=(12, 12),
                                    alpha=0.5,
                                    grid=False)
 plt.subplots_adjust(left=0.06,
                     bottom=0.06,
                     right=0.99,
                     top=0.96,
                     hspace=0.50,
                     wspace=0.50)
 plt.tight_layout()
 plt.savefig(f'{PLOT_DIR}/features/FeaturePlots.pdf')
 bkg_corr = plot_utils.plot_corr([background_tree_handler],
                                 TRAINING_COLUMNS_LIST, ['Background'])
 bkg_corr.set_size_inches(6, 6)
 plt.subplots_adjust(left=0.1,
                     bottom=0.06,
                     right=0.99,
                     top=0.96,
                     hspace=0.55,
                     wspace=0.55)
 plt.tight_layout()
 plt.savefig(f'{PLOT_DIR}/features/BackgroundCorrelationMatrix.pdf')
 np_corr = plot_utils.plot_corr([non_prompt_tree_handler],
                                TRAINING_COLUMNS_LIST, ['Non-prompt'])
 np_corr.set_size_inches(6, 6)
 plt.tight_layout()
 plt.savefig(f'{PLOT_DIR}/features/NonPromptCorrelationMatrix.pdf')
 p_corr = plot_utils.plot_corr([prompt_tree_handler],
def data_prep(inputCfg, iBin, PtBin, OutPutDirPt, PromptDf, FDDf, BkgDf):  #pylint: disable=too-many-statements, too-many-branches
    '''
    function for data preparation
    '''
    nPrompt = len(PromptDf)
    nFD = len(FDDf)
    nBkg = len(BkgDf)
    if FDDf.empty:
        out = f'\n     Signal: {nPrompt}\n     Bkg: {nBkg}'
    else:
        out = f'\n     Prompt: {nPrompt}\n     FD: {nFD}\n     Bkg: {nBkg}'
    print(
        f'Number of available candidates in {PtBin[0]} < pT < {PtBin[1]} GeV/c:{out}'
    )

    dataset_opt = inputCfg['data_prep']['dataset_opt']
    seed_split = inputCfg['data_prep']['seed_split']
    test_f = inputCfg['data_prep']['test_fraction']

    if dataset_opt == 'equal':
        if FDDf.empty:
            nCandToKeep = min([nPrompt, nBkg])
            out = 'signal'
            out2 = 'signal'
        else:
            nCandToKeep = min([nPrompt, nFD, nBkg])
            out = 'prompt, FD'
            out2 = 'prompt'
        print((
            f'Keep same number of {out} and background (minimum) for training and '
            f'testing ({1 - test_f}-{test_f}): {nCandToKeep}'))
        print(
            f'Fraction of real data candidates used for ML: {nCandToKeep/nBkg:.5f}'
        )

        if nPrompt > nCandToKeep:
            print((f'Remaining {out2} candidates ({nPrompt - nCandToKeep})'
                   'will be used for the efficiency together with test set'))
        if nFD > nCandToKeep:
            print((
                f'Remaining FD candidates ({nFD - nCandToKeep}) will be used for the '
                'efficiency together with test set'))

        TotDf = pd.concat([
            BkgDf.iloc[:nCandToKeep], PromptDf.iloc[:nCandToKeep],
            FDDf.iloc[:nCandToKeep]
        ],
                          sort=True)
        if FDDf.empty:
            LabelsArray = np.array([0] * nCandToKeep + [1] * nCandToKeep)
        else:
            LabelsArray = np.array([0] * nCandToKeep + [1] * nCandToKeep +
                                   [2] * nCandToKeep)
        if test_f < 1:
            TrainSet, TestSet, yTrain, yTest = train_test_split(
                TotDf, LabelsArray, test_size=test_f, random_state=seed_split)
        else:
            TrainSet = pd.DataFrame()
            TestSet = TotDf.copy()
            yTrain = pd.Series()
            yTest = LabelsArray.copy()

        TrainTestData = [TrainSet, yTrain, TestSet, yTest]
        PromptDfSelForEff = pd.concat([
            PromptDf.iloc[nCandToKeep:], TestSet[pd.Series(yTest).array == 1]
        ],
                                      sort=False)
        if FDDf.empty:
            FDDfSelForEff = pd.DataFrame()
        else:
            FDDfSelForEff = pd.concat([
                FDDf.iloc[nCandToKeep:], TestSet[pd.Series(yTest).array == 2]
            ],
                                      sort=False)
        del TotDf

    elif dataset_opt == 'max_signal':
        nCandBkg = round(inputCfg['data_prep']['bkg_mult'][iBin] *
                         (nPrompt + nFD))
        out = 'signal' if FDDf.empty else 'prompt and FD'
        print((
            f'Keep all {out} and use {nCandBkg} bkg candidates for training and '
            f'testing ({1 - test_f}-{test_f})'))
        if nCandBkg >= nBkg:
            nCandBkg = nBkg
            print('\033[93mWARNING: using all bkg available, not good!\033[0m')
        print(
            f'Fraction of real data candidates used for ML: {nCandBkg/nBkg:.5f}'
        )

        TotDf = pd.concat([BkgDf.iloc[:nCandBkg], PromptDf, FDDf], sort=True)
        if FDDf.empty:
            LabelsArray = np.array([0] * nCandBkg + [1] * nPrompt)
        else:
            LabelsArray = np.array([0] * nCandBkg + [1] * nPrompt + [2] * nFD)
        if test_f < 1:
            TrainSet, TestSet, yTrain, yTest = train_test_split(
                TotDf, LabelsArray, test_size=test_f, random_state=seed_split)
        else:
            TrainSet = pd.DataFrame()
            TestSet = TotDf.copy()
            yTrain = pd.Series()
            yTest = LabelsArray.copy()

        TrainTestData = [TrainSet, yTrain, TestSet, yTest]
        PromptDfSelForEff = TestSet[pd.Series(yTest).array == 1]
        FDDfSelForEff = pd.DataFrame() if FDDf.empty else TestSet[pd.Series(
            yTest).array == 2]
        del TotDf

    else:
        print(f'\033[91mERROR: {dataset_opt} is not a valid option!\033[0m')
        sys.exit()

    # plots
    VarsToDraw = inputCfg['plots']['plotting_columns']
    LegLabels = [
        inputCfg['output']['leg_labels']['Bkg'],
        inputCfg['output']['leg_labels']['Prompt']
    ]
    if inputCfg['output']['leg_labels']['FD'] is not None:
        LegLabels.append(inputCfg['output']['leg_labels']['FD'])
    OutputLabels = [
        inputCfg['output']['out_labels']['Bkg'],
        inputCfg['output']['out_labels']['Prompt']
    ]
    if inputCfg['output']['out_labels']['FD'] is not None:
        OutputLabels.append(inputCfg['output']['out_labels']['FD'])
    ListDf = [BkgDf, PromptDf] if FDDf.empty else [BkgDf, PromptDf, FDDf]
    #_____________________________________________
    plot_utils.plot_distr(ListDf,
                          VarsToDraw,
                          100,
                          LegLabels,
                          figsize=(12, 7),
                          alpha=0.3,
                          log=True,
                          grid=False,
                          density=True)
    plt.subplots_adjust(left=0.06,
                        bottom=0.06,
                        right=0.99,
                        top=0.96,
                        hspace=0.55,
                        wspace=0.55)
    plt.savefig(f'{OutPutDirPt}/DistributionsAll_pT_{PtBin[0]}_{PtBin[1]}.pdf')
    plt.close('all')
    #_____________________________________________
    CorrMatrixFig = plot_utils.plot_corr(ListDf, VarsToDraw, LegLabels)
    for Fig, Lab in zip(CorrMatrixFig, OutputLabels):
        plt.figure(Fig.number)
        plt.subplots_adjust(left=0.2, bottom=0.25, right=0.95, top=0.9)
        Fig.savefig(
            f'{OutPutDirPt}/CorrMatrix{Lab}_pT_{PtBin[0]}_{PtBin[1]}.pdf')

    return TrainTestData, PromptDfSelForEff, FDDfSelForEff
Ejemplo n.º 3
0
def data_prep(inputCfg, iBin, PtMin, PtMax, OutPutDirPt, DataDf, PromptDf,
              FDDf):  #pylint: disable=too-many-statements
    '''
    function for data preparation
    '''
    DataDfPtSel = DataDf.query(f'{PtMin} < pt_cand < {PtMax}')
    BkgDfPtSel = DataDfPtSel.query(inputCfg['data_prep']['filt_bkg_mass'])
    PromptDfPtSel = PromptDf.query(f'{PtMin} < pt_cand < {PtMax}')
    FDDfPtSel = FDDf.query(f'{PtMin} < pt_cand < {PtMax}')

    nPrompt = len(PromptDfPtSel)
    nFD = len(FDDfPtSel)
    nBkg = len(BkgDfPtSel)
    print((
        f'Number of available candidates in {PtMin} < pT < {PtMax} GeV/c:\n     Prompt: {nPrompt}'
        f'\n     FD: {nFD}\n     Bkg: {nBkg}'))

    dataset_opt = inputCfg['data_prep']['dataset_opt']
    seed_split = inputCfg['data_prep']['seed_split']
    test_f = inputCfg['data_prep']['test_fraction']

    if dataset_opt == 'equal':

        nCandToKeep = min([nPrompt, nFD, nBkg])
        print((
            'Keep same number of prompt, FD, and background (minimum) for training and '
            f'testing ({1 - test_f}-{test_f}): {nCandToKeep}'))
        print(
            f'Fraction of real data candidates used for ML: {nCandToKeep/nBkg:.5f}'
        )

        if nPrompt > nCandToKeep:
            print((f'Remaining prompt candidates ({nPrompt - nCandToKeep})'
                   'will be used for the efficiency together with test set'))
        if nFD > nCandToKeep:
            print((
                f'Remaining FD candidates ({nFD - nCandToKeep}) will be used for the '
                'efficiency together with test set'))

        TotDfPtSel = pd.concat([
            BkgDfPtSel.iloc[:nCandToKeep], PromptDfPtSel.iloc[:nCandToKeep],
            FDDfPtSel.iloc[:nCandToKeep]
        ],
                               sort=True)
        LabelsArray = [0] * nCandToKeep + [1] * nCandToKeep + [2] * nCandToKeep
        TrainSet, TestSet, yTrain, yTest = train_test_split(
            TotDfPtSel, LabelsArray, test_size=test_f, random_state=seed_split)
        TrainTestData = [TrainSet, yTrain, TestSet, yTest]
        CandTypeFlags = pd.Series(yTest)
        PromptDfPtSelForEff = pd.concat([
            PromptDfPtSel.iloc[nCandToKeep:],
            TestSet[CandTypeFlags.values == 1]
        ],
                                        sort=False)
        FDDfPtSelForEff = pd.concat(
            [FDDfPtSel.iloc[nCandToKeep:], TestSet[CandTypeFlags.values == 2]],
            sort=False)
        del TotDfPtSel

    elif dataset_opt == 'max_signal':

        nCandBkg = round(inputCfg['ml']['bkg_mult'][iBin] * (nPrompt + nFD))
        print((
            f'Keep all prompt and FD and use {nCandBkg} bkg candidates for training and '
            f'testing ({1 - test_f}-{test_f})'))
        if nCandBkg >= nBkg:
            nCandBkg = nBkg
            print('WARNING: using all bkg available, not good!')
        print(
            f'Fraction of real data candidates used for ML: {nCandBkg/nBkg:.5f}'
        )

        TotDfPtSel = pd.concat(
            [BkgDfPtSel.iloc[:nCandBkg], PromptDfPtSel, FDDfPtSel], sort=True)
        LabelsArray = [0] * nCandBkg + [1] * nPrompt + [2] * nFD
        TrainSet, TestSet, yTrain, yTest = train_test_split(
            TotDfPtSel, LabelsArray, test_size=test_f, random_state=seed_split)
        TrainTestData = [TrainSet, yTrain, TestSet, yTest]
        CandTypeFlags = pd.Series(yTest)
        PromptDfPtSelForEff = TestSet[CandTypeFlags.values == 1]
        FDDfPtSelForEff = TestSet[CandTypeFlags.values == 2]
        del TotDfPtSel

    else:
        print(f'ERROR: {dataset_opt} is not a valid option!')
        sys.exit()

    # plots
    VarsToDraw = inputCfg['ml']['plotting_columns']
    LegLabels = inputCfg['output']['leg_labels']
    OutputLabels = inputCfg['output']['out_labels']
    #_____________________________________________
    plot_utils.plot_distr([BkgDfPtSel, PromptDfPtSel, FDDfPtSel], VarsToDraw,
                          (12, 7), 100, True, LegLabels, 0.3)
    plt.subplots_adjust(left=0.06,
                        bottom=0.06,
                        right=0.99,
                        top=0.96,
                        hspace=0.55,
                        wspace=0.55)
    plt.savefig(f'{OutPutDirPt}/DistributionsAll_pT_{PtMin}_{PtMax}.pdf')
    plt.close('all')
    #_____________________________________________
    CorrMatrixFig = plot_utils.plot_corr(
        [BkgDfPtSel, PromptDfPtSel, FDDfPtSel], VarsToDraw, LegLabels)
    for Fig, Lab in zip(CorrMatrixFig, OutputLabels):
        plt.figure(Fig.number)
        plt.subplots_adjust(left=0.2, bottom=0.25, right=0.95, top=0.9)
        Fig.savefig(f'{OutPutDirPt}/CorrMatrix{Lab}_pT_{PtMin}_{PtMax}.pdf')

    del BkgDfPtSel, PromptDfPtSel, FDDfPtSel
    return TrainTestData, DataDfPtSel, PromptDfPtSelForEff, FDDfPtSelForEff
Ejemplo n.º 4
0
def test_plot_corr():
    """
    Test the correlation matrix plot
    """
    assert isinstance(plot_utils.plot_corr(
        [SIG_DF, BKG_DF], SIG_DF.columns), list)
HYP_RANGES = {
    # # defines the maximum depth of a single tree (regularization)
    'max_depth': (5, 15),
    # 'learning_rate': (0.01, 0.3),  # learning rate
    'n_estimators': (5, 10),  # number of boosting trees
}
MODEL.optimize_params_bayes(DATA, HYP_RANGES, 'roc_auc')

# train and test the model with the updated hyperparameters
MODEL.train_test_model(DATA)
Y_PRED = MODEL.predict(DATA[2])

# Calculate the BDT efficiency as a function of the BDT score
EFFICIENCY, THRESHOLD = analysis_utils.bdt_efficiency_array(
    DATA[3], Y_PRED, n_points=10)
# --------------------------------------------


# PLOTTING
# --------------------------------------------
FEATURES_DISTRIBUTIONS_PLOT = plot_utils.plot_distr(
    [SIG_DF, BKG_DF], SIG_DF.columns)
CORRELATION_MATRIX_PLOT = plot_utils.plot_corr([SIG_DF, BKG_DF], SIG_DF.columns)
BDT_OUTPUT_PLOT = plot_utils.plot_output_train_test(MODEL, DATA)
ROC_CURVE_PLOT = plot_utils.plot_roc(DATA[3], Y_PRED)
PRECISION_RECALL_PLOT = plot_utils.plot_precision_recall(DATA[3], Y_PRED)
BDT_EFFICIENCY_PLOT = plot_utils.plot_bdt_eff(THRESHOLD, EFFICIENCY)
FEATURES_IMPORTANCE = plot_utils.plot_feature_imp(TEST_SET, Y_TEST, MODEL)
plt.show()
# ---------------------------------------------
Ejemplo n.º 6
0
        # define tree handlers
        signal_tree_handler = TreeHandler()
        background_tree_handler = TreeHandler()
        signal_tree_handler.set_data_frame(df_signal_ct)
        background_tree_handler.set_data_frame(df_background_ct)
        del df_signal_ct, df_background_ct

        if not os.path.isdir(f'{PLOT_DIR}/features'):
            os.mkdir(f'{PLOT_DIR}/features')

        leg_labels = ['background', 'signal']
        plot_distr = plot_utils.plot_distr(
            [background_tree_handler, signal_tree_handler],
            TRAINING_COLUMNS_LIST, bins=40, labels=leg_labels, log=True, density=True, figsize=(10, 12),
            alpha=0.5, grid=False)
        plt.subplots_adjust(left=0.06, bottom=0.06, right=0.99, top=0.96, hspace=0.50, wspace=0.50)
        plt.tight_layout()
        plt.savefig(f'{PLOT_DIR}/features/FeaturePlots.pdf')
        bkg_corr = plot_utils.plot_corr([background_tree_handler], TRAINING_COLUMNS_LIST, ['Background'])
        bkg_corr.set_size_inches(6,6)
        plt.subplots_adjust(left=0.1, bottom=0.06, right=0.99, top=0.96, hspace=0.55, wspace=0.55)
        plt.tight_layout()
        plt.savefig(f'{PLOT_DIR}/features/BackgroundCorrelationMatrix.pdf')
        sig_corr = plot_utils.plot_corr([signal_tree_handler], TRAINING_COLUMNS_LIST, ['Signal'])
        sig_corr.set_size_inches(6,6)
        plt.tight_layout()
        plt.savefig(f'{PLOT_DIR}/features/SignalCorrelationMatrix.pdf')
        plt.close('all')

        ###########################################################
Ejemplo n.º 7
0
                        TRAINING_COLUMNS_LIST,
                        bins=50,
                        labels=leg_labels,
                        log=True,
                        density=True,
                        figsize=(12, 7),
                        alpha=0.3,
                        grid=False)
                    plt.subplots_adjust(left=0.06,
                                        bottom=0.06,
                                        right=0.99,
                                        top=0.96,
                                        hspace=0.55,
                                        wspace=0.55)
                    plt.savefig(f'{PLOT_DIR}/features/FeaturePlots_{bin}')
                    plot_utils.plot_corr([background_tree_handler],
                                         TRAINING_COLUMNS_LIST, ['background'])
                    plt.savefig(
                        f'{PLOT_DIR}/features/BackgroundCorrelationMatrix_{bin}'
                    )
                    plot_utils.plot_corr([signal_tree_handler],
                                         TRAINING_COLUMNS_LIST, ['signal'])
                    plt.savefig(
                        f'{PLOT_DIR}/features/SignalCorrelationMatrix_{bin}')
                    plt.close('all')

                # split data into training and test set
                train_test_data = train_test_generator(
                    [signal_tree_handler, background_tree_handler], [1, 0],
                    test_size=0.5,
                    random_state=RANDOM_STATE)
                print(
    def do_hipe4mlplot(self):
        self.logger.info("Plotting hipe4ml model")

        leglabels = ["Background", "Prompt signal"]
        outputlabels = ["Bkg", "SigPrompt"]

        # _____________________________________________
        plot_utils.plot_distr([self.bkghandler, self.signalhandler],
                              self.v_train, 100, leglabels)
        plt.subplots_adjust(left=0.06,
                            bottom=0.06,
                            right=0.99,
                            top=0.96,
                            hspace=0.55,
                            wspace=0.55)
        figname = f'{self.dirmlplot}/DistributionsAll_pT_{self.p_binmin}_{self.p_binmax}.pdf'
        plt.savefig(figname)
        plt.close('all')
        # _____________________________________________
        corrmatrixfig = plot_utils.plot_corr(
            [self.bkghandler, self.signalhandler], self.v_train, leglabels)
        for figg, labb in zip(corrmatrixfig, outputlabels):
            plt.figure(figg.number)
            plt.subplots_adjust(left=0.2, bottom=0.25, right=0.95, top=0.9)
            figname = f'{self.dirmlplot}/CorrMatrix{labb}_pT_{self.p_binmin}_{self.p_binmax}.pdf'
            figg.savefig(figname)
        # _____________________________________________
        plt.rcParams["figure.figsize"] = (10, 7)
        mloutputfig = plot_utils.plot_output_train_test(
            self.p_hipe4ml_model,
            self.traintestdata,
            80,
            self.raw_output_hipe4ml,
            leglabels,
            self.train_test_log_hipe4ml,
            density=True)
        figname = f'{self.dirmlplot}/MLOutputDistr_pT_{self.p_binmin}_{self.p_binmax}.pdf'
        mloutputfig.savefig(figname)
        # _____________________________________________
        plt.rcParams["figure.figsize"] = (10, 9)
        roccurvefig = plot_utils.plot_roc(self.traintestdata[3],
                                          self.ypredtest_hipe4ml, None,
                                          leglabels,
                                          self.average_method_hipe4ml,
                                          self.roc_method_hipe4ml)
        figname = f'{self.dirmlplot}/ROCCurveAll_pT_{self.p_binmin}_{self.p_binmax}.pdf'
        roccurvefig.savefig(figname)
        # _____________________________________________
        plt.rcParams["figure.figsize"] = (10, 9)
        roccurvettfig = plot_utils.plot_roc_train_test(
            self.traintestdata[3], self.ypredtest_hipe4ml,
            self.traintestdata[1], self.ypredtrain_hipe4ml, None, leglabels,
            self.average_method_hipe4ml, self.roc_method_hipe4ml)
        figname = f'{self.dirmlplot}/ROCCurveTrainTest_pT_{self.p_binmin}_{self.p_binmax}.pdf'
        roccurvettfig.savefig(figname)
        # _____________________________________________
        precisionrecallfig = plot_utils.plot_precision_recall(
            self.traintestdata[3], self.ypredtest_hipe4ml, leglabels)
        figname = f'{self.dirmlplot}/PrecisionRecallAll_pT_{self.p_binmin}_{self.p_binmax}.pdf'
        precisionrecallfig.savefig(figname)
        # _____________________________________________
        plt.rcParams["figure.figsize"] = (12, 7)
        featuresimportancefig = plot_utils.plot_feature_imp(
            self.traintestdata[2][self.v_train], self.traintestdata[3],
            self.p_hipe4ml_model, leglabels)
        for i in range(0, len(featuresimportancefig)):
            figname = (f'{self.dirmlplot}/FeatureImportanceOpt{i}_'
                       f'pT_{self.p_binmin}_{self.p_binmax}.pdf')
            featuresimportancefig[i].savefig(figname)
                          labels=['Signal', "Background"],
                          colors=["blue", "red"],
                          log=True,
                          density=True,
                          figsize=(18, 13),
                          alpha=0.3,
                          grid=False)
    plt.subplots_adjust(left=0.06,
                        bottom=0.06,
                        right=0.99,
                        top=0.96,
                        hspace=0.55,
                        wspace=0.55)
    plt.savefig(results_ml_path + "/features_distributions.png",
                bbox_inches='tight')
    corr = pu.plot_corr([signalH, bkgH], training_columns + ["m"],
                        ['Signal', "Background"])
    corr[0].savefig(results_ml_path + "/correlations.png", bbox_inches='tight')

    print("---------------------------------------------")
    print("Data loaded. Training and testing ....")

    params_range = {
        "max_depth": (8, 18),
        "learning_rate": (0.07, 0.15),
        "n_estimators": (150, 250),
        "gamma": (0.3, 0.5),
        "min_child_weight": (3, 8),
        "subsample": (0.5, 1),
        "colsample_bytree": (0.3, 1),
    }