def preparesample(self):
        self.logger.info("Prepare Sample for hipe4ml")

        self.signalhandler = TreeHandler(self.inputtreemc, 'treeMLDplus')
        nsigcand = self.signalhandler.get_n_cand()
        self.datahandler = TreeHandler(self.inputtreedata, 'treeMLDplus')
        self.bkghandler = self.datahandler.get_subset(self.s_selbkgml,
                                                      size=nsigcand *
                                                      self.v_bkgoversigfrac)
        self.traintestdata = train_test_generator(
            [self.signalhandler, self.bkghandler], [self.v_sig, self.v_bkg],
            test_size=self.test_frac,
            random_state=self.rnd_splt)
Beispiel #2
0
        train_test_data = [
            pd.DataFrame(),
            pd.DataFrame(),
            pd.DataFrame(),
            pd.DataFrame()
        ]
        if CREATE_TRAIN_TEST and (COMPUTE_SCORES_FROM_EFF or TRAIN):
            df_signal_ct = df_signal.query(
                f'ct > {ct_bins[0]} and ct < {ct_bins[1]} and pt > 0.5 and pt < 3 and isReconstructed and tpcClV0Pi > 69 and tpcClV0Pr > 69 and radius > 3'
            )
            df_background_ct = df_background.query(
                f'ct > {ct_bins[0]} and ct < {ct_bins[1]} and pt > 0.5 and pt < 3 and ( mass < 1.1 or mass > 1.13 ) and tpcClV0Pi > 69 and tpcClV0Pr > 69 and radius > 3'
            )

            # define tree handlers
            signal_tree_handler = TreeHandler()
            background_tree_handler = TreeHandler()
            signal_tree_handler.set_data_frame(df_signal_ct)
            background_tree_handler.set_data_frame(df_background_ct)
            del df_signal_ct, df_background_ct

            # split data into training and test set
            train_test_data = train_test_generator(
                [signal_tree_handler, background_tree_handler], [1, 0],
                test_size=0.5,
                random_state=RANDOM_STATE)
            train_test_data[0]['y_true'] = train_test_data[1]
            train_test_data[2]['y_true'] = train_test_data[3]
            train_test_data[0].to_parquet(
                f'df/train_data_{ct_bins[0]}_{ct_bins[1]}.parquet.gzip',
                compression='gzip')
Beispiel #3
0
def main():  #pylint: disable=too-many-statements, too-many-branches
    # read config file
    parser = argparse.ArgumentParser(description='Arguments to pass')
    parser.add_argument('cfgFileName',
                        metavar='text',
                        default='cfgFileNameML.yml',
                        help='config file name for ml')
    args = parser.parse_args()

    print('Loading analysis configuration: ...', end='\r')
    with open(args.cfgFileName, 'r') as ymlCfgFile:
        inputCfg = yaml.load(ymlCfgFile, yaml.FullLoader)
    print('Loading analysis configuration: Done!')

    PtBins = [[a, b] for a, b in zip(inputCfg['pt_ranges']['min'],
                                     inputCfg['pt_ranges']['max'])]
    OutputLabels = [
        inputCfg['output']['out_labels']['Bkg'],
        inputCfg['output']['out_labels']['Prompt']
    ]
    if inputCfg['output']['out_labels']['FD'] is not None:
        OutputLabels.append(inputCfg['output']['out_labels']['FD'])
    ColumnsToSave = inputCfg['appl']['column_to_save_list']
    ModelList = inputCfg['ml']['saved_models']
    ModelHandls = []
    for iBin in range(len(PtBins)):
        ModelPath = ModelList[iBin]
        if not isinstance(ModelPath, str):
            print('\033[91mERROR: path to model not correctly defined!\033[0m')
            sys.exit()
        ModelPath = os.path.expanduser(ModelPath)
        print(f'Loaded saved model: {ModelPath}')
        ModelHandl = ModelHandler()
        ModelHandl.load_model_handler(ModelPath)
        ModelHandls.append(ModelHandl)

    for inputFile, outName in zip(inputCfg['standalone_appl']['inputs'],
                                  inputCfg['standalone_appl']['output_names']):
        print(f'Loading and preparing data file {inputFile}: ...', end='\r')
        DataHandler = TreeHandler(inputFile)
        DataHandler.slice_data_frame('pt_cand', PtBins, True)
        print(f'Loading and preparing data files {inputFile}: Done!')

        print('Applying ML model to dataframes: ...', end='\r')
        for iBin, PtBin in enumerate(PtBins):
            OutPutDirPt = os.path.join(
                os.path.expanduser(inputCfg['standalone_appl']['output_dir']),
                f'pt{PtBin[0]}_{PtBin[1]}')
            if os.path.isdir(OutPutDirPt):
                print((
                    f'\033[93mWARNING: Output directory \'{OutPutDirPt}\' already exists,'
                    ' overwrites possibly ongoing!\033[0m'))
            else:
                os.makedirs(OutPutDirPt)
            DataDfPtSel = DataHandler.get_slice(iBin)
            yPred = ModelHandls[iBin].predict(DataDfPtSel,
                                              inputCfg['ml']['raw_output'])
            ColumnsToSaveFinal = ColumnsToSave
            if not isinstance(ColumnsToSaveFinal, list):
                print(
                    '\033[91mERROR: column_to_save_list must be defined!\033[0m'
                )
                sys.exit()
            if 'inv_mass' not in ColumnsToSaveFinal:
                print(
                    '\033[93mWARNING: inv_mass is not going to be saved in the output dataframe!\033[0m'
                )
            if 'pt_cand' not in ColumnsToSaveFinal:
                print(
                    '\033[93mWARNING: pt_cand is not going to be saved in the output dataframe!\033[0m'
                )
            if 'pt_B' in ColumnsToSaveFinal and 'pt_B' not in DataDfPtSel.columns:
                ColumnsToSaveFinal.remove('pt_B')  # only in MC
            DataDfPtSel = DataDfPtSel.loc[:, ColumnsToSaveFinal]
            if ModelHandls[iBin].get_n_classes() < 3:
                DataDfPtSel['ML_output'] = yPred
            else:
                for Pred, Lab in enumerate(OutputLabels):
                    DataDfPtSel[f'ML_output_{Lab}'] = yPred[:, Pred]
            DataDfPtSel.to_parquet(
                f'{OutPutDirPt}/{outName}_pT_{PtBin[0]}_{PtBin[1]}_ModelApplied.parquet.gzip'
            )
            del DataDfPtSel
        print('Applying ML model to dataframes: Done!')
Beispiel #4
0
def test_tree_handler():  # pylint: disable=too-many-statements
    """
    Test the TreeHandler class functionalities.
    """
    # define the working directory
    test_dir = Path(__file__).resolve().parent

    # initialize TreeHandler test
    test_data, references = init_tree_handler_test_workspace(test_dir)

    # instantiate tree handler objects
    data_hdlr = TreeHandler(test_data[0], 'treeMLDplus')
    prompt_hdlr = TreeHandler(test_data[1], 'treeMLDplus')
    data_pq_hdlr = TreeHandler(test_data[2])
    prompt_pq_hdlr = TreeHandler(test_data[3])
    mult_hdlr = TreeHandler(test_data[:2], 'treeMLDplus')
    mult_pq_hdlr = TreeHandler(test_data[2:])

    # open refernces objects
    reference_data_slice_df = pd.read_pickle(references[0])
    reference_prompt_slice_df = pd.read_pickle(references[1])
    with open(references[2], 'rb') as handle:
        reference_dict = pickle.load(handle)

    terminate_tree_handler_test_workspace(test_dir)

    # test that data is the same in root and parquet
    assert data_hdlr.get_data_frame().equals(data_pq_hdlr.get_data_frame()), \
        'data Dataframe from parquet file differs from the root file one!'
    assert prompt_hdlr.get_data_frame().equals(prompt_pq_hdlr.get_data_frame()), \
        'prompt Dataframe from parquet file differs from the root file one!'

    # test loading from multiple files
    merged_df = pd.concat(
        [data_hdlr.get_data_frame(),
         prompt_hdlr.get_data_frame()],
        ignore_index=True)
    assert mult_hdlr.get_data_frame().equals(
        merged_df), 'loading of multiple root files not working!'
    merged_pq_df = pd.concat(
        [data_pq_hdlr.get_data_frame(),
         prompt_pq_hdlr.get_data_frame()],
        ignore_index=True)
    assert mult_pq_hdlr.get_data_frame().equals(
        merged_pq_df), 'loading of multiple parquet files not working!'

    # define the info dict that will be compared with the reference
    info_dict = {}

    # get the number of candidates in the original data sample
    info_dict['n_data'] = data_hdlr.get_n_cand()
    info_dict['n_prompt'] = prompt_hdlr.get_n_cand()

    # get the original variable list
    info_dict['data_var_list'] = prompt_hdlr.get_var_names()
    info_dict['prompt_var_list'] = prompt_hdlr.get_var_names()

    # shuffle dataframes
    new_hndl = data_hdlr.shuffle_data_frame(size=10,
                                            random_state=5,
                                            inplace=False)
    copied_hndl = copy.deepcopy(data_hdlr)
    copied_hndl.shuffle_data_frame(size=10, random_state=5, inplace=True)
    assert copied_hndl.get_data_frame().equals(new_hndl.get_data_frame()), \
        'Inplaced dataframe differs from the not inplaced one after shuffling'

    # apply preselections
    preselections_data = '(pt_cand > 1.30 and pt_cand < 42.00) and (inv_mass > 1.6690 and inv_mass < 2.0690)'
    preselections_prompt = '(pt_cand > 1.00 and pt_cand < 25.60) and (inv_mass > 1.8320 and inv_mass < 1.8940)'

    new_hndl = data_hdlr.apply_preselections(preselections_data, inplace=False)
    data_hdlr.apply_preselections(preselections_data)
    assert data_hdlr.get_data_frame().equals(new_hndl.get_data_frame()), \
        'Inplaced dataframe differs from the not inplaced one after the preselections'

    prompt_hdlr.apply_preselections(preselections_prompt)

    # get the number of selected data
    info_dict['n_data_preselected'] = data_hdlr.get_n_cand()
    info_dict['n_prompt_preselected'] = prompt_hdlr.get_n_cand()

    # get the preselections
    info_dict['data_preselections'] = data_hdlr.get_preselections()
    info_dict['prompt_preselections'] = prompt_hdlr.get_preselections()

    # apply dummy eval() on the underlying data frame
    d_len_z_def = 'd_len_z = sqrt(d_len**2 - d_len_xy**2)'
    new_hndl = data_hdlr.eval_data_frame(d_len_z_def, inplace=False)
    data_hdlr.eval_data_frame(d_len_z_def)
    assert data_hdlr.get_data_frame().equals(new_hndl.get_data_frame()), \
        'Inplaced dataframe differs from the not inplaced one after eval'

    prompt_hdlr.eval_data_frame(d_len_z_def)

    # get the new variable list
    info_dict['data_new_var_list'] = prompt_hdlr.get_var_names()
    info_dict['prompt_new_var_list'] = prompt_hdlr.get_var_names()

    # get a random subset of the original data
    data_hdlr = data_hdlr.get_subset(size=3000, rndm_state=SEED)
    prompt_hdlr = prompt_hdlr.get_subset(size=55, rndm_state=SEED)

    # slice both data and prompt data frame respect to the pT
    bins = [[0, 2], [2, 10], [10, 25]]

    data_hdlr.slice_data_frame('pt_cand', bins)
    prompt_hdlr.slice_data_frame('pt_cand', bins)

    # store projection variable and binning
    info_dict['data_proj_variable'] = data_hdlr.get_projection_variable()
    info_dict['prompt_proj_variable'] = prompt_hdlr.get_projection_variable()

    info_dict['data_binning'] = data_hdlr.get_projection_binning()
    info_dict['prompt_binning'] = prompt_hdlr.get_projection_binning()

    # get info from a single data slice
    data_slice_df = data_hdlr.get_slice(2)
    prompt_slice_df = prompt_hdlr.get_slice(2)

    info_dict['n_data_slice'] = len(data_slice_df)
    info_dict['n_prompt_slice'] = len(prompt_slice_df)

    # test info_dict reproduction

    assert info_dict == reference_dict, 'dictionary with the data info differs from the reference!'

    # test sliced data frames reproduction
    assert data_slice_df.equals(
        reference_data_slice_df
    ), 'data sliced DataFrame differs from the reference!'
    assert prompt_slice_df.equals(
        reference_prompt_slice_df
    ), 'prompt sliced DataFrame differs from the reference!'
def main():  #pylint: disable=too-many-statements
    # read config file
    parser = argparse.ArgumentParser(description='Arguments to pass')
    parser.add_argument('cfgFileName',
                        metavar='text',
                        default='cfgFileNameML.yml',
                        help='config file name for ml')
    parser.add_argument("--train",
                        help="perform only training and testing",
                        action="store_true")
    parser.add_argument("--apply",
                        help="perform only application",
                        action="store_true")
    args = parser.parse_args()

    print('Loading analysis configuration: ...', end='\r')
    with open(args.cfgFileName, 'r') as ymlCfgFile:
        inputCfg = yaml.load(ymlCfgFile, yaml.FullLoader)
    print('Loading analysis configuration: Done!')

    print('Loading and preparing data files: ...', end='\r')
    PromptHandler = TreeHandler(inputCfg['input']['prompt'],
                                inputCfg['input']['treename'])
    FDHandler = None if inputCfg['input']['FD'] is None else TreeHandler(
        inputCfg['input']['FD'], inputCfg['input']['treename'])
    DataHandler = TreeHandler(inputCfg['input']['data'],
                              inputCfg['input']['treename'])

    if inputCfg['data_prep']['filt_bkg_mass']:
        BkgHandler = DataHandler.get_subset(
            inputCfg['data_prep']['filt_bkg_mass'],
            frac=1.,
            rndm_state=inputCfg['data_prep']['seed_split'])
    else:
        BkgHandler = DataHandler

    PtBins = [[a, b] for a, b in zip(inputCfg['pt_ranges']['min'],
                                     inputCfg['pt_ranges']['max'])]
    PromptHandler.slice_data_frame('pt_cand', PtBins, True)
    if FDHandler is not None:
        FDHandler.slice_data_frame('pt_cand', PtBins, True)
    DataHandler.slice_data_frame('pt_cand', PtBins, True)
    BkgHandler.slice_data_frame('pt_cand', PtBins, True)
    print('Loading and preparing data files: Done!')

    for iBin, PtBin in enumerate(PtBins):
        print(
            f'\n\033[94mStarting ML analysis --- {PtBin[0]} < pT < {PtBin[1]} GeV/c\033[0m'
        )

        OutPutDirPt = os.path.join(
            os.path.expanduser(inputCfg['output']['dir']),
            f'pt{PtBin[0]}_{PtBin[1]}')
        if os.path.isdir(OutPutDirPt):
            print((
                f'\033[93mWARNING: Output directory \'{OutPutDirPt}\' already exists,'
                ' overwrites possibly ongoing!\033[0m'))
        else:
            os.makedirs(OutPutDirPt)

        # data preparation
        #_____________________________________________
        FDDfPt = pd.DataFrame() if FDHandler is None else FDHandler.get_slice(
            iBin)
        TrainTestData, PromptDfSelForEff, FDDfSelForEff = data_prep(
            inputCfg, iBin, PtBin, OutPutDirPt, PromptHandler.get_slice(iBin),
            FDDfPt, BkgHandler.get_slice(iBin))
        if args.apply and inputCfg['data_prep']['test_fraction'] < 1.:
            print(
                '\033[93mWARNING: Using only a fraction of the MC for the application! Are you sure?\033[0m'
            )

        # training, testing
        #_____________________________________________
        if not args.apply:
            ModelHandl = train_test(inputCfg, PtBin, OutPutDirPt,
                                    TrainTestData, iBin)
        else:
            ModelList = inputCfg['ml']['saved_models']
            ModelPath = ModelList[iBin]
            if not isinstance(ModelPath, str):
                print(
                    '\033[91mERROR: path to model not correctly defined!\033[0m'
                )
                sys.exit()
            ModelPath = os.path.expanduser(ModelPath)
            print(f'Loaded saved model: {ModelPath}')
            ModelHandl = ModelHandler()
            ModelHandl.load_model_handler(ModelPath)

        # model application
        #_____________________________________________
        if not args.train:
            appl(inputCfg, PtBin, OutPutDirPt, ModelHandl,
                 DataHandler.get_slice(iBin), PromptDfSelForEff, FDDfSelForEff)

        # delete dataframes to release memory
        for data in TrainTestData:
            del data
        del PromptDfSelForEff, FDDfSelForEff
Beispiel #6
0
        # PLOT FEATURES DISTRIBUTIONS AND CORRELATIONS
        ######################################################

        df_background = uproot.open(
            os.path.expandvars(BKG_PATH))['LambdaTree'].arrays(library="pd")
        df_prompt_ct = df_signal.query(
            f'pt > 0 and pt < 3.5 and flag==1')  # pt cut?
        df_non_prompt_ct = df_signal.query(
            f'pt > 0 and pt < 3.5 and flag==2')  # pt cut?
        df_background_ct = df_background.query(
            f'pt > 0 and pt < 3.5')  # pt cut?
        #print(df_prompt_ct.keys())
        #print(df_background_ct.keys())

        # define tree handlers
        prompt_tree_handler = TreeHandler()
        non_prompt_tree_handler = TreeHandler()
        background_tree_handler = TreeHandler()
        prompt_tree_handler.set_data_frame(df_prompt_ct)
        non_prompt_tree_handler.set_data_frame(df_non_prompt_ct)
        background_tree_handler.set_data_frame(df_background_ct)
        del df_prompt_ct, df_non_prompt_ct, df_background_ct

        if not os.path.isdir(f'{PLOT_DIR}/features'):
            os.mkdir(f'{PLOT_DIR}/features')

        leg_labels = ['background', 'non-prompt', 'prompt']
        plot_distr = plot_utils.plot_distr([
            background_tree_handler, non_prompt_tree_handler,
            prompt_tree_handler
        ],
Beispiel #7
0
def benchmark_hyperparam_optimizers(filename_dict,
                                    params,
                                    params_range,
                                    flag_dict,
                                    presel_dict,
                                    training_variables='',
                                    testsize=0.75):

    import time
    from sklearn.metrics import roc_auc_score

    N_run = 1

    data_path = filename_dict['data_path']
    analysis_path = filename_dict['analysis_path']

    print('Loading MC signal')
    mc_signal = TreeHandler()
    mc_signal.get_handler_from_large_file(
        file_name=data_path + filename_dict['MC_signal_filename'],
        tree_name=filename_dict['MC_signal_table'])
    print('MC signal loaded\n')

    print('Loading background data for training')
    background_ls = TreeHandler()
    background_ls.get_handler_from_large_file(
        file_name=data_path + filename_dict['train_bckg_filename'],
        tree_name=filename_dict['train_bckg_table'])
    background_ls.apply_preselections(presel_dict['train_bckg_presel'])
    background_ls.shuffle_data_frame(size=min(background_ls.get_n_cand(),
                                              mc_signal.get_n_cand() * 4))
    print('Done\n')

    train_test_data = train_test_generator([mc_signal, background_ls], [1, 0],
                                           test_size=testsize)

    if training_variables == '':
        training_variables = train_test_data[0].columns.tolist()

    model_clf = xgb.XGBClassifier()
    model_hdl = ModelHandler(model_clf, training_variables)

    times = []
    roc = []

    for i in range(N_run):
        start = time.time()

        model_hdl.optimize_params_bayes(train_test_data,
                                        params_range,
                                        'roc_auc',
                                        njobs=-1)
        model_hdl.train_test_model(train_test_data, )

        y_pred_test = model_hdl.predict(
            train_test_data[2], True)  #used to evaluate model performance

        roc.append(roc_auc_score(train_test_data[3], y_pred_test))

        times.append(time.time() - start)

    print('BAYES OPTIMIZATION WITH SKLEARN')
    print('Mean time : ' + str(np.mean(time)))
    print('Mean ROC : ' + str(np.mean(roc)))
    print('--------------\n')

    for i in range(N_run):
        model_hdl.optimize_params_optuna(train_test_data,
                                         params_range,
                                         'roc_auc',
                                         timeout=np.mean(times),
                                         njobs=-1)
        model_hdl.train_test_model(train_test_data, )

        y_pred_test = model_hdl.predict(
            train_test_data[2], True)  #used to evaluate model performance

        roc.append(roc_auc_score(train_test_data[3], y_pred_test))

    print('OPTUNA')
    print('Fixed time : ' + str(np.mean(time)))
    print('Mean ROC : ' + str(np.mean(roc)))
    print('--------------\n')
Beispiel #8
0
    # make dataframe directory
    if not os.path.isdir('df'):
        os.mkdir('df')

    score_eff_arrays_dict = dict()

    for ct_bins in CT_BINS:

        train_test_data = [pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()]
        if CREATE_TRAIN_TEST and (COMPUTE_SCORES_FROM_EFF or TRAIN):
            df_prompt_ct = df_signal.query(f'ct > {ct_bins[0]} and ct < {ct_bins[1]} and pt > 0.5 and pt < 3.5 and isReconstructed and (flag==1) and tpcClV0Pi > 69 and tpcClV0Pr > 69 and radius > 3 and radius < 50 and dcaPrPV < 10 and dcaPiPV < 10 and eta < 0.8 and eta > -0.8')
            df_nonprompt_ct = df_signal.query(f'ct > {ct_bins[0]} and ct < {ct_bins[1]} and pt > 0.5 and pt < 3.5 and isReconstructed and (flag==2 or flag==4) and tpcClV0Pi > 69 and tpcClV0Pr > 69 and radius > 3 and radius < 50 and dcaPrPV < 10 and dcaPiPV < 10 and eta < 0.8 and eta > -0.8')
            df_background_ct = df_background.query(f'ct > {ct_bins[0]} and ct < {ct_bins[1]} and pt > 0.5 and pt < 3.5 and tpcClV0Pi > 69 and tpcClV0Pr > 69 and radius > 3 and radius < 50 and dcaPrPV < 10 and dcaPiPV < 10 and eta < 0.8 and eta > -0.8')

            # define tree handlers
            prompt_tree_handler = TreeHandler()
            nonprompt_tree_handler = TreeHandler()
            background_tree_handler = TreeHandler()
            prompt_tree_handler.set_data_frame(df_prompt_ct)
            nonprompt_tree_handler.set_data_frame(df_nonprompt_ct)
            background_tree_handler.set_data_frame(df_background_ct)
            del df_prompt_ct, df_nonprompt_ct, df_background_ct

            # split data into training and test set
            train_test_data = train_test_generator([background_tree_handler, nonprompt_tree_handler, prompt_tree_handler], [
                0, 1, 2], test_size=0.2, random_state=RANDOM_STATE)
            train_test_data[0]['y_true'] = train_test_data[1]
            train_test_data[2]['y_true'] = train_test_data[3]
            train_test_data[0].to_parquet(f'df/train_data_{ct_bins[0]}_{ct_bins[1]}.parquet.gzip',compression='gzip')
            train_test_data[2].to_parquet(f'df/test_data_{ct_bins[0]}_{ct_bins[1]}.parquet.gzip',compression='gzip')
            # continue
Beispiel #9
0
#%%
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import uproot
import os
from hipe4ml.tree_handler import TreeHandler

#%%
data = TreeHandler(
    os.path.abspath(os.getcwd()) + '/data/SignalTable_pp13TeV_mtexp.root',
    "SignalTable").get_data_frame()

#%%
training_variables = [
    "pt", "cos_pa", "tpc_ncls_de", "tpc_ncls_pr", "tpc_ncls_pi", "tpc_nsig_de",
    "tpc_nsig_pr", "tpc_nsig_pi", "dca_de_pr", "dca_de_pi", "dca_pr_pi",
    "dca_de_sv", "dca_pr_sv", "dca_pi_sv", "chi2"
]
print(list(data.columns))
data.head(10)
# %%
#sns.pairplot(data.query('gReconstructed > 0').sample(1000), hue = 'gReconstructed', plot_kws={'alpha': 0.1}, corner = True)
#data.query('gReconstructed > 0').hist(bins = 50, figsize = (20,20));

# %%
rec = data.query('gReconstructed > 0').copy()
rec_rej_acc = rec.query('rej_accept > 0').copy()

print(len(rec) / len(data))
Beispiel #10
0
                del df_generated_cent

        ######################################################################

    # second condition needed because of issue with Qt libraries
    if MAKE_FEATURES_PLOTS and not MAKE_PRESELECTION_EFFICIENCY and not TRAIN:
        ######################################################
        # PLOT FEATURES DISTRIBUTIONS AND CORRELATIONS
        ######################################################

        df_background = uproot.open(os.path.expandvars(BKG_PATH))['LambdaTree'].arrays(library="pd")
        df_signal_ct = df_signal.query(f'pt > 0.5 and pt < 3 and ((flag & 1) == 1) and isReconstructed') # pt cut?
        df_background_ct = df_background.query(f'pt > 0.5 and pt < 3 and ( mass < 1.105 or mass > 1.13 )') # pt cut?

        # define tree handlers
        signal_tree_handler = TreeHandler()
        background_tree_handler = TreeHandler()
        signal_tree_handler.set_data_frame(df_signal_ct)
        background_tree_handler.set_data_frame(df_background_ct)
        del df_signal_ct, df_background_ct

        if not os.path.isdir(f'{PLOT_DIR}/features'):
            os.mkdir(f'{PLOT_DIR}/features')

        leg_labels = ['background', 'signal']
        plot_distr = plot_utils.plot_distr(
            [background_tree_handler, signal_tree_handler],
            TRAINING_COLUMNS_LIST, bins=40, labels=leg_labels, log=True, density=True, figsize=(10, 12),
            alpha=0.5, grid=False)
        plt.subplots_adjust(left=0.06, bottom=0.06, right=0.99, top=0.96, hspace=0.50, wspace=0.50)
        plt.tight_layout()
Beispiel #11
0
                bin = f'{split}_{cent_bins[0]}_{cent_bins[1]}_{ct_bins[0]}_{ct_bins[1]}'

                ##############################################################
                # TRAINING AND TEST SET PREPARATION
                ##############################################################
                df_signal_cent_ct = df_signal.query(
                    f'ArmenterosAlpha {split_ineq_sign} 0 and centrality > {cent_bins[0]} and centrality < {cent_bins[1]} and ct > {ct_bins[0]} and ct < {ct_bins[1]}'
                )
                df_background_cent_ct = df_background.query(
                    f'ArmenterosAlpha {split_ineq_sign} 0 and centrality > {cent_bins[0]} and centrality < {cent_bins[1]} and ct > {ct_bins[0]} and ct < {ct_bins[1]}'
                )
                #df_signal_cent_ct = df_signal_cent_ct[TRAINING_COLUMNS_LIST]
                #df_background_cent_ct = df_background_cent_ct[TRAINING_COLUMNS_LIST]

                # define tree handlers
                signal_tree_handler = TreeHandler()
                background_tree_handler_full = TreeHandler()
                signal_tree_handler.set_data_frame(df_signal_cent_ct)
                background_tree_handler_full.set_data_frame(
                    df_background_cent_ct)
                del df_signal_cent_ct
                del df_background_cent_ct

                # downscale background
                background_tree_handler = background_tree_handler_full.get_subset(
                    size=int(0.8 * signal_tree_handler.get_n_cand()),
                    rndm_state=RANDOM_STATE)
                del background_tree_handler_full

                # features plot
                leg_labels = ['background', 'signal']
Beispiel #12
0
REFERENCE_DIR = BASE_DIR.joinpath('references')

# define paths for files
DATA_FILE_PATH = DATA_DIR.joinpath('Bkg_Dpluspp7TeV_pT_1_50.root')
PROMPT_FILE_PATH = DATA_DIR.joinpath('Prompt_Dpluspp7TeV_pT_1_50.root')

# define dictionary for storing reference for the tests
INFO_DICT = {}

# preliminar check
if not REFERENCE_DIR.is_dir():
    sys.exit(
        "No 'references' dir was found, so no reference data were produced!")

# instantiate tree handler objects
DATA_HDLR = TreeHandler(DATA_FILE_PATH, 'treeMLDplus')
PROMPT_HDLR = TreeHandler(PROMPT_FILE_PATH, 'treeMLDplus')

# store number of candidates in the original data sample
INFO_DICT['n_data'] = DATA_HDLR.get_n_cand()
INFO_DICT['n_prompt'] = PROMPT_HDLR.get_n_cand()

# store original variable list
INFO_DICT['data_var_list'] = PROMPT_HDLR.get_var_names()
INFO_DICT['prompt_var_list'] = PROMPT_HDLR.get_var_names()

# apply preselections
PRESEL_DATA = '(pt_cand > 1.30 and pt_cand < 42.00) and (inv_mass > 1.6690 and inv_mass < 2.0690)'
PRESEL_PROMPT = '(pt_cand > 1.00 and pt_cand < 25.60) and (inv_mass > 1.8320 and inv_mass < 1.8940)'

DATA_HDLR.apply_preselections(PRESEL_DATA)
class Optimiserhipe4mltree:
    # Class Attribute
    species = "optimiser_hipe4mltree"

    def __init__(self, data_param, binmin, binmax, training_var, bkg_sel,
                 hyper_pars):

        self.logger = get_logger()

        # directory
        #self.do_mlprefilter = datap.get("doml_asprefilter", None)
        self.dirmlout = data_param["ml"]["mlout"]
        self.dirmlplot = data_param["ml"]["mlplot"]
        #if self.do_mlprefilter is True:
        #    self.dirmodel = self.dirmodel + "/prefilter"
        #    self.dirmlplot = self.dirmlplot + "/prefilter"
        #if self.do_mlprefilter is False:
        #    self.dirmodel = self.dirmodel + "/analysis"
        #    self.dirmlplot = self.dirmlplot + "/analysis"

        self.inputtreedata = "/Users/lvermunt/cernbox/Analyses/ML/input/hipe4mlTTree/data.root"
        self.inputtreemc = "/Users/lvermunt/cernbox/Analyses/ML/input/hipe4mlTTree/prompt.root"
        self.v_train = None
        self.p_binmin = binmin
        self.p_binmax = binmax

        self.s_selsigml = ""
        self.s_selbkgml = bkg_sel  #"inv_mass < 1.82 or 1.92 < inv_mass < 2.00"
        self.v_bkgoversigfrac = 3
        self.v_sig = 1
        self.v_bkg = 0
        self.rnd_splt = data_param["ml"]["rnd_splt"]
        self.test_frac = data_param["ml"]["test_frac"]

        self.prompthandler = None
        self.datahandler = None
        self.bkghandler = None
        self.traintestdata = None
        self.ypredtrain_hipe4ml = None
        self.ypredtest_hipe4ml = None

        self.preparesample()

        self.p_hipe4ml_model = None
        self.v_hipe4ml_pars = hyper_pars
        self.load_hipe4mlmodel()

        self.bayesoptconfig_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"][
            "bayes_opt_config"]
        self.average_method_hipe4ml = data_param["hipe4ml"]["roc_auc_average"]
        self.nfold_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"]["nfolds"]
        self.init_points = data_param["hipe4ml"]["hyper_par_opt"]["initpoints"]
        self.n_iter_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"]["niter"]
        self.njobs_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"]["njobs"]
        self.roc_method_hipe4ml = data_param["hipe4ml"]["roc_auc_approach"]
        self.raw_output_hipe4ml = data_param["hipe4ml"]["raw_output"]
        self.train_test_log_hipe4ml = data_param["hipe4ml"]["train_test_log"]

        self.multiclass_labels = data_param["ml"].get("multiclass_labels",
                                                      None)

        self.logger.info("Using the following training variables: %s",
                         self.v_train)

    def preparesample(self):
        self.logger.info("Prepare Sample for hipe4ml")

        self.signalhandler = TreeHandler(self.inputtreemc, 'treeMLDplus')
        nsigcand = self.signalhandler.get_n_cand()
        self.datahandler = TreeHandler(self.inputtreedata, 'treeMLDplus')
        self.bkghandler = self.datahandler.get_subset(self.s_selbkgml,
                                                      size=nsigcand *
                                                      self.v_bkgoversigfrac)
        self.traintestdata = train_test_generator(
            [self.signalhandler, self.bkghandler], [self.v_sig, self.v_bkg],
            test_size=self.test_frac,
            random_state=self.rnd_splt)

    def load_hipe4mlmodel(self):
        self.logger.info("Loading hipe4ml model")
        self.v_train = self.signalhandler.get_var_names()
        self.v_train.remove('inv_mass')
        self.v_train.remove('pt_cand')

        model_xgboost = xgb.XGBClassifier()
        self.p_hipe4ml_model = ModelHandler(model_xgboost, self.v_train)

    def set_hipe4ml_modelpar(self):
        self.logger.info("Setting hipe4ml hyperparameters")
        self.p_hipe4ml_model.set_model_params(self.v_hipe4ml_pars)

    def do_hipe4mlhyperparopti(self):
        self.logger.info("Optimising hipe4ml hyperparameters (Bayesian)")

        if not (self.average_method_hipe4ml in ['macro', 'weighted']
                and self.roc_method_hipe4ml in ['ovo', 'ovr']):
            self.logger.fatal("Selected ROC configuration is not valid!")

        if self.average_method_hipe4ml == 'weighted':
            metric = f'roc_auc_{self.roc_method_hipe4ml}_{self.average_method_hipe4ml}'
        else:
            metric = f'roc_auc_{self.roc_method_hipe4ml}'

        hypparsfile = f'{self.dirmlout}/HyperParOpt_pT_{self.p_binmin}_{self.p_binmax}.txt'
        outfilehyppars = open(hypparsfile, 'wt')
        sys.stdout = outfilehyppars
        self.p_hipe4ml_model.optimize_params_bayes(self.traintestdata,
                                                   self.bayesoptconfig_hipe4ml,
                                                   metric, self.nfold_hipe4ml,
                                                   self.init_points,
                                                   self.n_iter_hipe4ml,
                                                   self.njobs_hipe4ml)
        outfilehyppars.close()
        sys.stdout = sys.__stdout__
        self.logger.info("Performing hyper-parameters optimisation: Done!")

    def do_hipe4mltrain(self):
        self.logger.info("Training + testing hipe4ml model")
        t0 = time.time()

        self.p_hipe4ml_model.train_test_model(self.traintestdata,
                                              self.average_method_hipe4ml,
                                              self.roc_method_hipe4ml)
        self.ypredtrain_hipe4ml = self.p_hipe4ml_model.predict(
            self.traintestdata[0], self.raw_output_hipe4ml)
        self.ypredtest_hipe4ml = self.p_hipe4ml_model.predict(
            self.traintestdata[2], self.raw_output_hipe4ml)

        modelhandlerfile = f'{self.dirmlout}/ModelHandler_pT_{self.p_binmin}_{self.p_binmax}.pkl'
        self.p_hipe4ml_model.dump_model_handler(modelhandlerfile)
        modelfile = f'{self.dirmlout}/ModelHandler_pT_{self.p_binmin}_{self.p_binmax}.model'
        self.p_hipe4ml_model.dump_original_model(modelfile)

        self.logger.info("Training + testing hipe4ml: Done!")
        self.logger.info("Time elapsed = %.3f", time.time() - t0)

    def do_hipe4mlplot(self):
        self.logger.info("Plotting hipe4ml model")

        leglabels = ["Background", "Prompt signal"]
        outputlabels = ["Bkg", "SigPrompt"]

        # _____________________________________________
        plot_utils.plot_distr([self.bkghandler, self.signalhandler],
                              self.v_train, 100, leglabels)
        plt.subplots_adjust(left=0.06,
                            bottom=0.06,
                            right=0.99,
                            top=0.96,
                            hspace=0.55,
                            wspace=0.55)
        figname = f'{self.dirmlplot}/DistributionsAll_pT_{self.p_binmin}_{self.p_binmax}.pdf'
        plt.savefig(figname)
        plt.close('all')
        # _____________________________________________
        corrmatrixfig = plot_utils.plot_corr(
            [self.bkghandler, self.signalhandler], self.v_train, leglabels)
        for figg, labb in zip(corrmatrixfig, outputlabels):
            plt.figure(figg.number)
            plt.subplots_adjust(left=0.2, bottom=0.25, right=0.95, top=0.9)
            figname = f'{self.dirmlplot}/CorrMatrix{labb}_pT_{self.p_binmin}_{self.p_binmax}.pdf'
            figg.savefig(figname)
        # _____________________________________________
        plt.rcParams["figure.figsize"] = (10, 7)
        mloutputfig = plot_utils.plot_output_train_test(
            self.p_hipe4ml_model,
            self.traintestdata,
            80,
            self.raw_output_hipe4ml,
            leglabels,
            self.train_test_log_hipe4ml,
            density=True)
        figname = f'{self.dirmlplot}/MLOutputDistr_pT_{self.p_binmin}_{self.p_binmax}.pdf'
        mloutputfig.savefig(figname)
        # _____________________________________________
        plt.rcParams["figure.figsize"] = (10, 9)
        roccurvefig = plot_utils.plot_roc(self.traintestdata[3],
                                          self.ypredtest_hipe4ml, None,
                                          leglabels,
                                          self.average_method_hipe4ml,
                                          self.roc_method_hipe4ml)
        figname = f'{self.dirmlplot}/ROCCurveAll_pT_{self.p_binmin}_{self.p_binmax}.pdf'
        roccurvefig.savefig(figname)
        # _____________________________________________
        plt.rcParams["figure.figsize"] = (10, 9)
        roccurvettfig = plot_utils.plot_roc_train_test(
            self.traintestdata[3], self.ypredtest_hipe4ml,
            self.traintestdata[1], self.ypredtrain_hipe4ml, None, leglabels,
            self.average_method_hipe4ml, self.roc_method_hipe4ml)
        figname = f'{self.dirmlplot}/ROCCurveTrainTest_pT_{self.p_binmin}_{self.p_binmax}.pdf'
        roccurvettfig.savefig(figname)
        # _____________________________________________
        precisionrecallfig = plot_utils.plot_precision_recall(
            self.traintestdata[3], self.ypredtest_hipe4ml, leglabels)
        figname = f'{self.dirmlplot}/PrecisionRecallAll_pT_{self.p_binmin}_{self.p_binmax}.pdf'
        precisionrecallfig.savefig(figname)
        # _____________________________________________
        plt.rcParams["figure.figsize"] = (12, 7)
        featuresimportancefig = plot_utils.plot_feature_imp(
            self.traintestdata[2][self.v_train], self.traintestdata[3],
            self.p_hipe4ml_model, leglabels)
        for i in range(0, len(featuresimportancefig)):
            figname = (f'{self.dirmlplot}/FeatureImportanceOpt{i}_'
                       f'pT_{self.p_binmin}_{self.p_binmax}.pdf')
            featuresimportancefig[i].savefig(figname)
MODEL_PARAMS = params['XGBOOST_PARAMS']
HYPERPARAMS = params['HYPERPARAMS']

path_to_data = "../Tables/"
results_ml_path = "../Results/PlotsML"
ml_model_path = "../Utils/Models"
utils_path = "../Utils"
efficiencies_path = "../Utils/Efficiencies"
selected_df_path = "../Utils/ReducedDataFrames"

print("---------------------------------------------")
print("Data loading...")

if training:

    signalH = TreeHandler(path_to_data + signal_table_name, "SignalTable")
    bkgH = TreeHandler(path_to_data + bkg_table_name, "DataTable")

    if bkg_fraction != None:
        bkgH.shuffle_data_frame(size=bkg_fraction * len(signalH),
                                inplace=True,
                                random_state=52)

    train_test_data = au.train_test_generator([signalH, bkgH], [1, 0],
                                              test_size=0.5,
                                              random_state=42)

    if pp_mode:
        signalH.apply_preselections("pt>0 and rej_accept==True")
        training_columns = [
            "pt", "cos_pa", "tpc_ncls_de", "tpc_ncls_pr", "tpc_ncls_pi",