Beispiel #1
0
                    df_data = pd.read_parquet('df/data_dataset')
                    #df_data = uproot.open('../data/AnalysisResults.root')['LambdaTree'].arrays(library="pd")
                    # df_data = df_data.append(df_data_r, ignore_index=True)
                    df_data_cent = df_data.query(
                        f'matter {split_ineq_sign} and centrality > {cent_bins[0]} and centrality < {cent_bins[1]} and pt > 0.5 and pt < 3 and ct > {ct_bins[0]} and ct < {ct_bins[1]} and tpcClV0Pi > 69 and tpcClV0Pr > 69 and radius > 3'
                    )
                    del df_data

                    data_y_score = model_hdl.predict(df_data_cent)
                    df_data_cent['model_output'] = data_y_score

                    df_data_cent = df_data_cent.query(
                        f'model_output > {score_eff_arrays_dict[bin][len(eff_array)-1]}'
                    )
                    df_data_cent.to_parquet(f'df/{bin}.parquet.gzip',
                                            compression='gzip')
                else:
                    df_data = TreeHandler()
                    df_data.get_handler_from_large_file(
                        DATA_PATH,
                        "LambdaTree",
                        preselection=
                        f'matter {split_ineq_sign} and centrality > {cent_bins[0]} and centrality < {cent_bins[1]} and pt > 0.5 and pt < 3 and ct > {ct_bins[0]} and ct < {ct_bins[1]}',
                        max_workers=8)

                    df_data.apply_model_handler(model_hdl)
                    df_data.apply_preselections(
                        f'model_output > {score_eff_arrays_dict[bin][len(eff_array)-1]}'
                    )
                    df_data.write_df_to_parquet_files(bin, "df/")
Beispiel #2
0
def test_tree_handler():  # pylint: disable=too-many-statements
    """
    Test the TreeHandler class functionalities.
    """
    # define the working directory
    test_dir = Path(__file__).resolve().parent

    # initialize TreeHandler test
    test_data, references = init_tree_handler_test_workspace(test_dir)

    # instantiate tree handler objects
    data_hdlr = TreeHandler(test_data[0], 'treeMLDplus')
    prompt_hdlr = TreeHandler(test_data[1], 'treeMLDplus')
    data_pq_hdlr = TreeHandler(test_data[2])
    prompt_pq_hdlr = TreeHandler(test_data[3])
    mult_hdlr = TreeHandler(test_data[:2], 'treeMLDplus')
    mult_pq_hdlr = TreeHandler(test_data[2:])

    # open refernces objects
    reference_data_slice_df = pd.read_pickle(references[0])
    reference_prompt_slice_df = pd.read_pickle(references[1])
    with open(references[2], 'rb') as handle:
        reference_dict = pickle.load(handle)

    terminate_tree_handler_test_workspace(test_dir)

    # test that data is the same in root and parquet
    assert data_hdlr.get_data_frame().equals(data_pq_hdlr.get_data_frame()), \
        'data Dataframe from parquet file differs from the root file one!'
    assert prompt_hdlr.get_data_frame().equals(prompt_pq_hdlr.get_data_frame()), \
        'prompt Dataframe from parquet file differs from the root file one!'

    # test loading from multiple files
    merged_df = pd.concat(
        [data_hdlr.get_data_frame(),
         prompt_hdlr.get_data_frame()],
        ignore_index=True)
    assert mult_hdlr.get_data_frame().equals(
        merged_df), 'loading of multiple root files not working!'
    merged_pq_df = pd.concat(
        [data_pq_hdlr.get_data_frame(),
         prompt_pq_hdlr.get_data_frame()],
        ignore_index=True)
    assert mult_pq_hdlr.get_data_frame().equals(
        merged_pq_df), 'loading of multiple parquet files not working!'

    # define the info dict that will be compared with the reference
    info_dict = {}

    # get the number of candidates in the original data sample
    info_dict['n_data'] = data_hdlr.get_n_cand()
    info_dict['n_prompt'] = prompt_hdlr.get_n_cand()

    # get the original variable list
    info_dict['data_var_list'] = prompt_hdlr.get_var_names()
    info_dict['prompt_var_list'] = prompt_hdlr.get_var_names()

    # shuffle dataframes
    new_hndl = data_hdlr.shuffle_data_frame(size=10,
                                            random_state=5,
                                            inplace=False)
    copied_hndl = copy.deepcopy(data_hdlr)
    copied_hndl.shuffle_data_frame(size=10, random_state=5, inplace=True)
    assert copied_hndl.get_data_frame().equals(new_hndl.get_data_frame()), \
        'Inplaced dataframe differs from the not inplaced one after shuffling'

    # apply preselections
    preselections_data = '(pt_cand > 1.30 and pt_cand < 42.00) and (inv_mass > 1.6690 and inv_mass < 2.0690)'
    preselections_prompt = '(pt_cand > 1.00 and pt_cand < 25.60) and (inv_mass > 1.8320 and inv_mass < 1.8940)'

    new_hndl = data_hdlr.apply_preselections(preselections_data, inplace=False)
    data_hdlr.apply_preselections(preselections_data)
    assert data_hdlr.get_data_frame().equals(new_hndl.get_data_frame()), \
        'Inplaced dataframe differs from the not inplaced one after the preselections'

    prompt_hdlr.apply_preselections(preselections_prompt)

    # get the number of selected data
    info_dict['n_data_preselected'] = data_hdlr.get_n_cand()
    info_dict['n_prompt_preselected'] = prompt_hdlr.get_n_cand()

    # get the preselections
    info_dict['data_preselections'] = data_hdlr.get_preselections()
    info_dict['prompt_preselections'] = prompt_hdlr.get_preselections()

    # apply dummy eval() on the underlying data frame
    d_len_z_def = 'd_len_z = sqrt(d_len**2 - d_len_xy**2)'
    new_hndl = data_hdlr.eval_data_frame(d_len_z_def, inplace=False)
    data_hdlr.eval_data_frame(d_len_z_def)
    assert data_hdlr.get_data_frame().equals(new_hndl.get_data_frame()), \
        'Inplaced dataframe differs from the not inplaced one after eval'

    prompt_hdlr.eval_data_frame(d_len_z_def)

    # get the new variable list
    info_dict['data_new_var_list'] = prompt_hdlr.get_var_names()
    info_dict['prompt_new_var_list'] = prompt_hdlr.get_var_names()

    # get a random subset of the original data
    data_hdlr = data_hdlr.get_subset(size=3000, rndm_state=SEED)
    prompt_hdlr = prompt_hdlr.get_subset(size=55, rndm_state=SEED)

    # slice both data and prompt data frame respect to the pT
    bins = [[0, 2], [2, 10], [10, 25]]

    data_hdlr.slice_data_frame('pt_cand', bins)
    prompt_hdlr.slice_data_frame('pt_cand', bins)

    # store projection variable and binning
    info_dict['data_proj_variable'] = data_hdlr.get_projection_variable()
    info_dict['prompt_proj_variable'] = prompt_hdlr.get_projection_variable()

    info_dict['data_binning'] = data_hdlr.get_projection_binning()
    info_dict['prompt_binning'] = prompt_hdlr.get_projection_binning()

    # get info from a single data slice
    data_slice_df = data_hdlr.get_slice(2)
    prompt_slice_df = prompt_hdlr.get_slice(2)

    info_dict['n_data_slice'] = len(data_slice_df)
    info_dict['n_prompt_slice'] = len(prompt_slice_df)

    # test info_dict reproduction

    assert info_dict == reference_dict, 'dictionary with the data info differs from the reference!'

    # test sliced data frames reproduction
    assert data_slice_df.equals(
        reference_data_slice_df
    ), 'data sliced DataFrame differs from the reference!'
    assert prompt_slice_df.equals(
        reference_prompt_slice_df
    ), 'prompt sliced DataFrame differs from the reference!'
Beispiel #3
0
DATA_HDLR = TreeHandler(DATA_FILE_PATH, 'treeMLDplus')
PROMPT_HDLR = TreeHandler(PROMPT_FILE_PATH, 'treeMLDplus')

# store number of candidates in the original data sample
INFO_DICT['n_data'] = DATA_HDLR.get_n_cand()
INFO_DICT['n_prompt'] = PROMPT_HDLR.get_n_cand()

# store original variable list
INFO_DICT['data_var_list'] = PROMPT_HDLR.get_var_names()
INFO_DICT['prompt_var_list'] = PROMPT_HDLR.get_var_names()

# apply preselections
PRESEL_DATA = '(pt_cand > 1.30 and pt_cand < 42.00) and (inv_mass > 1.6690 and inv_mass < 2.0690)'
PRESEL_PROMPT = '(pt_cand > 1.00 and pt_cand < 25.60) and (inv_mass > 1.8320 and inv_mass < 1.8940)'

DATA_HDLR.apply_preselections(PRESEL_DATA)
PROMPT_HDLR.apply_preselections(PRESEL_PROMPT)

# store number of selcted data
INFO_DICT['n_data_preselected'] = DATA_HDLR.get_n_cand()
INFO_DICT['n_prompt_preselected'] = PROMPT_HDLR.get_n_cand()

# store preselections
INFO_DICT['data_preselections'] = DATA_HDLR.get_preselections()
INFO_DICT['prompt_preselections'] = PROMPT_HDLR.get_preselections()

# apply dummy eval() on the underlying data frame
DATA_HDLR.eval_data_frame('d_len_z = sqrt(d_len**2 - d_len_xy**2)')
PROMPT_HDLR.eval_data_frame('d_len_z = sqrt(d_len**2 - d_len_xy**2)')

# store new variable list
Beispiel #4
0
def benchmark_hyperparam_optimizers(filename_dict,
                                    params,
                                    params_range,
                                    flag_dict,
                                    presel_dict,
                                    training_variables='',
                                    testsize=0.75):

    import time
    from sklearn.metrics import roc_auc_score

    N_run = 1

    data_path = filename_dict['data_path']
    analysis_path = filename_dict['analysis_path']

    print('Loading MC signal')
    mc_signal = TreeHandler()
    mc_signal.get_handler_from_large_file(
        file_name=data_path + filename_dict['MC_signal_filename'],
        tree_name=filename_dict['MC_signal_table'])
    print('MC signal loaded\n')

    print('Loading background data for training')
    background_ls = TreeHandler()
    background_ls.get_handler_from_large_file(
        file_name=data_path + filename_dict['train_bckg_filename'],
        tree_name=filename_dict['train_bckg_table'])
    background_ls.apply_preselections(presel_dict['train_bckg_presel'])
    background_ls.shuffle_data_frame(size=min(background_ls.get_n_cand(),
                                              mc_signal.get_n_cand() * 4))
    print('Done\n')

    train_test_data = train_test_generator([mc_signal, background_ls], [1, 0],
                                           test_size=testsize)

    if training_variables == '':
        training_variables = train_test_data[0].columns.tolist()

    model_clf = xgb.XGBClassifier()
    model_hdl = ModelHandler(model_clf, training_variables)

    times = []
    roc = []

    for i in range(N_run):
        start = time.time()

        model_hdl.optimize_params_bayes(train_test_data,
                                        params_range,
                                        'roc_auc',
                                        njobs=-1)
        model_hdl.train_test_model(train_test_data, )

        y_pred_test = model_hdl.predict(
            train_test_data[2], True)  #used to evaluate model performance

        roc.append(roc_auc_score(train_test_data[3], y_pred_test))

        times.append(time.time() - start)

    print('BAYES OPTIMIZATION WITH SKLEARN')
    print('Mean time : ' + str(np.mean(time)))
    print('Mean ROC : ' + str(np.mean(roc)))
    print('--------------\n')

    for i in range(N_run):
        model_hdl.optimize_params_optuna(train_test_data,
                                         params_range,
                                         'roc_auc',
                                         timeout=np.mean(times),
                                         njobs=-1)
        model_hdl.train_test_model(train_test_data, )

        y_pred_test = model_hdl.predict(
            train_test_data[2], True)  #used to evaluate model performance

        roc.append(roc_auc_score(train_test_data[3], y_pred_test))

    print('OPTUNA')
    print('Fixed time : ' + str(np.mean(time)))
    print('Mean ROC : ' + str(np.mean(roc)))
    print('--------------\n')
if training:

    signalH = TreeHandler(path_to_data + signal_table_name, "SignalTable")
    bkgH = TreeHandler(path_to_data + bkg_table_name, "DataTable")

    if bkg_fraction != None:
        bkgH.shuffle_data_frame(size=bkg_fraction * len(signalH),
                                inplace=True,
                                random_state=52)

    train_test_data = au.train_test_generator([signalH, bkgH], [1, 0],
                                              test_size=0.5,
                                              random_state=42)

    if pp_mode:
        signalH.apply_preselections("pt>0 and rej_accept==True")
        training_columns = [
            "pt", "cos_pa", "tpc_ncls_de", "tpc_ncls_pr", "tpc_ncls_pi",
            "tpc_nsig_de", "tpc_nsig_pr", "tpc_nsig_pi", "dca_de_pr",
            "dca_de_pi", "dca_pr_pi", "dca_de_sv", "dca_pr_sv", "dca_pi_sv",
            "chi2"
        ]
    else:
        training_columns = [
            'TPCnSigmaHe3', 'ct', 'V0CosPA', 'ProngsDCA', 'He3ProngPvDCA',
            'PiProngPvDCA', 'He3ProngPvDCAXY', 'PiProngPvDCAXY',
            'NpidClustersHe3', 'TPCnSigmaPi'
        ]

    if not os.path.exists(results_ml_path):
        os.makedirs(results_ml_path)