Beispiel #1
0
def calc_eff_fixed(df_to_sel, sel_opt, main_dict, name, thr_value, do_std=False, \
    std_cuts_map=None, ibin_std_cuts=None):
    """Calculate the selection efficiency of a ML model for a fixed threshold.

    It works also for standard selections, setting do_std to True.

    Args:
        df_to_sel: pandas dataframe
        sel_opt: string, according to filter_df_cand options
        main_dict: dictionary of parameters loaded from 'database_ml_parameters.yml' with case
                   already selected
        name: name of the ML model
        thr_value: threshold value on ML model output
        do_std: True for standard selections

    Return:
        eff: efficiency
        eff_err: uncertainty
    """
    df_sig = filter_df_cand(df_to_sel, main_dict, sel_opt)
    num_tot_cand = len(df_sig)

    if do_std:
        if std_cuts_map is None:
            num_sel_cand = len(
                filter_df_cand(df_sig, main_dict, 'sel_std_analysis'))
        else:
            #preselection on pid and track vars using bitmap
            df_sig = filter_df_cand(df_sig, main_dict, 'presel_track_pid')
            #apply standard cuts from file
            for icutvar in std_cuts_map:
                if icutvar != "var_binning":
                    array_var = df_sig.loc[:, std_cuts_map[icutvar]
                                           ["name"]].values
                    is_selected = selectcand_lincut(array_var, \
                            std_cuts_map[icutvar]["min"][ibin_std_cuts], \
                            std_cuts_map[icutvar]["max"][ibin_std_cuts], \
                            std_cuts_map[icutvar]["isabsval"])
                    df_sig = df_sig[is_selected]
            num_sel_cand = len(df_sig)
    else:
        num_sel_cand = len(
            df_sig[df_sig['y_test_prob' + name].values >= thr_value])

    eff = num_sel_cand / num_tot_cand
    eff_err = np.sqrt(eff * (1 - eff) / num_tot_cand)

    return eff, eff_err
Beispiel #2
0
def calc_eff(df_to_sel, sel_opt, main_dict, name, num_steps, do_std=False, \
    std_cuts_map=None, ibin_std_cuts=None):
    """Calculate the selection efficiency as a function of the treshold on the ML model output.

    It works also for standard selections, setting do_std to True. In this case the same value is
    repeated in the output array to easily plot the sandard selection efficiency together to the
    model one.

    Args:
        df_to_sel: pandas dataframe
        sel_opt: string, according to filter_df_cand options
        main_dict: dictionary of parameters loaded from 'database_ml_parameters.yml' with case
                   already selected
        name: name of the ML model
        num_steps: number of divisions on the model prediction output
        do_std: True for standard selections

    Return:
        eff_array: array of efficiency as a function of the threshold on the model output
        eff_err_array: array of uncertainties
        x_axis: array of threshold values
    """
    df_sig = filter_df_cand(df_to_sel, main_dict, sel_opt)
    ns_left = int(num_steps / 10) - 1
    ns_right = num_steps - ns_left
    x_axis_left = np.linspace(0., 0.49, ns_left)
    x_axis_right = np.linspace(0.5, 1.0, ns_right)
    x_axis = np.concatenate((x_axis_left, x_axis_right))
    num_tot_cand = len(df_sig)
    eff_array = []
    eff_err_array = []

    if not do_std:
        for thr in x_axis:
            num_sel_cand = len(
                df_sig[df_sig['y_test_prob' + name].values >= thr])
            eff = num_sel_cand / num_tot_cand
            eff_err = np.sqrt(eff * (1 - eff) / num_tot_cand)
            eff_array.append(eff)
            eff_err_array.append(eff_err)
    else:
        if std_cuts_map is None:
            num_sel_cand = len(
                filter_df_cand(df_sig, main_dict, 'sel_std_analysis'))
        else:
            #preselection on pid and track vars using bitmap
            df = filter_df_cand(df_sig, main_dict, 'presel_track_pid')
            #apply standard cuts from file
            for icutvar in std_cuts_map:
                if icutvar["name"] != "var_binning":
                    array_var = df.loc[:, icutvar["name"]].values
                    is_selected = selectcand_lincut(array_var, icutvar["min"][ibin_std_cuts], \
                        icutvar["max"][ibin_std_cuts], icutvar["isabsval"])
                    df = df[is_selected]
            num_sel_cand = len(df_sig)
        eff = num_sel_cand / num_tot_cand
        eff_err = np.sqrt(eff * (1 - eff) / num_tot_cand)
        eff_array = [eff] * num_steps
        eff_err_array = [eff_err] * num_steps

    return eff_array, eff_err_array, x_axis
Beispiel #3
0
    if presel_reco is not None:
        df = df.query(presel_reco)

    if useml == 0:
        if std_cuts_map is None:
            df = filter_df_cand(df, data[case], 'sel_std_analysis')
            df.to_pickle(namefiledf_std)
        else:
            #preselection on pid and track vars using bitmap
            df = filter_df_cand(df, data[case], 'presel_track_pid')
            #apply standard cuts from file
            for icutvar in std_cuts_map:
                if icutvar != "var_binning":
                    array_var = df.loc[:, std_cuts_map[icutvar]["name"]].values
                    is_selected = selectcand_lincut(array_var, \
                            std_cuts_map[icutvar]["min"][ibin_std_cuts], \
                            std_cuts_map[icutvar]["max"][ibin_std_cuts], \
                            std_cuts_map[icutvar]["isabsval"])
                    df = df[is_selected]
            df.to_pickle(namefiledf_std)
    elif useml == 1:
        df = filter_df_cand(df, data[case], 'presel_track_pid')
        mod = pickle.load(open(model, 'rb'))
        df = apply("BinaryClassification", [modelname], [mod], df, var_training)
        array_prob = df.loc[:, "y_test_prob" + modelname].values
        is_selected = selectcandidateml(array_prob, probcut)
        df = df[is_selected]
        df.to_pickle(namefiledf_ml)

def selectcandidatesall(data, listdf, listdfout_ml, listdfout_std, pt_var, ptmin, ptmax,
                        useml, modelname, model, probcut, case, std_cuts_map=None, \
                            ibin_std_cuts=None):