def calc_eff_fixed(df_to_sel, sel_opt, main_dict, name, thr_value, do_std=False, \ std_cuts_map=None, ibin_std_cuts=None): """Calculate the selection efficiency of a ML model for a fixed threshold. It works also for standard selections, setting do_std to True. Args: df_to_sel: pandas dataframe sel_opt: string, according to filter_df_cand options main_dict: dictionary of parameters loaded from 'database_ml_parameters.yml' with case already selected name: name of the ML model thr_value: threshold value on ML model output do_std: True for standard selections Return: eff: efficiency eff_err: uncertainty """ df_sig = filter_df_cand(df_to_sel, main_dict, sel_opt) num_tot_cand = len(df_sig) if do_std: if std_cuts_map is None: num_sel_cand = len( filter_df_cand(df_sig, main_dict, 'sel_std_analysis')) else: #preselection on pid and track vars using bitmap df_sig = filter_df_cand(df_sig, main_dict, 'presel_track_pid') #apply standard cuts from file for icutvar in std_cuts_map: if icutvar != "var_binning": array_var = df_sig.loc[:, std_cuts_map[icutvar] ["name"]].values is_selected = selectcand_lincut(array_var, \ std_cuts_map[icutvar]["min"][ibin_std_cuts], \ std_cuts_map[icutvar]["max"][ibin_std_cuts], \ std_cuts_map[icutvar]["isabsval"]) df_sig = df_sig[is_selected] num_sel_cand = len(df_sig) else: num_sel_cand = len( df_sig[df_sig['y_test_prob' + name].values >= thr_value]) eff = num_sel_cand / num_tot_cand eff_err = np.sqrt(eff * (1 - eff) / num_tot_cand) return eff, eff_err
def calc_eff(df_to_sel, sel_opt, main_dict, name, num_steps, do_std=False, \ std_cuts_map=None, ibin_std_cuts=None): """Calculate the selection efficiency as a function of the treshold on the ML model output. It works also for standard selections, setting do_std to True. In this case the same value is repeated in the output array to easily plot the sandard selection efficiency together to the model one. Args: df_to_sel: pandas dataframe sel_opt: string, according to filter_df_cand options main_dict: dictionary of parameters loaded from 'database_ml_parameters.yml' with case already selected name: name of the ML model num_steps: number of divisions on the model prediction output do_std: True for standard selections Return: eff_array: array of efficiency as a function of the threshold on the model output eff_err_array: array of uncertainties x_axis: array of threshold values """ df_sig = filter_df_cand(df_to_sel, main_dict, sel_opt) ns_left = int(num_steps / 10) - 1 ns_right = num_steps - ns_left x_axis_left = np.linspace(0., 0.49, ns_left) x_axis_right = np.linspace(0.5, 1.0, ns_right) x_axis = np.concatenate((x_axis_left, x_axis_right)) num_tot_cand = len(df_sig) eff_array = [] eff_err_array = [] if not do_std: for thr in x_axis: num_sel_cand = len( df_sig[df_sig['y_test_prob' + name].values >= thr]) eff = num_sel_cand / num_tot_cand eff_err = np.sqrt(eff * (1 - eff) / num_tot_cand) eff_array.append(eff) eff_err_array.append(eff_err) else: if std_cuts_map is None: num_sel_cand = len( filter_df_cand(df_sig, main_dict, 'sel_std_analysis')) else: #preselection on pid and track vars using bitmap df = filter_df_cand(df_sig, main_dict, 'presel_track_pid') #apply standard cuts from file for icutvar in std_cuts_map: if icutvar["name"] != "var_binning": array_var = df.loc[:, icutvar["name"]].values is_selected = selectcand_lincut(array_var, icutvar["min"][ibin_std_cuts], \ icutvar["max"][ibin_std_cuts], icutvar["isabsval"]) df = df[is_selected] num_sel_cand = len(df_sig) eff = num_sel_cand / num_tot_cand eff_err = np.sqrt(eff * (1 - eff) / num_tot_cand) eff_array = [eff] * num_steps eff_err_array = [eff_err] * num_steps return eff_array, eff_err_array, x_axis
if presel_reco is not None: df = df.query(presel_reco) if useml == 0: if std_cuts_map is None: df = filter_df_cand(df, data[case], 'sel_std_analysis') df.to_pickle(namefiledf_std) else: #preselection on pid and track vars using bitmap df = filter_df_cand(df, data[case], 'presel_track_pid') #apply standard cuts from file for icutvar in std_cuts_map: if icutvar != "var_binning": array_var = df.loc[:, std_cuts_map[icutvar]["name"]].values is_selected = selectcand_lincut(array_var, \ std_cuts_map[icutvar]["min"][ibin_std_cuts], \ std_cuts_map[icutvar]["max"][ibin_std_cuts], \ std_cuts_map[icutvar]["isabsval"]) df = df[is_selected] df.to_pickle(namefiledf_std) elif useml == 1: df = filter_df_cand(df, data[case], 'presel_track_pid') mod = pickle.load(open(model, 'rb')) df = apply("BinaryClassification", [modelname], [mod], df, var_training) array_prob = df.loc[:, "y_test_prob" + modelname].values is_selected = selectcandidateml(array_prob, probcut) df = df[is_selected] df.to_pickle(namefiledf_ml) def selectcandidatesall(data, listdf, listdfout_ml, listdfout_std, pt_var, ptmin, ptmax, useml, modelname, model, probcut, case, std_cuts_map=None, \ ibin_std_cuts=None):