def load_config(user_path: str, default_path: tuple) -> dict: """ Quickly extract either configuration given by user and fall back to package default if no user config given. Args: user_path: path to YAML file default_path: tuple were to find the resource and name of resource Returns: dictionary built from YAML """ logger = get_logger() stream = None if user_path is None: print(default_path[0], default_path[1]) stream = resource_stream(default_path[0], default_path[1]) else: if not exists(user_path): logger_string = f"The file {user_path} does not exist." logger.fatal(logger_string) stream = open(user_path) return yaml.load(stream, yaml.FullLoader)
class AnalyzerManager: """ Manager class handling analysis and systematic objects """ def __init__(self, ana_class, database, case, typean, doperiodbyperiod, *args): self.ana_class = ana_class self.database = database self.case = case self.typean = typean self.doperiodbyperiod = doperiodbyperiod # Additional arguments to be forwarded to the analyzers self.add_args = args self.logger = get_logger() self.analyzers = [] self.after_burner = None self.is_initialized = False
def calc_sigeff_steps(num_steps, df_sig, name, multiclass_labels): logger = get_logger() if multiclass_labels is None: ns_left = int(num_steps / 10) - 1 ns_right = num_steps - ns_left x_axis_left = np.linspace(0., 0.49, ns_left) x_axis_right = np.linspace(0.5, 1.0, ns_right) x_axis = np.concatenate((x_axis_left, x_axis_right)) else: x_axis = np.linspace(0, 0.4, num_steps) if df_sig.empty: logger.error("In division denominator is empty") eff_array = [0] * num_steps eff_err_array = [0] * num_steps return eff_array, eff_err_array, x_axis num_tot_cand = len(df_sig) eff_array = [] eff_err_array = [] if multiclass_labels is not None: for thr0 in x_axis: for thr1 in x_axis: mlsel_multi0 = 'y_test_prob' + name + multiclass_labels[ 0] + ' <= ' + str(thr0) mlsel_multi1 = 'y_test_prob' + name + multiclass_labels[ 1] + ' >= ' + str(thr1) mlsel_multi = mlsel_multi0 + ' and ' + mlsel_multi1 num_sel_cand = len(df_sig.query(mlsel_multi)) eff, err_eff = calc_eff(num_sel_cand, num_tot_cand) eff_array.append(eff) eff_err_array.append(err_eff) else: for thr in x_axis: num_sel_cand = len( df_sig[df_sig['y_test_prob' + name].values >= thr]) eff, err_eff = calc_eff(num_sel_cand, num_tot_cand) eff_array.append(eff) eff_err_array.append(err_eff) return eff_array, eff_err_array, x_axis
def calc_bkg(df_bkg, name, num_step, fit_region, bin_width, sig_region): """ Estimate the number of background candidates under the signal peak. This is obtained from real data with a fit of the sidebands of the invariant mass distribution. """ logger = get_logger() x_axis = np.linspace(0, 1.00, num_step) bkg_array = [] bkg_err_array = [] num_bins = (fit_region[1] - fit_region[0]) / bin_width num_bins = int(round(num_bins)) bin_width = (fit_region[1] - fit_region[0]) / num_bins logger.debug("To fit the bkg an exponential function is used") for thr in x_axis: bkg = 0. bkg_err = 0. hmass = TH1F('hmass', '', num_bins, fit_region[0], fit_region[1]) bkg_sel_mask = df_bkg['y_test_prob' + name].values >= thr sel_mass_array = df_bkg[bkg_sel_mask]['inv_mass_ML'].values if len(sel_mass_array) > 5: for mass_value in np.nditer(sel_mass_array): hmass.Fill(mass_value) fit = hmass.Fit('expo', 'Q', '', fit_region[0], fit_region[1]) if int(fit) == 0: fit_func = hmass.GetFunction('expo') bkg = fit_func.Integral(sig_region[0], sig_region[1]) / bin_width bkg_err = fit_func.IntegralError(sig_region[0], sig_region[1]) / bin_width del fit_func bkg_array.append(bkg) bkg_err_array.append(bkg_err) del hmass return bkg_array, bkg_err_array, x_axis
def calc_sigeff_steps(num_steps, df_sig, name): logger = get_logger() ns_left = int(num_steps / 10) - 1 ns_right = num_steps - ns_left x_axis_left = np.linspace(0., 0.49, ns_left) x_axis_right = np.linspace(0.5, 1.0, ns_right) x_axis = np.concatenate((x_axis_left, x_axis_right)) if df_sig.empty: logger.error("In division denominator is empty") eff_array = [0] * num_steps eff_err_array = [0] * num_steps return eff_array, eff_err_array, x_axis num_tot_cand = len(df_sig) eff_array = [] eff_err_array = [] for thr in x_axis: num_sel_cand = len(df_sig[df_sig['y_test_prob' + name].values >= thr]) eff, err_eff = calc_eff(num_sel_cand, num_tot_cand) eff_array.append(eff) eff_err_array.append(err_eff) return eff_array, eff_err_array, x_axis
def getclf_xgboost(model_config): logger = get_logger() logger.debug("Load xgboost models") if "xgboost" not in model_config: logger.debug("No xgboost models found") return [], [] classifiers = [] names = [] for c in model_config["xgboost"]: try: model = getattr(templates_xgboost, c)(model_config["xgboost"][c]) classifiers.append(model) names.append(c) logger.info("Added xgboost model %s", c) except AttributeError: logger.critical("Could not load xgboost model %s", c) return classifiers, names
def make_and_fill(self, binx, namex, biny=None, namey=None): """ Makes histogram and fills them based on their axis titles """ h = None if namey: # Check that column exists if namex not in self.source_dataframe: get_logger().warning( "Columns %s for X axis does not exist in dataframe, skipping histogram", namex) return if namey not in self.source_dataframe: get_logger().warning( "Columns %s for Y axis does not exist in dataframe, skipping histogram", namey) return h_name = f"hVal_{namex}_vs_{namey}{self.collection_tag}" h_tit = f" ; {namex} ; {namey}" h = makefill2dhist(self.source_dataframe, h_name, binx, biny, namex, namey) h.SetTitle(h_tit) else: # Check that column exists if namex not in self.source_dataframe: get_logger().warning( "Columns %s for X axis does not exist in dataframe, skipping histogram", namex) return h_name = f"hVal_{namex}{self.collection_tag}" h_tit = f" ; {namex} ; Entries" h = makefill1dhist(self.source_dataframe, h_name, h_tit, binx, namex) if self.verbose: get_logger().info("Filling histogram %s", h.GetName()) self.histograms.append(h)
def prep_mlsamples(df_sig, df_bkg, namesig, nevt_sig, nevt_bkg, test_frac, rnd_splt): logger = get_logger() if nevt_sig > len(df_sig): logger.warning("There are not enough signal events") if nevt_bkg > len(df_bkg): logger.warning("There are not enough background events") nevt_sig = min(len(df_sig), nevt_sig) nevt_bkg = min(len(df_bkg), nevt_bkg) logger.info("Used number of signal events is %d", nevt_sig) logger.info("Used number of background events is %d", nevt_bkg) df_sig = df_sig[:nevt_sig] df_bkg = df_bkg[:nevt_bkg] df_sig[namesig] = 1 df_bkg[namesig] = 0 df_ml = pd.DataFrame() df_ml = pd.concat([df_sig, df_bkg]) df_ml_train, df_ml_test = train_test_split(df_ml, test_size=test_frac, random_state=rnd_splt) logger.info("%d events for training and %d for testing", len(df_ml_train), len(df_ml_test)) return df_ml_train, df_ml_test
def do_gridsearch(names, classifiers, param_grid, refit_arr, x_train, y_train_, cv_, ncores): logger = get_logger() grid_search_models_ = [] grid_search_bests_ = [] list_scores_ = [] for _, clf, param_cv, refit in zip(names, classifiers, param_grid, refit_arr): grid_search = GridSearchCV(clf, param_cv, cv=cv_, refit=refit, scoring='neg_mean_squared_error', n_jobs=ncores) grid_search_model = grid_search.fit(x_train, y_train_) cvres = grid_search.cv_results_ for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]): logger.info(np.sqrt(-mean_score), params) list_scores_.append(pd.DataFrame(cvres)) grid_search_best = grid_search.best_estimator_.fit(x_train, y_train_) grid_search_models_.append(grid_search_model) grid_search_bests_.append(grid_search_best) return grid_search_models_, grid_search_bests_, list_scores_
def create_mlsamples( df_sig, df_bkg, sel_opt_sig, main_dict, sel_bkg, rnd_shuffle, # pylint: disable=too-many-arguments var_signal, var_training, nevt_sig, nevt_bkg, test_frac, rnd_splt): df_sig = filter_df_cand(df_sig, main_dict, sel_opt_sig) df_bkg = df_bkg.query(sel_bkg) df_sig = shuffle(df_sig, random_state=rnd_shuffle) df_bkg = shuffle(df_bkg, random_state=rnd_shuffle) df_ml_train, df_ml_test = prep_mlsamples(df_sig, df_bkg, var_signal, nevt_sig, nevt_bkg, test_frac, rnd_splt) df_sig_train, df_bkg_train = split_df_sigbkg(df_ml_train, var_signal) df_sig_test, df_bkg_test = split_df_sigbkg(df_ml_test, var_signal) logger = get_logger() logger.info("Events for ml train %d and test %d", len(df_ml_train), len(df_ml_test)) logger.info("Events for signal train %d and test %d", len(df_sig_train), len(df_sig_test)) logger.info("Events for bkg train %d and test %d", len(df_bkg_train), len(df_bkg_test)) x_train = df_ml_train[var_training] y_train = df_ml_train[var_signal] x_test = df_ml_test[var_training] y_test = df_ml_test[var_signal] return df_ml_train, df_ml_test, df_sig_train, df_bkg_train, df_sig_test, df_bkg_test, \ x_train, y_train, x_test, y_test
def assert_model_config(self): # pylint: disable=R0912 """ Validate and return the configuration for ml models Args: path: path to configuration YAML run_config: Run configuration since loading some models can depend on that, e.g. if run_config["activate_keras"] == 0 the keras config does not need to be checked and loaded. """ logger = get_logger() logger.debug("Check sanity of user configs") user_config = {} if isinstance(self.model_config_input, str): user_config = parse_yaml( os.path.expanduser(self.model_config_input)) elif isinstance(self.model_config_input, dict): user_config = self.model_config_input # At this point the asserted_config dict is just the one with defaults asserted_config = Configuration.get_meta_config("models")[ self.run_config["mltype"]] user_config = user_config.get(self.run_config["mltype"], {}) # Could probably merged with the former loop, however, would like to see whether there are # e.g. typos. Because steering a run wanting keras - but writing kras - could cost a lot of # time when it needs to be done again. if self.run_config["mltype"] in self.run_config["activate_models"]: for backend, model in \ self.run_config["activate_models"][self.run_config["mltype"]].items(): if backend not in asserted_config: logger.critical("Unknown backend %s.", backend) if model is None: logger.critical("No models specified for backend %s.", backend) for name, activate in model.items(): if name not in asserted_config[backend]: logger.critical("Unknown model %s for backend %s.", name, backend) if name in asserted_config[backend]: if activate is None or not isinstance(activate, bool): logger.critical("Activation value of model %s for backend %s " \ "must be specified as boolean value.", name, backend) asserted_config[backend][name]["activate"] = activate # Pop deactivated models for backend in list(asserted_config.keys()): for model in list(asserted_config[backend].keys()): if not asserted_config[backend][model]["activate"]: del asserted_config[backend][model] else: asserted_config[backend][model] = asserted_config[backend][ model]["default"] if backend in user_config and model in user_config[backend]: if len(user_config[backend][model]) != len( asserted_config[backend][model]): logger.critical( "Parameter list for %s model %s differs", backend, model) for u in asserted_config[backend][model]: asserted_config[backend][model][u] = \ user_config[backend][model].get(u, asserted_config[backend][model][u]) self.model_config = asserted_config
from machine_learning_hep.models import fit, savemodels, test, apply, decisionboundaries from machine_learning_hep.models import importanceplotall from machine_learning_hep.mlperformance import cross_validation_mse, cross_validation_mse_continuous from machine_learning_hep.mlperformance import plot_cross_validation_mse, plot_learning_curves # from machine_learning_hep.mlperformance import confusion, plot_overtraining from machine_learning_hep.mlperformance import precision_recall from machine_learning_hep.grid_search import do_gridsearch, read_grid_dict, perform_plot_gridsearch from machine_learning_hep.logger import get_logger from machine_learning_hep.optimization import study_signif from machine_learning_hep.efficiency import study_eff DATA_PREFIX = os.path.expanduser("~/.machine_learning_hep") def doclassification_regression(run_config, data, model_config, case, binmin, binmax): # pylint: disable=too-many-locals, too-many-statements, too-many-branches logger = get_logger() logger.info("Start classification_regression run") mltype = run_config['mltype'] mlsubtype = run_config['mlsubtype'] loadsampleoption = run_config['loadsampleoption'] rnd_shuffle = run_config['rnd_shuffle'] nevt_sig = run_config['nevt_sig'] nevt_bkg = run_config['nevt_bkg'] test_frac = run_config['test_frac'] rnd_splt = run_config['rnd_splt'] docorrelation = run_config['docorrelation'] dostandard = run_config['dostandard'] dopca = run_config['dopca'] dotraining = run_config['dotraining'] dotesting = run_config['dotesting']
def assert_run_config(self): """ Validate and return the configuration for run Args: path: path to configuration YAML """ logger = get_logger() logger.debug("Check sanity of user configs") user_run_config = {} if isinstance(self.run_config_input, str): user_run_config = parse_yaml( os.path.expanduser(self.run_config_input)) elif isinstance(self.run_config_input, dict): user_run_config = self.run_config_input # At this point the asserted_config dict is just the one with defaults run_config = Configuration.get_meta_config("run") asserted_config = {k: run_config[k]["default"] for k in run_config} choices_config = { k: run_config[k]["choices"] for k in run_config if "choices" in run_config[k] } depends_config = { k: run_config[k]["depends"] for k in run_config if "depends" in run_config[k] } types_config = { k: run_config[k]["type_as"] for k in run_config if "type_as" in run_config[k] } # Check for unknown parameters and abort since running entire machinery with wrong # setting (e.g. 'dotaining' instead of 'dotraining' might happen just by accident) # could be just overhead. for k in user_run_config: if k not in asserted_config: logger.critical("Unkown parameter %s in config", k) elif user_run_config[k] is None: logger.critical("Missing value for parameter %s in config", k) # Replace all defaults if user specified parameter for k in asserted_config: asserted_config[k] = user_run_config.get(k, asserted_config[k]) # If parameter is already set, check if consistent if k in choices_config and asserted_config[ k] not in choices_config[k]: logger.critical( "Invalid value %s for parameter %s. Must be one of %s", str(user_run_config[k]), k, str(choices_config[k])) if k in types_config: check_types = [type(t) for t in types_config[k]] if not isinstance(asserted_config[k], tuple(check_types)): logger.critical( "Invalid value type %s of parameter %s. Must be of type %s", str(type(asserted_config[k])), k, str(check_types)) # Can so far only depend on one parameter, change to combination # of parameters. Do we need to check for circular dependencies? for k in depends_config: if (asserted_config[depends_config[k]["parameter"]] == depends_config[k]["value"] and asserted_config[k] != depends_config[k]["set"]): asserted_config[k] = depends_config[k]["set"] logger.info( "Parameter %s = %s enforced since it is required for %s == %s", k, str(depends_config[k]["set"]), str(depends_config[k]["parameter"]), str(depends_config[k]["value"])) self.run_config = asserted_config
def make_asymm_y_errors(*args): if len(args) % 2 != 0: get_logger().fatal( "Need an even number ==> ((low, up) * n_central) of errors") return [[0, 0, args[i], args[i + 1]] for i in range(0, len(args), 2)]
def signal_func(sgnfunc): if sgnfunc != "kGaus": get_logger().fatal("Unknown signal fit function %s", sgnfunc) return "[0]/(sqrt(2.*pi))/[2]*(exp(-(x-[1])*(x-[1])/2./[2]/[2]))"
def calc_systematic_mesonratio(errnum_list, errden_list, n_bins, justfd=-99): """ Returns a list of total errors taking into account the defined correlations Propagation uncertainties defined for Ds(MB or mult) / D0(MB or mult). Check if applicable to your situation """ tot_list = [[0., 0., 0., 0.] for _ in range(n_bins)] if n_bins != len(list(errnum_list.errors.values())[0]) or \ n_bins != len(list(errden_list.errors.values())[0]): get_logger().fatal("Number of bins and number of errors mismatch, %i vs. %i vs. %i", \ n_bins, len(list(errnum_list.errors.values())[0]), \ len(list(errden_list.errors.values())[0])) listimpl = ["yield", "cut", "pid", "feeddown_mult", "feeddown_mult_spectra", "trigger", \ "multiplicity_interval", "multiplicity_weights", "track", "ptshape", \ "feeddown_NB", "sigmav0", "branching_ratio"] j = 0 for (_, errnum), (_, errden) in zip(errnum_list.errors.items(), errden_list.errors.items()): for i in range(n_bins): if errnum_list.names[j] not in listimpl: get_logger().fatal("Unknown systematic name: %s", errnum_list.names[j]) if errnum_list.names[j] != errden_list.names[j]: get_logger().fatal("Names not in same order: %s vs %s", \ errnum_list.names[j], errden_list.names[j]) for nb in range(len(tot_list[i])): if errnum_list.names[j] == "yield" and justfd is not True: #Uncorrelated tot_list[i][nb] += errnum[i][nb] * errnum[i][nb] + errden[ i][nb] * errden[i][nb] elif errnum_list.names[j] == "cut" and justfd is not True: #Uncorrelated tot_list[i][nb] += errnum[i][nb] * errnum[i][nb] + errden[ i][nb] * errden[i][nb] elif errnum_list.names[j] == "pid" and justfd is not True: #Correlated, assign difference diff = abs(errnum[i][nb] - errden[i][nb]) tot_list[i][nb] += diff * diff elif errnum_list.names[ j] == "feeddown_mult_spectra" and justfd is not False: #Fully correlated ynum = errnum_list.errors["feeddown_NB"][i][4] yden = errden_list.errors["feeddown_NB"][i][4] #Relative uncertainties stored, make absolute ynuml = ynum - ynum * errnum[i][2] ydenl = yden - yden * errden[i][2] ynumh = ynum + ynum * errnum[i][3] ydenh = yden + yden * errden[i][3] rat = [ynuml / ydenl, ynum / yden, ynumh / ydenh] minsys = min(rat) maxsys = max(rat) if nb == 2: tot_list[i][nb] += (rat[1] - minsys) * ( rat[1] - minsys) / (rat[1] * rat[1]) if nb == 3: tot_list[i][nb] += (maxsys - rat[1]) * ( maxsys - rat[1]) / (rat[1] * rat[1]) elif errnum_list.names[ j] == "feeddown_mult" and justfd is not False: #Spectra here, skip ratio systematic pass elif errnum_list.names[j] == "trigger" and justfd is not True: #Correlated, do nothing pass elif errnum_list.names[ j] == "feeddown_NB" and justfd is not False: #Fully correlated under assumption central Fc value stays within Nb syst ynum = errnum[i][4] yden = errden[i][4] #Absolute uncertainties stored ynuml = ynum - errnum[i][2] ydenl = yden - errden[i][2] ynumh = ynum + errnum[i][3] ydenh = yden + errden[i][3] rat = [ynuml / ydenl, ynum / yden, ynumh / ydenh] minsys = min(rat) maxsys = max(rat) if nb == 2: tot_list[i][nb] += (rat[1] - minsys) * ( rat[1] - minsys) / (rat[1] * rat[1]) if nb == 3: tot_list[i][nb] += (maxsys - rat[1]) * ( maxsys - rat[1]) / (rat[1] * rat[1]) elif errnum_list.names[ j] == "multiplicity_weights" and justfd is not True: #Correlated, assign difference diff = abs(errnum[i][nb] - errden[i][nb]) tot_list[i][nb] += diff * diff elif errnum_list.names[j] == "track" and justfd is not True: #Correlated, assign difference diff = abs(errnum[i][nb] - errden[i][nb]) tot_list[i][nb] += diff * diff elif errnum_list.names[j] == "ptshape" and justfd is not True: #Uncorrelated tot_list[i][nb] += errnum[i][nb] * errnum[i][nb] + errden[ i][nb] * errden[i][nb] elif errnum_list.names[ j] == "multiplicity_interval" and justfd is not True: #NB: Assuming ratio: 3prongs over 2prongs here! 2prong part cancels #We use 1/3 of systematic of numerator tot_list[i][nb] += errnum[i][nb] * errnum[i][nb] / 9 elif errnum_list.names[j] == "sigmav0" and justfd is not True: #Correlated and usually not plotted in boxes, do nothing pass elif errnum_list.names[ j] == "branching_ratio" and justfd is not True: #Uncorrelated, but usually not plotted in boxes, so pass pass j = j + 1 tot_list = np.sqrt(tot_list) return tot_list
def calc_systematic_mesondoubleratio(errnum_list1, errnum_list2, errden_list1, \ errden_list2, n_bins, dropbins=None, justfd=-99): """ Returns a list of total errors taking into account the defined correlations Propagation uncertainties defined for Lc/D0_mult-i / Lc/D0_mult-j. Check if applicable to your situation """ tot_list = [[0., 0., 0., 0.] for _ in range(n_bins)] if n_bins != len(list(errnum_list1.errors.values())[0]) or \ n_bins != len(list(errden_list1.errors.values())[0]): if dropbins is None: get_logger().fatal("Number of bins and number of errors mismatch, %i vs. %i vs. %i", \ n_bins, len(list(errnum_list1.errors.values())[0]), \ len(list(errden_list1.errors.values())[0])) listimpl = ["yield", "cut", "pid", "feeddown_mult", "feeddown_mult_spectra", "trigger", \ "multiplicity_interval", "multiplicity_weights", "track", "ptshape", \ "feeddown_NB", "sigmav0", "branching_ratio"] j = 0 for (_, errnum1), (_, errnum2), (_, errden1), (_, errden2) in zip(errnum_list1.errors.items(), \ errnum_list2.errors.items(), \ errden_list1.errors.items(), \ errden_list2.errors.items()): for i in range(n_bins): inum = i iden = i if dropbins is not None: inum = dropbins[0][i] iden = dropbins[1][i] if errnum_list1.names[j] not in listimpl: get_logger().fatal("Unknown systematic name: %s", errnum_list1.names[j]) if errnum_list1.names[j] != errden_list2.names[j]: get_logger().fatal("Names not in same order: %s vs %s", \ errnum_list1.names[j], errden_list2.names[j]) for nb in range(len(tot_list[i])): if errnum_list1.names[j] == "yield" and justfd is not True: #Uncorrelated tot_list[i][nb] += errnum1[inum][nb] * errnum1[inum][nb] + \ errnum2[inum][nb] * errnum2[inum][nb] + \ errden1[iden][nb] * errden1[iden][nb] + \ errden2[iden][nb] * errden2[iden][nb] elif errnum_list1.names[j] == "cut" and justfd is not True: #Uncorrelated tot_list[i][nb] += errnum1[inum][nb] * errnum1[inum][nb] + \ errnum2[inum][nb] * errnum2[inum][nb] + \ errden1[iden][nb] * errden1[iden][nb] + \ errden2[iden][nb] * errden2[iden][nb] elif errnum_list1.names[j] == "pid" and justfd is not True: #Correlated, do nothing pass elif errnum_list1.names[ j] == "feeddown_mult_spectra" and justfd is not False: #Correlated, do nothing pass elif errnum_list1.names[ j] == "feeddown_mult" and justfd is not False: #Correlated, do nothing pass elif errnum_list1.names[j] == "trigger" and justfd is not True: #Correlated, do nothing pass elif errnum_list1.names[ j] == "feeddown_NB" and justfd is not False: #Correlated, do nothing pass elif errnum_list1.names[ j] == "multiplicity_weights" and justfd is not True: #Correlated, do nothing pass elif errnum_list1.names[j] == "track" and justfd is not True: #Correlated, do nothing pass elif errnum_list1.names[j] == "ptshape" and justfd is not True: #Uncorrelated tot_list[i][nb] += errnum1[inum][nb] * errnum1[inum][nb] + \ errnum2[inum][nb] * errnum2[inum][nb] + \ errden1[iden][nb] * errden1[iden][nb] + \ errden2[iden][nb] * errden2[iden][nb] elif errnum_list1.names[ j] == "multiplicity_interval" and justfd is not True: #NB: Assuming ratio: 3prongs over 2prongs here! 2prong part cancels #We use 1/3 of systematic of numerator tot_list[i][ nb] += errden1[iden][nb] * errden1[iden][nb] / 9 elif errnum_list1.names[j] == "sigmav0" and justfd is not True: #Correlated and usually not plotted in boxes, do nothing pass elif errnum_list1.names[ j] == "branching_ratio" and justfd is not True: #Uncorrelated, but usually not plotted in boxes, so pass pass j = j + 1 tot_list = np.sqrt(tot_list) return tot_list
def __init__(self, data_param, case): self.logger = get_logger() self.logger.info("DnnOptimizer::Init\nCase: %s", case) # Dataset config self.grid_phi = data_param["grid_phi"] self.grid_z = data_param["grid_z"] self.grid_r = data_param["grid_r"] self.selopt_input = data_param["selopt_input"] self.selopt_output = data_param["selopt_output"] self.opt_train = data_param["opt_train"] self.opt_predout = data_param["opt_predout"] self.nameopt_predout = data_param["nameopt_predout"] self.dim_input = sum(self.opt_train) self.dim_output = sum(self.opt_predout) self.use_scaler = data_param["use_scaler"] # Directories self.dirmodel = data_param["dirmodel"] self.dirval = data_param["dirval"] self.diroutflattree = data_param["diroutflattree"] train_dir = data_param["dirinput_bias"] if data_param["train_bias"] \ else data_param["dirinput_nobias"] test_dir = data_param["dirinput_bias"] if data_param["test_bias"] \ else data_param["dirinput_nobias"] apply_dir = data_param["dirinput_bias"] if data_param["apply_bias"] \ else data_param["dirinput_nobias"] self.dirinput_train = "%s/SC-%d-%d-%d/" % \ (train_dir, self.grid_z, self.grid_r, self.grid_phi) self.dirinput_test = "%s/SC-%d-%d-%d/" % \ (test_dir, self.grid_z, self.grid_r, self.grid_phi) self.dirinput_apply = "%s/SC-%d-%d-%d/" % \ (apply_dir, self.grid_z, self.grid_r, self.grid_phi) # DNN config self.filters = data_param["filters"] self.pooling = data_param["pooling"] self.batch_size = data_param["batch_size"] self.shuffle = data_param["shuffle"] self.depth = data_param["depth"] self.batch_normalization = data_param["batch_normalization"] self.dropout = data_param["dropout"] self.epochs = data_param["epochs"] self.lossfun = data_param["lossfun"] self.metrics = data_param["metrics"] self.adamlr = data_param["adamlr"] self.params = {'phi_slice': self.grid_phi, 'r_row' : self.grid_r, 'z_col' : self.grid_z, 'batch_size': self.batch_size, 'shuffle': self.shuffle, 'opt_train' : self.opt_train, 'opt_predout' : self.opt_predout, 'selopt_input' : self.selopt_input, 'selopt_output' : self.selopt_output, 'use_scaler': self.use_scaler} self.suffix = "phi%d_r%d_z%d_filter%d_poo%d_drop%.2f_depth%d_batch%d_scaler%d" % \ (self.grid_phi, self.grid_r, self.grid_z, self.filters, self.pooling, self.dropout, self.depth, self.batch_normalization, self.use_scaler) self.suffix = "%s_useSCMean%d_useSCFluc%d" % \ (self.suffix, self.opt_train[0], self.opt_train[1]) self.suffix = "%s_pred_doR%d_dophi%d_doz%d" % \ (self.suffix, self.opt_predout[0], self.opt_predout[1], self.opt_predout[2]) self.suffix_ds = "phi%d_r%d_z%d" % \ (self.grid_phi, self.grid_r, self.grid_z) if not os.path.isdir("plots"): os.makedirs("plots") if not os.path.isdir(self.dirmodel): os.makedirs(self.dirmodel) if not os.path.isdir(self.dirval): os.makedirs(self.dirval) self.logger.info("I am processing the configuration %s", self.suffix) if self.dim_output > 1: self.logger.fatal("YOU CAN PREDICT ONLY 1 DISTORSION. The sum of opt_predout == 1") self.logger.info("Inputs active for training: (SCMean, SCFluctuations)=(%d, %d)", self.opt_train[0], self.opt_train[1]) self.maxrandomfiles = data_param["maxrandomfiles"] self.range_mean_index = data_param["range_mean_index"] self.indices_events_means_train = None self.partition = None self.total_events = 0 gROOT.SetStyle("Plain") gROOT.SetBatch()
def calc_systematic_multovermb(errnum_list, errden_list, n_bins, justfd=-99): """ Returns a list of total errors taking into account the defined correlations Propagation uncertainties defined for Ds(mult) / Ds(MB). Check if applicable to your situation """ tot_list = [[0., 0., 0., 0.] for _ in range(n_bins)] if n_bins != len(list(errnum_list.errors.values())[0]) or \ n_bins != len(list(errden_list.errors.values())[0]): get_logger().fatal("Number of bins and number of errors mismatch, %i vs. %i vs. %i", \ n_bins, len(list(errnum_list.errors.values())[0]), \ len(list(errden_list.errors.values())[0])) listimpl = ["yield", "cut", "pid", "feeddown_mult", "feeddown_mult_spectra", "trigger", \ "multiplicity_interval", "multiplicity_weights", "track", "ptshape", \ "feeddown_NB", "sigmav0", "branching_ratio"] j = 0 for (_, errnum), (_, errden) in zip(errnum_list.errors.items(), errden_list.errors.items()): for i in range(n_bins): if errnum_list.names[j] not in listimpl: get_logger().fatal("Unknown systematic name: %s", errnum_list.names[j]) if errnum_list.names[j] != errden_list.names[j]: get_logger().fatal("Names not in same order: %s vs %s", \ errnum.names[j], errden.names[j]) for nb in range(len(tot_list[i])): if errnum_list.names[j] == "yield" and justfd is not True: #Partially correlated, take largest tot_list[i][nb] += max(errnum[i][nb], errden[i][nb]) \ * max(errnum[i][nb], errden[i][nb]) elif errnum_list.names[j] == "cut" and justfd is not True: #Partially correlated, take largest tot_list[i][nb] += max(errnum[i][nb], errden[i][nb]) \ * max(errnum[i][nb], errden[i][nb]) elif errnum_list.names[j] == "pid" and justfd is not True: #Correlated, do nothing pass elif errnum_list.names[ j] == "feeddown_mult" and justfd is not False: #Assign directly from multiplicity case, no syst for MB tot_list[i][nb] += errnum[i][nb] * errnum[i][nb] elif errnum_list.names[ j] == "feeddown_mult_spectra" and justfd is not False: #Ratio here, skip spectra syst pass elif errnum_list.names[j] == "trigger" and justfd is not True: #Assign directly from multiplicity case, no syst for MB tot_list[i][nb] += errnum[i][nb] * errnum[i][nb] elif errnum_list.names[ j] == "multiplicity_interval" and justfd is not True: #FD: estimated using 7TeV strategy directly for ratio tot_list[i][nb] += errnum[i][nb] * errnum[i][nb] elif errnum_list.names[ j] == "multiplicity_weights" and justfd is not True: #Uncorrelated tot_list[i][nb] += errnum[i][nb] * errnum[i][nb] + errden[ i][nb] * errden[i][nb] elif errnum_list.names[j] == "track" and justfd is not True: #Correlated, do nothing pass elif errnum_list.names[j] == "ptshape" and justfd is not True: #Correlated, assign difference diff = abs(errnum[i][nb] - errden[i][nb]) tot_list[i][nb] += diff * diff elif errnum_list.names[ j] == "feeddown_NB" and justfd is not False: #Correlated, do nothing pass elif errnum_list.names[j] == "sigmav0" and justfd is not True: #Correlated and usually not plotted in boxes, do nothing pass elif errnum_list.names[ j] == "branching_ratio" and justfd is not True: #Correlated and usually not plotted in boxes, do nothing pass j = j + 1 tot_list = np.sqrt(tot_list) return tot_list
def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-branches """ Main plotting function """ gROOT.SetBatch(True) # pylint: disable=unused-variable parser = argparse.ArgumentParser() parser.add_argument("--database-analysis", "-d", dest="database_analysis", help="analysis database to be used", required=True) parser.add_argument("--analysis", "-a", dest="type_ana", help="choose type of analysis", required=True) parser.add_argument("--input", "-i", dest="input_file", help="results input file", required=True) args = parser.parse_args() typean = args.type_ana shape = typean[len("jet_"):] print("Shape:", shape) file_in = args.input_file with open(args.database_analysis, "r") as file_db: data_param = yaml.safe_load(file_db) case = list(data_param.keys())[0] datap = data_param[case] logger = get_logger() i_cut = file_in.rfind("/") rootpath = file_in[:i_cut] # plotting # LaTeX string p_latexnhadron = datap["analysis"][typean]["latexnamehadron"] p_latexbin2var = datap["analysis"][typean]["latexbin2var"] v_varshape_latex = datap["analysis"][typean]["var_shape_latex"] # first variable (hadron pt) lpt_finbinmin = datap["analysis"][typean]["sel_an_binmin"] lpt_finbinmax = datap["analysis"][typean]["sel_an_binmax"] var1ranges = lpt_finbinmin.copy() var1ranges.append(lpt_finbinmax[-1]) # second variable (jet pt) v_var2_binning = datap["analysis"][typean]["var_binning2"] # name lvar2_binmin_reco = datap["analysis"][typean].get("sel_binmin2_reco", None) lvar2_binmax_reco = datap["analysis"][typean].get("sel_binmax2_reco", None) p_nbin2_reco = len(lvar2_binmin_reco) # number of reco bins lvar2_binmin_gen = datap["analysis"][typean].get("sel_binmin2_gen", None) lvar2_binmax_gen = datap["analysis"][typean].get("sel_binmax2_gen", None) p_nbin2_gen = len(lvar2_binmin_gen) # number of gen bins var2ranges_reco = lvar2_binmin_reco.copy() var2ranges_reco.append(lvar2_binmax_reco[-1]) var2binarray_reco = array( "d", var2ranges_reco) # array of bin edges to use in histogram constructors var2ranges_gen = lvar2_binmin_gen.copy() var2ranges_gen.append(lvar2_binmax_gen[-1]) var2binarray_gen = array( "d", var2ranges_gen) # array of bin edges to use in histogram constructors # observable (z, shape,...) v_varshape_binning = datap["analysis"][typean][ "var_binningshape"] # name (reco) v_varshape_binning_gen = datap["analysis"][typean][ "var_binningshape_gen"] # name (gen) lvarshape_binmin_reco = \ datap["analysis"][typean].get("sel_binminshape_reco", None) lvarshape_binmax_reco = \ datap["analysis"][typean].get("sel_binmaxshape_reco", None) p_nbinshape_reco = len(lvarshape_binmin_reco) # number of reco bins lvarshape_binmin_gen = \ datap["analysis"][typean].get("sel_binminshape_gen", None) lvarshape_binmax_gen = \ datap["analysis"][typean].get("sel_binmaxshape_gen", None) p_nbinshape_gen = len(lvarshape_binmin_gen) # number of gen bins varshaperanges_reco = lvarshape_binmin_reco.copy() varshaperanges_reco.append(lvarshape_binmax_reco[-1]) varshapebinarray_reco = array( "d", varshaperanges_reco ) # array of bin edges to use in histogram constructors varshaperanges_gen = lvarshape_binmin_gen.copy() varshaperanges_gen.append(lvarshape_binmax_gen[-1]) varshapebinarray_gen = array( "d", varshaperanges_gen ) # array of bin edges to use in histogram constructors file_results = TFile.Open(file_in) if not file_results: logger.fatal(make_message_notfound(file_in)) ibin2 = 1 suffix = "%s_%g_%g" % (v_var2_binning, lvar2_binmin_gen[ibin2], lvar2_binmax_gen[ibin2]) # HF data nameobj = "%s_hf_data_%d_stat" % (shape, ibin2) hf_data_stat = file_results.Get(nameobj) if not hf_data_stat: logger.fatal(make_message_notfound(nameobj, file_in)) nameobj = "%s_hf_data_%d_syst" % (shape, ibin2) hf_data_syst = file_results.Get(nameobj) if not hf_data_syst: logger.fatal(make_message_notfound(nameobj, file_in)) # HF PYTHIA nameobj = "%s_hf_pythia_%d_stat" % (shape, ibin2) hf_pythia_stat = file_results.Get(nameobj) if not hf_pythia_stat: logger.fatal(make_message_notfound(nameobj, file_in)) # HF ratio nameobj = "%s_hf_ratio_%d_stat" % (shape, ibin2) hf_ratio_stat = file_results.Get(nameobj) if not hf_ratio_stat: logger.fatal(make_message_notfound(nameobj, file_in)) nameobj = "%s_hf_ratio_%d_syst" % (shape, ibin2) hf_ratio_syst = file_results.Get(nameobj) if not hf_ratio_syst: logger.fatal(make_message_notfound(nameobj, file_in)) # inclusive data nameobj = "%s_incl_data_%d_stat" % (shape, ibin2) incl_data_stat = file_results.Get(nameobj) if not incl_data_stat: logger.fatal(make_message_notfound(nameobj, file_in)) nameobj = "%s_incl_data_%d_syst" % (shape, ibin2) incl_data_syst = file_results.Get(nameobj) if not incl_data_syst: logger.fatal(make_message_notfound(nameobj, file_in)) # inclusive PYTHIA nameobj = "%s_incl_pythia_%d_stat" % (shape, ibin2) incl_pythia_stat = file_results.Get(nameobj) if not incl_pythia_stat: logger.fatal(make_message_notfound(nameobj, file_in)) nameobj = "%s_incl_pythia_%d_syst" % (shape, ibin2) incl_pythia_syst = file_results.Get(nameobj) if not incl_pythia_syst: logger.fatal(make_message_notfound(nameobj, file_in)) # inclusive ratio nameobj = "%s_incl_ratio_%d_stat" % (shape, ibin2) incl_ratio_stat = file_results.Get(nameobj) if not incl_ratio_stat: logger.fatal(make_message_notfound(nameobj, file_in)) nameobj = "%s_incl_ratio_%d_syst" % (shape, ibin2) incl_ratio_syst = file_results.Get(nameobj) if not incl_ratio_syst: logger.fatal(make_message_notfound(nameobj, file_in)) # quark PYTHIA nameobj = "%s_quark_pythia_%d_stat" % (shape, ibin2) quark_pythia_stat = file_results.Get(nameobj) if not quark_pythia_stat: logger.fatal(make_message_notfound(nameobj, file_in)) nameobj = "%s_quark_pythia_%d_syst" % (shape, ibin2) quark_pythia_syst = file_results.Get(nameobj) if not quark_pythia_syst: logger.fatal(make_message_notfound(nameobj, file_in)) # gluon PYTHIA nameobj = "%s_gluon_pythia_%d_stat" % (shape, ibin2) gluon_pythia_stat = file_results.Get(nameobj) if not gluon_pythia_stat: logger.fatal(make_message_notfound(nameobj, file_in)) nameobj = "%s_gluon_pythia_%d_syst" % (shape, ibin2) gluon_pythia_syst = file_results.Get(nameobj) if not gluon_pythia_syst: logger.fatal(make_message_notfound(nameobj, file_in)) # plot the results with systematic uncertainties and models size_can = [800, 800] offsets_axes = [0.8, 1.1] margins_can = [0.1, 0.13, 0.1, 0.03] size_thg = 0.05 offset_thg = 0.85 gStyle.SetErrorX(0) # do not plot horizontal error bars of histograms fontsize = 0.035 opt_leg_g = "FP" opt_plot_g = "2" list_new = [] # list to avoid loosing objects created in loops # labels x_latex = 0.16 y_latex_top = 0.83 y_step = 0.055 title_x = v_varshape_latex title_y = "(1/#it{N}_{jet}) d#it{N}/d%s" % v_varshape_latex title_full = ";%s;%s" % (title_x, title_y) title_full_ratio = ";%s;data/MC: ratio of %s" % (title_x, title_y) text_alice = "#bf{ALICE} Preliminary, pp, #sqrt{#it{s}} = 13 TeV" text_alice_sim = "#bf{ALICE} Simulation, pp, #sqrt{#it{s}} = 13 TeV" text_pythia = "PYTHIA 8 (Monash)" text_pythia_split = "#splitline{PYTHIA 8}{(Monash)}" text_jets = "charged jets, anti-#it{k}_{T}, #it{R} = 0.4" text_ptjet = "%g #leq %s < %g GeV/#it{c}, #left|#it{#eta}_{jet}#right| #leq 0.5" % ( lvar2_binmin_reco[ibin2], p_latexbin2var, lvar2_binmax_reco[ibin2]) text_pth = "%g #leq #it{p}_{T}^{%s} < %g GeV/#it{c}, #left|#it{y}_{%s}#right| #leq 0.8" % ( lpt_finbinmin[0], p_latexnhadron, min(lpt_finbinmax[-1], lvar2_binmax_reco[ibin2]), p_latexnhadron) text_ptcut = "#it{p}_{T, incl. ch. jet}^{leading track} #geq 5.33 GeV/#it{c}" text_ptcut_sim = "#it{p}_{T, incl. ch. jet}^{leading h^{#pm}} #geq 5.33 GeV/#it{c} (varied)" text_sd = "Soft Drop (#it{z}_{cut} = 0.1, #it{#beta} = 0)" title_thetag = "#it{#theta}_{g} = #it{R}_{g}/#it{R}" radius_jet = 0.4 # colour and marker indeces c_hf_data = 0 c_incl_data = 1 c_hf_mc = 2 c_incl_mc = 6 c_quark_mc = 5 c_gluon_mc = 0 # markers m_hf_data = get_marker(0) m_incl_data = get_marker(1) m_hf_mc = get_marker(0, 2) m_incl_mc = get_marker(1, 2) m_quark_mc = get_marker(2) m_gluon_mc = get_marker(3) # make the horizontal error bars smaller if shape == "nsd": for gr in [ hf_data_syst, incl_data_syst, hf_ratio_syst, incl_ratio_syst, incl_pythia_syst, quark_pythia_syst, gluon_pythia_syst ]: for i in range(gr.GetN()): gr.SetPointEXlow(i, 0.1) gr.SetPointEXhigh(i, 0.1) # data, HF and inclusive hf_data_syst_cl = hf_data_syst.Clone() leg_pos = [.72, .75, .85, .85] list_obj = [hf_data_syst, incl_data_syst, hf_data_stat, incl_data_stat] labels_obj = ["%s-tagged" % p_latexnhadron, "inclusive", "", ""] colours = [ get_colour(i, j) for i, j in zip((c_hf_data, c_incl_data, c_hf_data, c_incl_data), (2, 2, 1, 1)) ] markers = [m_hf_data, m_incl_data, m_hf_data, m_incl_data] y_margin_up = 0.46 y_margin_down = 0.05 cshape_data, list_obj_data_new = make_plot("cshape_data_" + suffix, size=size_can, \ list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=offsets_axes, \ colours=colours, markers=markers, leg_pos=leg_pos, margins_y=[y_margin_down, y_margin_up], margins_c=margins_can, \ title=title_full) for gr, c in zip((hf_data_syst, incl_data_syst), (c_hf_data, c_incl_data)): gr.SetMarkerColor(get_colour(c)) list_obj_data_new[0].SetTextSize(fontsize) if shape == "nsd": hf_data_syst.GetXaxis().SetNdivisions(5) # Draw a line through the points. if shape == "nsd": for h in (hf_data_stat, incl_data_stat): h_line = h.Clone(h.GetName() + "_line") h_line.SetLineStyle(2) h_line.Draw("l hist same") list_new.append(h_line) cshape_data.Update() if shape == "rg": # plot the theta_g axis gr_frame = hf_data_syst axis_rg = gr_frame.GetXaxis() rg_min = axis_rg.GetBinLowEdge(axis_rg.GetFirst()) rg_max = axis_rg.GetBinUpEdge(axis_rg.GetLast()) thetag_min = rg_min / radius_jet thetag_max = rg_max / radius_jet y_axis = cshape_data.GetUymax() axis_thetag = TGaxis(rg_min, y_axis, rg_max, y_axis, thetag_min, thetag_max, 510, "-") axis_thetag.SetTitle(title_thetag) axis_thetag.SetTitleSize(size_thg) axis_thetag.SetLabelSize(0.036) axis_thetag.SetTitleFont(42) axis_thetag.SetLabelFont(42) axis_thetag.SetLabelOffset(0) axis_thetag.SetTitleOffset(offset_thg) cshape_data.SetTickx(0) axis_thetag.Draw("same") # Draw LaTeX y_latex = y_latex_top list_latex_data = [] for text_latex in [ text_alice, text_jets, text_ptjet, text_pth, text_ptcut, text_sd ]: latex = TLatex(x_latex, y_latex, text_latex) list_latex_data.append(latex) draw_latex(latex, textsize=fontsize) y_latex -= y_step cshape_data.Update() cshape_data.SaveAs("%s/%s_data_%s.pdf" % (rootpath, shape, suffix)) # data and PYTHIA, HF leg_pos = [.72, .65, .85, .85] list_obj = [hf_data_syst_cl, hf_data_stat, hf_pythia_stat] labels_obj = ["data", "", text_pythia_split] colours = [ get_colour(i, j) for i, j in zip((c_hf_data, c_hf_data, c_hf_mc), (2, 1, 1)) ] markers = [m_hf_data, m_hf_data, m_hf_mc] y_margin_up = 0.4 y_margin_down = 0.05 cshape_data_mc_hf, list_obj_data_mc_hf_new = make_plot("cshape_data_mc_hf_" + suffix, size=size_can, \ list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=offsets_axes, \ colours=colours, markers=markers, leg_pos=leg_pos, margins_y=[y_margin_down, y_margin_up], margins_c=margins_can, \ title=title_full) for gr, c in zip([hf_data_syst_cl], [c_hf_data]): gr.SetMarkerColor(get_colour(c)) leg_data_mc_hf = list_obj_data_mc_hf_new[0] leg_data_mc_hf.SetHeader("%s-tagged" % p_latexnhadron) leg_data_mc_hf.SetTextSize(fontsize) if shape == "nsd": hf_data_syst_cl.GetXaxis().SetNdivisions(5) #axis_nsd = hf_data_syst_cl.GetHistogram().GetXaxis() #x1 = axis_nsd.GetBinLowEdge(1) #x2 = axis_nsd.GetBinUpEdge(axis_nsd.GetNbins()) #axis_nsd.Set(5, x1, x2) #for ibin in range(axis_nsd.GetNbins()): # axis_nsd.SetBinLabel(ibin + 1, "%d" % ibin) #axis_nsd.SetNdivisions(5) cshape_data_mc_hf.Update() if shape == "rg": # plot the theta_g axis axis_rg = hf_data_stat.GetXaxis() rg_min = axis_rg.GetBinLowEdge(axis_rg.GetFirst()) rg_max = axis_rg.GetBinUpEdge(axis_rg.GetLast()) thetag_min = rg_min / radius_jet thetag_max = rg_max / radius_jet y_axis = cshape_data_mc_hf.GetUymax() axis_thetag = TGaxis(rg_min, y_axis, rg_max, y_axis, thetag_min, thetag_max, 510, "-") axis_thetag.SetTitle(title_thetag) axis_thetag.SetTitleSize(size_thg) axis_thetag.SetLabelSize(0.036) axis_thetag.SetTitleFont(42) axis_thetag.SetLabelFont(42) axis_thetag.SetLabelOffset(0) axis_thetag.SetTitleOffset(offset_thg) cshape_data_mc_hf.SetTickx(0) axis_thetag.Draw("same") # Draw LaTeX y_latex = y_latex_top list_latex_data_mc_hf = [] for text_latex in [text_alice, text_jets, text_ptjet, text_pth, text_sd]: latex = TLatex(x_latex, y_latex, text_latex) list_latex_data_mc_hf.append(latex) draw_latex(latex, textsize=fontsize) y_latex -= y_step cshape_data_mc_hf.Update() cshape_data_mc_hf.SaveAs("%s/%s_data_mc_hf_%s.pdf" % (rootpath, shape, suffix)) # data and PYTHIA, inclusive #leg_pos = [.68, .65, .85, .85] list_obj = [ incl_data_syst, incl_pythia_syst, incl_data_stat, incl_pythia_stat ] labels_obj = ["data", text_pythia_split] colours = [ get_colour(i, j) for i, j in zip((c_incl_data, c_incl_mc, c_incl_data, c_incl_mc), (2, 2, 1, 1)) ] markers = [m_incl_data, m_incl_mc, m_incl_data, m_incl_mc] y_margin_up = 0.4 y_margin_down = 0.05 cshape_data_mc_incl, list_obj_data_mc_incl_new = make_plot("cshape_data_mc_incl_" + suffix, size=size_can, \ list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=offsets_axes, \ colours=colours, markers=markers, leg_pos=leg_pos, margins_y=[y_margin_down, y_margin_up], margins_c=margins_can, \ title=title_full) for gr, c in zip([incl_data_syst, incl_pythia_syst], [c_incl_data, c_incl_mc]): gr.SetMarkerColor(get_colour(c)) leg_data_mc_incl = list_obj_data_mc_incl_new[0] leg_data_mc_incl.SetHeader("inclusive") leg_data_mc_incl.SetTextSize(fontsize) if shape == "nsd": incl_data_syst.GetXaxis().SetNdivisions(5) cshape_data_mc_incl.Update() if shape == "rg": # plot the theta_g axis axis_rg = incl_data_stat.GetXaxis() rg_min = axis_rg.GetBinLowEdge(axis_rg.GetFirst()) rg_max = axis_rg.GetBinUpEdge(axis_rg.GetLast()) thetag_min = rg_min / radius_jet thetag_max = rg_max / radius_jet y_axis = cshape_data_mc_incl.GetUymax() axis_thetag = TGaxis(rg_min, y_axis, rg_max, y_axis, thetag_min, thetag_max, 510, "-") axis_thetag.SetTitle(title_thetag) axis_thetag.SetTitleSize(size_thg) axis_thetag.SetLabelSize(0.036) axis_thetag.SetTitleFont(42) axis_thetag.SetLabelFont(42) axis_thetag.SetLabelOffset(0) axis_thetag.SetTitleOffset(offset_thg) cshape_data_mc_incl.SetTickx(0) axis_thetag.Draw("same") # Draw LaTeX y_latex = y_latex_top list_latex_data_mc_incl = [] for text_latex in [text_alice, text_jets, text_ptjet, text_ptcut, text_sd]: latex = TLatex(x_latex, y_latex, text_latex) list_latex_data_mc_incl.append(latex) draw_latex(latex, textsize=fontsize) y_latex -= y_step cshape_data_mc_incl.Update() cshape_data_mc_incl.SaveAs("%s/%s_data_mc_incl_%s.pdf" % (rootpath, shape, suffix)) # Ratios data/MC, HF and inclusive line_1 = TLine(lvarshape_binmin_reco[0], 1, lvarshape_binmax_reco[-1], 1) line_1.SetLineStyle(9) line_1.SetLineColor(1) line_1.SetLineWidth(3) #leg_pos = [.72, .7, .85, .85] # with header leg_pos = [.72, .75, .85, .85] # without header list_obj = [ hf_ratio_syst, line_1, incl_ratio_syst, hf_ratio_stat, incl_ratio_stat ] labels_obj = ["%s-tagged" % p_latexnhadron, "inclusive"] colours = [ get_colour(i, j) for i, j in zip((c_hf_data, c_incl_data, c_hf_data, c_incl_data), (2, 2, 1, 1)) ] markers = [m_hf_data, m_incl_data, m_hf_data, m_incl_data] y_margin_up = 0.52 y_margin_down = 0.05 if shape == "nsd": y_margin_up = 0.22 cshape_ratio, list_obj_ratio_new = make_plot("cshape_ratio_" + suffix, size=size_can, \ list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=offsets_axes, \ colours=colours, markers=markers, leg_pos=leg_pos, margins_y=[y_margin_down, y_margin_up], margins_c=margins_can, \ title=title_full_ratio) cshape_ratio.Update() for gr, c in zip((hf_ratio_syst, incl_ratio_syst), (c_hf_data, c_incl_data)): gr.SetMarkerColor(get_colour(c)) leg_ratio = list_obj_ratio_new[0] leg_ratio.SetTextSize(fontsize) #leg_ratio.SetHeader("data/MC") if shape == "nsd": hf_ratio_syst.GetXaxis().SetNdivisions(5) cshape_ratio.Update() if shape == "rg": # plot the theta_g axis gr_frame = hf_ratio_syst axis_rg = gr_frame.GetXaxis() rg_min = axis_rg.GetBinLowEdge(axis_rg.GetFirst()) rg_max = axis_rg.GetBinUpEdge(axis_rg.GetLast()) thetag_min = rg_min / radius_jet thetag_max = rg_max / radius_jet y_axis = cshape_ratio.GetUymax() axis_thetag = TGaxis(rg_min, y_axis, rg_max, y_axis, thetag_min, thetag_max, 510, "-") axis_thetag.SetTitle(title_thetag) axis_thetag.SetTitleSize(size_thg) axis_thetag.SetLabelSize(0.036) axis_thetag.SetTitleFont(42) axis_thetag.SetLabelFont(42) axis_thetag.SetLabelOffset(0) axis_thetag.SetTitleOffset(offset_thg) cshape_ratio.SetTickx(0) axis_thetag.Draw("same") # Draw LaTeX y_latex = y_latex_top list_latex_ratio = [] for text_latex in [ text_alice, text_jets, text_ptjet, text_pth, text_ptcut, text_sd, text_pythia ]: latex = TLatex(x_latex, y_latex, text_latex) list_latex_ratio.append(latex) draw_latex(latex, textsize=fontsize) y_latex -= y_step cshape_ratio.Update() cshape_ratio.SaveAs("%s/%s_ratio_%s.pdf" % (rootpath, shape, suffix)) # PYTHIA, HF, inclusive, quark, gluon incl_pythia_syst_cl = incl_pythia_syst.Clone() y_min_h, y_max_h = get_y_window_his([ hf_pythia_stat, incl_pythia_stat, quark_pythia_stat, gluon_pythia_stat ]) y_min_g, y_max_g = get_y_window_gr( [incl_pythia_syst, quark_pythia_syst, gluon_pythia_syst]) y_min = min(y_min_h, y_min_g) y_max = max(y_max_h, y_max_g) y_margin_up = 0.46 y_margin_down = 0.05 y_min_plot, y_max_plot = get_plot_range(y_min, y_max, y_margin_down, y_margin_up) #leg_pos = [.6, .65, .75, .85] leg_pos = [.72, .55, .85, .85] list_obj = [ incl_pythia_syst, quark_pythia_syst, gluon_pythia_syst, hf_pythia_stat, incl_pythia_stat, quark_pythia_stat, gluon_pythia_stat ] labels_obj = ["inclusive", "quark", "gluon", "%s-tagged" % p_latexnhadron] colours = [ get_colour(i, j) for i, j in zip((c_incl_mc, c_quark_mc, c_gluon_mc, c_hf_mc, c_incl_mc, c_quark_mc, c_gluon_mc), (2, 2, 2, 1, 1, 1, 1)) ] markers = [ m_incl_mc, m_quark_mc, m_gluon_mc, m_hf_mc, m_incl_mc, m_quark_mc, m_gluon_mc ] y_margin_up = 0.46 y_margin_down = 0.05 cshape_mc, list_obj_mc_new = make_plot("cshape_mc_" + suffix, size=size_can, \ list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=offsets_axes, \ colours=colours, markers=markers, leg_pos=leg_pos, range_y=[y_min_plot, y_max_plot], margins_c=margins_can, \ title=title_full) cshape_mc.Update() for gr, c in zip((incl_pythia_syst, quark_pythia_syst, gluon_pythia_syst), (c_incl_mc, c_quark_mc, c_gluon_mc)): gr.SetMarkerColor(get_colour(c)) leg_mc = list_obj_mc_new[0] leg_mc.SetTextSize(fontsize) leg_mc.SetHeader(text_pythia_split) if shape == "nsd": incl_pythia_syst.GetXaxis().SetNdivisions(5) cshape_mc.Update() if shape == "rg": # plot the theta_g axis axis_rg = hf_pythia_stat.GetXaxis() rg_min = axis_rg.GetBinLowEdge(axis_rg.GetFirst()) rg_max = axis_rg.GetBinUpEdge(axis_rg.GetLast()) thetag_min = rg_min / radius_jet thetag_max = rg_max / radius_jet y_axis = cshape_mc.GetUymax() axis_thetag = TGaxis(rg_min, y_axis, rg_max, y_axis, thetag_min, thetag_max, 510, "-") axis_thetag.SetTitle(title_thetag) axis_thetag.SetTitleSize(size_thg) axis_thetag.SetLabelSize(0.036) axis_thetag.SetTitleFont(42) axis_thetag.SetLabelFont(42) axis_thetag.SetLabelOffset(0) axis_thetag.SetTitleOffset(offset_thg) cshape_mc.SetTickx(0) axis_thetag.Draw("same") # Draw LaTeX y_latex = y_latex_top list_latex_mc = [] for text_latex in [ text_alice_sim, text_jets, text_ptjet, text_pth, text_ptcut_sim, text_sd ]: latex = TLatex(x_latex, y_latex, text_latex) list_latex_mc.append(latex) draw_latex(latex, textsize=fontsize) y_latex -= y_step cshape_mc.Update() cshape_mc.SaveAs("%s/%s_mc_%s.pdf" % (rootpath, shape, suffix)) # PYTHIA, HF, quark, gluon #leg_pos = [.6, .65, .75, .85] leg_pos = [.72, .61, .85, .85] list_obj = [ quark_pythia_syst, gluon_pythia_syst, hf_pythia_stat, quark_pythia_stat, gluon_pythia_stat ] labels_obj = ["quark", "gluon", "%s-tagged" % p_latexnhadron] colours = [ get_colour(i, j) for i, j in zip((c_quark_mc, c_gluon_mc, c_hf_mc, c_quark_mc, c_gluon_mc), (2, 2, 1, 1, 1)) ] markers = [m_quark_mc, m_gluon_mc, m_hf_mc, m_quark_mc, m_gluon_mc] y_margin_up = 0.46 y_margin_down = 0.05 cshape_mc, list_obj_mc_new = make_plot("cshape_mc_qgd_" + suffix, size=size_can, \ list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=offsets_axes, \ colours=colours, markers=markers, leg_pos=leg_pos, range_y=[y_min_plot, y_max_plot], margins_c=margins_can, \ title=title_full) cshape_mc.Update() for gr, c in zip((quark_pythia_syst, gluon_pythia_syst), (c_quark_mc, c_gluon_mc)): gr.SetMarkerColor(get_colour(c)) leg_mc = list_obj_mc_new[0] leg_mc.SetTextSize(fontsize) leg_mc.SetHeader(text_pythia_split) if shape == "nsd": quark_pythia_syst.GetXaxis().SetNdivisions(5) cshape_mc.Update() if shape == "rg": # plot the theta_g axis axis_rg = hf_pythia_stat.GetXaxis() rg_min = axis_rg.GetBinLowEdge(axis_rg.GetFirst()) rg_max = axis_rg.GetBinUpEdge(axis_rg.GetLast()) thetag_min = rg_min / radius_jet thetag_max = rg_max / radius_jet y_axis = cshape_mc.GetUymax() axis_thetag = TGaxis(rg_min, y_axis, rg_max, y_axis, thetag_min, thetag_max, 510, "-") axis_thetag.SetTitle(title_thetag) axis_thetag.SetTitleSize(size_thg) axis_thetag.SetLabelSize(0.036) axis_thetag.SetTitleFont(42) axis_thetag.SetLabelFont(42) axis_thetag.SetLabelOffset(0) axis_thetag.SetTitleOffset(offset_thg) cshape_mc.SetTickx(0) axis_thetag.Draw("same") # Draw LaTeX y_latex = y_latex_top list_latex_mc = [] for text_latex in [ text_alice_sim, text_jets, text_ptjet, text_pth, text_ptcut_sim, text_sd ]: latex = TLatex(x_latex, y_latex, text_latex) list_latex_mc.append(latex) draw_latex(latex, textsize=fontsize) y_latex -= y_step cshape_mc.Update() cshape_mc.SaveAs("%s/%s_mc_qgd_%s.pdf" % (rootpath, shape, suffix)) # PYTHIA, HF, inclusive #leg_pos = [.6, .65, .75, .85] leg_pos = [.72, .67, .85, .85] list_obj = [incl_pythia_syst_cl, incl_pythia_stat, hf_pythia_stat] labels_obj = ["inclusive", "", "%s-tagged" % p_latexnhadron] colours = [ get_colour(i, j) for i, j in zip((c_incl_mc, c_incl_mc, c_hf_mc), (2, 1, 1)) ] markers = [m_incl_mc, m_incl_mc, m_hf_mc] y_margin_up = 0.46 y_margin_down = 0.05 cshape_mc, list_obj_mc_new = make_plot("cshape_mc_id_" + suffix, size=size_can, \ list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=offsets_axes, \ colours=colours, markers=markers, leg_pos=leg_pos, range_y=[y_min_plot, y_max_plot], margins_c=margins_can, \ title=title_full) # Draw a line through the points. if shape == "nsd": for h in (incl_pythia_stat, hf_pythia_stat): h_line = h.Clone(h.GetName() + "_line") h_line.SetLineStyle(2) h_line.Draw("l hist same") list_new.append(h_line) cshape_mc.Update() incl_pythia_syst_cl.SetMarkerColor(get_colour(c_incl_mc)) leg_mc = list_obj_mc_new[0] leg_mc.SetTextSize(fontsize) leg_mc.SetHeader(text_pythia_split) if shape == "nsd": incl_pythia_syst_cl.GetXaxis().SetNdivisions(5) cshape_mc.Update() if shape == "rg": # plot the theta_g axis axis_rg = hf_pythia_stat.GetXaxis() rg_min = axis_rg.GetBinLowEdge(axis_rg.GetFirst()) rg_max = axis_rg.GetBinUpEdge(axis_rg.GetLast()) thetag_min = rg_min / radius_jet thetag_max = rg_max / radius_jet y_axis = cshape_mc.GetUymax() axis_thetag = TGaxis(rg_min, y_axis, rg_max, y_axis, thetag_min, thetag_max, 510, "-") axis_thetag.SetTitle(title_thetag) axis_thetag.SetTitleSize(size_thg) axis_thetag.SetLabelSize(0.036) axis_thetag.SetTitleFont(42) axis_thetag.SetLabelFont(42) axis_thetag.SetLabelOffset(0) axis_thetag.SetTitleOffset(offset_thg) cshape_mc.SetTickx(0) axis_thetag.Draw("same") # Draw LaTeX y_latex = y_latex_top list_latex_mc = [] for text_latex in [ text_alice_sim, text_jets, text_ptjet, text_pth, text_ptcut_sim, text_sd ]: latex = TLatex(x_latex, y_latex, text_latex) list_latex_mc.append(latex) draw_latex(latex, textsize=fontsize) y_latex -= y_step cshape_mc.Update() cshape_mc.SaveAs("%s/%s_mc_id_%s.pdf" % (rootpath, shape, suffix)) # data inclusive vs PYTHIA, quark, gluon #leg_pos = [.6, .65, .75, .85] #leg_pos = [.72, .55, .85, .85] leg_pos = [.6, .7, .85, .85] list_obj = [ incl_data_syst, quark_pythia_syst, gluon_pythia_syst, incl_data_stat, quark_pythia_stat, gluon_pythia_stat ] labels_obj = ["inclusive (data)", "quark (PYTHIA 8)", "gluon (PYTHIA 8)"] colours = [ get_colour(i, j) for i, j in zip((c_incl_data, c_quark_mc, c_gluon_mc, c_incl_data, c_quark_mc, c_gluon_mc), (2, 2, 2, 1, 1, 1)) ] markers = [ m_incl_data, m_quark_mc, m_gluon_mc, m_incl_data, m_quark_mc, m_gluon_mc ] y_margin_up = 0.3 y_margin_down = 0.05 cshape_mc, list_obj_mc_new = make_plot("cshape_mc_data_iqg" + suffix, size=size_can, \ list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=offsets_axes, \ colours=colours, markers=markers, leg_pos=leg_pos, margins_y=[y_margin_down, y_margin_up], margins_c=margins_can, \ title=title_full) for gr, c in zip((incl_data_syst, quark_pythia_syst, gluon_pythia_syst), (c_incl_data, c_quark_mc, c_gluon_mc)): gr.SetMarkerColor(get_colour(c)) leg_mc = list_obj_mc_new[0] leg_mc.SetTextSize(fontsize) cshape_mc.Update() cshape_mc.SaveAs("%s/%s_data_i_mc_qg_%s.pdf" % (rootpath, shape, suffix))
def reset_input(self, dataframe, tag): self.source_dataframe = dataframe self.collection_tag = tag if self.verbose: get_logger().info("Resetting ValidationCollection with tag '%s'", self.collection_tag)
def do_entire_analysis(data_config: dict, data_param: dict, data_param_overwrite: dict, # pylint: disable=too-many-locals, too-many-statements, too-many-branches data_model: dict, run_param: dict, clean: bool): # Disable any graphical stuff. No TCanvases opened and shown by default gROOT.SetBatch(True) logger = get_logger() logger.info("Do analysis chain") # If we are here we are interested in the very first key in the parameters database case = list(data_param.keys())[0] # Update database accordingly if needed update_config(data_param, data_config, data_param_overwrite) dodownloadalice = data_config["download"]["alice"]["activate"] doconversionmc = data_config["conversion"]["mc"]["activate"] doconversiondata = data_config["conversion"]["data"]["activate"] domergingmc = data_config["merging"]["mc"]["activate"] domergingdata = data_config["merging"]["data"]["activate"] doskimmingmc = data_config["skimming"]["mc"]["activate"] doskimmingdata = data_config["skimming"]["data"]["activate"] domergingperiodsmc = data_config["mergingperiods"]["mc"]["activate"] domergingperiodsdata = data_config["mergingperiods"]["data"]["activate"] doml = data_config["ml_study"]["activate"] docorrelation = data_config["ml_study"]['docorrelation'] dotraining = data_config["ml_study"]['dotraining'] dotesting = data_config["ml_study"]['dotesting'] doapplytodatamc = data_config["ml_study"]['doapplytodatamc'] docrossvalidation = data_config["ml_study"]['docrossvalidation'] dolearningcurve = data_config["ml_study"]['dolearningcurve'] doroc = data_config["ml_study"]['doroc'] doroctraintest = data_config["ml_study"]['doroctraintest'] doboundary = data_config["ml_study"]['doboundary'] doimportance = data_config["ml_study"]['doimportance'] doimportanceshap = data_config["ml_study"]['doimportanceshap'] dogridsearch = data_config["ml_study"]['dogridsearch'] dobayesianopt = data_config["ml_study"]['dobayesianopt'] doefficiencyml = data_config["ml_study"]['doefficiency'] dosignifopt = data_config["ml_study"]['dosignifopt'] doscancuts = data_config["ml_study"]["doscancuts"] doplotdistr = data_config["ml_study"]["doplotdistr"] doapplydata = data_config["mlapplication"]["data"]["doapply"] doapplymc = data_config["mlapplication"]["mc"]["doapply"] domergeapplydata = data_config["mlapplication"]["data"]["domergeapply"] domergeapplymc = data_config["mlapplication"]["mc"]["domergeapply"] docontinueapplydata = data_config["mlapplication"]["data"]["docontinueafterstop"] docontinueapplymc = data_config["mlapplication"]["mc"]["docontinueafterstop"] dohistomassmc = data_config["analysis"]["mc"]["histomass"] dohistomassdata = data_config["analysis"]["data"]["histomass"] doefficiency = data_config["analysis"]["mc"]["efficiency"] doresponse = data_config["analysis"]["mc"]["response"] dofeeddown = data_config["analysis"]["mc"]["feeddown"] dounfolding = data_config["analysis"]["mc"]["dounfolding"] dojetsystematics = data_config["analysis"]["data"]["dojetsystematics"] dofit = data_config["analysis"]["dofit"] doeff = data_config["analysis"]["doeff"] docross = data_config["analysis"]["docross"] doplotsval = data_config["analysis"]["doplotsval"] doplots = data_config["analysis"]["doplots"] dosyst = data_config["analysis"]["dosyst"] dosystprob = data_config["systematics"]["cutvar"]["activate"] do_syst_prob_mass = data_config["systematics"]["cutvar"]["probvariationmass"] do_syst_prob_eff = data_config["systematics"]["cutvar"]["probvariationeff"] do_syst_prob_fit = data_config["systematics"]["cutvar"]["probvariationfit"] do_syst_prob_cross = data_config["systematics"]["cutvar"]["probvariationcross"] dosystptshape = data_config["systematics"]["mcptshape"]["activate"] doanaperperiod = data_config["analysis"]["doperperiod"] typean = data_config["analysis"]["type"] dojetstudies = data_config["analysis"]["dojetstudies"] dirpklmc = data_param[case]["multi"]["mc"]["pkl"] dirpklevtcounter_allmc = data_param[case]["multi"]["mc"]["pkl_evtcounter_all"] dirpklskmc = data_param[case]["multi"]["mc"]["pkl_skimmed"] dirpklmlmc = data_param[case]["multi"]["mc"]["pkl_skimmed_merge_for_ml"] dirpklmltotmc = data_param[case]["multi"]["mc"]["pkl_skimmed_merge_for_ml_all"] dirpkldata = data_param[case]["multi"]["data"]["pkl"] dirpklevtcounter_alldata = data_param[case]["multi"]["data"]["pkl_evtcounter_all"] dirpklskdata = data_param[case]["multi"]["data"]["pkl_skimmed"] dirpklmldata = data_param[case]["multi"]["data"]["pkl_skimmed_merge_for_ml"] dirpklmltotdata = data_param[case]["multi"]["data"]["pkl_skimmed_merge_for_ml_all"] dirpklskdecmc = data_param[case]["mlapplication"]["mc"]["pkl_skimmed_dec"] dirpklskdec_mergedmc = data_param[case]["mlapplication"]["mc"]["pkl_skimmed_decmerged"] dirpklskdecdata = data_param[case]["mlapplication"]["data"]["pkl_skimmed_dec"] dirpklskdec_mergeddata = data_param[case]["mlapplication"]["data"]["pkl_skimmed_decmerged"] dirresultsdata = data_param[case]["analysis"][typean]["data"]["results"] dirresultsmc = data_param[case]["analysis"][typean]["mc"]["results"] dirresultsdatatot = data_param[case]["analysis"][typean]["data"]["resultsallp"] dirresultsmctot = data_param[case]["analysis"][typean]["mc"]["resultsallp"] binminarray = data_param[case]["ml"]["binmin"] binmaxarray = data_param[case]["ml"]["binmax"] raahp = data_param[case]["ml"]["opt"]["raahp"] mltype = data_param[case]["ml"]["mltype"] training_vars = data_param[case]["variables"]["var_training"] mlout = data_param[case]["ml"]["mlout"] mlplot = data_param[case]["ml"]["mlplot"] proc_type = data_param[case]["analysis"][typean]["proc_type"] #creating folder if not present counter = 0 if doconversionmc is True: counter = counter + checkdirlist(dirpklmc) if doconversiondata is True: counter = counter + checkdirlist(dirpkldata) if doskimmingmc is True: checkdirlist(dirpklskmc) counter = counter + checkdir(dirpklevtcounter_allmc) if doskimmingdata is True: counter = counter + checkdirlist(dirpklskdata) counter = counter + checkdir(dirpklevtcounter_alldata) if domergingmc is True: counter = counter + checkdirlist(dirpklmlmc) if domergingdata is True: counter = counter + checkdirlist(dirpklmldata) if domergingperiodsmc is True: counter = counter + checkdir(dirpklmltotmc) if domergingperiodsdata is True: counter = counter + checkdir(dirpklmltotdata) if doml is True: counter = counter + checkdir(mlout) counter = counter + checkdir(mlplot) if docontinueapplymc is False: if doapplymc is True: counter = counter + checkdirlist(dirpklskdecmc) if domergeapplymc is True: counter = counter + checkdirlist(dirpklskdec_mergedmc) if docontinueapplydata is False: if doapplydata is True: counter = counter + checkdirlist(dirpklskdecdata) if domergeapplydata is True: counter = counter + checkdirlist(dirpklskdec_mergeddata) if dohistomassmc is True: counter = counter + checkdirlist(dirresultsmc) counter = counter + checkdir(dirresultsmctot) if dohistomassdata is True: counter = counter + checkdirlist(dirresultsdata) counter = counter + checkdir(dirresultsdatatot) if counter < 0: sys.exit() # check and create directories if doconversionmc is True: checkmakedirlist(dirpklmc) if doconversiondata is True: checkmakedirlist(dirpkldata) if doskimmingmc is True: checkmakedirlist(dirpklskmc) checkmakedir(dirpklevtcounter_allmc) if doskimmingdata is True: checkmakedirlist(dirpklskdata) checkmakedir(dirpklevtcounter_alldata) if domergingmc is True: checkmakedirlist(dirpklmlmc) if domergingdata is True: checkmakedirlist(dirpklmldata) if domergingperiodsmc is True: checkmakedir(dirpklmltotmc) if domergingperiodsdata is True: checkmakedir(dirpklmltotdata) if doml is True: checkmakedir(mlout) checkmakedir(mlplot) if docontinueapplymc is False: if doapplymc is True: checkmakedirlist(dirpklskdecmc) if domergeapplymc is True: checkmakedirlist(dirpklskdec_mergedmc) if docontinueapplydata is False: if doapplydata is True: checkmakedirlist(dirpklskdecdata) if domergeapplydata is True: checkmakedirlist(dirpklskdec_mergeddata) if dohistomassmc is True: checkmakedirlist(dirresultsmc) checkmakedir(dirresultsmctot) if dohistomassdata is True: checkmakedirlist(dirresultsdata) checkmakedir(dirresultsdatatot) proc_class = Processer ana_class = Analyzer syst_class = Systematics if proc_type == "Dhadrons": print("Using new feature for Dhadrons") proc_class = ProcesserDhadrons ana_class = AnalyzerDhadrons if proc_type == "Dhadrons_mult": print("Using new feature for Dhadrons_mult") proc_class = ProcesserDhadrons_mult ana_class = AnalyzerDhadrons_mult if proc_type == "Dhadrons_jet": print("Using new feature for Dhadrons_jet") proc_class = ProcesserDhadrons_jet ana_class = AnalyzerJet mymultiprocessmc = MultiProcesser(case, proc_class, data_param[case], typean, run_param, "mc") mymultiprocessdata = MultiProcesser(case, proc_class, data_param[case], typean, run_param,\ "data") ana_mgr = AnalyzerManager(ana_class, data_param[case], case, typean, doanaperperiod) # Has to be done always period-by-period syst_mgr = AnalyzerManager(syst_class, data_param[case], case, typean, True, run_param) #perform the analysis flow if dodownloadalice == 1: subprocess.call("../cplusutilities/Download.sh") if doconversionmc == 1: mymultiprocessmc.multi_unpack_allperiods() if doconversiondata == 1: mymultiprocessdata.multi_unpack_allperiods() if doskimmingmc == 1: mymultiprocessmc.multi_skim_allperiods() if doskimmingdata == 1: mymultiprocessdata.multi_skim_allperiods() if domergingmc == 1: mymultiprocessmc.multi_mergeml_allperiods() if domergingdata == 1: mymultiprocessdata.multi_mergeml_allperiods() if domergingperiodsmc == 1: mymultiprocessmc.multi_mergeml_allinone() if domergingperiodsdata == 1: mymultiprocessdata.multi_mergeml_allinone() if doml is True: index = 0 for binmin, binmax in zip(binminarray, binmaxarray): myopt = Optimiser(data_param[case], case, typean, data_model[mltype], binmin, binmax, raahp[index], training_vars[index]) if docorrelation is True: myopt.do_corr() if dotraining is True: myopt.do_train() if dotesting is True: myopt.do_test() if doapplytodatamc is True: myopt.do_apply() if docrossvalidation is True: myopt.do_crossval() if dolearningcurve is True: myopt.do_learningcurve() if doroc is True: myopt.do_roc() if doroctraintest is True: myopt.do_roc_train_test() if doplotdistr is True: myopt.do_plot_model_pred() if doimportance is True: myopt.do_importance() if doimportanceshap is True: myopt.do_importance_shap() if dogridsearch is True: myopt.do_grid() if dobayesianopt is True: myopt.do_bayesian_opt() if doboundary is True: myopt.do_boundary() if doefficiencyml is True: myopt.do_efficiency() if dosignifopt is True: myopt.do_significance() if doscancuts is True: myopt.do_scancuts() index = index + 1 if doapplydata is True: mymultiprocessdata.multi_apply_allperiods() if doapplymc is True: mymultiprocessmc.multi_apply_allperiods() if domergeapplydata is True: mymultiprocessdata.multi_mergeapply_allperiods() if domergeapplymc is True: mymultiprocessmc.multi_mergeapply_allperiods() if dohistomassmc is True: mymultiprocessmc.multi_histomass() if dohistomassdata is True: # After-burner in case of a mult analysis to obtain "correctionsweight.root" # for merged-period data # pylint: disable=fixme # FIXME Can only be run here because result directories are constructed when histomass # is run. If this step was independent, histomass would always complain that the # result directory already exists. mymultiprocessdata.multi_histomass() if doefficiency is True: mymultiprocessmc.multi_efficiency() if doresponse is True: mymultiprocessmc.multi_response() # Collect all desired analysis steps analyze_steps = [] if dofit is True: analyze_steps.append("fit") if dosyst is True: analyze_steps.append("yield_syst") if doeff is True: analyze_steps.append("efficiency") if dojetstudies is True: if dofit is False: analyze_steps.append("fit") if doeff is False: analyze_steps.append("efficiency") analyze_steps.append("sideband_sub") if dofeeddown is True: analyze_steps.append("feeddown") if dounfolding is True: analyze_steps.append("unfolding") analyze_steps.append("unfolding_closure") if dojetsystematics is True: analyze_steps.append("jetsystematics") if docross is True: analyze_steps.append("makenormyields") if doplots is True: analyze_steps.append("plotternormyields") if doplotsval is True: analyze_steps.append("plottervalidation") # Now do the analysis ana_mgr.analyze(*analyze_steps) ml_syst_steps = [] if dosystprob is True: if do_syst_prob_mass: ml_syst_steps.append("ml_cutvar_mass") if do_syst_prob_eff: ml_syst_steps.append("ml_cutvar_eff") if do_syst_prob_fit: ml_syst_steps.append("ml_cutvar_fit") if do_syst_prob_cross: ml_syst_steps.append("ml_cutvar_cross") if dosystptshape is True: ml_syst_steps.append("mcptshape") syst_mgr.analyze(*ml_syst_steps) # Delete per-period results. if clean: print("Cleaning") if doanaperperiod: print("Per-period analysis enabled. Skipping.") else: if not delete_dirlist(dirresultsmc + dirresultsdata): print("Error: Failed to complete cleaning.") print("Done")
def efficiency_cutscan( dataframe_, mylistvariables_, modelname_, threshold, # pylint: disable=too-many-statements output_, suffix_, plot_options_=None): plot_type_name = "eff_cut_scan" plot_options = {} if isinstance(plot_options_, dict): plot_options = plot_options_.get(plot_type_name, {}) selml = "y_test_prob%s>%s" % (modelname_, threshold) dataframe_ = dataframe_.query(selml) fig = plt.figure(figsize=(60, 25)) gs = GridSpec(3, int(len(mylistvariables_) / 3 + 1)) axes = [fig.add_subplot(gs[i]) for i in range(len(mylistvariables_))] # Available cut options cut_options = ["lt", "st", "abslt", "absst"] for i, var_tuple in enumerate(mylistvariables_): var = var_tuple[0] vardir = var_tuple[1] axes[i].set_xlabel(var, fontsize=30) axes[i].set_ylabel("entries (normalised)", fontsize=30) axes[i].tick_params(labelsize=20) axes[i].set_yscale('log') axes[i].set_ylim(0.1, 1.5) values = dataframe_[var].values if "abs" in vardir: cen = var_tuple[2] if len(var_tuple) > 2 else None if cen is None: get_logger().error("Absolute cut chosen for %s. " \ "However, no central value provided", var) continue values = np.array([abs(v - cen) for v in values]) nbinscan = 100 minv, maxv = values.min(), values.max() if var in plot_options and "xlim" in plot_options[var]: minv = plot_options[var]["xlim"][0] maxv = plot_options[var]["xlim"][1] else: minv = values.min() maxv = values.max() _, bina = np.histogram(values, range=(minv, maxv), bins=nbinscan) widthbin = (maxv - minv) / (float)(nbinscan) width = np.diff(bina) center = (bina[:-1] + bina[1:]) / 2 den = len(values) ratios = deque() if vardir not in cut_options: get_logger().error("Please choose cut option from %s. " \ "Your current setting for variable %s is %s", str(cut_options), vardir, var) continue if "lt" in vardir: for ibin in range(nbinscan): values = values[values > minv + widthbin * ibin] num = len(values) eff = float(num) / float(den) ratios.append(eff) else: for ibin in range(nbinscan, 0, -1): values = values[values < minv + widthbin * ibin] num = len(values) eff = float(num) / float(den) ratios.appendleft(eff) lbl = f'prob > {threshold}' axes[i].bar(center, ratios, align='center', width=width, label=lbl) axes[i].legend(fontsize=30) plotname = join(output_, f"variables_effscan_prob{threshold}_{suffix_}.png") plt.savefig(plotname, bbox_inches='tight') plt.savefig(plotname, bbox_inches='tight')
def write(self): for i in self.histograms: if self.verbose: get_logger().info("Writing histogram %s", i.GetName()) i.Write()
def __init__(self, database: dict, ana_type: str, file_data_name: str, file_mc_name: str): """ Initialize MLFitParsFactory Args: database: dictionary of the entire analysis database ana_type: specifying the analysis within the database to be done file_data_name: file path where to find data histograms to fit file_mc_name: file path where to find MC histograms to fit """ self.logger = get_logger() ana_config = database["analysis"][ana_type] self.prob_cut_fin = database["mlapplication"]["probcutoptimal"] # File config self.file_data_name = file_data_name self.file_mc_name = file_mc_name # Binning self.bin1_name = database["var_binning"] self.bins1_edges_low = ana_config["sel_an_binmin"] self.bins1_edges_up = ana_config["sel_an_binmax"] self.n_bins1 = len(self.bins1_edges_low) self.bin2_name = ana_config["var_binning2"] self.bin2_gen_name = ana_config["var_binning2_gen"] self.bins2_edges_low = ana_config["sel_binmin2"] self.bins2_edges_up = ana_config["sel_binmax2"] self.n_bins2 = len(self.bins2_edges_low) self.bin_matching = ana_config["binning_matching"] bineff = ana_config["usesinglebineff"] self.bins2_int_bin = bineff if bineff is not None else 0 # Fit method flags self.init_fits_from = ana_config["init_fits_from"] self.sig_func_name = ana_config["sgnfunc"] self.bkg_func_name = ana_config["bkgfunc"] self.fit_range_low = ana_config["massmin"] self.fit_range_up = ana_config["massmax"] self.likelihood = ana_config["dolikelihood"] self.rebin = ana_config["rebin"] try: iter(self.rebin[0]) except TypeError: self.rebin = [self.rebin for _ in range(self.n_bins2)] # Initial fit parameters self.mean = ana_config["masspeak"] self.fix_mean = ana_config["FixedMean"] self.use_user_mean = ana_config["SetInitialGaussianMean"] self.sigma = ana_config["sigmaarray"] self.fix_sigma = ana_config["SetFixGaussianSigma"] self.use_user_sigma = ana_config["SetInitialGaussianSigma"] self.max_rel_sigma_diff = ana_config["MaxPercSigmaDeviation"] self.n_sigma_sideband = ana_config["exclude_nsigma_sideband"] self.n_sigma_signal = ana_config["nsigma_signal"] self.rel_sigma_bound = ana_config["MaxPercSigmaDeviation"] # Second peak flags self.include_sec_peak = ana_config.get("includesecpeak", [False] * self.n_bins1) try: iter(self.include_sec_peak[0]) except TypeError: self.include_sec_peak = [ self.include_sec_peak for _ in range(self.n_bins2) ] self.sec_mean = ana_config[ "masssecpeak"] if self.include_sec_peak else None self.fix_sec_mean = ana_config.get("fix_masssecpeak", [False] * self.n_bins1) try: iter(self.fix_sec_mean[0]) except TypeError: self.fix_sec_mean = [ self.fix_sec_mean for _ in range(self.n_bins2) ] self.sec_sigma = ana_config[ "widthsecpeak"] if self.include_sec_peak else None self.fix_sec_sigma = ana_config[ "fix_widthsecpeak"] if self.include_sec_peak else None # Reflections flag self.include_reflections = ana_config.get("include_reflection", False) # Is this a trigger weighted histogram? self.apply_weights = ana_config["triggersel"]["weighttrig"] # Systematics self.syst_pars = ana_config.get("systematics", {}) self.syst_init_sigma_from = None if self.syst_pars: self.syst_init_sigma_from = self.syst_pars.get( "init_sigma_from", "central") if not isinstance(self.syst_init_sigma_from, list): self.syst_init_sigma_from = [self.syst_init_sigma_from ] * self.n_bins1 if not isinstance(self.syst_init_sigma_from[0], list): self.syst_init_sigma_from = [self.syst_init_sigma_from ] * self.n_bins2
def preparesample(self): logger = get_logger() print("prepare sample") self.df_data = pd.read_pickle(self.f_reco_data) self.df_mc = pd.read_pickle(self.f_reco_mc) self.df_mcgen = pd.read_pickle(self.f_gen_mc) self.df_mcgen = self.df_mcgen.query(self.p_presel_gen_eff) arraydf = [self.df_data, self.df_mc] self.df_mc = seldf_singlevar(self.df_mc, self.v_bin, self.p_binmin, self.p_binmax) self.df_mcgen = seldf_singlevar(self.df_mcgen, self.v_bin, self.p_binmin, self.p_binmax) self.df_data = seldf_singlevar(self.df_data, self.v_bin, self.p_binmin, self.p_binmax) self.df_sig, self.df_bkg = arraydf[self.p_tagsig], arraydf[ self.p_tagbkg] self.df_sig = seldf_singlevar(self.df_sig, self.v_bin, self.p_binmin, self.p_binmax) self.df_bkg = seldf_singlevar(self.df_bkg, self.v_bin, self.p_binmin, self.p_binmax) self.df_sig = self.df_sig.query(self.s_selsigml) self.df_bkg = self.df_bkg.query(self.s_selbkgml) self.df_bkg["ismcsignal"] = 0 self.df_bkg["ismcprompt"] = 0 self.df_bkg["ismcfd"] = 0 self.df_bkg["ismcbkg"] = 0 if self.p_nsig > len(self.df_sig): logger.warning("There are not enough signal events") if self.p_nbkg > len(self.df_bkg): logger.warning("There are not enough background events") self.p_nsig = min(len(self.df_sig), self.p_nsig) self.p_nbkg = min(len(self.df_bkg), self.p_nbkg) logger.info("Used number of signal events is %d", self.p_nsig) logger.info("Used number of background events is %d", self.p_nbkg) self.df_ml = pd.DataFrame() self.df_sig = shuffle(self.df_sig, random_state=self.rnd_shuffle) self.df_bkg = shuffle(self.df_bkg, random_state=self.rnd_shuffle) self.df_sig = self.df_sig[:self.p_nsig] self.df_bkg = self.df_bkg[:self.p_nbkg] self.df_sig[self.v_sig] = 1 self.df_bkg[self.v_sig] = 0 self.df_ml = pd.concat([self.df_sig, self.df_bkg]) self.df_mltrain, self.df_mltest = train_test_split(self.df_ml, \ test_size=self.test_frac, random_state=self.rnd_splt) self.df_mltrain = self.df_mltrain.reset_index(drop=True) self.df_mltest = self.df_mltest.reset_index(drop=True) self.df_sigtrain, self.df_bkgtrain = split_df_sigbkg( self.df_mltrain, self.v_sig) self.df_sigtest, self.df_bkgtest = split_df_sigbkg( self.df_mltest, self.v_sig) logger.info("Nev ml train %d and test %d", len(self.df_mltrain), len(self.df_mltest)) logger.info("Nev signal train %d and test %d", len(self.df_sigtrain), len(self.df_sigtest)) logger.info("Nev bkg train %d and test %d", len(self.df_bkgtrain), len(self.df_bkgtest)) self.df_xtrain = self.df_mltrain[self.v_train] self.df_ytrain = self.df_mltrain[self.v_sig] self.df_xtest = self.df_mltest[self.v_train] self.df_ytest = self.df_mltest[self.v_sig]
class Optimiser: #Class Attribute species = "optimiser" def __init__(self, data_param, case, model_config, grid_config, binmin, binmax, raahp): self.logger = get_logger() dirmcml = data_param["multi"]["mc"]["pkl_skimmed_merge_for_ml_all"] dirdataml = data_param["multi"]["data"]["pkl_skimmed_merge_for_ml_all"] dirdatatotsample = data_param["multi"]["data"]["pkl_evtcounter_all"] self.v_bin = data_param["var_binning"] #directory self.dirmlout = data_param["ml"]["mlout"] self.dirmlplot = data_param["ml"]["mlplot"] #ml file names self.n_reco = data_param["files_names"]["namefile_reco"] self.n_reco = self.n_reco.replace(".pkl", "_%s%d_%d.pkl" % (self.v_bin, binmin, binmax)) self.n_evt = data_param["files_names"]["namefile_evt"] self.n_gen = data_param["files_names"]["namefile_gen"] self.n_gen = self.n_gen.replace(".pkl", "_%s%d_%d.pkl" % (self.v_bin, binmin, binmax)) self.n_treetest = data_param["files_names"]["treeoutput"] self.n_reco_applieddata = data_param["files_names"]["namefile_reco_applieddata"] self.n_reco_appliedmc = data_param["files_names"]["namefile_reco_appliedmc"] # ml files self.f_gen_mc = os.path.join(dirmcml, self.n_gen) self.f_reco_mc = os.path.join(dirmcml, self.n_reco) self.f_evt_mc = os.path.join(dirmcml, self.n_evt) self.f_reco_data = os.path.join(dirdataml, self.n_reco) self.f_evt_data = os.path.join(dirdataml, self.n_evt) self.f_evttotsample_data = os.path.join(dirdatatotsample, self.n_evt) self.f_reco_applieddata = os.path.join(self.dirmlout, self.n_reco_applieddata) self.f_reco_appliedmc = os.path.join(self.dirmlout, self.n_reco_appliedmc) #variables self.v_all = data_param["variables"]["var_all"] self.v_train = data_param["variables"]["var_training"] self.v_bound = data_param["variables"]["var_boundaries"] self.v_sig = data_param["variables"]["var_signal"] self.v_invmass = data_param["variables"]["var_inv_mass"] self.v_cuts = data_param["variables"].get("var_cuts", []) self.v_corrx = data_param["variables"]["var_correlation"][0] self.v_corry = data_param["variables"]["var_correlation"][1] self.v_isstd = data_param["bitmap_sel"]["var_isstd"] self.v_ismcsignal = data_param["bitmap_sel"]["var_ismcsignal"] self.v_ismcprompt = data_param["bitmap_sel"]["var_ismcprompt"] self.v_ismcfd = data_param["bitmap_sel"]["var_ismcfd"] self.v_ismcbkg = data_param["bitmap_sel"]["var_ismcbkg"] #parameters self.p_case = case self.p_nbkg = data_param["ml"]["nbkg"] self.p_nsig = data_param["ml"]["nsig"] self.p_tagsig = data_param["ml"]["sampletagforsignal"] self.p_tagbkg = data_param["ml"]["sampletagforbkg"] self.p_binmin = binmin self.p_binmax = binmax self.p_npca = None self.p_mltype = data_param["ml"]["mltype"] self.p_nkfolds = data_param["ml"]["nkfolds"] self.p_ncorescross = data_param["ml"]["ncorescrossval"] self.rnd_shuffle = data_param["ml"]["rnd_shuffle"] self.rnd_splt = data_param["ml"]["rnd_splt"] self.test_frac = data_param["ml"]["test_frac"] self.p_plot_options = data_param["variables"].get("plot_options", {}) self.p_dofullevtmerge = data_param["dofullevtmerge"] #dataframes self.df_mc = None self.df_mcgen = None self.df_data = None self.df_sig = None self.df_bkg = None self.df_ml = None self.df_mltest = None self.df_mltrain = None self.df_sigtrain = None self.df_sigtest = None self.df_bkgtrain = None self.df_bktest = None self.df_xtrain = None self.df_ytrain = None self.df_xtest = None self.df_ytest = None #selections self.s_selbkgml = data_param["ml"]["sel_bkgml"] self.s_selsigml = data_param["ml"]["sel_sigml"] #model param self.db_model = model_config self.p_class = None self.p_classname = None self.p_trainedmod = None self.s_suffix = None #config files self.c_gridconfig = grid_config #significance self.f_fonll = data_param["ml"]["opt"]["filename_fonll"] self.p_fonllband = data_param["ml"]["opt"]["fonll_pred"] self.p_fragf = data_param["ml"]["opt"]["FF"] self.p_sigmamb = data_param["ml"]["opt"]["sigma_MB"] self.p_taa = data_param["ml"]["opt"]["Taa"] self.p_br = data_param["ml"]["opt"]["BR"] self.p_fprompt = data_param["ml"]["opt"]["f_prompt"] self.p_bkgfracopt = data_param["ml"]["opt"]["bkg_data_fraction"] self.p_nstepsign = data_param["ml"]["opt"]["num_steps"] self.p_savefit = data_param["ml"]["opt"]["save_fit"] self.p_nevtml = None self.p_nevttot = None self.p_presel_gen_eff = data_param["analysis"]["presel_gen_eff"] self.p_mass_fit_lim = data_param["analysis"]['mass_fit_lim'] self.p_bin_width = data_param["analysis"]['bin_width'] self.p_num_bins = int(round((self.p_mass_fit_lim[1] - self.p_mass_fit_lim[0]) / \ self.p_bin_width)) self.p_mass = data_param["mass"] self.p_raahp = raahp self.preparesample() self.loadmodels() self.create_suffix() self.df_evt_data = None self.df_evttotsample_data = None self.f_reco_applieddata = \ self.f_reco_applieddata.replace(".pkl", "%s.pkl" % self.s_suffix) self.f_reco_appliedmc = \ self.f_reco_appliedmc.replace(".pkl", "%s.pkl" % self.s_suffix)
def calc_bkg(df_bkg, name, num_steps, fit_region, bkg_func, bin_width, sig_region, save_fit, out_dir, pt_lims, multiclass_labels): """ Estimate the number of background candidates under the signal peak. This is obtained from real data with a fit of the sidebands of the invariant mass distribution. """ logger = get_logger() if multiclass_labels is None: ns_left = int(num_steps / 10) - 1 ns_right = num_steps - ns_left x_axis_left = np.linspace(0., 0.49, ns_left) x_axis_right = np.linspace(0.5, 1.0, ns_right) x_axis = np.concatenate((x_axis_left, x_axis_right)) else: x_axis = np.linspace(0, 0.4, num_steps) bkg_array = [] bkg_err_array = [] num_bins = (fit_region[1] - fit_region[0]) / bin_width num_bins = int(round(num_bins)) bin_width = (fit_region[1] - fit_region[0]) / num_bins if save_fit: logger.debug("Saving bkg fits to file") pt_min = pt_lims[0] pt_max = pt_lims[1] out_file = TFile( f'{out_dir}/bkg_fits_{name}_pt{pt_min:.1f}_{pt_max:.1f}.root', 'recreate') out_file.cd() logger.debug("To fit the bkg a %s function is used", bkg_func) if multiclass_labels is not None: for thr0 in x_axis: for thr1 in x_axis: bkg = 0. bkg_err = 0. hmass = TH1F(f'hmass_{thr0:.5f}_{thr1:.5f}', '', num_bins, fit_region[0], fit_region[1]) mlsel_multi0 = 'y_test_prob' + name + multiclass_labels[ 0] + ' <= ' + str(thr0) mlsel_multi1 = 'y_test_prob' + name + multiclass_labels[ 1] + ' >= ' + str(thr1) mlsel_multi = mlsel_multi0 + ' and ' + mlsel_multi1 sel_mass_array = df_bkg.query(mlsel_multi)['inv_mass'].values if len(sel_mass_array) > 5: for mass_value in np.nditer(sel_mass_array): hmass.Fill(mass_value) fit = hmass.Fit(bkg_func, 'Q', '', fit_region[0], fit_region[1]) if save_fit: hmass.Write() if int(fit) == 0: fit_func = hmass.GetFunction(bkg_func) bkg = fit_func.Integral(sig_region[0], sig_region[1]) / bin_width bkg_err = fit_func.IntegralError( sig_region[0], sig_region[1]) / bin_width del fit_func elif save_fit: hmass.Write() bkg_array.append(bkg) bkg_err_array.append(bkg_err) del hmass else: for thr in x_axis: bkg = 0. bkg_err = 0. hmass = TH1F(f'hmass_{thr:.5f}', '', num_bins, fit_region[0], fit_region[1]) bkg_sel_mask = df_bkg['y_test_prob' + name].values >= thr sel_mass_array = df_bkg[bkg_sel_mask]['inv_mass'].values if len(sel_mass_array) > 5: for mass_value in np.nditer(sel_mass_array): hmass.Fill(mass_value) fit = hmass.Fit(bkg_func, 'Q', '', fit_region[0], fit_region[1]) if save_fit: hmass.Write() if int(fit) == 0: fit_func = hmass.GetFunction(bkg_func) bkg = fit_func.Integral(sig_region[0], sig_region[1]) / bin_width bkg_err = fit_func.IntegralError(sig_region[0], sig_region[1]) / bin_width del fit_func elif save_fit: hmass.Write() bkg_array.append(bkg) bkg_err_array.append(bkg_err) del hmass if save_fit: out_file.Close() return bkg_array, bkg_err_array, x_axis
def filter_df_cand(dataframe, main_dict, sel_opt): '''Filter a dataframe looking at the type of candidate. It works both for bitmap and old selection method. In 'database_ml_parameters.yml' only one between old_sel and bitmap_sel must have 'use: True' Implemented selection options: - 'mc_signal' -> select MC signal - 'mc_signal_prompt' -> select only prompt MC signal - 'mc_signal_FD' -> select only feed-down MC signal - 'mc_bkg' -> select MC background - 'presel_track_pid' -> select candidates satisfing PID and track pre-selections - 'sel_std_analysis' -> select candidates fulfilling the std analysis selections Args: dataframe: pandas dataframe to filter main_dict: dictionary of parameters loaded from 'database_ml_parameters.yml' sel_opt: selection option (string) Return: df_selected: filtered pandas dataframe ''' logger = get_logger() bitmap_dict = main_dict['bitmap_sel'] old_dict = main_dict['old_sel'] use_bitmap = bitmap_dict['use'] use_old = old_dict['use'] if use_bitmap == use_old: logger.critical( "One and only one of the selection method have to be used, i.e. with " "'use' flag set to True") if use_bitmap: logger.debug("Using bitmap selection") if sel_opt == 'mc_signal': sel_bits = bitmap_dict['mcsignal_on_off'] elif sel_opt == 'mc_signal_prompt': sel_bits = bitmap_dict['mcsignal_prompt_on_off'] elif sel_opt == 'mc_signal_FD': sel_bits = bitmap_dict['mcsignal_feed_on_off'] elif sel_opt == 'mc_bkg': sel_bits = bitmap_dict['mcbkg_on_off'] elif sel_opt == 'presel_track_pid': sel_bits = bitmap_dict['preseltrack_pid_on_off'] elif sel_opt == 'sel_std_analysis': sel_bits = bitmap_dict['std_analysis_on_off'] else: logger.critical("Wrong selection option!") logger.debug("Candidates before selection: %d", len(dataframe)) df_selected = filter_bit_df(dataframe, bitmap_dict['var_sel'], sel_bits) logger.debug("Candidates after %s selection: %d", sel_opt, len(df_selected)) if use_old: logger.debug("Using old selection") if sel_opt == 'mc_signal': sel_string = old_dict['mc_signal'] elif sel_opt == 'mc_signal_prompt': sel_string = old_dict['mc_signal_prompt'] elif sel_opt == 'mc_signal_FD': sel_string = old_dict['mc_signal_FD'] elif sel_opt == 'mc_bkg': sel_string = old_dict['mc_bkg'] elif sel_opt == 'presel_track_pid': sel_string = old_dict['presel_track_pid'] elif sel_opt == 'sel_std_analysis': sel_string = old_dict['sel_std_analysis'] else: logger.critical("Wrong selection option!") logger.debug("Candidates before selection: %d", len(dataframe)) df_selected = dataframe.query(sel_string) logger.debug("Candidates after %s selection: %d", sel_opt, len(df_selected)) return df_selected
# HF specific imports from machine_learning_hep.logger import get_logger # pylint: disable=import-error, no-name-in-module from ROOT import gStyle # pylint: disable=too-few-public-methods class WorkflowBase: """ Base class for all workflows related classes including systematics """ species = "workflow_base" def __init__(self, datap, case, typean, period=None): self.logger = get_logger() self.datap = datap self.case = case self.typean = typean self.period = period @staticmethod def loadstyle(): gStyle.SetOptStat(0) gStyle.SetOptStat(0000) gStyle.SetPalette(1) gStyle.SetNumberContours(100) gStyle.SetCanvasColor(0) gStyle.SetFrameFillColor(0) @staticmethod