def __write_successful_trials(self): if not self.successful_write: return write_yaml = {"successful_trials": self.successful_write} save_path = join(self.nominal_analyzer_merged.d_resultsallpdata, self.syst_out_dir, "successful_trials.yaml") dump_yaml_from_dict(write_yaml, save_path)
def save(self, out_dir, best_only=True): """Save paramaters/results and best model """ results = self.make_results() try: dump_yaml_from_dict(results, join(out_dir, "results.yaml")) except RepresenterError: print("Cannot save optimisation results as YAML") try: pickle.dump(results, open(join(out_dir, "results.pkl"), "wb")) except Exception: #pylint: disable=broad-except print("Cannot pickle optimisation results") save_func = self.save_model_ print(f"Save best model from Bayesian opt at {out_dir}") if self.yield_model_custom and self.save_model_custom: save_func = self.save_model_custom save_func(self.best, out_dir) if not best_only: # Save all models for i, m in enumerate(self.models): out_dir_model = join(out_dir, f"model_{i}") save_func(m, out_dir_model)
def save(self, out_dir): """Save paramaters/results and best model """ dump_yaml_from_dict(self.make_results(), join(out_dir, "results.yaml")) print(f"Save best model from Bayesian opt at {out_dir}") if self.yield_model_custom and self.save_model_custom: self.save_model_custom(self.best, out_dir) else: self.save_model_(self.best, out_dir)
def __write_working_points(self): write_yaml = { "central": self.cent_cv_cut, "lower_limits": self.min_cv_cut, "upper_limits": self.max_cv_cut, "working_points": self.ml_wps } save_path = join(self.nominal_analyzer_merged.d_resultsallpdata, self.syst_out_dir, "working_points.yaml") dump_yaml_from_dict(write_yaml, save_path)
def dump_default_config(which_config, path): """ Write default configuration Args: which_config: either "run" or "models" path: full or relative path to where the config should be dumped """ construction_functions = { "run": Configuration.construct_default_run_config, "models": Configuration.construct_default_model_config } if which_config not in construction_functions: get_logger().critical("No defaults for %s.", which_config) path = os.path.expanduser(path) dump_yaml_from_dict(construction_functions[which_config](), path) get_logger().info("Dumped default %s config to %s", which_config, path)
def step_done(self, step): if self.steps_done is None: self.steps_done = [] step_name = f"{step}_{self.p_binmin}_{self.p_binmax}" if step_name in self.steps_done: print("\n\n") self.logger.warning("Done ML step %s already. It's skipped now. Remove the step " \ "from the list in the following file", step_name) print(self.file_steps_done) print("\n\n") return True # Add this steps and update the corresponsing file self.steps_done.append(step_name) dump_yaml_from_dict({"done": self.steps_done}, self.file_steps_done) return False
def multi_mergeml_allinone(self): for ipt in range(self.p_nptbins): merge_method(self.lptper_recoml[ipt], self.lpt_recoml_mergedallp[ipt]) if self.mcordata == "mc": merge_method(self.lptper_genml[ipt], self.lpt_genml_mergedallp[ipt]) count_evt = 0 count_evtorig = 0 for evt_count_file in self.lper_evt_count_ml: count_dict = parse_yaml(evt_count_file) count_evt += count_dict["evt"] count_evtorig += count_dict["evtorig"] dump_yaml_from_dict({ "evt": count_evt, "evtorig": count_evtorig }, self.f_evtml_count)
def save_fit(fit, save_dir, annotations=None): if not fit.has_attempt: get_logger().warning( "Fit has not been done and will hence not be saved") return checkdir(save_dir) root_file_name = join(save_dir, "root_objects.root") root_file = TFile.Open(root_file_name, "RECREATE") root_file.cd() for name, root_object in fit.root_objects.items(): if root_object: root_object.Write(name) fit.kernel.Write("kernel") yaml_path = join(save_dir, "init_pars.yaml") dump_yaml_from_dict(fit.init_pars, yaml_path) yaml_path = join(save_dir, "fit_pars.yaml") dump_yaml_from_dict(fit.fit_pars, yaml_path) class_name = fit.__class__.__name__ meta_info = {"fit_class": class_name, "success": fit.success} if annotations: meta_info["annotations"] = annotations yaml_path = join(save_dir, "meta.yaml") dump_yaml_from_dict(meta_info, yaml_path)
def process_mergeforml(self): indices_for_evt = [] for ipt in range(self.p_nptbins): nfiles = len(self.mptfiles_recosk[ipt]) if not nfiles: print("There are no files to be merged") sys.exit(1) print(f"Use merge fraction {self.p_frac_merge[ipt]} for pT bin {ipt}") ntomerge = int(nfiles * self.p_frac_merge[ipt]) rd.seed(self.p_rd_merge) filesel = rd.sample(range(0, nfiles), ntomerge) indices_for_evt = list(set(indices_for_evt) | set(filesel)) list_sel_recosk = [self.mptfiles_recosk[ipt][j] for j in filesel] merge_method(list_sel_recosk, self.lpt_reco_ml[ipt]) if self.mcordata == "mc": list_sel_gensk = [self.mptfiles_gensk[ipt][j] for j in filesel] merge_method(list_sel_gensk, self.lpt_gen_ml[ipt]) print("Count events...") list_sel_evt = [self.l_evt[j] for j in indices_for_evt] list_sel_evtorig = [self.l_evtorig[j] for j in indices_for_evt] count_dict = {"evt": count_df_length_pkl(*list_sel_evt), "evtorig": count_df_length_pkl(*list_sel_evtorig)} dump_yaml_from_dict(count_dict, self.f_evt_count_ml)
def write(self, yaml_path): """ Write everything from YAML """ dump_yaml_from_dict(self.errors, yaml_path)
for child in children: files_child = [f for f in files_all if f"/{child}/" in f] args = [] for f in files_child: args.append((f, UNIQUE_COLS)) duplicates = multi_proc(check_duplicates, args, None, 500, 40) duplicates_ratio = [ d[1] / d[0] * 100 if d[0] > 0 else 0. for d in duplicates ] if EXTRACT_DUPL_INFO: duplicates_cols = [] for d in duplicates: duplicates_cols_this_df = [] for _, row in d[2].iterrows(): duplicates_cols_this_df.append([float(row[col_name]) \ for col_name in UNIQUE_COLS]) duplicates_cols.append(duplicates_cols_this_df) else: duplicates_cols = [None] * len(duplicates) has_duplicates = [dr > 0. for dr in duplicates_ratio] DUPLICATES_SUMMARY[dm][period][child] = \ [{"file": df, "dupl_ratio": dr, "has_duplicates": hd, "duplicates": dc} \ for df, dr, hd, dc \ in zip(files_child, duplicates_ratio, has_duplicates, duplicates_cols)] dump_yaml_from_dict(DUPLICATES_SUMMARY, SUMMARY_FILE)
def do_gridsearch(names, classifiers, grid_params, x_train, y_train, nkfolds, out_dirs, ncores=-1): """Hyperparameter grid search for a list of classifiers Given a list of classifiers, do a hyperparameter grid search based on a corresponding set of parameters Args: names: iteratable of classifier names classifiers: iterable of classifiers grid_params: iterable of parameters used to perform the grid search x_train: feature dataframe y_train: targets dataframe nkfolds: int, cross-validation generator or an iterable out_dirs: Write parameters and pickle of summary dataframe ncores: number of cores to distribute jobs to Returns: lists of grid search models, the best model and scoring dataframes """ logger = get_logger() for clf_name, clf, gps, out_dir in zip(names, classifiers, grid_params, out_dirs): if not gps: logger.info("Nothing to be done for grid search of model %s", clf_name) continue logger.info("Grid search for model %s with following parameters:", clf_name) print_dict(gps) # To work for probabilities. This will call model.decision_function or # model.predict_proba as it is done for the nominal ROC curves as well to decide on the # performance scoring = get_scorers(gps["scoring"]) grid_search = GridSearchCV(clf, gps["params"], cv=nkfolds, refit=gps["refit"], scoring=scoring, n_jobs=ncores, verbose=2, return_train_score=True) grid_search.fit(x_train, y_train) cvres = grid_search.cv_results_ # Save the results as soon as we have them in case something goes wrong later # (would be quite unfortunate to loose grid search reults...) out_file = osjoin(out_dir, "results.pkl") pickle.dump(pd.DataFrame(cvres), openfile(out_file, "wb"), protocol=4) # Parameters dump_yaml_from_dict(gps, osjoin(out_dir, "parameters.yaml")) savemodels((clf_name, ), (grid_search.best_estimator_, ), out_dir, "")