Esempio n. 1
0
 def __write_successful_trials(self):
     if not self.successful_write:
         return
     write_yaml = {"successful_trials": self.successful_write}
     save_path = join(self.nominal_analyzer_merged.d_resultsallpdata,
                      self.syst_out_dir, "successful_trials.yaml")
     dump_yaml_from_dict(write_yaml, save_path)
Esempio n. 2
0
    def save(self, out_dir, best_only=True):
        """Save paramaters/results and best model
        """

        results = self.make_results()
        try:
            dump_yaml_from_dict(results, join(out_dir, "results.yaml"))
        except RepresenterError:
            print("Cannot save optimisation results as YAML")

        try:
            pickle.dump(results, open(join(out_dir, "results.pkl"), "wb"))
        except Exception: #pylint: disable=broad-except
            print("Cannot pickle optimisation results")


        save_func = self.save_model_
        print(f"Save best model from Bayesian opt at {out_dir}")
        if self.yield_model_custom and self.save_model_custom:
            save_func = self.save_model_custom
        save_func(self.best, out_dir)

        if not best_only:
            # Save all models
            for i, m in enumerate(self.models):
                out_dir_model = join(out_dir, f"model_{i}")
                save_func(m, out_dir_model)
    def save(self, out_dir):
        """Save paramaters/results and best model
        """

        dump_yaml_from_dict(self.make_results(), join(out_dir, "results.yaml"))
        print(f"Save best model from Bayesian opt at {out_dir}")
        if self.yield_model_custom and self.save_model_custom:
            self.save_model_custom(self.best, out_dir)
        else:
            self.save_model_(self.best, out_dir)
Esempio n. 4
0
 def __write_working_points(self):
     write_yaml = {
         "central": self.cent_cv_cut,
         "lower_limits": self.min_cv_cut,
         "upper_limits": self.max_cv_cut,
         "working_points": self.ml_wps
     }
     save_path = join(self.nominal_analyzer_merged.d_resultsallpdata,
                      self.syst_out_dir, "working_points.yaml")
     dump_yaml_from_dict(write_yaml, save_path)
Esempio n. 5
0
 def dump_default_config(which_config, path):
     """
     Write default configuration
     Args:
         which_config: either "run" or "models"
         path: full or relative path to where the config should be dumped
     """
     construction_functions = {
         "run": Configuration.construct_default_run_config,
         "models": Configuration.construct_default_model_config
     }
     if which_config not in construction_functions:
         get_logger().critical("No defaults for %s.", which_config)
     path = os.path.expanduser(path)
     dump_yaml_from_dict(construction_functions[which_config](), path)
     get_logger().info("Dumped default %s config to %s", which_config, path)
Esempio n. 6
0
    def step_done(self, step):
        if self.steps_done is None:
            self.steps_done = []

        step_name = f"{step}_{self.p_binmin}_{self.p_binmax}"
        if step_name in self.steps_done:
            print("\n\n")
            self.logger.warning("Done ML step %s already. It's skipped now. Remove the step " \
                    "from the list in the following file", step_name)
            print(self.file_steps_done)
            print("\n\n")
            return True

        # Add this steps and update the corresponsing file
        self.steps_done.append(step_name)
        dump_yaml_from_dict({"done": self.steps_done}, self.file_steps_done)

        return False
    def multi_mergeml_allinone(self):
        for ipt in range(self.p_nptbins):
            merge_method(self.lptper_recoml[ipt],
                         self.lpt_recoml_mergedallp[ipt])
            if self.mcordata == "mc":
                merge_method(self.lptper_genml[ipt],
                             self.lpt_genml_mergedallp[ipt])

        count_evt = 0
        count_evtorig = 0
        for evt_count_file in self.lper_evt_count_ml:
            count_dict = parse_yaml(evt_count_file)
            count_evt += count_dict["evt"]
            count_evtorig += count_dict["evtorig"]

        dump_yaml_from_dict({
            "evt": count_evt,
            "evtorig": count_evtorig
        }, self.f_evtml_count)
Esempio n. 8
0
def save_fit(fit, save_dir, annotations=None):

    if not fit.has_attempt:
        get_logger().warning(
            "Fit has not been done and will hence not be saved")
        return

    checkdir(save_dir)

    root_file_name = join(save_dir, "root_objects.root")
    root_file = TFile.Open(root_file_name, "RECREATE")
    root_file.cd()

    for name, root_object in fit.root_objects.items():
        if root_object:
            root_object.Write(name)
    fit.kernel.Write("kernel")

    yaml_path = join(save_dir, "init_pars.yaml")
    dump_yaml_from_dict(fit.init_pars, yaml_path)

    yaml_path = join(save_dir, "fit_pars.yaml")
    dump_yaml_from_dict(fit.fit_pars, yaml_path)

    class_name = fit.__class__.__name__
    meta_info = {"fit_class": class_name, "success": fit.success}
    if annotations:
        meta_info["annotations"] = annotations

    yaml_path = join(save_dir, "meta.yaml")
    dump_yaml_from_dict(meta_info, yaml_path)
Esempio n. 9
0
    def process_mergeforml(self):
        indices_for_evt = []
        for ipt in range(self.p_nptbins):
            nfiles = len(self.mptfiles_recosk[ipt])
            if not nfiles:
                print("There are no files to be merged")
                sys.exit(1)
            print(f"Use merge fraction {self.p_frac_merge[ipt]} for pT bin {ipt}")
            ntomerge = int(nfiles * self.p_frac_merge[ipt])
            rd.seed(self.p_rd_merge)
            filesel = rd.sample(range(0, nfiles), ntomerge)
            indices_for_evt = list(set(indices_for_evt) | set(filesel))
            list_sel_recosk = [self.mptfiles_recosk[ipt][j] for j in filesel]
            merge_method(list_sel_recosk, self.lpt_reco_ml[ipt])
            if self.mcordata == "mc":
                list_sel_gensk = [self.mptfiles_gensk[ipt][j] for j in filesel]
                merge_method(list_sel_gensk, self.lpt_gen_ml[ipt])

        print("Count events...")
        list_sel_evt = [self.l_evt[j] for j in indices_for_evt]
        list_sel_evtorig = [self.l_evtorig[j] for j in indices_for_evt]
        count_dict = {"evt": count_df_length_pkl(*list_sel_evt),
                      "evtorig": count_df_length_pkl(*list_sel_evtorig)}
        dump_yaml_from_dict(count_dict, self.f_evt_count_ml)
Esempio n. 10
0
 def write(self, yaml_path):
     """
     Write everything from YAML
     """
     dump_yaml_from_dict(self.errors, yaml_path)
Esempio n. 11
0
        for child in children:
            files_child = [f for f in files_all if f"/{child}/" in f]
            args = []
            for f in files_child:
                args.append((f, UNIQUE_COLS))

            duplicates = multi_proc(check_duplicates, args, None, 500, 40)
            duplicates_ratio = [
                d[1] / d[0] * 100 if d[0] > 0 else 0. for d in duplicates
            ]

            if EXTRACT_DUPL_INFO:
                duplicates_cols = []
                for d in duplicates:
                    duplicates_cols_this_df = []
                    for _, row in d[2].iterrows():
                        duplicates_cols_this_df.append([float(row[col_name]) \
                                for col_name in UNIQUE_COLS])
                    duplicates_cols.append(duplicates_cols_this_df)
            else:
                duplicates_cols = [None] * len(duplicates)

            has_duplicates = [dr > 0. for dr in duplicates_ratio]
            DUPLICATES_SUMMARY[dm][period][child] = \
                    [{"file": df, "dupl_ratio": dr, "has_duplicates": hd, "duplicates": dc} \
                    for df, dr, hd, dc \
                    in zip(files_child, duplicates_ratio, has_duplicates, duplicates_cols)]

dump_yaml_from_dict(DUPLICATES_SUMMARY, SUMMARY_FILE)
Esempio n. 12
0
def do_gridsearch(names,
                  classifiers,
                  grid_params,
                  x_train,
                  y_train,
                  nkfolds,
                  out_dirs,
                  ncores=-1):
    """Hyperparameter grid search for a list of classifiers

    Given a list of classifiers, do a hyperparameter grid search based on a corresponding
    set of parameters

    Args:
        names: iteratable of classifier names
        classifiers: iterable of classifiers
        grid_params: iterable of parameters used to perform the grid search
        x_train: feature dataframe
        y_train: targets dataframe
        nkfolds: int, cross-validation generator or an iterable
        out_dirs: Write parameters and pickle of summary dataframe
        ncores: number of cores to distribute jobs to
    Returns:
        lists of grid search models, the best model and scoring dataframes
    """

    logger = get_logger()

    for clf_name, clf, gps, out_dir in zip(names, classifiers, grid_params,
                                           out_dirs):
        if not gps:
            logger.info("Nothing to be done for grid search of model %s",
                        clf_name)
            continue
        logger.info("Grid search for model %s with following parameters:",
                    clf_name)
        print_dict(gps)

        # To work for probabilities. This will call model.decision_function or
        # model.predict_proba as it is done for the nominal ROC curves as well to decide on the
        # performance
        scoring = get_scorers(gps["scoring"])

        grid_search = GridSearchCV(clf,
                                   gps["params"],
                                   cv=nkfolds,
                                   refit=gps["refit"],
                                   scoring=scoring,
                                   n_jobs=ncores,
                                   verbose=2,
                                   return_train_score=True)
        grid_search.fit(x_train, y_train)
        cvres = grid_search.cv_results_

        # Save the results as soon as we have them in case something goes wrong later
        # (would be quite unfortunate to loose grid search reults...)
        out_file = osjoin(out_dir, "results.pkl")
        pickle.dump(pd.DataFrame(cvres), openfile(out_file, "wb"), protocol=4)
        # Parameters
        dump_yaml_from_dict(gps, osjoin(out_dir, "parameters.yaml"))
        savemodels((clf_name, ), (grid_search.best_estimator_, ), out_dir, "")