Beispiel #1
0
def plotter(client, parameters, hist_df=None, timer=None):
    if hist_df is None:
        arg_load = {
            "year": parameters["years"],
            "var_name": parameters["hist_vars"],
            "dataset": parameters["datasets"],
        }
        hist_dfs = parallelize(load_histogram, arg_load, client, parameters)
        hist_df = pd.concat(hist_dfs).reset_index(drop=True)

    arg_plot = {
        "year":
        parameters["years"],
        "region":
        parameters["regions"],
        "channel":
        parameters["channels"],
        "var_name":
        [v for v in hist_df.var_name.unique() if v in parameters["plot_vars"]],
        "df": [hist_df],
    }

    yields = parallelize(plot, arg_plot, client, parameters, seq=True)

    return yields
Beispiel #2
0
def to_histograms(client, parameters, df):
    argset = {
        "year": df.year.unique(),
        "var_name": parameters["hist_vars"],
        "dataset": df.dataset.unique(),
    }
    if isinstance(df, pd.DataFrame):
        argset["df"] = [df]
    elif isinstance(df, dd.DataFrame):
        argset["df"] = [(i, df.partitions[i]) for i in range(df.npartitions)]

    parallelize(delete_existing_hists, argset, client, parameters, seq=True)
    hist_rows = parallelize(make_histograms, argset, client, parameters)
    hist_df = pd.concat(hist_rows).reset_index(drop=True)
    return hist_df
Beispiel #3
0
def to_templates(client, parameters, hist_df=None):
    if hist_df is None:
        argset_load = {
            "year": parameters["years"],
            "var_name": parameters["hist_vars"],
            "dataset": parameters["datasets"],
        }
        hist_rows = parallelize(load_histogram, argset_load, client,
                                parameters)
        hist_df = pd.concat(hist_rows).reset_index(drop=True)

    argset = {
        "year":
        parameters["years"],
        "region":
        parameters["regions"],
        "channel":
        parameters["channels"],
        "var_name":
        [v for v in hist_df.var_name.unique() if v in parameters["plot_vars"]],
        "hist_df": [hist_df],
    }
    yields = parallelize(make_templates, argset, client, parameters)
    return yields
Beispiel #4
0
 def run_evaluation(self, client=None):
     if len(self.models) == 0:
         return
     if client:
         arg_set = {
             "model_name": self.models.keys(),
             "fold_filters": self.fold_filters_list,
         }
         rets = parallelize(self.evaluate_model, arg_set, client)
     else:
         rets = []
         for model_name in self.models.keys():
             for ff in self.fold_filters_list:
                 ret = self.evaluate_model(
                     {"fold_filters": ff, "model_name": model_name}
                 )
                 rets.append(ret)
     for ret in rets:
         step = ret["step"]
         model_name = ret["model_name"]
         eval_filter = self.fold_filters_list[step]["eval_filter"]
         score_name = f"{model_name}_score"
         self.df.loc[eval_filter, score_name] = ret["prediction"]
Beispiel #5
0
    def run_training(self, client=None):
        if len(self.models) == 0:
            return
        if client:
            arg_set = {
                "model_name": self.models.keys(),
                "fold_filters": self.fold_filters_list,
            }
            rets = parallelize(self.train_model, arg_set, client)

        else:
            rets = []
            for model_name in self.models.keys():
                for ff in self.fold_filters_list:
                    ret = self.train_model(
                        {"fold_filters": ff, "model_name": model_name}
                    )
                    rets.append(ret)
        for ret in rets:
            model_name = ret["model_name"]
            step = ret["step"]
            self.trained_models[model_name][step] = ret["model_save_path"]
            self.scalers[model_name][step] = ret["scalers_save_path"]
Beispiel #6
0
def run_fits(client, parameters, df):
    signal_ds = parameters.pop("signals", [])
    all_datasets = df.dataset.unique()
    signals = [ds for ds in all_datasets if ds in signal_ds]
    backgrounds = [ds for ds in all_datasets if ds not in signal_ds]
    fit_setups = []
    if len(backgrounds) > 0:
        fit_setup = {
            "label": "background",
            "mode": "bkg",
            "df": df[df.dataset.isin(backgrounds)],
            "blinded": True,
        }
        fit_setups.append(fit_setup)
    for ds in signals:
        fit_setup = {"label": ds, "mode": "sig", "df": df[df.dataset == ds]}
        fit_setups.append(fit_setup)

    argset = {
        "fit_setup": fit_setups,
        "channel": parameters["mva_channels"],
        "category": df["category"].dropna().unique(),
    }
    fit_ret = parallelize(fitter, argset, client, parameters)
    df_fits = pd.DataFrame(columns=["label", "channel", "category", "chi2"])
    for fr in fit_ret:
        df_fits = pd.concat([df_fits, pd.DataFrame.from_dict(fr)])
    # choose fit function with lowest chi2/dof
    df_fits.loc[df_fits.chi2 <= 0, "chi2"] = 999.0
    df_fits.to_pickle("all_chi2.pkl")
    idx = df_fits.groupby(["label", "channel", "category"])["chi2"].idxmin()
    df_fits = (df_fits.loc[idx].reset_index().set_index(
        ["label", "channel"]).sort_index().drop_duplicates())
    print(df_fits)
    df_fits.to_pickle("best_chi2.pkl")
    return fit_ret