コード例 #1
0
def main():
    """
    This is used as the entry point for fitting.
    """

    parser = argparse.ArgumentParser()
    parser.add_argument("--database-analysis",
                        "-d",
                        dest="database_analysis",
                        help="analysis database to be used",
                        required=True)
    parser.add_argument("--analysis",
                        "-a",
                        dest="type_ana",
                        help="choose type of analysis",
                        required=True)
    parser.add_argument("--period-number", "-p", dest="period_number", type=int,
                        help="choose type of analysis (0: 2016, 1: 2017, 2: 2018, " \
                             "-1: all merged (default))", default=-1)
    parser.add_argument("--output",
                        "-o",
                        default="simple_fit",
                        help="result output directory")

    args = parser.parse_args()

    configure_logger(False, None)

    # Extract database as dictionary
    data = parse_yaml(args.database_analysis)
    data = data[list(data.keys())[0]]
    # Run the chain
    do_simple_fit(data, args.type_ana, args.period_number, args.output)
コード例 #2
0
ファイル: utils.py プロジェクト: talazare/MachineLearningHEP
def load_fit(save_dir):
    yaml_path = join(save_dir, "meta.yaml")
    meta_info = parse_yaml(yaml_path)

    yaml_path = join(save_dir, "init_pars.yaml")

    #pylint: disable=import-outside-toplevel
    import machine_learning_hep.fitting.fitters as search_module
    #pylint: enable=import-outside-toplevel
    fit_classes = {f[0]: getattr(search_module, f[0]) \
            for f in inspect.getmembers(search_module, inspect.isclass) \
            if f[1].__module__ == search_module.__name__}
    fit = None
    if meta_info["fit_class"] in fit_classes:
        fit = fit_classes[meta_info["fit_class"]](parse_yaml(yaml_path))
    else:
        get_logger().fatal("Fit class %s is invalid")

    yaml_path = join(save_dir, "fit_pars.yaml")
    fit.fit_pars = parse_yaml(yaml_path)

    root_file_name = join(save_dir, "root_objects.root")
    root_file = TFile.Open(root_file_name, "READ")

    keys = root_file.GetListOfKeys()

    root_objects = {}
    for k in keys:
        if k.GetName() == "kernel":
            fit.kernel = k.ReadObj()
            continue
        obj = k.ReadObj()
        obj.SetDirectory(0)
        root_objects[k.GetName()] = obj
    root_file.Close()

    fit.set_root_objects(root_objects)
    fit.success = meta_info["success"]
    fit.init_fit()

    if "annotations" not in meta_info:
        return fit
    return fit, meta_info["annotations"]
コード例 #3
0
 def __read_successful_trials(self):
     save_path = join(self.nominal_analyzer_merged.d_resultsallpdata,
                      self.syst_out_dir, "successful_trials.yaml")
     if not exists(save_path):
         print(
             f"Cannot load working points. File {save_path} doesn't (yet) exist."
         )
         print("Do full syst in 10s...")
         sleep(10)
         return []
     return parse_yaml(save_path)["successful_trials"]
コード例 #4
0
    def __load_working_points(self):
        save_path = join(self.nominal_analyzer_merged.d_resultsallpdata,
                         self.syst_out_dir, "working_points.yaml")
        if not exists(save_path):
            print(
                f"Cannot load working points. File {save_path} doesn't exist")
            sys.exit(1)
        read_yaml = parse_yaml(save_path)

        self.cent_cv_cut = read_yaml["central"]
        self.min_cv_cut = read_yaml["lower_limits"]
        self.max_cv_cut = read_yaml["upper_limits"]
        self.ml_wps = read_yaml["working_points"]
コード例 #5
0
 def read(self, yaml_errors, extra_errors=None):
     """
     Read everything from YAML
     """
     error_dict = parse_yaml(yaml_errors)
     for name, errors in error_dict.items():
         if name == "names":
             self.names = errors.copy()
         else:
             self.add_errors(name, errors)
     if extra_errors is not None:
         self.errors.update(extra_errors)
         for key in extra_errors:
             self.names.append(key)
コード例 #6
0
    def multi_mergeml_allinone(self):
        for ipt in range(self.p_nptbins):
            merge_method(self.lptper_recoml[ipt],
                         self.lpt_recoml_mergedallp[ipt])
            if self.mcordata == "mc":
                merge_method(self.lptper_genml[ipt],
                             self.lpt_genml_mergedallp[ipt])

        count_evt = 0
        count_evtorig = 0
        for evt_count_file in self.lper_evt_count_ml:
            count_dict = parse_yaml(evt_count_file)
            count_evt += count_dict["evt"]
            count_evtorig += count_dict["evtorig"]

        dump_yaml_from_dict({
            "evt": count_evt,
            "evtorig": count_evtorig
        }, self.f_evtml_count)
コード例 #7
0
def make_distributions(args, inv_mass, inv_mass_window):  # pylint: disable=too-many-statements

    config = parse_yaml(args.config)

    database_path = config["database"]
    data_or_mc = config["data_or_mc"]
    analysis_name = config["analysis"]
    distribution = config["distribution"]
    distribution_x_range = config["x_range"]
    out_file = config["out_file"]
    # whether or not to slice and derive weights in these slices
    period_cuts = config.get("period_cuts", None)
    slice_cuts = config.get("slice_cuts", None)
    required_columns = config.get("required_columns", None)
    query_all = config.get("query_all", None)
    use_ml_selection = config.get("use_ml_selection", True)
    use_mass_window = config.get("use_mass_window", True)

    # Now open database
    _, database = read_database(database_path)

    analysis_config = database["analysis"][analysis_name]
    inv_mass[0] = database["mass"]

    inv_mass_window[0] = config.get("mass_window", 0.02)

    # required column names
    column_names = ["ev_id", "ev_id_ext", "run_number"]
    column_names.append(distribution)

    # Add column names required by the user
    if required_columns:
        for rcn in required_columns:
            if rcn not in column_names:
                column_names.append(rcn)

    periods = database["multi"][data_or_mc]["period"]

    # is this ML or STD?
    is_ml = database["doml"]

    # No cuts for specific input file
    file_names_cut_map = None

    # Set where to read data from and set overall selection query
    column_names.append("inv_mass")
    trigger_sel = analysis_config["triggersel"][data_or_mc]
    in_top_dirs = database["mlapplication"][data_or_mc]["pkl_skimmed_dec"]
    if trigger_sel:
        if query_all:
            query_all += f" and {trigger_sel}"
        else:
            query_all = trigger_sel

    in_file_name_gen = database["files_names"]["namefile_reco"]
    in_file_name_gen = in_file_name_gen[:in_file_name_gen.find(".")]

    if is_ml:
        pkl_extension = ""
        if use_ml_selection:
            model_name = database["mlapplication"]["modelname"]
            ml_sel_column = f"y_test_prob{model_name}"
            column_names.append(ml_sel_column)
            ml_sel_pt = database["mlapplication"]["probcutoptimal"]
            pt_bins_low = database["sel_skim_binmin"]
            pt_bins_up = database["sel_skim_binmax"]
            in_file_names = [f"{in_file_name_gen}{ptl}_{ptu}" \
                    for ptl, ptu in zip(pt_bins_low, pt_bins_up)]
            file_names_cut_map = {ifn: f"{ml_sel_column} > {cut}" \
                    for ifn, cut in zip(in_file_names, ml_sel_pt)}
    else:
        pkl_extension = "_std"

    in_file_name_gen = in_file_name_gen + "*"

    # Now make the directory path right
    in_top_dirs = [f"{itd}{pkl_extension}" for itd in in_top_dirs]

    derive(periods, in_top_dirs, in_file_name_gen, column_names,
           use_mass_window, distribution, distribution_x_range,
           file_names_cut_map, out_file, period_cuts, query_all, slice_cuts)
コード例 #8
0
    def assert_model_config(self):  # pylint: disable=R0912
        """
        Validate and return the configuration for ml models
        Args:
            path: path to configuration YAML
            run_config: Run configuration since loading some models can depend on that, e.g.
                        if run_config["activate_keras"] == 0 the keras config does not need
                        to be checked and loaded.
        """
        logger = get_logger()
        logger.debug("Check sanity of user configs")

        user_config = {}
        if isinstance(self.model_config_input, str):
            user_config = parse_yaml(
                os.path.expanduser(self.model_config_input))
        elif isinstance(self.model_config_input, dict):
            user_config = self.model_config_input

        # At this point the asserted_config dict is just the one with defaults
        asserted_config = Configuration.get_meta_config("models")[
            self.run_config["mltype"]]
        user_config = user_config.get(self.run_config["mltype"], {})

        # Could probably merged with the former loop, however, would like to see whether there are
        # e.g. typos. Because steering a run wanting keras - but writing kras - could cost a lot of
        # time when it needs to be done again.
        if self.run_config["mltype"] in self.run_config["activate_models"]:
            for backend, model in \
            self.run_config["activate_models"][self.run_config["mltype"]].items():
                if backend not in asserted_config:
                    logger.critical("Unknown backend %s.", backend)
                if model is None:
                    logger.critical("No models specified for backend %s.",
                                    backend)
                for name, activate in model.items():
                    if name not in asserted_config[backend]:
                        logger.critical("Unknown model %s for backend %s.",
                                        name, backend)
                    if name in asserted_config[backend]:
                        if activate is None or not isinstance(activate, bool):
                            logger.critical("Activation value of model %s for backend %s " \
                                             "must be specified as boolean value.", name, backend)
                        asserted_config[backend][name]["activate"] = activate

        # Pop deactivated models
        for backend in list(asserted_config.keys()):
            for model in list(asserted_config[backend].keys()):
                if not asserted_config[backend][model]["activate"]:
                    del asserted_config[backend][model]
                else:
                    asserted_config[backend][model] = asserted_config[backend][
                        model]["default"]
                    if backend in user_config and model in user_config[backend]:
                        if len(user_config[backend][model]) != len(
                                asserted_config[backend][model]):
                            logger.critical(
                                "Parameter list for %s model %s differs",
                                backend, model)
                        for u in asserted_config[backend][model]:
                            asserted_config[backend][model][u] = \
                                user_config[backend][model].get(u,
                                                                asserted_config[backend][model][u])

        self.model_config = asserted_config
コード例 #9
0
    def assert_run_config(self):
        """
        Validate and return the configuration for run
        Args:
            path: path to configuration YAML
        """
        logger = get_logger()
        logger.debug("Check sanity of user configs")

        user_run_config = {}
        if isinstance(self.run_config_input, str):
            user_run_config = parse_yaml(
                os.path.expanduser(self.run_config_input))
        elif isinstance(self.run_config_input, dict):
            user_run_config = self.run_config_input

        # At this point the asserted_config dict is just the one with defaults
        run_config = Configuration.get_meta_config("run")
        asserted_config = {k: run_config[k]["default"] for k in run_config}
        choices_config = {
            k: run_config[k]["choices"]
            for k in run_config if "choices" in run_config[k]
        }
        depends_config = {
            k: run_config[k]["depends"]
            for k in run_config if "depends" in run_config[k]
        }
        types_config = {
            k: run_config[k]["type_as"]
            for k in run_config if "type_as" in run_config[k]
        }
        # Check for unknown parameters and abort since running entire machinery with wrong
        # setting (e.g. 'dotaining' instead of 'dotraining' might happen just by accident)
        # could be just overhead.
        for k in user_run_config:
            if k not in asserted_config:
                logger.critical("Unkown parameter %s in config", k)
            elif user_run_config[k] is None:
                logger.critical("Missing value for parameter %s in config", k)

        # Replace all defaults if user specified parameter
        for k in asserted_config:
            asserted_config[k] = user_run_config.get(k, asserted_config[k])
            # If parameter is already set, check if consistent
            if k in choices_config and asserted_config[
                    k] not in choices_config[k]:
                logger.critical(
                    "Invalid value %s for parameter %s. Must be one of %s",
                    str(user_run_config[k]), k, str(choices_config[k]))
            if k in types_config:
                check_types = [type(t) for t in types_config[k]]
                if not isinstance(asserted_config[k], tuple(check_types)):
                    logger.critical(
                        "Invalid value type %s of parameter %s. Must be of type %s",
                        str(type(asserted_config[k])), k, str(check_types))

        # Can so far only depend on one parameter, change to combination
        # of parameters. Do we need to check for circular dependencies?
        for k in depends_config:
            if (asserted_config[depends_config[k]["parameter"]]
                    == depends_config[k]["value"]
                    and asserted_config[k] != depends_config[k]["set"]):
                asserted_config[k] = depends_config[k]["set"]
                logger.info(
                    "Parameter %s = %s enforced since it is required for %s == %s",
                    k, str(depends_config[k]["set"]),
                    str(depends_config[k]["parameter"]),
                    str(depends_config[k]["value"]))

        self.run_config = asserted_config
コード例 #10
0
def test_yaml():
    assert isinstance(parse_yaml(YAML_PATH), dict)
コード例 #11
0
    def __plot_summary(self, out_dir, from_yaml=None, from_pickle=None):# pylint: disable=too-many-statements
        """Plot results

        Results are plotted to out_dir/results.png

        Args:
            out_dir: str
                output directory where results.png will be saved
            from_yaml: str
                path to YAML file to read and plot results from

        """

        results_tmp = self.results
        scores_tmp = list(self.scoring.keys())
        score_opt_tmp = self.scoring_opt

        if from_yaml:
            read_yaml = parse_yaml(from_yaml)
            results_tmp = read_yaml["cv"]
            scores_tmp = read_yaml["score_names"]
            score_opt_tmp = read_yaml["score_opt_name"]
        elif from_pickle:
            read_yaml = pickle.load(open(from_pickle, "rb"))
            results_tmp = read_yaml["cv"]
            scores_tmp = read_yaml["score_names"]
            score_opt_tmp = read_yaml["score_opt_name"]


        # Re-arrange such that always the optimisation score is on top
        score_names = list(scores_tmp)
        del score_names[score_names.index(score_opt_tmp)]
        score_names.insert(0, score_opt_tmp)

        # Prepare figrue and axes
        figsize = (35, 18 * len(score_names))
        fig, axes = plt.subplots(len(score_names), 1, sharex=True, gridspec_kw={"hspace": 0.05},
                                 figsize=figsize)

        # If only one score is given, need to make it iterable
        try:
            iter(axes)
        except TypeError:
            axes = [axes]

        markerstyles = ["o", "+"]
        markersize = 20
        for axi, (sn, ax) in enumerate(zip(score_names, axes)):
            ax.set_ylabel(f"CV mean {sn}", fontsize=20)
            ax.get_yaxis().set_tick_params(labelsize=20)

            # Get means of scores and plot with their std
            means = {}
            for i, tt in enumerate(("train", "test")):
                markerstyle = markerstyles[i % len(markerstyles)]
                means[tt] = [r[f"{tt}_{sn}"] for r in results_tmp]
                stds = [r[f"{tt}_{sn}_std"] for r in results_tmp]
                ax.errorbar(range(len(means[tt])), means[tt], yerr=stds, ls="",
                            marker=markerstyle, markersize=markersize, label=f"{sn} ({tt})")

            # Relative deviations between test and train
            index_high_score = means["test"].index(max(means["test"]))
            dev_high_score = \
                    abs(means["test"][index_high_score] - means["train"][index_high_score]) \
                    / means["test"][index_high_score]
            index_low_score = means["test"].index(min(means["test"]))
            dev_low_score = \
                    abs(means["test"][index_low_score] - means["train"][index_low_score]) \
                    / means["test"][index_low_score]
            dev_min = [abs(test - train) / test \
                    for train, test in zip(means["train"], means["test"])]
            index_min = dev_min.index(min(dev_min))
            dev_min = min(dev_min)

            ax.axvline(index_high_score, color="red")
            y_coord = (means["test"][index_high_score] + means["train"][index_high_score]) / 2
            ax.text(index_high_score, y_coord, f"{dev_high_score:.4f}", color="red", fontsize=20)
            ax.axvline(index_low_score, color="blue")
            y_coord = (means["test"][index_low_score] + means["train"][index_low_score]) / 2
            ax.text(index_low_score, y_coord, f"{dev_low_score:.4f}", color="blue", fontsize=20)
            ax.axvline(index_min, color="green")
            y_coord = (means["test"][index_min] + means["train"][index_min]) / 2
            ax.text(index_min, y_coord, f"{dev_min:.4f}", color="green", fontsize=20)

            leg = ax.legend(loc="upper right", fontsize=20)
            if axi == 0:
                # Add another legend for highest, lowest score and min. rel. deviation between
                # test and train score
                handles = [Line2D([0], [0], color="red"),
                           Line2D([0], [0], color="blue"),
                           Line2D([0], [0], color="green")]
                labels = ["highest test score", "lowest test score", "min. rel deviation"]
                ax.legend(handles, labels, bbox_to_anchor=(0., 1.02, 1., .102), loc='lower left',
                          ncol=3, mode="expand", borderaxespad=0., fontsize=20)
                # Add back first legend
                ax.add_artist(leg)

        axes[-1].set_xticks(range(len(results_tmp)))
        axes[-1].set_xticklabels(range(len(results_tmp)), fontsize=20)
        axes[-1].set_xlabel("# trial", fontsize=20)
        fig.suptitle("Bayesian model optimisation", fontsize=35)

        fig.tight_layout()
        out_file = join(out_dir, "results.png")
        fig.savefig(out_file)
        plt.close(fig)
コード例 #12
0
def perform_plot_gridsearch(names, out_dirs):
    '''
    Function for grid scores plotting (working with scikit 0.20)
    '''
    logger = get_logger()

    for name, out_dir in zip(names, out_dirs):

        # Read written results
        gps = parse_yaml(osjoin(out_dir, "parameters.yaml"))
        score_obj = pickle.load(openfile(osjoin(out_dir, "results.pkl"), "rb"))

        param_keys = [f"param_{key}" for key in gps["params"].keys()]
        if not param_keys:
            logger.warning("Add at least 1 parameter (even just 1 value)")
            continue

        # Re-arrange scoring such that the refitted one is always on top
        score_names = gps["scoring"]
        refit_score = gps["refit"]
        del score_names[score_names.index(refit_score)]
        score_names.insert(0, refit_score)

        # Extract scores
        x_labels = []
        y_values = {}
        y_errors = {}

        for sn in score_names:
            y_values[sn] = {"train": [], "test": []}
            y_errors[sn] = {"train": [], "test": []}

        # Get indices of values to put on x-axis and identify parameter combination
        values_indices = [
            range(len(values)) for values in gps["params"].values()
        ]

        y_axis_mins = {sn: 9999 for sn in score_names}
        y_axis_maxs = {sn: -9999 for sn in score_names}
        for indices, case in zip(
                itertools.product(*values_indices),
                itertools.product(*list(gps["params"].values()))):
            df_case = score_obj.copy()
            for i_case, i_key in zip(case, param_keys):
                df_case = df_case.loc[df_case[i_key] ==
                                      df_case[i_key].dtype.type(i_case)]

            x_labels.append(",".join([str(i) for i in indices]))
            # As we just nailed it down to one value
            for sn in score_names:
                for tt in ("train", "test"):
                    y_values[sn][tt].append(
                        df_case[f"mean_{tt}_{sn}"].values[0])
                    y_errors[sn][tt].append(
                        df_case[f"std_{tt}_{sn}"].values[0])
                    y_axis_mins[sn] = min(y_axis_mins[sn],
                                          y_values[sn][tt][-1])
                    y_axis_maxs[sn] = max(y_axis_maxs[sn],
                                          y_values[sn][tt][-1])

        # Prepare text for parameters
        text_parameters = "\n".join(
            [f"{key}: {values}" for key, values in gps["params"].items()])

        # To determine fontsizes later
        figsize = (35, 18 * len(score_names))
        fig, axes = plt.subplots(len(score_names),
                                 1,
                                 sharex=True,
                                 gridspec_kw={"hspace": 0.05},
                                 figsize=figsize)
        ax_plot = dict(zip(score_names, axes))

        # The axes to put the parameter list
        ax_main = axes[-1]
        # The axes with the title being on top
        ax_top = axes[0]

        points_per_inch = 72
        markerstyles = ["o", "+"]
        markersize = 20

        for sn in score_names:
            ax = ax_plot[sn]
            ax_min = y_axis_mins[sn] - (y_axis_maxs[sn] -
                                        y_axis_mins[sn]) / 10.
            ax_max = y_axis_maxs[sn] + (y_axis_maxs[sn] -
                                        y_axis_mins[sn]) / 10.
            ax.set_ylim(ax_min, ax_max)
            ax.set_ylabel(f"mean {sn}", fontsize=20)
            ax.get_yaxis().set_tick_params(labelsize=20)

            for j, tt in enumerate(("train", "test")):
                markerstyle = markerstyles[j % len(markerstyles)]

                ax.errorbar(range(len(x_labels)),
                            y_values[sn][tt],
                            yerr=y_errors[sn][tt],
                            ls="",
                            marker=markerstyle,
                            markersize=markersize,
                            label=f"{sn} ({tt})")

                # Add values to points
                ylim = ax.get_ylim()
                plot_labels_offset = (ylim[1] - ylim[0]) / 40
                for x, y in enumerate(y_values[sn][tt]):
                    ax.text(x, y - plot_labels_offset, f"{y:.4f}", fontsize=20)

        ax_main.set_xlabel("parameter indices", fontsize=20)
        ax_top.set_title(f"Grid search {name}", fontsize=30)
        ax_main.get_xaxis().set_tick_params(labelsize=20)
        ax_main.set_xticks(range(len(x_labels)))
        ax_main.set_xticklabels(x_labels, rotation=45)

        text_point_size = int(4 * fig.dpi / points_per_inch * figsize[1] /
                              len(gps["params"]))
        xlim = ax_main.get_xlim()
        ylim = ax_main.get_ylim()

        xlow = xlim[0] + (xlim[1] - xlim[0]) / 100
        ylow = ylim[0] + (ylim[1] - ylim[0]) / 3
        ax_main.text(xlow, ylow, text_parameters, fontsize=text_point_size)

        for ax in ax_plot.values():
            ax.legend(loc="center right", fontsize=20)
        plotname = osjoin(out_dir, "GridSearchResults.png")
        plt.savefig(plotname)
        plt.close(fig)
コード例 #13
0
    def do_significance(self):
        if self.step_done("significance"):
            return

        self.do_apply()
        self.do_test()

        self.logger.info("Doing significance optimization")
        gROOT.SetBatch(True)
        gROOT.ProcessLine("gErrorIgnoreLevel = kWarning;")
        #first extract the number of data events in the ml sample
        # This might need a revisit, for now just extract the numbers from the ML merged
        # event count (aka from a YAML since the actual events are not needed)
        # Before the ML count was always taken from the ML merged event df while the total
        # number was taken from the event counter. But the latter is basically not used
        # anymore for a long time cause "dofullevtmerge" is mostly "false" in the DBs
        #and the total number of events
        count_dict = parse_yaml(self.f_evt_count_ml)
        self.p_nevttot = count_dict["evtorig"]
        self.p_nevtml = count_dict["evt"]
        self.logger.debug("Number of data events used for ML: %d",
                          self.p_nevtml)
        self.logger.debug("Total number of data events: %d", self.p_nevttot)
        #calculate acceptance correction. we use in this case all
        #the signal from the mc sample, without limiting to the n. signal
        #events used for training
        denacc = len(self.df_mcgen[self.df_mcgen["ismcprompt"] == 1])
        numacc = len(self.df_mc[self.df_mc["ismcprompt"] == 1])
        acc, acc_err = calc_eff(numacc, denacc)
        self.logger.debug("Acceptance: %.3e +/- %.3e", acc, acc_err)
        #calculation of the expected fonll signals
        delta_pt = self.p_binmax - self.p_binmin
        if self.is_fonll_from_root:
            df_fonll = TFile.Open(self.f_fonll)
            df_fonll_Lc = df_fonll.Get(self.p_fonllparticle + "_" +
                                       self.p_fonllband)
            bin_min = df_fonll_Lc.FindBin(self.p_binmin)
            bin_max = df_fonll_Lc.FindBin(self.p_binmax)
            prod_cross = df_fonll_Lc.Integral(
                bin_min, bin_max) * self.p_fragf * 1e-12 / delta_pt
            signal_yield = 2. * prod_cross * delta_pt * acc * self.p_taa * self.p_br \
                           / (self.p_sigmamb * self.p_fprompt)
            #now we plot the fonll expectation
            cFONLL = TCanvas('cFONLL', 'The FONLL expectation')
            df_fonll_Lc.GetXaxis().SetRangeUser(0, 16)
            df_fonll_Lc.Draw("")
            cFONLL.SaveAs("%s/FONLL_curve_%s.png" %
                          (self.dirmlplot, self.s_suffix))
        else:
            df_fonll = pd.read_csv(self.f_fonll)
            df_fonll_in_pt = \
                    df_fonll.query('(pt >= @self.p_binmin) and (pt < @self.p_binmax)')\
                    [self.p_fonllband]
            prod_cross = df_fonll_in_pt.sum() * self.p_fragf * 1e-12 / delta_pt
            signal_yield = 2. * prod_cross * delta_pt * self.p_br * acc * self.p_taa \
                           / (self.p_sigmamb * self.p_fprompt)
            #now we plot the fonll expectation
            fig = plt.figure(figsize=(20, 15))
            plt.subplot(111)
            plt.plot(df_fonll['pt'],
                     df_fonll[self.p_fonllband] * self.p_fragf,
                     linewidth=4.0)
            plt.xlabel('P_t [GeV/c]', fontsize=20)
            plt.ylabel('Cross Section [pb/GeV]', fontsize=20)
            plt.title("FONLL cross section " + self.p_case, fontsize=20)
            plt.semilogy()
            plt.savefig(f'{self.dirmlplot}/FONLL_curve_{self.s_suffix}.png')
            plt.close(fig)

        self.logger.debug("Expected signal yield: %.3e", signal_yield)
        signal_yield = self.p_raahp * signal_yield
        self.logger.debug("Expected signal yield x RAA hp: %.3e", signal_yield)

        df_data_sideband = self.df_data.query(self.s_selbkgml)
        df_data_sideband = shuffle(df_data_sideband,
                                   random_state=self.rnd_shuffle)
        df_data_sideband = df_data_sideband.tail(
            round(len(df_data_sideband) * self.p_bkgfracopt))
        hmass = TH1F('hmass', '', self.p_num_bins, self.p_mass_fit_lim[0],
                     self.p_mass_fit_lim[1])
        df_mc_signal = self.df_mc[self.df_mc["ismcsignal"] == 1]
        mass_array = df_mc_signal[self.v_invmass].values
        for mass_value in np.nditer(mass_array):
            hmass.Fill(mass_value)

        gaus_fit = TF1("gaus_fit", "gaus", self.p_mass_fit_lim[0],
                       self.p_mass_fit_lim[1])
        gaus_fit.SetParameters(0, hmass.Integral())
        gaus_fit.SetParameters(1, self.p_mass)
        gaus_fit.SetParameters(2, 0.02)
        self.logger.debug("To fit the signal a gaussian function is used")
        fitsucc = hmass.Fit("gaus_fit", "RQ")

        if int(fitsucc) != 0:
            self.logger.warning("Problem in signal peak fit")
            sigma = 0.

        sigma = gaus_fit.GetParameter(2)
        self.logger.debug("Mean of the gaussian: %.3e",
                          gaus_fit.GetParameter(1))
        self.logger.debug("Sigma of the gaussian: %.3e", sigma)
        sig_region = [self.p_mass - 3 * sigma, self.p_mass + 3 * sigma]
        fig_signif_pevt = plt.figure(figsize=(20, 15))
        plt.xlabel('Threshold', fontsize=20)
        plt.ylabel(r'Significance Per Event ($3 \sigma$)', fontsize=20)
        #plt.title("Significance Per Event vs Threshold", fontsize=20)
        plt.xticks(fontsize=18)
        plt.yticks(fontsize=18)
        fig_signif = plt.figure(figsize=(20, 15))
        plt.xlabel('Threshold', fontsize=20)
        plt.ylabel(r'Significance ($3 \sigma$)', fontsize=20)
        #plt.title("Significance vs Threshold", fontsize=20)
        plt.xticks(fontsize=18)
        plt.yticks(fontsize=18)

        df_sig = self.df_mltest[self.df_mltest["ismcprompt"] == 1]

        for name in self.p_classname:
            eff_array, eff_err_array, x_axis = calc_sigeff_steps(
                self.p_nstepsign, df_sig, name)
            bkg_array, bkg_err_array, _ = calc_bkg(
                df_data_sideband, name, self.p_nstepsign, self.p_mass_fit_lim,
                self.p_bkg_func, self.p_bin_width, sig_region, self.p_savefit,
                self.dirmlplot, [self.p_binmin, self.p_binmax], self.v_invmass)
            sig_array = [eff * signal_yield for eff in eff_array]
            sig_err_array = [
                eff_err * signal_yield for eff_err in eff_err_array
            ]
            bkg_array = [
                bkg / (self.p_bkgfracopt * self.p_nevtml) for bkg in bkg_array
            ]
            bkg_err_array = [bkg_err / (self.p_bkgfracopt * self.p_nevtml) \
                             for bkg_err in bkg_err_array]
            signif_array, signif_err_array = calc_signif(
                sig_array, sig_err_array, bkg_array, bkg_err_array)
            plt.figure(fig_signif_pevt.number)
            plt.errorbar(x_axis,
                         signif_array,
                         yerr=signif_err_array,
                         label=f'{name}',
                         elinewidth=2.5,
                         linewidth=5.0)
            signif_array_ml = [
                sig * sqrt(self.p_nevtml) for sig in signif_array
            ]
            signif_err_array_ml = [
                sig_err * sqrt(self.p_nevtml) for sig_err in signif_err_array
            ]
            plt.figure(fig_signif.number)
            plt.errorbar(x_axis,
                         signif_array_ml,
                         yerr=signif_err_array_ml,
                         label=f'{name}_ML_dataset',
                         elinewidth=2.5,
                         linewidth=5.0)
            plt.text(
                0.7,
                0.95,
                f" ${self.p_binmin} < p_\\mathrm{{T}}/(\\mathrm{{GeV}}/c) < {self.p_binmax}$",
                verticalalignment="center",
                transform=fig_signif.gca().transAxes,
                fontsize=30)
            #signif_array_tot = [sig * sqrt(self.p_nevttot) for sig in signif_array]
            #signif_err_array_tot = [sig_err * sqrt(self.p_nevttot) for sig_err in signif_err_array]
            #plt.figure(fig_signif.number)
            #plt.errorbar(x_axis, signif_array_tot, yerr=signif_err_array_tot,
            #             label=f'{name}_Tot', elinewidth=2.5, linewidth=5.0)
            plt.figure(fig_signif_pevt.number)
            plt.legend(loc="upper left", prop={'size': 30})
            plt.savefig(
                f'{self.dirmlplot}/Significance_PerEvent_{self.s_suffix}.png')
            plt.figure(fig_signif.number)
            plt.legend(loc="upper left", prop={'size': 30})
            mpl.rcParams.update({"text.usetex": True})
            plt.savefig(f'{self.dirmlplot}/Significance_{self.s_suffix}.png')
            mpl.rcParams.update({"text.usetex": False})

            with open(f'{self.dirmlplot}/Significance_{self.s_suffix}.pickle',
                      'wb') as out:
                pickle.dump(fig_signif, out)

            plt.close(fig_signif_pevt)
            plt.close(fig_signif)
コード例 #14
0
class Optimiser:  # pylint: disable=too-many-public-methods
    #Class Attribute
    species = "optimiser"

    def __init__(self, data_param, case, typean, model_config, binmin, binmax,
                 raahp, training_var, index):

        self.logger = get_logger()

        dirmcml = data_param["multi"]["mc"]["pkl_skimmed_merge_for_ml_all"]
        dirdataml = data_param["multi"]["data"]["pkl_skimmed_merge_for_ml_all"]
        self.v_bin = data_param["var_binning"]
        #directory
        self.dirmlout = data_param["ml"]["mlout"]
        self.dirmlplot = data_param["ml"]["mlplot"]

        # Check here which steps have been done already
        self.steps_done = None
        self.file_steps_done = os.path.join(self.dirmlout, "steps_done.yaml")
        if os.path.exists(self.file_steps_done):
            self.steps_done = parse_yaml(self.file_steps_done)["done"]
        if self.steps_done is None \
                and (os.listdir(self.dirmlout) or os.listdir(self.dirmlplot)):
            # Backwards compatible
            print(f"rm -r {self.dirmlout}")
            print(f"rm -r {self.dirmlplot}")
            self.logger.fatal("Please remove above directories as indicated above first and " \
                    "run again")

        #ml file names
        self.n_reco = data_param["files_names"]["namefile_reco"]
        self.n_reco = self.n_reco.replace(
            ".pkl", "_%s%d_%d.pkl" % (self.v_bin, binmin, binmax))
        self.n_evt = data_param["files_names"]["namefile_evt"]
        self.n_evt_count_ml = data_param["files_names"].get(
            "namefile_evt_count", "evtcount.yaml")
        self.n_gen = data_param["files_names"]["namefile_gen"]
        self.n_gen = self.n_gen.replace(
            ".pkl", "_%s%d_%d.pkl" % (self.v_bin, binmin, binmax))
        self.n_treetest = data_param["files_names"]["treeoutput"]
        self.n_reco_applieddata = data_param["files_names"][
            "namefile_reco_applieddata"]
        self.n_reco_appliedmc = data_param["files_names"][
            "namefile_reco_appliedmc"]
        # ml files
        self.f_gen_mc = os.path.join(dirmcml, self.n_gen)
        self.f_reco_mc = os.path.join(dirmcml, self.n_reco)
        self.f_evt_mc = os.path.join(dirmcml, self.n_evt)
        self.f_reco_data = os.path.join(dirdataml, self.n_reco)
        self.f_evt_count_ml = os.path.join(dirdataml, self.n_evt_count_ml)
        self.f_reco_applieddata = os.path.join(self.dirmlout,
                                               self.n_reco_applieddata)
        self.f_reco_appliedmc = os.path.join(self.dirmlout,
                                             self.n_reco_appliedmc)
        #variables
        self.v_all = data_param["variables"]["var_all"]
        self.v_train = training_var
        self.v_selected = data_param["variables"].get("var_selected", None)
        if self.v_selected:
            self.v_selected = self.v_selected[index]
        self.v_bound = data_param["variables"]["var_boundaries"]
        self.v_sig = data_param["variables"]["var_signal"]
        self.v_invmass = data_param["variables"]["var_inv_mass"]
        self.v_cuts = data_param["variables"].get("var_cuts", [])
        self.v_corrx = data_param["variables"]["var_correlation"][0]
        self.v_corry = data_param["variables"]["var_correlation"][1]
        self.v_isstd = data_param["bitmap_sel"]["var_isstd"]
        self.v_ismcsignal = data_param["bitmap_sel"]["var_ismcsignal"]
        self.v_ismcprompt = data_param["bitmap_sel"]["var_ismcprompt"]
        self.v_ismcfd = data_param["bitmap_sel"]["var_ismcfd"]
        self.v_ismcbkg = data_param["bitmap_sel"]["var_ismcbkg"]
        #parameters
        self.p_case = case
        self.p_typean = typean
        self.p_nbkg = data_param["ml"]["nbkg"]
        self.p_nsig = data_param["ml"]["nsig"]
        self.p_tagsig = data_param["ml"]["sampletagforsignal"]
        self.p_tagbkg = data_param["ml"]["sampletagforbkg"]
        self.p_binmin = binmin
        self.p_binmax = binmax
        self.p_npca = None
        self.p_mltype = data_param["ml"]["mltype"]
        self.p_nkfolds = data_param["ml"]["nkfolds"]
        self.p_ncorescross = data_param["ml"]["ncorescrossval"]
        self.rnd_shuffle = data_param["ml"]["rnd_shuffle"]
        self.rnd_splt = data_param["ml"]["rnd_splt"]
        self.test_frac = data_param["ml"]["test_frac"]
        self.p_plot_options = data_param["variables"].get("plot_options", {})
        self.p_dofullevtmerge = data_param["dofullevtmerge"]

        self.p_evtsel = data_param["ml"]["evtsel"]
        self.p_triggersel_mc = data_param["ml"]["triggersel"]["mc"]
        self.p_triggersel_data = data_param["ml"]["triggersel"]["data"]

        #dataframes
        self.df_mc = None
        self.df_mcgen = None
        self.df_data = None
        self.arraydf = None
        self.df_sig = None
        self.df_bkg = None
        self.df_ml = None
        self.df_mltest = None
        self.df_mltrain = None
        self.df_sigtrain = None
        self.df_sigtest = None
        self.df_bkgtrain = None
        self.df_bktest = None
        self.df_xtrain = None
        self.df_ytrain = None
        self.df_xtest = None
        self.df_ytest = None
        #selections
        self.s_selbkgml = data_param["ml"]["sel_bkgml"]
        self.s_selsigml = data_param["ml"]["sel_sigml"]
        self.p_equalise_sig_bkg = data_param["ml"].get("equalise_sig_bkg",
                                                       False)
        #model param
        self.db_model = model_config
        self.p_class = None
        self.p_classname = None
        self.p_trainedmod = None
        self.s_suffix = None

        #significance
        self.is_fonll_from_root = data_param["ml"]["opt"]["isFONLLfromROOT"]
        self.f_fonll = data_param["ml"]["opt"]["filename_fonll"]
        if self.is_fonll_from_root and "fonll_particle" not in data_param[
                "ml"]["opt"]:
            self.logger.fatal("Attempt to read FONLL from ROOT file but field " \
                    "\"fonll_particle\" not provided in database")
        self.p_fonllparticle = data_param["ml"]["opt"].get(
            "fonll_particle", "")
        self.p_fonllband = data_param["ml"]["opt"]["fonll_pred"]
        self.p_fragf = data_param["ml"]["opt"]["FF"]
        self.p_sigmamb = data_param["ml"]["opt"]["sigma_MB"]
        self.p_taa = data_param["ml"]["opt"]["Taa"]
        self.p_br = data_param["ml"]["opt"]["BR"]
        self.p_fprompt = data_param["ml"]["opt"]["f_prompt"]
        self.p_bkgfracopt = data_param["ml"]["opt"]["bkg_data_fraction"]
        self.p_nstepsign = data_param["ml"]["opt"]["num_steps"]
        self.p_bkg_func = data_param["ml"]["opt"]["bkg_function"]
        self.p_savefit = data_param["ml"]["opt"]["save_fit"]
        self.p_nevtml = None
        self.p_nevttot = None
        self.p_presel_gen_eff = data_param["ml"]["opt"]["presel_gen_eff"]
        # Potentially mask certain values (e.g. nsigma TOF of -999)
        self.p_mask_values = data_param["ml"].get("mask_values", None)
        self.p_mass_fit_lim = data_param["analysis"][
            self.p_typean]['mass_fit_lim']
        self.p_bin_width = data_param["analysis"][self.p_typean]['bin_width']
        self.p_num_bins = int(round((self.p_mass_fit_lim[1] - self.p_mass_fit_lim[0]) / \
                                     self.p_bin_width))
        self.p_mass = data_param["mass"]
        self.p_raahp = raahp
        self.create_suffix()
        self.preparesample()
        self.loadmodels()
        self.df_evt_data = None
        self.df_evttotsample_data = None

        self.f_reco_applieddata = \
                self.f_reco_applieddata.replace(".pkl", "%s.pkl" % self.s_suffix)
        self.f_reco_appliedmc = \
                self.f_reco_appliedmc.replace(".pkl", "%s.pkl" % self.s_suffix)
        self.f_df_ml_test_to_df = f"{self.dirmlout}/testsample_{self.s_suffix}_mldecision.pkl"
        self.f_mltest_applied = f"{self.dirmlout}/testsample_{self.s_suffix}_mldecision.pkl"
        self.df_mltest_applied = None

        print(training_var)